Coverage for mddb_workflow/utils/constants.py: 95%
150 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 15:48 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 15:48 +0000
1from os import environ
2from shutil import which
4# CONSTANTS ---------------------------------------------------------------------------
6# Set a custom globals dict
7# This way we can edit the value of a constant on runtime
8GLOBALS = {
9 # Set if symlinks are allowed
10 'no_symlinks': False,
11 # Set if colors are disabled for logging
12 'no_colors': False,
13}
15# Set the possible gromacs calls tried to find the gromacs executable in case it is not froced by the user
16GROMACS_EXECUTABLE_COMMON_NAMES = ['gmx', 'gmx_mpi']
17# Set the name of the environmental variable which is read by the workflow to know the gromacs path
18GROMACS_ENV = 'MWF_GMX'
19# Set the gromacs executable path
20# This may be forced by the user thorugh an enviornment variable
21GROMACS_EXECUTABLE = environ.get(GROMACS_ENV, None)
22# Otherwise we try with the known common gromacs executable names until we find an existing one
23if not GROMACS_EXECUTABLE:
24 for common_name in GROMACS_EXECUTABLE_COMMON_NAMES:
25 if which(common_name):
26 GROMACS_EXECUTABLE = common_name
27 break
28# If we do not find it then complain
29if not GROMACS_EXECUTABLE:
30 raise RuntimeError(f'Cannot find gromacs. Is gromacs installed? Set the env variable {GROMACS_ENV} as the gromacs executable path')
32# List typical text editor and their commands
33TEXT_EDITORS = {
34 'VIM': 'vim',
35 'GNU nano': 'nano',
36 'GNOME text editor': 'gedit',
37 'VScode': 'code',
38}
39# Keep only those editor which are already installed
40AVAILABLE_TEXT_EDITORS = { name: command for name, command in TEXT_EDITORS.items() if which(command) }
42# Set dates format
43DATE_STYLE = '%d-%m-%Y %H:%M:%S'
45# Database
46DEFAULT_API_URL = 'https://irb-dev.mddbr.eu/api/'
48# Selections
49# Set a standard selection for protein and nucleic acid backbones in vmd syntax
50ALL_ATOMS = 'all'
51PROTEIN_AND_NUCLEIC = 'protein or nucleic'
52PROTEIN_AND_NUCLEIC_BACKBONE = "(protein and name N CA C) or (nucleic and name P O5' O3' C5' C4' C3')"
54# Inputs file
55DEFAULT_INPUTS_FILENAME = 'inputs.yaml'
56ACCEPTED_INPUT_FILENAMES = [
57 DEFAULT_INPUTS_FILENAME, # The default
58 'inputs.yml', # Another extension of yaml files
59 'inputs.json' # Legacy inputs file
60]
62# Default input values used when the value is not specified
63# If an input field has no default value then it will be set as None
64DEFAULT_INPUT_VALUES = {
65 'license': 'This trajectory dataset is released under a Creative Commons Attribution 4.0 International Public License',
66 'linkcense': 'https://creativecommons.org/licenses/by/4.0/',
67 'mdref': 0,
68}
70# Expected MD inputs
71MD_DIRECTORY = 'mdir'
73# Input config file for the NASSA analysis
74DEFAULT_NASSA_CONFIG_FILENAME = 'nassa.json'
76# Markov State Model input filenames
77DEFAULT_POPULATIONS_FILENAME = 'populations.json'
78DEFAULT_TRANSITIONS_FILENAME = 'transitions.json'
80# An old system for when original topology is very wrong and charges must be provided manually
81RAW_CHARGES_FILENAME = 'charges.txt'
82# Accepted topology formats for atomic charges mining
83ACCEPTED_TOPOLOGY_FORMATS = ['tpr', 'top', 'psf', 'prmtop', 'prm7']
85# Input files processing intermediate steps
86# We name differenlty every intermediate file and we never rename/overwrite any input or intermediate file
87# This allows us to know where we were in case the process was interrupted and not repeat steps on reset
88# Intermediate files are removed at the end of the process if it was successful
90INCOMPLETE_PREFIX = 'incomplete_'
92CONVERTED = 'converted'
93CONVERTED_STRUCTURE = 'converted.pdb'
94CONVERTED_TRAJECTORY = 'converted.xtc'
96FILTERED = 'filtered'
97FILTERED_STRUCTURE = 'filtered.pdb'
98FILTERED_TRAJECTORY = 'filtered.xtc'
100IMAGED = 'imaged'
101IMAGED_STRUCTURE = 'imaged.pdb'
102IMAGED_TRAJECTORY = 'imaged.xtc'
104CORRECTED = 'corrected'
105CORRECTED_STRUCTURE = 'corrected.pdb'
106CORRECTED_TRAJECTORY = 'corrected.xtc'
108# Output core files
109STANDARD_TOPOLOGY_FILENAME = 'topology.json'
110STRUCTURE_FILENAME = 'structure.pdb'
111TRAJECTORY_FILENAME = 'trajectory.xtc'
113# Auxiliar files
114REGISTER_FILENAME = '.register.json'
115CACHE_FILENAME = '.mwf_cache.json'
117# Files saving resorted bonds and charges when we have to resort atoms
118# Note that these files have priority when loading both bonds and charges
119RESORTED_CHARGES_FILENAME = 'resorted_charges.json'
120RESORTED_BONDS_FILENAME = 'resorted_bonds.json'
122# Set generated file names
123FIRST_FRAME_FILENAME = 'first_frame.pdb'
124AVERAGE_STRUCTURE_FILENAME = 'average.pdb'
126# Set the reference labels according to the reference file used
127REFERENCE_LABELS = {
128 FIRST_FRAME_FILENAME: 'firstframe',
129 AVERAGE_STRUCTURE_FILENAME: 'average'
130}
132# Set output files generated to be uploaded to the database
134# Set the PDB (Protein Data Bank) references filename
135PDB_REFERENCES_FILENAME = 'pdb_references.json'
136# Set the protein references filename
137PROTEIN_REFERENCES_FILENAME = 'protein_references.json'
138# Set the ligand references filename
139LIGAND_REFERENCES_FILENAME = 'ligand_references.json'
140# Set the InChIKey references filename
141INCHIKEY_REFERENCES_FILENAME = 'inchikey_references.json'
143# Set the chains filename
144OUTPUT_CHAINS_FILENAME = 'chains.json'
146# Set the metadata filename
147OUTPUT_METADATA_FILENAME = 'metadata.json'
149# Set the screenshot filename
150OUTPUT_SCREENSHOT_FILENAME = 'mdf.screenshot.jpg'
152# Additional screenshot filenames
153OUTPUT_CLUSTER_SCREENSHOT_FILENAMES = 'mdf.clusters_*_screenshot_??.jpg'
155# Set analyses files to be generated
156OUTPUT_INTERACTIONS_FILENAME = 'mda.interactions.json'
157OUTPUT_RMSDS_FILENAME = 'mda.rmsds.json'
158OUTPUT_TMSCORES_FILENAME = 'mda.tmscores.json'
159OUTPUT_RMSF_FILENAME = 'mda.fluctuation.json'
160OUTPUT_RGYR_FILENAME = 'mda.rgyr.json'
161OUTPUT_PCA_FILENAME = 'mda.pca.json'
162OUTPUT_PCA_PROJECTION_PREFIX = 'mdt.pca_trajectory'
163OUTPUT_PCA_CONTACTS_FILENAME = 'mda.pca_contacts.json'
164OUTPUT_RMSD_PERRES_FILENAME = 'mda.rmsd_perres.json'
165OUTPUT_RMSD_PAIRWISE_FILENAME = 'mda.rmsd_pairwise.json'
166OUTPUT_CLUSTERS_FILENAME = 'mda.clusters.json'
167OUTPUT_DIST_PERRES_FILENAME = 'mda.dist_perres.json'
168OUTPUT_HBONDS_FILENAME = 'mda.hbonds.json'
169OUTPUT_SASA_FILENAME = 'mda.sasa.json'
170OUTPUT_ENERGIES_FILENAME = 'mda.energies.json'
171OUTPUT_DIHEDRAL_ENERGIES_FILENAME = 'mda.dihenergies.json'
172OUTPUT_POCKETS_FILENAME = 'mda.pockets.json'
173OUTPUT_POCKET_STRUCTURES_PREFIX = 'mdf.pocket' # WARNING: If this is changed then the pockets function must be updated as well
174OUTPUT_HELICAL_PARAMETERS_FILENAME = 'mda.helical.json'
175OUTPUT_MARKOV_FILENAME = 'mda.markov.json'
176OUTPUT_PROVENANCE_FILENAME = 'mda.provenance.json'
177MEMBRANE_MAPPING_FILENAME = 'mda.mem_map.json'
178OUTPUT_DENSITY_FILENAME = 'mda.density.json'
179OUTPUT_THICKNESS_FILENAME = 'mda.thickness.json'
180OUTPUT_APL_FILENAME = 'mda.apl.json'
181OUTPUT_LIPID_ORDER_FILENAME = 'mda.lipid_order.json'
182OUTPUT_LIPID_INTERACTIONS_FILENAME = 'mda.lipid_inter.json'
183OUTPUT_CHANNELS_FILENAME = 'mda.channels.json'
185# Set problematic signs for directory/folder names
186# º is forbidden since paths including this characters are not readable by MDtraj
187FORBIDDEN_DIRECTORY_CHARACTERS = ['.', ',', ';', ':', 'º', '/']
189# Default parameters
190DEFAULT_RMSD_CUTOFF = 9
191DEFAULT_INTERACTION_CUTOFF = 0.1
193# Set register cache flags
194SNAPSHOTS_FLAG = 'snapshots'
195PDB_TO_PUBCHEM = 'pdb2pubchem'
196NOT_MATCHED_LIGANDS = 'notmatchedligands'
198# Set the different test flags
199STABLE_BONDS_FLAG = 'stabonds'
200COHERENT_BONDS_FLAG = 'cohbonds'
201TRAJECTORY_INTEGRITY_FLAG = 'intrajrity'
202CORRECT_ELEMENTS = 'elements'
203REFERENCE_SEQUENCE_FLAG = 'refseq'
204STABLE_INTERACTIONS_FLAG = 'interact'
205LIGANDS_MATCH_FLAG = 'ligands'
206CHAINS_ANALYSIS = 'chains'
208# State all the available checkings, which may be trusted
209AVAILABLE_CHECKINGS = [ STABLE_BONDS_FLAG, COHERENT_BONDS_FLAG, TRAJECTORY_INTEGRITY_FLAG ]
210# State all critical process failures, which are to be lethal for the workflow unless mercy is given
211AVAILABLE_FAILURES = AVAILABLE_CHECKINGS + [ CORRECT_ELEMENTS, REFERENCE_SEQUENCE_FLAG, STABLE_INTERACTIONS_FLAG, LIGANDS_MATCH_FLAG, CHAINS_ANALYSIS ]
213# Set which tests are to be run when some input files are modified
214STRUCTURE_TESTS = [STABLE_BONDS_FLAG, COHERENT_BONDS_FLAG]
215TRAJECTORY_TESTS = [STABLE_BONDS_FLAG, TRAJECTORY_INTEGRITY_FLAG]
216TOPOLOGY_TESTS = [STABLE_BONDS_FLAG, COHERENT_BONDS_FLAG]
218# Terminal colors
219# https://stackoverflow.com/questions/287871/how-do-i-print-colored-text-to-the-terminal
220if not GLOBALS['no_colors']:
221 GREEN_HEADER = '\033[92m'
222 CYAN_HEADER = '\033[96m'
223 BLUE_HEADER = '\033[94m'
224 YELLOW_HEADER = '\033[93m'
225 RED_HEADER = '\033[91m'
226 GREY_HEADER = '\033[90m'
227 COLOR_END = '\033[0m'
228else:
229 GREEN_HEADER = ''
230 CYAN_HEADER = ''
231 BLUE_HEADER = ''
232 YELLOW_HEADER = ''
233 RED_HEADER = ''
234 GREY_HEADER = ''
235 COLOR_END = ''
237# Set a dictionary to parse an internal raw name to a pretty human firendly name
238NICE_NAMES = {
239 STABLE_BONDS_FLAG: 'Stable bonds test',
240 COHERENT_BONDS_FLAG: 'Coherent bonds test',
241 TRAJECTORY_INTEGRITY_FLAG: 'Trajectory integrity test',
242 CORRECT_ELEMENTS: 'Correct elements',
243 REFERENCE_SEQUENCE_FLAG: 'Reference sequence match',
244 STABLE_INTERACTIONS_FLAG: 'Interactions are stable',
245 LIGANDS_MATCH_FLAG : 'Ligands matched residues',
246 CHAINS_ANALYSIS: 'Chains analysis'
247}
249# Set the "standard" file format of every possible file extension
250# Note that some formats have different possible extension (e.g. nc, cdf, netcdf)
251EXTENSION_FORMATS = {
252 # Topologies
253 'tpr': 'tpr',
254 'top': 'top',
255 'psf': 'psf',
256 'prmtop': 'prmtop',
257 'parm7': 'prmtop',
258 'prm7': 'prmtop',
259 'txt': 'txt', # charges.txt
260 # Structures
261 'pdb': 'pdb',
262 'gro': 'gro',
263 'cif': 'cif',
264 # Trajectories
265 'xtc': 'xtc',
266 'trr': 'trr',
267 'dcd': 'dcd',
268 'nc': 'nc',
269 'cdf': 'nc',
270 'netcdf': 'nc',
271 'crd': 'crd',
272 'mdcrd': 'crd',
273 'trj': 'crd',
274 # Restart files (may be used as single frame trajectories)
275 'rst7': 'rst7',
276 # Other
277 'json': 'json',
278 'yaml': 'yaml',
279 'yml': 'yaml',
280 'npy': 'npy',
281 'in': 'txt',
282 'h5': 'h5'
283}
285# Topology and trajectory file formats supported by PyTraj
286PYTRAJ_SUPPORTED_FORMATS = set([
287 # Topologies
288 'prmtop', 'top', 'psf', 'pdb'
289 # Trajectories
290 'nc', 'crd', 'dcd', 'trr', 'xtc'
291])
293# From GitHub:
294# ParmFormatDict = {
295# "AMBERPARM": AMBERPARM,
296# "PDBFILE": PDBFILEPARM,
297# "MOL2FILE": MOL2FILEPARM,
298# "CHARMMPSF": CHARMMPSF,
299# "CIFFILE": CIFFILE,
300# "GMXTOP": GMXTOP,
301# "SDFFILE": SDFFILE,
302# "TINKER": TINKERPARM,
303# "UNKNOWN_PARM": UNKNOWN_PARM,
304# }
306# Set some flags requeired to write files with pytraj
307PYTRAJ_PARM_FORMAT = {
308 'prmtop': 'AMBERPARM',
309 'psf': 'CHARMMPSF',
310 'top': 'GMXTOP',
311 'pdb': 'PDBFILE'
312}
314# Elements supported while correcting atom elements
315# DANI: Ba was found in PDB 1J6S
316# DANI: Lu was found in PDB 1DUH
317# DANI: U was found in PDB 2GIC
318# DANI: V was found in PDB 2P7E
319# DANI: Tb was found in PDB 359D
320# DANI: Ag was found in PDB 5AY2
321# DANI: Rb was found in PDB 3GGK
323# Set elements which are always "bonded"
324SUPPORTED_POLYMER_ELEMENTS = set([ 'C', 'N', 'O', 'H', 'P', 'S', 'D' ])
325# Set elements which may be found both "bonded" or "alone"
326SUPPORTED_COORDINATED_ELEMENTS = set([ 'Zn', 'Fe', 'Mn', 'Co', 'Lu', 'U', 'V', 'Al', 'Ba', 'Be', 'F', 'Te' ])
327# Set elements which are always "alone"
328SUPPORTED_ION_ELEMENTS = set([ 'K', 'Cl', 'Na', 'Mg', 'Br', 'I', 'Ca', 'Tb', 'Ag', 'Tl', 'Rb' ])
329SUPPORTED_ELEMENTS = {
330 *SUPPORTED_POLYMER_ELEMENTS,
331 *SUPPORTED_COORDINATED_ELEMENTS,
332 *SUPPORTED_ION_ELEMENTS
333}
335# Set a dictionaries with all residue names and their equivalent letters
336# Amino acids
337PROTEIN_RESIDUE_NAME_LETTERS = {
338 'ALA':'A',
339 'ALAN':'A',
340 'ALAC':'A',
341 'ARG':'R',
342 'ARGN':'R',
343 'ARGC':'R',
344 'ASN':'N',
345 'ASNN':'N',
346 'ASNC':'N',
347 'ASP':'D',
348 'ASPN':'D',
349 'ASPC':'D',
350 'CYS':'C',
351 'CYSN':'C',
352 'CYSC':'C',
353 'CYH':'C',
354 'CSH':'C',
355 'CSS':'C',
356 'CYX':'C',
357 'CYP':'C',
358 'GLN':'Q',
359 'GLNN':'Q',
360 'GLNC':'Q',
361 'GLU':'E',
362 'GLUN':'E',
363 'GLUC':'E',
364 'GLUP':'E',
365 'GLY':'G',
366 'GLYN':'G',
367 'GLYC':'G',
368 'HIS':'H',
369 'HISN':'H',
370 'HISC':'H',
371 'HID':'H',
372 'HIE':'H',
373 'HIP':'H',
374 'HSD':'H',
375 'HSE':'H',
376 'ILE':'I',
377 'ILEN':'I',
378 'ILEC':'I',
379 'ILU':'I',
380 'LEU':'L',
381 'LEUN':'L',
382 'LEUC':'L',
383 'LYS':'K',
384 'LYSN':'K',
385 'LYSC':'K',
386 'MET':'M',
387 'METN':'M',
388 'METC':'M',
389 'PHE':'F',
390 'PHEN':'F',
391 'PHEC':'F',
392 'PRO':'P',
393 'PRON':'P',
394 'PROC':'P',
395 'PRØ':'P',
396 'PR0':'P',
397 'PRZ':'P',
398 'SER':'S',
399 'SERN':'S',
400 'SERC':'S',
401 'THR':'T',
402 'THRN':'T',
403 'THRC':'R',
404 'TRP':'W',
405 'TRPN':'W',
406 'TRPC':'W',
407 'TRY':'W',
408 'TYR':'Y',
409 'TYRN':'Y',
410 'TYRC':'Y',
411 'VAL':'V',
412 'VALN':'V',
413 'VALC':'V',
414}
415# Nucleotides
416DNA_RESIDUE_NAME_LETTERS = {
417 'DA': 'A',
418 'T': 'T',
419 'T3': 'T',
420 'T5': 'T',
421 'DT': 'T',
422 'DC': 'C',
423 'DG': 'G',
424 'DA3': 'A',
425 'DA5': 'A',
426 'DT3': 'T',
427 'DT5': 'T',
428 'DC3': 'C',
429 'DC5': 'C',
430 'DG3': 'G',
431 'DG5': 'G',
432}
433RNA_RESIDUE_NAME_LETTERS = {
434 'RA': 'A',
435 'U': 'U',
436 'U3': 'U',
437 'U5': 'U',
438 'RU': 'U',
439 'RC': 'C',
440 'RG': 'G',
441 'RA3': 'A',
442 'RA5': 'A',
443 'RU3': 'U',
444 'RU5': 'U',
445 'RC3': 'C',
446 'RC5': 'C',
447 'RG3': 'G',
448 'RG5': 'G',
449}
450NUCLEIC_RESIDUE_NAME_LETTERS = {
451 **DNA_RESIDUE_NAME_LETTERS,
452 **RNA_RESIDUE_NAME_LETTERS,
453 'A': 'A',
454 'A3': 'A',
455 'A5': 'A',
456 'C': 'C',
457 'C3': 'C',
458 'C5': 'C',
459 'G': 'G',
460 'G3': 'G',
461 'G5': 'G',
462}
463# All of them together
464RESIDUE_NAME_LETTERS = { **PROTEIN_RESIDUE_NAME_LETTERS, **NUCLEIC_RESIDUE_NAME_LETTERS }
466# Lipid common residue names
467# Source: https://github.com/NMRLipids/Databank/blob/main/Scripts/DatabankLib/settings/molecules.py#L10
468# Meanings: https://github.com/NMRLipids/Databank/blob/48fdf2c4149d0db8900ce08b0e74dc1836dcfab3/Scripts/BuildDatabank/docs/source/moleculesAndMapping.md?plain=1#L50
469FATTY_RESIDUE_NAMES = {
470 "POPC", "POPG", "POPS", "POPE", "PYPC", "PAzePCprot", "PAzePCdeprot", "DMPC",
471 "DPPC", "DPPE", "DPPG", "DEPC", "DRPC", "DYPC", "DLPC", "DLIPC", "DOG", "DOPC",
472 "DOPE", "DDOPC", "DOPS", "DSPC", "DAPC", "DMTAP", "SDG", "SDPE", "SOPC", "POPI",
473 "SAPI", "SAPI24", "SAPI25", "SLPI", "CER", "CER180", "DHMDMAB", "SLiPC", "SM16",
474 "SM18", "TOCL", "TLCL_0H", "TMCL", "GM1", "DPPGK", "GB3", "BOG"
475}
476STEROID_RESIDUE_NAMES = { "CHL", "CHL1", "CHOL", "DCHOL" }
477LIPIDS_RESIDUE_NAMES = FATTY_RESIDUE_NAMES.union(STEROID_RESIDUE_NAMES)
479# Set typical residue names to guess what residues are
480STANDARD_SOLVENT_RESIDUE_NAMES = {'SOL', 'WAT', 'HOH', 'TIP', 'TP3', 'SWM4'}
481# WARNING: Note that standard names also include + and - symbols
482# Use functions such as Structure.select_counter_ions instead of checking if the set includes a name
483STANDARD_COUNTER_CATION_ATOM_NAMES = {'K', 'NA', 'SOD', 'POT'}
484STANDARD_COUNTER_ANION_ATOM_NAMES = {'CL', 'CLA'}
485STANDARD_COUNTER_ION_ATOM_NAMES = STANDARD_COUNTER_CATION_ATOM_NAMES.union(STANDARD_COUNTER_ANION_ATOM_NAMES)
486STANDARD_DUMMY_ATOM_NAMES = {'MW'}
487DUMMY_ATOM_ELEMENT = 'Dm'
488CG_ATOM_ELEMENT = 'Cg'
490# Topology flags
492# Set a flag to represent a protein which is not referable (e.g. antibodies, synthetic constructs)
493NO_REFERABLE_FLAG = 'noref'
495# Set a flag to represent a not found reference
496NOT_FOUND_FLAG = 'notfound'
498# Reference id formats
499PDB_ID_FORMAT = r'^[1-9]{1}[a-zA-Z0-9]{3}$'
501# Available analysis for NASSA
502NASSA_ANALYSES_LIST = [ 'bconf', 'coordist', 'bpcorr', 'crdcorr', 'stiff' ]
504# Set the correponding canals archives (.ser) for each NASSA analysis
505NASSA_ANALYSES_CANALS = {
506 #'bconf': ['epsilC', 'epsilW', 'zetaC', 'zetaW'],
507 'coordist': ['shift', 'slide', 'rise', 'tilt', 'roll', 'twist','chiW', 'chiC'],
508 'bpcorr': ['shift', 'slide', 'rise', 'tilt', 'roll', 'twist'],
509 #'crdcorr': ['shift', 'slide', 'rise', 'tilt', 'roll', 'twist'],
510 'stiff': ['stretch', 'shear', 'buckle', 'stagger', 'propel', 'opening', 'chiW', 'chiC']
511}