Coverage for mddb_workflow / utils / constants.py: 95%
159 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
1from os import environ
2from pathlib import Path
3from shutil import which
5# CONSTANTS ---------------------------------------------------------------------------
7# Set a custom globals dict
8# This way we can edit the value of a constant on runtime
9GLOBALS = {
10 # Set if symlinks are allowed
11 'no_symlinks': False,
12 # Set if colors are disabled for logging
13 'no_colors': False,
14}
16# Set the possible gromacs calls tried to find the gromacs executable in case it is not froced by the user
17GROMACS_EXECUTABLE_COMMON_NAMES = ['gmx', 'gmx_mpi']
18# Set the name of the environmental variable which is read by the workflow to know the gromacs path
19GROMACS_ENV = 'MWF_GMX'
20# Set the gromacs executable path
21# This may be forced by the user thorugh an enviornment variable
22GROMACS_EXECUTABLE = environ.get(GROMACS_ENV, None)
23# Otherwise we try with the known common gromacs executable names until we find an existing one
24if not GROMACS_EXECUTABLE:
25 for common_name in GROMACS_EXECUTABLE_COMMON_NAMES:
26 if which(common_name):
27 GROMACS_EXECUTABLE = common_name
28 break
29# If we do not find it then complain
30if not GROMACS_EXECUTABLE:
31 raise RuntimeError(f'Cannot find gromacs. Is gromacs installed? Set the env variable {GROMACS_ENV} as the gromacs executable path')
33# List typical text editor and their commands
34TEXT_EDITORS = {
35 'VIM': 'vim',
36 'GNU nano': 'nano',
37 'GNOME text editor': 'gedit',
38 'VScode': 'code',
39}
40# Keep only those editor which are already installed
41AVAILABLE_TEXT_EDITORS = { name: command for name, command in TEXT_EDITORS.items() if which(command) }
43# Set dates format
44DATE_STYLE = '%d-%m-%Y %H:%M:%S'
46# Database
47DEFAULT_API_URL = 'https://irb-dev.mddbr.eu/api/'
49# Selections
50# Set a standard selection for protein and nucleic acid backbones in vmd syntax
51ALL_ATOMS = 'all'
52PROTEIN_AND_NUCLEIC = 'protein or nucleic'
53PROTEIN_AND_NUCLEIC_BACKBONE = "(protein and name N CA C) or (nucleic and name P O5' O3' C5' C4' C3')"
55# Inputs file
56DEFAULT_INPUTS_FILENAME = 'inputs.yaml'
57ACCEPTED_INPUT_FILENAMES = [
58 DEFAULT_INPUTS_FILENAME, # The default
59 'inputs.yml', # Another extension of yaml files
60 'inputs.json' # Legacy inputs file
61]
63# Default input values used when the value is not specified
64# If an input field has no default value then it will be set as None
65DEFAULT_INPUT_VALUES = {
66 'license': 'This trajectory dataset is released under a Creative Commons Attribution 4.0 International Public License',
67 'linkcense': 'https://creativecommons.org/licenses/by/4.0/',
68 'mdref': 0,
69}
71# Resource files which are always part of the workflow
72RESOURCES_DIRECTORY_PATH = resources = str(Path(__file__).parent.parent / "resources")
73INPUTS_TEMPLATE_FILEPATH = f'{RESOURCES_DIRECTORY_PATH}/inputs_file_template.yml'
74NASSA_TEMPLATE_FILEPATH = f'{RESOURCES_DIRECTORY_PATH}/nassa_template.yml'
75GROMACS_CUSTOM_MASSES_FILEPATH = f'{RESOURCES_DIRECTORY_PATH}/atommass.dat'
76CMIP_INPUTS_CHECKONLY_SOURCE = f'{RESOURCES_DIRECTORY_PATH}/cmip_check.in'
77CMIP_INPUTS_SOURCE = f'{RESOURCES_DIRECTORY_PATH}/cmip.in'
78CMIP_VDW_SOURCE = f'{RESOURCES_DIRECTORY_PATH}/vdwprm'
79ENERGIES_DEBUG_SCRIPT_SOURCE = f'{RESOURCES_DIRECTORY_PATH}/get_energies_sum.py'
81# Expected MD inputs
82MD_DIRECTORY = 'mdir'
84# Input config file for the NASSA analysis
85DEFAULT_NASSA_CONFIG_FILENAME = 'nassa.json'
87# Markov State Model input filenames
88DEFAULT_POPULATIONS_FILENAME = 'populations.json'
89DEFAULT_TRANSITIONS_FILENAME = 'transitions.json'
91# An old system for when original topology is very wrong and charges must be provided manually
92RAW_CHARGES_FILENAME = 'charges.txt'
93# Accepted topology formats for atomic charges mining
94ACCEPTED_TOPOLOGY_FORMATS = ['tpr', 'top', 'psf', 'prmtop', 'prm7']
96# Input files processing intermediate steps
97# We name differenlty every intermediate file and we never rename/overwrite any input or intermediate file
98# This allows us to know where we were in case the process was interrupted and not repeat steps on reset
99# Intermediate files are removed at the end of the process if it was successful
101INCOMPLETE_PREFIX = 'incomplete_'
103CONVERTED = 'converted'
104CONVERTED_STRUCTURE = 'converted.pdb'
105CONVERTED_TRAJECTORY = 'converted.xtc'
107FILTERED = 'filtered'
108FILTERED_STRUCTURE = 'filtered.pdb'
109FILTERED_TRAJECTORY = 'filtered.xtc'
111IMAGED = 'imaged'
112IMAGED_STRUCTURE = 'imaged.pdb'
113IMAGED_TRAJECTORY = 'imaged.xtc'
115CORRECTED = 'corrected'
116CORRECTED_STRUCTURE = 'corrected.pdb'
117CORRECTED_TRAJECTORY = 'corrected.xtc'
119# Output core files
120STANDARD_TOPOLOGY_FILENAME = 'topology.json'
121STRUCTURE_FILENAME = 'structure.pdb'
122TRAJECTORY_FILENAME = 'trajectory.xtc'
124# Auxiliar files
125REGISTER_FILENAME = '.register.json'
126CACHE_FILENAME = '.mwf_cache.json'
128# Files saving resorted bonds and charges when we have to resort atoms
129# Note that these files have priority when loading both bonds and charges
130RESORTED_CHARGES_FILENAME = 'resorted_charges.json'
131RESORTED_BONDS_FILENAME = 'resorted_bonds.json'
133# Set generated file names
134FIRST_FRAME_FILENAME = 'first_frame.pdb'
135AVERAGE_STRUCTURE_FILENAME = 'average.pdb'
137# Set the reference labels according to the reference file used
138REFERENCE_LABELS = {
139 FIRST_FRAME_FILENAME: 'firstframe',
140 AVERAGE_STRUCTURE_FILENAME: 'average'
141}
143# Set output files generated to be uploaded to the database
145# Set the PDB (Protein Data Bank) references filename
146PDB_REFERENCES_FILENAME = 'pdb_references.json'
147# Set the protein references filename
148PROTEIN_REFERENCES_FILENAME = 'protein_references.json'
149# Set the InChIKey references filename
150INCHIKEY_REFERENCES_FILENAME = 'inchikey_references.json'
152# Set the chains filename
153OUTPUT_CHAINS_FILENAME = 'chains.json'
155# Set the metadata filename
156OUTPUT_METADATA_FILENAME = 'metadata.json'
158# Set the screenshot filename
159OUTPUT_SCREENSHOT_FILENAME = 'mdf.screenshot.jpg'
161# Additional screenshot filenames
162OUTPUT_CLUSTER_SCREENSHOT_FILENAMES = 'mdf.clusters_*_screenshot_??.jpg'
164# Set analyses files to be generated
165OUTPUT_INTERACTIONS_FILENAME = 'mda.interactions.json'
166OUTPUT_RMSDS_FILENAME = 'mda.rmsds.json'
167OUTPUT_TMSCORES_FILENAME = 'mda.tmscores.json'
168OUTPUT_RMSF_FILENAME = 'mda.fluctuation.json'
169OUTPUT_RGYR_FILENAME = 'mda.rgyr.json'
170OUTPUT_PCA_FILENAME = 'mda.pca.json'
171OUTPUT_PCA_PROJECTION_PREFIX = 'mdt.pca_trajectory'
172OUTPUT_PCA_CONTACTS_FILENAME = 'mda.pca_contacts.json'
173OUTPUT_RMSD_PERRES_FILENAME = 'mda.rmsd_perres.json'
174OUTPUT_RMSD_PAIRWISE_FILENAME = 'mda.rmsd_pairwise.json'
175OUTPUT_CLUSTERS_FILENAME = 'mda.clusters.json'
176OUTPUT_DIST_PERRES_FILENAME = 'mda.dist_perres.json'
177OUTPUT_HBONDS_FILENAME = 'mda.hbonds.json'
178OUTPUT_SASA_FILENAME = 'mda.sasa.json'
179OUTPUT_ENERGIES_FILENAME = 'mda.energies.json'
180OUTPUT_DIHEDRAL_ENERGIES_FILENAME = 'mda.dihenergies.json'
181OUTPUT_POCKETS_FILENAME = 'mda.pockets.json'
182OUTPUT_POCKET_STRUCTURES_PREFIX = 'mdf.pocket' # WARNING: If this is changed then the pockets function must be updated as well
183OUTPUT_HELICAL_PARAMETERS_FILENAME = 'mda.helical.json'
184OUTPUT_MARKOV_FILENAME = 'mda.markov.json'
185OUTPUT_PROVENANCE_FILENAME = 'mda.provenance.json'
186MEMBRANE_MAPPING_FILENAME = 'mda.mem_map.json'
187OUTPUT_DENSITY_FILENAME = 'mda.density.json'
188OUTPUT_THICKNESS_FILENAME = 'mda.thickness.json'
189OUTPUT_APL_FILENAME = 'mda.apl.json'
190OUTPUT_LIPID_ORDER_FILENAME = 'mda.lipid_order.json'
191OUTPUT_LIPID_INTERACTIONS_FILENAME = 'mda.lipid_inter.json'
192OUTPUT_CHANNELS_FILENAME = 'mda.channels.json'
194# Set problematic signs for directory/folder names
195# º is forbidden since paths including this characters are not readable by MDtraj
196FORBIDDEN_DIRECTORY_CHARACTERS = ['.', ',', ';', ':', 'º', '/']
198# Default parameters
199DEFAULT_RMSD_CUTOFF = 9
200DEFAULT_INTERACTION_CUTOFF = 0.1
202# Set register cache flags
203SNAPSHOTS_FLAG = 'snapshots'
204PDB_TO_PUBCHEM = 'pdb2pubchem'
206# Set the different test and warning flags
207MISSING_BONDS_FLAG = 'nobonds'
208STABLE_BONDS_FLAG = 'stabonds'
209COHERENT_BONDS_FLAG = 'cohbonds'
210TRAJECTORY_INTEGRITY_FLAG = 'intrajrity'
211CORRECT_ELEMENTS = 'elements'
212REFERENCE_SEQUENCE_FLAG = 'refseq'
213STABLE_INTERACTIONS_FLAG = 'interact'
214LIGANDS_MATCH_FLAG = 'ligands'
215CHAINS_ANALYSIS = 'chains'
216FAITH_BYPASS = 'faith'
218# State all the available checkings, which may be trusted
219AVAILABLE_CHECKINGS = [ STABLE_BONDS_FLAG, COHERENT_BONDS_FLAG, TRAJECTORY_INTEGRITY_FLAG ]
220# State all critical process failures, which are to be lethal for the workflow unless mercy is given
221AVAILABLE_FAILURES = AVAILABLE_CHECKINGS + [ CORRECT_ELEMENTS, REFERENCE_SEQUENCE_FLAG, STABLE_INTERACTIONS_FLAG, LIGANDS_MATCH_FLAG, CHAINS_ANALYSIS ]
223# Set which tests are to be run when some input files are modified
224STRUCTURE_TESTS = [STABLE_BONDS_FLAG, COHERENT_BONDS_FLAG]
225TRAJECTORY_TESTS = [STABLE_BONDS_FLAG, TRAJECTORY_INTEGRITY_FLAG]
226TOPOLOGY_TESTS = [STABLE_BONDS_FLAG, COHERENT_BONDS_FLAG]
228# Terminal colors
229# https://stackoverflow.com/questions/287871/how-do-i-print-colored-text-to-the-terminal
230if not GLOBALS['no_colors']:
231 GREEN_HEADER = '\033[92m'
232 CYAN_HEADER = '\033[96m'
233 BLUE_HEADER = '\033[94m'
234 YELLOW_HEADER = '\033[93m'
235 RED_HEADER = '\033[91m'
236 GREY_HEADER = '\033[90m'
237 COLOR_END = '\033[0m'
238else:
239 GREEN_HEADER = ''
240 CYAN_HEADER = ''
241 BLUE_HEADER = ''
242 YELLOW_HEADER = ''
243 RED_HEADER = ''
244 GREY_HEADER = ''
245 COLOR_END = ''
247# Set a dictionary to parse an internal raw name to a pretty human firendly name
248NICE_NAMES = {
249 STABLE_BONDS_FLAG: 'Stable bonds test',
250 COHERENT_BONDS_FLAG: 'Coherent bonds test',
251 TRAJECTORY_INTEGRITY_FLAG: 'Trajectory integrity test',
252 CORRECT_ELEMENTS: 'Correct elements',
253 REFERENCE_SEQUENCE_FLAG: 'Reference sequence match',
254 STABLE_INTERACTIONS_FLAG: 'Interactions are stable',
255 LIGANDS_MATCH_FLAG: 'Ligands matched residues',
256 CHAINS_ANALYSIS: 'Chains analysis'
257}
259# Set the "standard" file format of every possible file extension
260# Note that some formats have different possible extension (e.g. nc, cdf, netcdf)
261EXTENSION_FORMATS = {
262 # Topologies
263 'tpr': 'tpr',
264 'top': 'top',
265 'psf': 'psf',
266 'prmtop': 'prmtop',
267 'parm7': 'prmtop',
268 'prm7': 'prmtop',
269 'txt': 'txt', # charges.txt
270 # Structures
271 'pdb': 'pdb',
272 'gro': 'gro',
273 'cif': 'cif',
274 # Trajectories
275 'xtc': 'xtc',
276 'trr': 'trr',
277 'dcd': 'dcd',
278 'nc': 'nc',
279 'cdf': 'nc',
280 'netcdf': 'nc',
281 'crd': 'crd',
282 'mdcrd': 'crd',
283 'trj': 'crd',
284 # Restart files (may be used as single frame trajectories)
285 'rst7': 'rst7',
286 # Other
287 'json': 'json',
288 'yaml': 'yaml',
289 'yml': 'yaml',
290 'npy': 'npy',
291 'in': 'txt',
292 'h5': 'h5',
293 'jpg': 'jpg',
294 'jpeg': 'jpg',
295}
297# Topology and trajectory file formats supported by PyTraj
298PYTRAJ_SUPPORTED_FORMATS = set([
299 # Topologies
300 'prmtop', 'top', 'psf', 'pdb'
301 # Trajectories
302 'nc', 'crd', 'dcd', 'trr', 'xtc'
303])
305# From GitHub:
306# ParmFormatDict = {
307# "AMBERPARM": AMBERPARM,
308# "PDBFILE": PDBFILEPARM,
309# "MOL2FILE": MOL2FILEPARM,
310# "CHARMMPSF": CHARMMPSF,
311# "CIFFILE": CIFFILE,
312# "GMXTOP": GMXTOP,
313# "SDFFILE": SDFFILE,
314# "TINKER": TINKERPARM,
315# "UNKNOWN_PARM": UNKNOWN_PARM,
316# }
318# Set some flags requeired to write files with pytraj
319PYTRAJ_PARM_FORMAT = {
320 'prmtop': 'AMBERPARM',
321 'psf': 'CHARMMPSF',
322 'top': 'GMXTOP',
323 'pdb': 'PDBFILE'
324}
326# Elements supported while correcting atom elements
327# DANI: Ba was found in PDB 1J6S
328# DANI: Lu was found in PDB 1DUH
329# DANI: U was found in PDB 2GIC
330# DANI: V was found in PDB 2P7E
331# DANI: Tb was found in PDB 359D
332# DANI: Ag was found in PDB 5AY2
333# DANI: Rb was found in PDB 3GGK
335# Set elements which are always "bonded"
336SUPPORTED_POLYMER_ELEMENTS = set([ 'C', 'N', 'O', 'H', 'P', 'S', 'D' ])
337# Set elements which may be found both "bonded" or "alone"
338SUPPORTED_COORDINATED_ELEMENTS = set([ 'Zn', 'Fe', 'Mn', 'Co', 'Lu', 'U', 'V', 'Al', 'Ba', 'Be', 'F', 'Te' ])
339# Set elements which are always "alone"
340SUPPORTED_ION_ELEMENTS = set([ 'K', 'Cl', 'Na', 'Mg', 'Br', 'I', 'Ca', 'Tb', 'Ag', 'Tl', 'Rb' ])
341SUPPORTED_ELEMENTS = {
342 *SUPPORTED_POLYMER_ELEMENTS,
343 *SUPPORTED_COORDINATED_ELEMENTS,
344 *SUPPORTED_ION_ELEMENTS
345}
347# Set a dictionaries with all residue names and their equivalent letters
348# Amino acids
349PROTEIN_RESIDUE_NAME_LETTERS = {
350 'ALA':'A',
351 'ALAN':'A',
352 'ALAC':'A',
353 'ARG':'R',
354 'ARGN':'R',
355 'ARGC':'R',
356 'ASN':'N',
357 'ASNN':'N',
358 'ASNC':'N',
359 'ASP':'D',
360 'ASPN':'D',
361 'ASPC':'D',
362 'CYS':'C',
363 'CYSN':'C',
364 'CYSC':'C',
365 'CYH':'C',
366 'CSH':'C',
367 'CSS':'C',
368 'CYX':'C',
369 'CYP':'C',
370 'GLN':'Q',
371 'GLNN':'Q',
372 'GLNC':'Q',
373 'GLU':'E',
374 'GLUN':'E',
375 'GLUC':'E',
376 'GLUP':'E',
377 'GLY':'G',
378 'GLYN':'G',
379 'GLYC':'G',
380 'HIS':'H',
381 'HISN':'H',
382 'HISC':'H',
383 'HID':'H',
384 'HIE':'H',
385 'HIP':'H',
386 'HSD':'H',
387 'HSE':'H',
388 'ILE':'I',
389 'ILEN':'I',
390 'ILEC':'I',
391 'ILU':'I',
392 'LEU':'L',
393 'LEUN':'L',
394 'LEUC':'L',
395 'LYS':'K',
396 'LYSN':'K',
397 'LYSC':'K',
398 'MET':'M',
399 'METN':'M',
400 'METC':'M',
401 'PHE':'F',
402 'PHEN':'F',
403 'PHEC':'F',
404 'PRO':'P',
405 'PRON':'P',
406 'PROC':'P',
407 'PRØ':'P',
408 'PR0':'P',
409 'PRZ':'P',
410 'SER':'S',
411 'SERN':'S',
412 'SERC':'S',
413 'THR':'T',
414 'THRN':'T',
415 'THRC':'R',
416 'TRP':'W',
417 'TRPN':'W',
418 'TRPC':'W',
419 'TRY':'W',
420 'TYR':'Y',
421 'TYRN':'Y',
422 'TYRC':'Y',
423 'VAL':'V',
424 'VALN':'V',
425 'VALC':'V',
426}
427# Nucleotides
428DNA_RESIDUE_NAME_LETTERS = {
429 'DA': 'A',
430 'T': 'T',
431 'T3': 'T',
432 'T5': 'T',
433 'DT': 'T',
434 'DC': 'C',
435 'DG': 'G',
436 'DA3': 'A',
437 'DA5': 'A',
438 'DT3': 'T',
439 'DT5': 'T',
440 'DC3': 'C',
441 'DC5': 'C',
442 'DG3': 'G',
443 'DG5': 'G',
444}
445RNA_RESIDUE_NAME_LETTERS = {
446 'RA': 'A',
447 'U': 'U',
448 'U3': 'U',
449 'U5': 'U',
450 'RU': 'U',
451 'RC': 'C',
452 'RG': 'G',
453 'RA3': 'A',
454 'RA5': 'A',
455 'RU3': 'U',
456 'RU5': 'U',
457 'RC3': 'C',
458 'RC5': 'C',
459 'RG3': 'G',
460 'RG5': 'G',
461}
462NUCLEIC_RESIDUE_NAME_LETTERS = {
463 **DNA_RESIDUE_NAME_LETTERS,
464 **RNA_RESIDUE_NAME_LETTERS,
465 'A': 'A',
466 'A3': 'A',
467 'A5': 'A',
468 'C': 'C',
469 'C3': 'C',
470 'C5': 'C',
471 'G': 'G',
472 'G3': 'G',
473 'G5': 'G',
474}
475# All of them together
476RESIDUE_NAME_LETTERS = { **PROTEIN_RESIDUE_NAME_LETTERS, **NUCLEIC_RESIDUE_NAME_LETTERS }
478# Lipid common residue names
479# Source: https://github.com/NMRLipids/Databank/blob/main/Scripts/DatabankLib/settings/molecules.py#L10
480# Meanings: https://github.com/NMRLipids/Databank/blob/48fdf2c4149d0db8900ce08b0e74dc1836dcfab3/Scripts/BuildDatabank/docs/source/moleculesAndMapping.md?plain=1#L50
481FATTY_RESIDUE_NAMES = {
482 "POPC", "POPG", "POPS", "POPE", "PYPC", "PAzePCprot", "PAzePCdeprot", "DMPC",
483 "DPPC", "DPPE", "DPPG", "DEPC", "DRPC", "DYPC", "DLPC", "DLIPC", "DOG", "DOPC",
484 "DOPE", "DDOPC", "DOPS", "DSPC", "DAPC", "DMTAP", "SDG", "SDPE", "SOPC", "POPI",
485 "SAPI", "SAPI24", "SAPI25", "SLPI", "CER", "CER180", "DHMDMAB", "SLiPC", "SM16",
486 "SM18", "TOCL", "TLCL_0H", "TMCL", "GM1", "DPPGK", "GB3", "BOG"
487}
488STEROID_RESIDUE_NAMES = { "CHL", "CHL1", "CHOL", "DCHOL" }
489LIPIDS_RESIDUE_NAMES = FATTY_RESIDUE_NAMES.union(STEROID_RESIDUE_NAMES)
491# Set typical residue names to guess what residues are
492STANDARD_SOLVENT_RESIDUE_NAMES = {'SOL', 'WAT', 'HOH', 'TIP', 'TP3', 'SWM4', 'W'}
493# WARNING: Note that standard names also include + and - symbols
494# Use functions such as Structure.select_counter_ions instead of checking if the set includes a name
495STANDARD_COUNTER_CATION_ATOM_NAMES = {'K', 'NA', 'SOD', 'POT'}
496STANDARD_COUNTER_ANION_ATOM_NAMES = {'CL', 'CLA'}
497STANDARD_COUNTER_ION_ATOM_NAMES = STANDARD_COUNTER_CATION_ATOM_NAMES.union(STANDARD_COUNTER_ANION_ATOM_NAMES)
498STANDARD_DUMMY_ATOM_NAMES = {'MW'}
499DUMMY_ATOM_ELEMENT = 'Dm'
500CG_ATOM_ELEMENT = 'Cg'
502# Topology flags
504# Set a flag to represent a protein which is not referable (e.g. antibodies, synthetic constructs)
505NO_REFERABLE_FLAG = 'noref'
507# Set a flag to represent a not found reference
508NOT_FOUND_FLAG = 'notfound'
510# Reference id formats
511PDB_ID_FORMAT = r'^[1-9]{1}[a-zA-Z0-9]{3}$'
513# Available analysis for NASSA
514NASSA_ANALYSES_LIST = [ 'bconf', 'coordist', 'bpcorr', 'crdcorr', 'stiff' ]
516# Set the correponding canals archives (.ser) for each NASSA analysis
517NASSA_ANALYSES_CANALS = {
518 #'bconf': ['epsilC', 'epsilW', 'zetaC', 'zetaW'],
519 'coordist': ['shift', 'slide', 'rise', 'tilt', 'roll', 'twist','chiW', 'chiC'],
520 'bpcorr': ['shift', 'slide', 'rise', 'tilt', 'roll', 'twist'],
521 #'crdcorr': ['shift', 'slide', 'rise', 'tilt', 'roll', 'twist'],
522 'stiff': ['stretch', 'shear', 'buckle', 'stagger', 'propel', 'opening', 'chiW', 'chiC']
523}