Coverage for model_workflow/utils/constants.py: 99%

139 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-23 10:54 +0000

1from os import environ 

2from shutil import which 

3 

4# CONSTANTS --------------------------------------------------------------------------- 

5 

6# Set a custom globals dict 

7# This way we can edit the value of a constant on runtime 

8GLOBALS = { 

9 # Set if symlinks are allowed 

10 'no_symlinks': False, 

11} 

12 

13# Set the possible gromacs calls tried to find the gromacs executable in case it is not froced by the user 

14GROMACS_EXECUTABLE_COMMON_NAMES = ['gmx', 'gmx_mpi'] 

15# Set the name of the environmental variable which is read by the workflow to know the gromacs path 

16GROMACS_ENV = 'MWF_GMX' 

17# Set the gromacs executable path 

18# This may be forced by the user thorugh an enviornment variable 

19GROMACS_EXECUTABLE = environ.get(GROMACS_ENV, None) 

20# Otherwise we try with the known common gromacs executable names until we find an existing one 

21if not GROMACS_EXECUTABLE: 

22 for common_name in GROMACS_EXECUTABLE_COMMON_NAMES: 

23 if which(common_name): 

24 GROMACS_EXECUTABLE = common_name 

25 break 

26# If we do not find it then complain 

27if not GROMACS_EXECUTABLE: 

28 raise RuntimeError(f'Cannot find gromacs. Is gromacs installed? Set the env variable {GROMACS_ENV} as the gromacs executable path') 

29 

30# List typical text editor and their commands 

31TEXT_EDITORS = { 

32 'VIM': 'vim', 

33 'GNU nano': 'nano', 

34 'GNOME text editor': 'gedit', 

35 'VScode': 'code', 

36} 

37# Keep only those editor which are already installed 

38AVAILABLE_TEXT_EDITORS = { name: command for name, command in TEXT_EDITORS.items() if which(command) } 

39 

40# Set dates format 

41DATE_STYLE = '%d-%m-%Y %H:%M:%S' 

42 

43# Database 

44DEFAULT_API_URL = 'https://irb-dev.mddbr.eu/api/' 

45 

46# Selections 

47# Set a standard selection for protein and nucleic acid backbones in vmd syntax 

48ALL_ATOMS = 'all' 

49PROTEIN_AND_NUCLEIC = 'protein or nucleic' 

50PROTEIN_AND_NUCLEIC_BACKBONE = "(protein and name N CA C) or (nucleic and name P O5' O3' C5' C4' C3')" 

51 

52# Inputs file 

53DEFAULT_INPUTS_FILENAME = 'inputs.yaml' 

54ACCEPTED_INPUT_FILENAMES = [ 

55 DEFAULT_INPUTS_FILENAME, # The default 

56 'inputs.yml', # Another extension of yaml files 

57 'inputs.json' # Legacy inputs file 

58] 

59 

60# Default input values used when the value is not specified 

61# If an input field has no default value then it will be set as None 

62DEFAULT_INPUT_VALUES = { 

63 'license': 'This trajectory dataset is released under a Creative Commons Attribution 4.0 International Public License', 

64 'linkcense': 'https://creativecommons.org/licenses/by/4.0/', 

65 'mdref': 0, 

66} 

67 

68# Expected MD inputs 

69MD_DIRECTORY = 'mdir' 

70 

71# Input config file for the NASSA analysis 

72DEFAULT_NASSA_CONFIG_FILENAME = 'nassa.json' 

73 

74# Markov State Model input filenames 

75DEFAULT_POPULATIONS_FILENAME = 'populations.json' 

76DEFAULT_TRANSITIONS_FILENAME = 'transitions.json' 

77 

78# An old system for when original topology is very wrong and charges must be provided manually 

79RAW_CHARGES_FILENAME = 'charges.txt' 

80# Accepted topology formats for atomic charges mining 

81ACCEPTED_TOPOLOGY_FORMATS = ['tpr', 'top', 'psf', 'prmtop', 'prm7'] 

82 

83# Input files processing intermediate steps 

84# We name differenlty every intermediate file and we never rename/overwrite any input or intermediate file 

85# This allows us to know where we were in case the process was interrupted and not repeat steps on reset 

86# Intermediate files are removed at the end of the process if it was successful 

87 

88INCOMPLETE_PREFIX = 'incomplete_' 

89 

90CONVERTED = 'converted' 

91CONVERTED_STRUCTURE = 'converted.pdb' 

92CONVERTED_TRAJECTORY = 'converted.xtc' 

93 

94FILTERED = 'filtered' 

95FILTERED_STRUCTURE = 'filtered.pdb' 

96FILTERED_TRAJECTORY = 'filtered.xtc' 

97 

98IMAGED = 'imaged' 

99IMAGED_STRUCTURE = 'imaged.pdb' 

100IMAGED_TRAJECTORY = 'imaged.xtc' 

101 

102CORRECTED = 'corrected' 

103CORRECTED_STRUCTURE = 'corrected.pdb' 

104CORRECTED_TRAJECTORY = 'corrected.xtc' 

105 

106# Input and output core files 

107STANDARD_TOPOLOGY_FILENAME = 'topology.json' 

108STRUCTURE_FILENAME = 'structure.pdb' 

109TRAJECTORY_FILENAME = 'trajectory.xtc' 

110 

111# Auxiliar files 

112REGISTER_FILENAME = '.register.json' 

113CACHE_FILENAME = '.mwf_cache.json' 

114 

115# Files saving resorted bonds and charges when we have to resort atoms 

116# Note that these files have priority when loading both bonds and charges 

117RESORTED_CHARGES_FILENAME = 'resorted_charges.json' 

118RESORTED_BONDS_FILENAME = 'resorted_bonds.json' 

119 

120# Set generated file names 

121FIRST_FRAME_FILENAME = 'first_frame.pdb' 

122AVERAGE_STRUCTURE_FILENAME = 'average.pdb' 

123 

124# Set the reference labels according to the reference file used 

125REFERENCE_LABELS = { 

126 FIRST_FRAME_FILENAME: 'firstframe', 

127 AVERAGE_STRUCTURE_FILENAME: 'average' 

128} 

129 

130# Set output files generated to be uploaded to the database 

131 

132# Set the PDB (Protein Data Bank) references filename 

133PDB_REFERENCES_FILENAME = 'pdb_references.json' 

134# Set the protein references filename 

135PROTEIN_REFERENCES_FILENAME = 'protein_references.json' 

136# Set the ligand references filename 

137LIGAND_REFERENCES_FILENAME = 'ligand_references.json' 

138# Set the Lipid references filename 

139LIPID_REFERENCES_FILENAME = 'lipid_references.json' 

140 

141# Set the chains filename 

142OUTPUT_CHAINS_FILENAME = 'chains.json' 

143 

144# Set the metadata filename 

145OUTPUT_METADATA_FILENAME = 'metadata.json' 

146 

147# Set the screenshot filename 

148OUTPUT_SCREENSHOT_FILENAME = 'mdf.screenshot.jpg' 

149 

150# Additional screenshot filenames 

151OUTPUT_CLUSTER_SCREENSHOT_FILENAMES = 'mdf.clusters_*_screenshot_??.jpg' 

152 

153# Set analyses files to be generated 

154OUTPUT_INTERACTIONS_FILENAME = 'mda.interactions.json' 

155OUTPUT_RMSDS_FILENAME = 'mda.rmsds.json' 

156OUTPUT_TMSCORES_FILENAME = 'mda.tmscores.json' 

157OUTPUT_RMSF_FILENAME = 'mda.fluctuation.json' 

158OUTPUT_RGYR_FILENAME = 'mda.rgyr.json' 

159OUTPUT_PCA_FILENAME = 'mda.pca.json' 

160OUTPUT_PCA_PROJECTION_PREFIX = 'mdt.pca_trajectory' 

161OUTPUT_PCA_CONTACTS_FILENAME = 'mda.pca_contacts.json' 

162OUTPUT_RMSD_PERRES_FILENAME = 'mda.rmsd_perres.json' 

163OUTPUT_RMSD_PAIRWISE_FILENAME = 'mda.rmsd_pairwise.json' 

164OUTPUT_CLUSTERS_FILENAME = 'mda.clusters.json' 

165OUTPUT_DIST_PERRES_FILENAME = 'mda.dist_perres.json' 

166OUTPUT_HBONDS_FILENAME = 'mda.hbonds.json' 

167OUTPUT_SASA_FILENAME = 'mda.sasa.json' 

168OUTPUT_ENERGIES_FILENAME = 'mda.energies.json' 

169OUTPUT_DIHEDRAL_ENERGIES_FILENAME = 'mda.dihenergies.json' 

170OUTPUT_POCKETS_FILENAME = 'mda.pockets.json' 

171OUTPUT_POCKET_STRUCTURES_PREFIX = 'mdf.pocket' # WARNING: If this is changed then the pockets function must be updated as well 

172OUTPUT_HELICAL_PARAMETERS_FILENAME = 'mda.helical.json' 

173OUTPUT_MARKOV_FILENAME = 'mda.markov.json' 

174MEMBRANE_MAPPING_FILENAME = 'mda.mem_map.json' 

175OUTPUT_DENSITY_FILENAME = 'mda.density.json' 

176OUTPUT_THICKNESS_FILENAME = 'mda.thickness.json' 

177OUTPUT_APL_FILENAME = 'mda.apl.json' 

178OUTPUT_LIPID_ORDER_FILENAME = 'mda.lipid_order.json' 

179OUTPUT_LIPID_INTERACTIONS_FILENAME = 'mda.lipid_inter.json' 

180 

181# Set problematic signs for directory/folder names 

182# º is forbidden since paths including this characters are not readable by MDtraj 

183FORBIDDEN_DIRECTORY_CHARACTERS = ['.', ',', ';', ':', 'º', '/'] 

184 

185# Default parameters 

186DEFAULT_RMSD_CUTOFF = 9 

187DEFAULT_INTERACTION_CUTOFF = 0.1 

188 

189# Set register cache flags 

190SNAPSHOTS_FLAG = 'snapshots' 

191PDB_TO_PUBCHEM = 'pdb2pubchem' 

192NOT_MATCHED_LIGANDS = 'notmatchedligands' 

193 

194# Set the different test flags 

195STABLE_BONDS_FLAG = 'stabonds' 

196COHERENT_BONDS_FLAG = 'cohbonds' 

197TRAJECTORY_INTEGRITY_FLAG = 'intrajrity' 

198CORRECT_ELEMENTS = 'elements' 

199REFERENCE_SEQUENCE_FLAG = 'refseq' 

200STABLE_INTERACTIONS_FLAG = 'interact' 

201LIGANDS_MATCH_FLAG = 'ligands' 

202CHAINS_ANALYSIS = 'chains' 

203 

204# State all the available checkings, which may be trusted 

205AVAILABLE_CHECKINGS = [ STABLE_BONDS_FLAG, COHERENT_BONDS_FLAG, TRAJECTORY_INTEGRITY_FLAG ] 

206# State all critical process failures, which are to be lethal for the workflow unless mercy is given 

207AVAILABLE_FAILURES = AVAILABLE_CHECKINGS + [ CORRECT_ELEMENTS, REFERENCE_SEQUENCE_FLAG, STABLE_INTERACTIONS_FLAG, LIGANDS_MATCH_FLAG, CHAINS_ANALYSIS ] 

208 

209# Set which tests are to be run when some input files are modified 

210STRUCTURE_TESTS = [STABLE_BONDS_FLAG, COHERENT_BONDS_FLAG] 

211TRAJECTORY_TESTS = [STABLE_BONDS_FLAG, TRAJECTORY_INTEGRITY_FLAG] 

212TOPOLOGY_TESTS = [STABLE_BONDS_FLAG, COHERENT_BONDS_FLAG] 

213 

214# Terminal colors 

215# https://stackoverflow.com/questions/287871/how-do-i-print-colored-text-to-the-terminal 

216GREEN_HEADER = '\033[92m' 

217CYAN_HEADER = '\033[96m' 

218BLUE_HEADER = '\033[94m' 

219YELLOW_HEADER = '\033[93m' 

220RED_HEADER = '\033[91m' 

221GREY_HEADER = '\033[90m' 

222COLOR_END = '\033[0m' 

223 

224# Set a dictionary to parse an internal raw name to a pretty human firendly name 

225NICE_NAMES = { 

226 STABLE_BONDS_FLAG: 'Stable bonds test', 

227 COHERENT_BONDS_FLAG: 'Coherent bonds test', 

228 TRAJECTORY_INTEGRITY_FLAG: 'Trajectory integrity test', 

229 CORRECT_ELEMENTS: 'Correct elements', 

230 REFERENCE_SEQUENCE_FLAG: 'Reference sequence match', 

231 STABLE_INTERACTIONS_FLAG: 'Interactions are stable', 

232 LIGANDS_MATCH_FLAG : 'Ligands matched residues', 

233 CHAINS_ANALYSIS: 'Chains analysis' 

234} 

235 

236# Set the "standard" file format of every possible file extension 

237# Note that some formats have different possible extension (e.g. nc, cdf, netcdf) 

238EXTENSION_FORMATS = { 

239 # Topologies 

240 'tpr': 'tpr', 

241 'top': 'top', 

242 'psf': 'psf', 

243 'prmtop': 'prmtop', 

244 'parm7': 'prmtop', 

245 'prm7': 'prmtop', 

246 'txt': 'txt', # charges.txt 

247 # Structures 

248 'pdb': 'pdb', 

249 'gro': 'gro', 

250 'cif': 'cif', 

251 # Trajectories 

252 'xtc': 'xtc', 

253 'trr': 'trr', 

254 'dcd': 'dcd', 

255 'nc': 'nc', 

256 'cdf': 'nc', 

257 'netcdf': 'nc', 

258 'crd': 'crd', 

259 'mdcrd': 'crd', 

260 'trj': 'crd', 

261 # Restart files (may be used as single frame trajectories) 

262 'rst7': 'rst7', 

263 # Other 

264 'json': 'json', 

265 'yaml': 'yaml', 

266 'yml': 'yaml', 

267 'npy': 'npy', 

268 'in': 'txt', 

269 'h5': 'h5' 

270} 

271 

272# Topology and trajectory file formats supported by PyTraj 

273PYTRAJ_SUPPORTED_FORMATS = set([ 

274 # Topologies 

275 'prmtop', 'top', 'psf', 'pdb' 

276 # Trajectories 

277 'nc', 'crd', 'dcd', 'trr', 'xtc' 

278]) 

279 

280# From GitHub: 

281# ParmFormatDict = { 

282# "AMBERPARM": AMBERPARM, 

283# "PDBFILE": PDBFILEPARM, 

284# "MOL2FILE": MOL2FILEPARM, 

285# "CHARMMPSF": CHARMMPSF, 

286# "CIFFILE": CIFFILE, 

287# "GMXTOP": GMXTOP, 

288# "SDFFILE": SDFFILE, 

289# "TINKER": TINKERPARM, 

290# "UNKNOWN_PARM": UNKNOWN_PARM, 

291# } 

292 

293# Set some flags requeired to write files with pytraj 

294PYTRAJ_PARM_FORMAT = { 

295 'prmtop': 'AMBERPARM', 

296 'psf': 'CHARMMPSF', 

297 'top': 'GMXTOP', 

298 'pdb': 'PDBFILE' 

299} 

300 

301# Elements supported while correcting atom elements 

302# DANI: Ba was found in PDB 1J6S 

303# DANI: Lu was found in PDB 1DUH 

304# DANI: U was found in PDB 2GIC 

305# DANI: V was found in PDB 2P7E 

306# DANI: Tb was found in PDB 359D 

307# DANI: Ag was found in PDB 5AY2 

308# DANI: Rb was found in PDB 3GGK 

309 

310# Set elements which are always "bonded" 

311SUPPORTED_POLYMER_ELEMENTS = set([ 'C', 'N', 'O', 'H', 'P', 'S' ]) 

312# Set elements which may be found both "bonded" or "alone" 

313SUPPORTED_COORDINATED_ELEMENTS = set([ 'Zn', 'Fe', 'Mn', 'Co', 'Lu', 'U', 'V', 'Al', 'Ba', 'Be', 'F' ]) 

314# Set elements which are always "alone" 

315SUPPORTED_ION_ELEMENTS = set([ 'K', 'Cl', 'Na', 'Mg', 'Br', 'I', 'Ca', 'Tb', 'Ag', 'Tl', 'Rb' ]) 

316SUPPORTED_ELEMENTS = { 

317 *SUPPORTED_POLYMER_ELEMENTS, 

318 *SUPPORTED_COORDINATED_ELEMENTS, 

319 *SUPPORTED_ION_ELEMENTS 

320} 

321 

322# Set a dictionaries with all residue names and their equivalent letters 

323# Amino acids 

324PROTEIN_RESIDUE_NAME_LETTERS = { 

325 'ALA':'A', 

326 'ALAN':'A', 

327 'ALAC':'A', 

328 'ARG':'R', 

329 'ARGN':'R', 

330 'ARGC':'R', 

331 'ASN':'N', 

332 'ASNN':'N', 

333 'ASNC':'N', 

334 'ASP':'D', 

335 'ASPN':'D', 

336 'ASPC':'D', 

337 'CYS':'C', 

338 'CYSN':'C', 

339 'CYSC':'C', 

340 'CYH':'C', 

341 'CSH':'C', 

342 'CSS':'C', 

343 'CYX':'C', 

344 'CYP':'C', 

345 'GLN':'Q', 

346 'GLNN':'Q', 

347 'GLNC':'Q', 

348 'GLU':'E', 

349 'GLUN':'E', 

350 'GLUC':'E', 

351 'GLUP':'E', 

352 'GLY':'G', 

353 'GLYN':'G', 

354 'GLYC':'G', 

355 'HIS':'H', 

356 'HISN':'H', 

357 'HISC':'H', 

358 'HID':'H', 

359 'HIE':'H', 

360 'HIP':'H', 

361 'HSD':'H', 

362 'HSE':'H', 

363 'ILE':'I', 

364 'ILEN':'I', 

365 'ILEC':'I', 

366 'ILU':'I', 

367 'LEU':'L', 

368 'LEUN':'L', 

369 'LEUC':'L', 

370 'LYS':'K', 

371 'LYSN':'K', 

372 'LYSC':'K', 

373 'MET':'M', 

374 'METN':'M', 

375 'METC':'M', 

376 'PHE':'F', 

377 'PHEN':'F', 

378 'PHEC':'F', 

379 'PRO':'P', 

380 'PRON':'P', 

381 'PROC':'P', 

382 'PRØ':'P', 

383 'PR0':'P', 

384 'PRZ':'P', 

385 'SER':'S', 

386 'SERN':'S', 

387 'SERC':'S', 

388 'THR':'T', 

389 'THRN':'T', 

390 'THRC':'R', 

391 'TRP':'W', 

392 'TRPN':'W', 

393 'TRPC':'W', 

394 'TRY':'W', 

395 'TYR':'Y', 

396 'TYRN':'Y', 

397 'TYRC':'Y', 

398 'VAL':'V', 

399 'VALN':'V', 

400 'VALC':'V', 

401} 

402# Nucleotides 

403DNA_RESIDUE_NAME_LETTERS = { 

404 'DA': 'A', 

405 'T': 'T', 

406 'T3': 'T', 

407 'T5': 'T', 

408 'DT': 'T', 

409 'DC': 'C', 

410 'DG': 'G', 

411 'DA3': 'A', 

412 'DA5': 'A', 

413 'DT3': 'T', 

414 'DT5': 'T', 

415 'DC3': 'C', 

416 'DC5': 'C', 

417 'DG3': 'G', 

418 'DG5': 'G', 

419} 

420RNA_RESIDUE_NAME_LETTERS = { 

421 'RA': 'A', 

422 'U': 'U', 

423 'U3': 'U', 

424 'U5': 'U', 

425 'RU': 'U', 

426 'RC': 'C', 

427 'RG': 'G', 

428 'RA3': 'A', 

429 'RA5': 'A', 

430 'RU3': 'U', 

431 'RU5': 'U', 

432 'RC3': 'C', 

433 'RC5': 'C', 

434 'RG3': 'G', 

435 'RG5': 'G', 

436} 

437NUCLEIC_RESIDUE_NAME_LETTERS = { 

438 **DNA_RESIDUE_NAME_LETTERS, 

439 **RNA_RESIDUE_NAME_LETTERS, 

440 'A': 'A', 

441 'A3': 'A', 

442 'A5': 'A', 

443 'C': 'C', 

444 'C3': 'C', 

445 'C5': 'C', 

446 'G': 'G', 

447 'G3': 'G', 

448 'G5': 'G', 

449} 

450# All of them together 

451RESIDUE_NAME_LETTERS = { **PROTEIN_RESIDUE_NAME_LETTERS, **NUCLEIC_RESIDUE_NAME_LETTERS } 

452 

453# Lipid common residue names 

454# Source: https://github.com/NMRLipids/Databank/blob/main/Scripts/DatabankLib/settings/molecules.py#L10 

455# Meanings: https://github.com/NMRLipids/Databank/blob/48fdf2c4149d0db8900ce08b0e74dc1836dcfab3/Scripts/BuildDatabank/docs/source/moleculesAndMapping.md?plain=1#L50 

456FATTY_RESIDUE_NAMES = { 

457 "POPC", "POPG", "POPS", "POPE", "PYPC", "PAzePCprot", "PAzePCdeprot", "DMPC", 

458 "DPPC", "DPPE", "DPPG", "DEPC", "DRPC", "DYPC", "DLPC", "DLIPC", "DOG", "DOPC", 

459 "DOPE", "DDOPC", "DOPS", "DSPC", "DAPC", "DMTAP", "SDG", "SDPE", "SOPC", "POPI", 

460 "SAPI", "SAPI24", "SAPI25", "SLPI", "CER", "CER180", "DHMDMAB", "SLiPC", "SM16", 

461 "SM18", "TOCL", "TLCL_0H", "TMCL", "GM1", "DPPGK", "GB3", "BOG" 

462} 

463STEROID_RESIDUE_NAMES = { "CHL", "CHL1", "CHOL", "DCHOL" } 

464 

465# Set typical residue names to guess what residues are 

466STANDARD_SOLVENT_RESIDUE_NAMES = {'SOL', 'WAT', 'HOH', 'TIP', 'TP3', 'SWM4'} 

467# WARNING: Note that standard names also include + and - symbols 

468# Use functions such as Structure.select_counter_ions instead of checking if the set includes a name 

469STANDARD_COUNTER_CATION_ATOM_NAMES = {'K', 'NA', 'SOD', 'POT'} 

470STANDARD_COUNTER_ANION_ATOM_NAMES = {'CL', 'CLA'} 

471STANDARD_COUNTER_ION_ATOM_NAMES = STANDARD_COUNTER_CATION_ATOM_NAMES.union(STANDARD_COUNTER_ANION_ATOM_NAMES) 

472STANDARD_DUMMY_ATOM_NAMES = {'MW'} 

473DUMMY_ATOM_ELEMENT = 'Dm' 

474CG_ATOM_ELEMENT = 'Cg' 

475 

476# Topology flags 

477 

478# Set a flag to represent a protein which is not referable (e.g. antibodies, synthetic constructs) 

479NO_REFERABLE_FLAG = 'noref' 

480 

481# Set a flag to represent a not found reference 

482NOT_FOUND_FLAG = 'notfound' 

483 

484# Reference id formats 

485PDB_ID_FORMAT = r'^[1-9]{1}[a-zA-Z0-9]{3}$' 

486 

487# Available analysis for NASSA 

488NASSA_ANALYSES_LIST = [ 'bconf', 'coordist', 'bpcorr', 'crdcorr', 'stiff' ] 

489 

490# Set the correponding canals archives (.ser) for each NASSA analysis 

491NASSA_ANALYSES_CANALS = { 

492 #'bconf': ['epsilC', 'epsilW', 'zetaC', 'zetaW'], 

493 'coordist': ['shift', 'slide', 'rise', 'tilt', 'roll', 'twist','chiW', 'chiC'], 

494 'bpcorr': ['shift', 'slide', 'rise', 'tilt', 'roll', 'twist'], 

495 #'crdcorr': ['shift', 'slide', 'rise', 'tilt', 'roll', 'twist'], 

496 'stiff': ['stretch', 'shear', 'buckle', 'stagger', 'propel', 'opening', 'chiW', 'chiC'] 

497}