Coverage for mddb_workflow/utils/constants.py: 95%

1from os import environ

2from shutil import which

4# CONSTANTS ---------------------------------------------------------------------------

6# Set a custom globals dict

7# This way we can edit the value of a constant on runtime

8GLOBALS = {

9 # Set if symlinks are allowed

10 'no_symlinks': False,

11 # Set if colors are disabled for logging

12 'no_colors': False,

13}

15# Set the possible gromacs calls tried to find the gromacs executable in case it is not froced by the user

16GROMACS_EXECUTABLE_COMMON_NAMES = ['gmx', 'gmx_mpi']

17# Set the name of the environmental variable which is read by the workflow to know the gromacs path

18GROMACS_ENV = 'MWF_GMX'

19# Set the gromacs executable path

20# This may be forced by the user thorugh an enviornment variable

21GROMACS_EXECUTABLE = environ.get(GROMACS_ENV, None)

22# Otherwise we try with the known common gromacs executable names until we find an existing one

23if not GROMACS_EXECUTABLE:

24 for common_name in GROMACS_EXECUTABLE_COMMON_NAMES:

25 if which(common_name):

26 GROMACS_EXECUTABLE = common_name

27 break

28# If we do not find it then complain

29if not GROMACS_EXECUTABLE:

30 raise RuntimeError(f'Cannot find gromacs. Is gromacs installed? Set the env variable {GROMACS_ENV} as the gromacs executable path')

32# List typical text editor and their commands

33TEXT_EDITORS = {

34 'VIM': 'vim',

35 'GNU nano': 'nano',

36 'GNOME text editor': 'gedit',

37 'VScode': 'code',

38}

39# Keep only those editor which are already installed

40AVAILABLE_TEXT_EDITORS = { name: command for name, command in TEXT_EDITORS.items() if which(command) }

42# Set dates format

43DATE_STYLE = '%d-%m-%Y %H:%M:%S'

45# Database

46DEFAULT_API_URL = 'https://irb-dev.mddbr.eu/api/'

48# Selections

49# Set a standard selection for protein and nucleic acid backbones in vmd syntax

50ALL_ATOMS = 'all'

51PROTEIN_AND_NUCLEIC = 'protein or nucleic'

52PROTEIN_AND_NUCLEIC_BACKBONE = "(protein and name N CA C) or (nucleic and name P O5' O3' C5' C4' C3')"

54# Inputs file

55DEFAULT_INPUTS_FILENAME = 'inputs.yaml'

56ACCEPTED_INPUT_FILENAMES = [

57 DEFAULT_INPUTS_FILENAME, # The default

58 'inputs.yml', # Another extension of yaml files

59 'inputs.json' # Legacy inputs file

60]

62# Default input values used when the value is not specified

63# If an input field has no default value then it will be set as None

64DEFAULT_INPUT_VALUES = {

65 'license': 'This trajectory dataset is released under a Creative Commons Attribution 4.0 International Public License',

66 'linkcense': 'https://creativecommons.org/licenses/by/4.0/',

67 'mdref': 0,

68}

70# Expected MD inputs

71MD_DIRECTORY = 'mdir'

73# Input config file for the NASSA analysis

74DEFAULT_NASSA_CONFIG_FILENAME = 'nassa.json'

76# Markov State Model input filenames

77DEFAULT_POPULATIONS_FILENAME = 'populations.json'

78DEFAULT_TRANSITIONS_FILENAME = 'transitions.json'

80# An old system for when original topology is very wrong and charges must be provided manually

81RAW_CHARGES_FILENAME = 'charges.txt'

82# Accepted topology formats for atomic charges mining

83ACCEPTED_TOPOLOGY_FORMATS = ['tpr', 'top', 'psf', 'prmtop', 'prm7']

85# Input files processing intermediate steps

86# We name differenlty every intermediate file and we never rename/overwrite any input or intermediate file

87# This allows us to know where we were in case the process was interrupted and not repeat steps on reset

88# Intermediate files are removed at the end of the process if it was successful

90INCOMPLETE_PREFIX = 'incomplete_'

92CONVERTED = 'converted'

93CONVERTED_STRUCTURE = 'converted.pdb'

94CONVERTED_TRAJECTORY = 'converted.xtc'

96FILTERED = 'filtered'

97FILTERED_STRUCTURE = 'filtered.pdb'

98FILTERED_TRAJECTORY = 'filtered.xtc'

100IMAGED = 'imaged'

101IMAGED_STRUCTURE = 'imaged.pdb'

102IMAGED_TRAJECTORY = 'imaged.xtc'

103

104CORRECTED = 'corrected'

105CORRECTED_STRUCTURE = 'corrected.pdb'

106CORRECTED_TRAJECTORY = 'corrected.xtc'

107

108# Output core files

109STANDARD_TOPOLOGY_FILENAME = 'topology.json'

110STRUCTURE_FILENAME = 'structure.pdb'

111TRAJECTORY_FILENAME = 'trajectory.xtc'

112

113# Auxiliar files

114REGISTER_FILENAME = '.register.json'

115CACHE_FILENAME = '.mwf_cache.json'

116

117# Files saving resorted bonds and charges when we have to resort atoms

118# Note that these files have priority when loading both bonds and charges

119RESORTED_CHARGES_FILENAME = 'resorted_charges.json'

120RESORTED_BONDS_FILENAME = 'resorted_bonds.json'

121

122# Set generated file names

123FIRST_FRAME_FILENAME = 'first_frame.pdb'

124AVERAGE_STRUCTURE_FILENAME = 'average.pdb'

125

126# Set the reference labels according to the reference file used

127REFERENCE_LABELS = {

128 FIRST_FRAME_FILENAME: 'firstframe',

129 AVERAGE_STRUCTURE_FILENAME: 'average'

130}

131

132# Set output files generated to be uploaded to the database

133

134# Set the PDB (Protein Data Bank) references filename

135PDB_REFERENCES_FILENAME = 'pdb_references.json'

136# Set the protein references filename

137PROTEIN_REFERENCES_FILENAME = 'protein_references.json'

138# Set the ligand references filename

139LIGAND_REFERENCES_FILENAME = 'ligand_references.json'

140# Set the InChIKey references filename

141INCHIKEY_REFERENCES_FILENAME = 'inchikey_references.json'

142

143# Set the chains filename

144OUTPUT_CHAINS_FILENAME = 'chains.json'

145

146# Set the metadata filename

147OUTPUT_METADATA_FILENAME = 'metadata.json'

148

149# Set the screenshot filename

150OUTPUT_SCREENSHOT_FILENAME = 'mdf.screenshot.jpg'

151

152# Additional screenshot filenames

153OUTPUT_CLUSTER_SCREENSHOT_FILENAMES = 'mdf.clusters_*_screenshot_??.jpg'

154

155# Set analyses files to be generated

156OUTPUT_INTERACTIONS_FILENAME = 'mda.interactions.json'

157OUTPUT_RMSDS_FILENAME = 'mda.rmsds.json'

158OUTPUT_TMSCORES_FILENAME = 'mda.tmscores.json'

159OUTPUT_RMSF_FILENAME = 'mda.fluctuation.json'

160OUTPUT_RGYR_FILENAME = 'mda.rgyr.json'

161OUTPUT_PCA_FILENAME = 'mda.pca.json'

162OUTPUT_PCA_PROJECTION_PREFIX = 'mdt.pca_trajectory'

163OUTPUT_PCA_CONTACTS_FILENAME = 'mda.pca_contacts.json'

164OUTPUT_RMSD_PERRES_FILENAME = 'mda.rmsd_perres.json'

165OUTPUT_RMSD_PAIRWISE_FILENAME = 'mda.rmsd_pairwise.json'

166OUTPUT_CLUSTERS_FILENAME = 'mda.clusters.json'

167OUTPUT_DIST_PERRES_FILENAME = 'mda.dist_perres.json'

168OUTPUT_HBONDS_FILENAME = 'mda.hbonds.json'

169OUTPUT_SASA_FILENAME = 'mda.sasa.json'

170OUTPUT_ENERGIES_FILENAME = 'mda.energies.json'

171OUTPUT_DIHEDRAL_ENERGIES_FILENAME = 'mda.dihenergies.json'

172OUTPUT_POCKETS_FILENAME = 'mda.pockets.json'

173OUTPUT_POCKET_STRUCTURES_PREFIX = 'mdf.pocket' # WARNING: If this is changed then the pockets function must be updated as well

174OUTPUT_HELICAL_PARAMETERS_FILENAME = 'mda.helical.json'

175OUTPUT_MARKOV_FILENAME = 'mda.markov.json'

176OUTPUT_PROVENANCE_FILENAME = 'mda.provenance.json'

177MEMBRANE_MAPPING_FILENAME = 'mda.mem_map.json'

178OUTPUT_DENSITY_FILENAME = 'mda.density.json'

179OUTPUT_THICKNESS_FILENAME = 'mda.thickness.json'

180OUTPUT_APL_FILENAME = 'mda.apl.json'

181OUTPUT_LIPID_ORDER_FILENAME = 'mda.lipid_order.json'

182OUTPUT_LIPID_INTERACTIONS_FILENAME = 'mda.lipid_inter.json'

183OUTPUT_CHANNELS_FILENAME = 'mda.channels.json'

184

185# Set problematic signs for directory/folder names

186# º is forbidden since paths including this characters are not readable by MDtraj

187FORBIDDEN_DIRECTORY_CHARACTERS = ['.', ',', ';', ':', 'º', '/']

188

189# Default parameters

190DEFAULT_RMSD_CUTOFF = 9

191DEFAULT_INTERACTION_CUTOFF = 0.1

192

193# Set register cache flags

194SNAPSHOTS_FLAG = 'snapshots'

195PDB_TO_PUBCHEM = 'pdb2pubchem'

196NOT_MATCHED_LIGANDS = 'notmatchedligands'

197

198# Set the different test flags

199STABLE_BONDS_FLAG = 'stabonds'

200COHERENT_BONDS_FLAG = 'cohbonds'

201TRAJECTORY_INTEGRITY_FLAG = 'intrajrity'

202CORRECT_ELEMENTS = 'elements'

203REFERENCE_SEQUENCE_FLAG = 'refseq'

204STABLE_INTERACTIONS_FLAG = 'interact'

205LIGANDS_MATCH_FLAG = 'ligands'

206CHAINS_ANALYSIS = 'chains'

207

208# State all the available checkings, which may be trusted

209AVAILABLE_CHECKINGS = [ STABLE_BONDS_FLAG, COHERENT_BONDS_FLAG, TRAJECTORY_INTEGRITY_FLAG ]

210# State all critical process failures, which are to be lethal for the workflow unless mercy is given

211AVAILABLE_FAILURES = AVAILABLE_CHECKINGS + [ CORRECT_ELEMENTS, REFERENCE_SEQUENCE_FLAG, STABLE_INTERACTIONS_FLAG, LIGANDS_MATCH_FLAG, CHAINS_ANALYSIS ]

212

213# Set which tests are to be run when some input files are modified

214STRUCTURE_TESTS = [STABLE_BONDS_FLAG, COHERENT_BONDS_FLAG]

215TRAJECTORY_TESTS = [STABLE_BONDS_FLAG, TRAJECTORY_INTEGRITY_FLAG]

216TOPOLOGY_TESTS = [STABLE_BONDS_FLAG, COHERENT_BONDS_FLAG]

217

218# Terminal colors

219# https://stackoverflow.com/questions/287871/how-do-i-print-colored-text-to-the-terminal

220if not GLOBALS['no_colors']:

221 GREEN_HEADER = '\033[92m'

222 CYAN_HEADER = '\033[96m'

223 BLUE_HEADER = '\033[94m'

224 YELLOW_HEADER = '\033[93m'

225 RED_HEADER = '\033[91m'

226 GREY_HEADER = '\033[90m'

227 COLOR_END = '\033[0m'

228else:

229 GREEN_HEADER = ''

230 CYAN_HEADER = ''

231 BLUE_HEADER = ''

232 YELLOW_HEADER = ''

233 RED_HEADER = ''

234 GREY_HEADER = ''

235 COLOR_END = ''

236

237# Set a dictionary to parse an internal raw name to a pretty human firendly name

238NICE_NAMES = {

239 STABLE_BONDS_FLAG: 'Stable bonds test',

240 COHERENT_BONDS_FLAG: 'Coherent bonds test',

241 TRAJECTORY_INTEGRITY_FLAG: 'Trajectory integrity test',

242 CORRECT_ELEMENTS: 'Correct elements',

243 REFERENCE_SEQUENCE_FLAG: 'Reference sequence match',

244 STABLE_INTERACTIONS_FLAG: 'Interactions are stable',

245 LIGANDS_MATCH_FLAG : 'Ligands matched residues',

246 CHAINS_ANALYSIS: 'Chains analysis'

247}

248

249# Set the "standard" file format of every possible file extension

250# Note that some formats have different possible extension (e.g. nc, cdf, netcdf)

251EXTENSION_FORMATS = {

252 # Topologies

253 'tpr': 'tpr',

254 'top': 'top',

255 'psf': 'psf',

256 'prmtop': 'prmtop',

257 'parm7': 'prmtop',

258 'prm7': 'prmtop',

259 'txt': 'txt', # charges.txt

260 # Structures

261 'pdb': 'pdb',

262 'gro': 'gro',

263 'cif': 'cif',

264 # Trajectories

265 'xtc': 'xtc',

266 'trr': 'trr',

267 'dcd': 'dcd',

268 'nc': 'nc',

269 'cdf': 'nc',

270 'netcdf': 'nc',

271 'crd': 'crd',

272 'mdcrd': 'crd',

273 'trj': 'crd',

274 # Restart files (may be used as single frame trajectories)

275 'rst7': 'rst7',

276 # Other

277 'json': 'json',

278 'yaml': 'yaml',

279 'yml': 'yaml',

280 'npy': 'npy',

281 'in': 'txt',

282 'h5': 'h5'

283}

284

285# Topology and trajectory file formats supported by PyTraj

286PYTRAJ_SUPPORTED_FORMATS = set([

287 # Topologies

288 'prmtop', 'top', 'psf', 'pdb'

289 # Trajectories

290 'nc', 'crd', 'dcd', 'trr', 'xtc'

291])

292

293# From GitHub:

294# ParmFormatDict = {

295# "AMBERPARM": AMBERPARM,

296# "PDBFILE": PDBFILEPARM,

297# "MOL2FILE": MOL2FILEPARM,

298# "CHARMMPSF": CHARMMPSF,

299# "CIFFILE": CIFFILE,

300# "GMXTOP": GMXTOP,

301# "SDFFILE": SDFFILE,

302# "TINKER": TINKERPARM,

303# "UNKNOWN_PARM": UNKNOWN_PARM,

304# }

305

306# Set some flags requeired to write files with pytraj

307PYTRAJ_PARM_FORMAT = {

308 'prmtop': 'AMBERPARM',

309 'psf': 'CHARMMPSF',

310 'top': 'GMXTOP',

311 'pdb': 'PDBFILE'

312}

313

314# Elements supported while correcting atom elements

315# DANI: Ba was found in PDB 1J6S

316# DANI: Lu was found in PDB 1DUH

317# DANI: U was found in PDB 2GIC

318# DANI: V was found in PDB 2P7E

319# DANI: Tb was found in PDB 359D

320# DANI: Ag was found in PDB 5AY2

321# DANI: Rb was found in PDB 3GGK

322

323# Set elements which are always "bonded"

324SUPPORTED_POLYMER_ELEMENTS = set([ 'C', 'N', 'O', 'H', 'P', 'S', 'D' ])

325# Set elements which may be found both "bonded" or "alone"

326SUPPORTED_COORDINATED_ELEMENTS = set([ 'Zn', 'Fe', 'Mn', 'Co', 'Lu', 'U', 'V', 'Al', 'Ba', 'Be', 'F', 'Te' ])

327# Set elements which are always "alone"

328SUPPORTED_ION_ELEMENTS = set([ 'K', 'Cl', 'Na', 'Mg', 'Br', 'I', 'Ca', 'Tb', 'Ag', 'Tl', 'Rb' ])

329SUPPORTED_ELEMENTS = {

330 *SUPPORTED_POLYMER_ELEMENTS,

331 *SUPPORTED_COORDINATED_ELEMENTS,

332 *SUPPORTED_ION_ELEMENTS

333}

334

335# Set a dictionaries with all residue names and their equivalent letters

336# Amino acids

337PROTEIN_RESIDUE_NAME_LETTERS = {

338 'ALA':'A',

339 'ALAN':'A',

340 'ALAC':'A',

341 'ARG':'R',

342 'ARGN':'R',

343 'ARGC':'R',

344 'ASN':'N',

345 'ASNN':'N',

346 'ASNC':'N',

347 'ASP':'D',

348 'ASPN':'D',

349 'ASPC':'D',

350 'CYS':'C',

351 'CYSN':'C',

352 'CYSC':'C',

353 'CYH':'C',

354 'CSH':'C',

355 'CSS':'C',

356 'CYX':'C',

357 'CYP':'C',

358 'GLN':'Q',

359 'GLNN':'Q',

360 'GLNC':'Q',

361 'GLU':'E',

362 'GLUN':'E',

363 'GLUC':'E',

364 'GLUP':'E',

365 'GLY':'G',

366 'GLYN':'G',

367 'GLYC':'G',

368 'HIS':'H',

369 'HISN':'H',

370 'HISC':'H',

371 'HID':'H',

372 'HIE':'H',

373 'HIP':'H',

374 'HSD':'H',

375 'HSE':'H',

376 'ILE':'I',

377 'ILEN':'I',

378 'ILEC':'I',

379 'ILU':'I',

380 'LEU':'L',

381 'LEUN':'L',

382 'LEUC':'L',

383 'LYS':'K',

384 'LYSN':'K',

385 'LYSC':'K',

386 'MET':'M',

387 'METN':'M',

388 'METC':'M',

389 'PHE':'F',

390 'PHEN':'F',

391 'PHEC':'F',

392 'PRO':'P',

393 'PRON':'P',

394 'PROC':'P',

395 'PRØ':'P',

396 'PR0':'P',

397 'PRZ':'P',

398 'SER':'S',

399 'SERN':'S',

400 'SERC':'S',

401 'THR':'T',

402 'THRN':'T',

403 'THRC':'R',

404 'TRP':'W',

405 'TRPN':'W',

406 'TRPC':'W',

407 'TRY':'W',

408 'TYR':'Y',

409 'TYRN':'Y',

410 'TYRC':'Y',

411 'VAL':'V',

412 'VALN':'V',

413 'VALC':'V',

414}

415# Nucleotides

416DNA_RESIDUE_NAME_LETTERS = {

417 'DA': 'A',

418 'T': 'T',

419 'T3': 'T',

420 'T5': 'T',

421 'DT': 'T',

422 'DC': 'C',

423 'DG': 'G',

424 'DA3': 'A',

425 'DA5': 'A',

426 'DT3': 'T',

427 'DT5': 'T',

428 'DC3': 'C',

429 'DC5': 'C',

430 'DG3': 'G',

431 'DG5': 'G',

432}

433RNA_RESIDUE_NAME_LETTERS = {

434 'RA': 'A',

435 'U': 'U',

436 'U3': 'U',

437 'U5': 'U',

438 'RU': 'U',

439 'RC': 'C',

440 'RG': 'G',

441 'RA3': 'A',

442 'RA5': 'A',

443 'RU3': 'U',

444 'RU5': 'U',

445 'RC3': 'C',

446 'RC5': 'C',

447 'RG3': 'G',

448 'RG5': 'G',

449}

450NUCLEIC_RESIDUE_NAME_LETTERS = {

451 **DNA_RESIDUE_NAME_LETTERS,

452 **RNA_RESIDUE_NAME_LETTERS,

453 'A': 'A',

454 'A3': 'A',

455 'A5': 'A',

456 'C': 'C',

457 'C3': 'C',

458 'C5': 'C',

459 'G': 'G',

460 'G3': 'G',

461 'G5': 'G',

462}

463# All of them together

464RESIDUE_NAME_LETTERS = { **PROTEIN_RESIDUE_NAME_LETTERS, **NUCLEIC_RESIDUE_NAME_LETTERS }

465

466# Lipid common residue names

467# Source: https://github.com/NMRLipids/Databank/blob/main/Scripts/DatabankLib/settings/molecules.py#L10

468# Meanings: https://github.com/NMRLipids/Databank/blob/48fdf2c4149d0db8900ce08b0e74dc1836dcfab3/Scripts/BuildDatabank/docs/source/moleculesAndMapping.md?plain=1#L50

469FATTY_RESIDUE_NAMES = {

470 "POPC", "POPG", "POPS", "POPE", "PYPC", "PAzePCprot", "PAzePCdeprot", "DMPC",

471 "DPPC", "DPPE", "DPPG", "DEPC", "DRPC", "DYPC", "DLPC", "DLIPC", "DOG", "DOPC",

472 "DOPE", "DDOPC", "DOPS", "DSPC", "DAPC", "DMTAP", "SDG", "SDPE", "SOPC", "POPI",

473 "SAPI", "SAPI24", "SAPI25", "SLPI", "CER", "CER180", "DHMDMAB", "SLiPC", "SM16",

474 "SM18", "TOCL", "TLCL_0H", "TMCL", "GM1", "DPPGK", "GB3", "BOG"

475}

476STEROID_RESIDUE_NAMES = { "CHL", "CHL1", "CHOL", "DCHOL" }

477LIPIDS_RESIDUE_NAMES = FATTY_RESIDUE_NAMES.union(STEROID_RESIDUE_NAMES)

478

479# Set typical residue names to guess what residues are

480STANDARD_SOLVENT_RESIDUE_NAMES = {'SOL', 'WAT', 'HOH', 'TIP', 'TP3', 'SWM4'}

481# WARNING: Note that standard names also include + and - symbols

482# Use functions such as Structure.select_counter_ions instead of checking if the set includes a name

483STANDARD_COUNTER_CATION_ATOM_NAMES = {'K', 'NA', 'SOD', 'POT'}

484STANDARD_COUNTER_ANION_ATOM_NAMES = {'CL', 'CLA'}

485STANDARD_COUNTER_ION_ATOM_NAMES = STANDARD_COUNTER_CATION_ATOM_NAMES.union(STANDARD_COUNTER_ANION_ATOM_NAMES)

486STANDARD_DUMMY_ATOM_NAMES = {'MW'}

487DUMMY_ATOM_ELEMENT = 'Dm'

488CG_ATOM_ELEMENT = 'Cg'

489

490# Topology flags

491

492# Set a flag to represent a protein which is not referable (e.g. antibodies, synthetic constructs)

493NO_REFERABLE_FLAG = 'noref'

494

495# Set a flag to represent a not found reference

496NOT_FOUND_FLAG = 'notfound'

497

498# Reference id formats

499PDB_ID_FORMAT = r'^[1-9]{1}[a-zA-Z0-9]{3}$'

500

501# Available analysis for NASSA

502NASSA_ANALYSES_LIST = [ 'bconf', 'coordist', 'bpcorr', 'crdcorr', 'stiff' ]

503

504# Set the correponding canals archives (.ser) for each NASSA analysis

505NASSA_ANALYSES_CANALS = {

506 #'bconf': ['epsilC', 'epsilW', 'zetaC', 'zetaW'],

507 'coordist': ['shift', 'slide', 'rise', 'tilt', 'roll', 'twist','chiW', 'chiC'],

508 'bpcorr': ['shift', 'slide', 'rise', 'tilt', 'roll', 'twist'],

509 #'crdcorr': ['shift', 'slide', 'rise', 'tilt', 'roll', 'twist'],

510 'stiff': ['stretch', 'shear', 'buckle', 'stagger', 'propel', 'opening', 'chiW', 'chiC']

511}