Coverage for mddb_workflow / mwf.py: 79%
1155 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
1# This is the starter script
3from os import chdir, walk, mkdir, getcwd
4from os.path import exists, isdir, isabs, relpath, normpath, split, basename
5import sys
6import io
7import re
8import numpy
9from glob import glob
11# Constants
12# Importing constants first is important
13from mddb_workflow.utils.constants import *
15# Import local utils
16from mddb_workflow.utils.auxiliar import InputError, MISSING_TOPOLOGY
17from mddb_workflow.utils.auxiliar import warn, load_json, save_json, load_yaml, save_yaml
18from mddb_workflow.utils.auxiliar import is_glob, parse_glob, is_url, url_to_source_filename
19from mddb_workflow.utils.auxiliar import read_ndict, write_ndict, get_git_version, download_file
20from mddb_workflow.utils.auxiliar import is_standard_topology
21from mddb_workflow.utils.register import Register
22from mddb_workflow.utils.cache import Cache
23from mddb_workflow.utils.structures import Structure
24from mddb_workflow.utils.topologies import Topology
25from mddb_workflow.utils.file import File
26from mddb_workflow.utils.database import Database
27from mddb_workflow.utils.pyt_spells import get_frames_count, get_average_structure
28from mddb_workflow.utils.selections import Selection
29from mddb_workflow.utils.mda_spells import get_mda_universe
30from mddb_workflow.utils.tasks import Task
31from mddb_workflow.utils.type_hints import *
33# Import local tools
34from mddb_workflow.tools.get_first_frame import get_first_frame
35from mddb_workflow.tools.get_bonds import find_safe_bonds, get_bonds_reference_frame
36from mddb_workflow.tools.process_interactions import process_interactions
37from mddb_workflow.tools.generate_metadata import prepare_project_metadata, generate_md_metadata
38from mddb_workflow.tools.chains import prepare_chain_references
39from mddb_workflow.tools.generate_pdb_references import prepare_pdb_references
40from mddb_workflow.tools.generate_map import generate_protein_mapping
41from mddb_workflow.tools.get_inchi_keys import generate_inchikeys, generate_inchi_references
42from mddb_workflow.tools.get_lipids import generate_lipid_references
43from mddb_workflow.tools.membrane_mapping import generate_membrane_mapping
44from mddb_workflow.tools.get_ligands import generate_ligand_references
45from mddb_workflow.tools.residue_mapping import generate_residue_mapping
46from mddb_workflow.tools.generate_topology import generate_topology
47from mddb_workflow.tools.get_charges import get_charges
48from mddb_workflow.tools.remove_trash import remove_trash
49from mddb_workflow.tools.get_screenshot import get_screenshot
50from mddb_workflow.tools.process_input_files import process_input_files
51from mddb_workflow.tools.provenance import produce_provenance
52from mddb_workflow.tools.get_reduced_trajectory import calculate_frame_step
54# Import local analyses
55from mddb_workflow.analyses.rmsds import rmsds
56from mddb_workflow.analyses.tmscores import tmscores
57from mddb_workflow.analyses.rmsf import rmsf
58from mddb_workflow.analyses.rgyr import rgyr
59from mddb_workflow.analyses.pca import pca
60from mddb_workflow.analyses.density import density
61from mddb_workflow.analyses.thickness import thickness
62from mddb_workflow.analyses.area_per_lipid import area_per_lipid
63from mddb_workflow.analyses.lipid_order import lipid_order
64from mddb_workflow.analyses.lipid_interactions import lipid_interactions
65from mddb_workflow.analyses.channels import channels
66# from mddb_workflow.analyses.pca_contacts import pca_contacts
67from mddb_workflow.analyses.rmsd_per_residue import rmsd_per_residue
68from mddb_workflow.analyses.rmsd_pairwise import rmsd_pairwise
69from mddb_workflow.analyses.clusters import clusters_analysis
70from mddb_workflow.analyses.distance_per_residue import distance_per_residue
71from mddb_workflow.analyses.hydrogen_bonds import hydrogen_bonds
72from mddb_workflow.analyses.sasa import sasa
73from mddb_workflow.analyses.energies import energies
74from mddb_workflow.analyses.dihedral_energies import compute_dihedral_energies
75from mddb_workflow.analyses.pockets import pockets
76from mddb_workflow.analyses.rmsd_check import check_trajectory_integrity
77from mddb_workflow.analyses.helical_parameters import helical_parameters
78from mddb_workflow.analyses.markov import markov
80# Make the system output stream to not be buffered
81# Only when not in a Jupyter notebook or using pytest
82# Check if we're in an interactive Python shell like Jupyter
83if not hasattr(sys, 'ps1') and 'pytest' not in sys.modules:
84 # This is useful to make prints work on time in Slurm
85 # Otherwise, output logs are written after the script has fully run
86 # Note that this fix affects all modules and built-ins
87 unbuffered = io.TextIOWrapper(open(sys.stdout.fileno(), 'wb', 0), write_through=True)
88 sys.stdout = unbuffered
90# Set a special exception for missing inputs
91MISSING_INPUT_EXCEPTION = Exception('Missing input')
94class MD:
95 """A Molecular Dynamics (MD) is the union of a structure and a trajectory.
96 Having this data several analyses are possible.
97 Note that an MD is always defined inside of a Project and thus it has additional topology and metadata.
98 """
100 def __init__(self,
101 project: 'Project',
102 number: int,
103 directory: str,
104 input_structure_filepath: str,
105 input_trajectory_filepaths: list[str],
106 ):
107 """Initialize the MD object.
109 Args:
110 project (Project): The parent project this MD belongs to.
111 number (int): The number of the MD according to its accession.
112 directory (str): The local directory where the MD takes place.
113 input_structure_filepath (str): The input structure file path.
114 input_trajectory_filepaths (list[str]): The input trajectory file paths.
116 """
117 # Save the inputs
118 self.project = project
119 if not project:
120 raise Exception('Project is mandatory to instantiate a new MD')
121 # Save the MD number and index
122 self.number = number
123 self.index = number - 1
124 # Set the remote project handler for this specific MD
125 self.accession = None
126 self.remote = None
127 if self.project.database and self.project.accession:
128 self.accession = f'{self.project.accession}.{self.number}'
129 self.remote = self.project.database.get_remote_project(self.accession)
130 # Save the directory
131 self.directory = normpath(directory)
132 # Now set the director relative to the project
133 self.directory = self.project.pathify(self.directory)
134 if normpath(self.directory) == normpath(self.project.directory):
135 raise InputError(f'MD {self.number} has the same directory as the project: {self.directory}')
136 # Save the directory name alone apart
137 self.directory_location, self.directory_name = split(self.directory)
138 # If the directory does not exists then create it
139 if not exists(self.directory):
140 mkdir(self.directory)
141 # Save the input structure filepath
142 # They may be relative to the project directory (unique) or relative to the MD directory (one per MD)
143 # If the path is absolute then it is considered unique
144 # If the file does not exist and it is to be downloaded then it is downloaded for each MD
145 # Priorize the MD directory over the project directory
146 self.arg_input_structure_filepath = input_structure_filepath
147 self._input_structure_filepath = None
148 self._input_structure_url = None
149 # Set the internal variable for the input structure file, to be assigned later
150 self._input_structure_file = None
151 # Save the input trajectory filepaths
152 self.arg_input_trajectory_filepaths = input_trajectory_filepaths
153 self._input_trajectory_filepaths = None
154 self._input_trajectory_urls = None
155 # Set the internal variable for the input trajectory files, to be assigned later
156 self._input_trajectory_files = None
158 # Other values which may be found/calculated on demand
159 self._md_inputs = None
160 self._structure = None
162 # Tests
163 self._trajectory_integrity = None
165 # Set a new MD specific register
166 # In case the directory is the project directory itself, use the project register
167 register_filepath = self.pathify(REGISTER_FILENAME)
168 register_file = File(register_filepath)
169 if register_file.path == self.project.register.file.path:
170 self.register = self.project.register
171 else:
172 self.register = Register(register_file)
174 # Set a new MD specific cache
175 # In case the directory is the project directory itself, use the project cache
176 cache_filepath = self.pathify(CACHE_FILENAME)
177 cache_file = File(cache_filepath)
178 if cache_file.path == self.project.cache.file.path:
179 self.cache = self.project.cache
180 else:
181 self.cache = Cache(cache_file)
183 # Set tasks whose output is to be overwritten
184 self.overwritables = set()
186 # Get MD inputs just to fill the inputs' "mds" value
187 # Some functions may fail otherwise when its value is missing
188 if self.project.is_inputs_file_available():
189 self.get_md_inputs()
191 def __repr__(self):
192 """Return a string representation of the MD object."""
193 return 'MD'
195 def pathify(self, filename_or_relative_path: str) -> str:
196 """Given a filename or relative path, add the MD directory path at the beginning."""
197 return normpath(self.directory + '/' + filename_or_relative_path)
199 # Input structure file ------------
201 def get_input_structure_filepath(self) -> str:
202 """Set a function to get input structure file path."""
203 # Return the internal value if it is already assigned
204 if self._input_structure_filepath is not None:
205 return self._input_structure_filepath
207 def relativize_and_parse_paths(input_path: str, may_not_exist: bool = False) -> Optional[str]:
208 """Find out if a path is relative to MD directories or to the project directory.
210 To do so just check if the file exists in any of those.
211 In case it exists in both or none then assume it is relative to MD directory.
212 Parse glob notation in the process.
213 """
214 # Check if it is a URL
215 if is_url(input_path):
216 self._input_structure_url = input_path
217 # Set the paths for the further download
218 source_filename = url_to_source_filename(input_path)
219 md_relative_filepath = self.pathify(source_filename)
220 return md_relative_filepath
221 # Check if it is an absolute path
222 if isabs(input_path):
223 abs_glob_parse = parse_glob(input_path)
224 # If we had multiple results then we complain
225 if len(abs_glob_parse) > 1:
226 raise InputError(f'Multiple structures found with "{input_path}": {", ".join(abs_glob_parse)}')
227 # If we had no results then we complain
228 if len(abs_glob_parse) == 0:
229 if self.remote:
230 warn('Spread syntax is not supported to download remote files')
231 raise InputError(f'No structure found with "{input_path}"')
232 abs_parsed_filepath = abs_glob_parse[0]
233 return abs_parsed_filepath
234 # Check the MD directory
235 md_relative_filepath = self.pathify(input_path)
236 md_glob_parse = parse_glob(md_relative_filepath)
237 if len(md_glob_parse) > 1:
238 raise InputError(f'Multiple structures found with "{input_path}": {", ".join(md_glob_parse)}')
239 md_parsed_filepath = md_glob_parse[0] if len(md_glob_parse) == 1 else None
240 if md_parsed_filepath and File(md_parsed_filepath).exists:
241 return md_parsed_filepath
242 # Check the project directory
243 project_relative_filepath = self.project.pathify(input_path)
244 project_glob_parse = parse_glob(project_relative_filepath)
245 if len(project_glob_parse) > 1:
246 raise InputError(f'Multiple structures found with "{input_path}": {", ".join(project_glob_parse)}')
247 project_parsed_filepath = project_glob_parse[0] if len(project_glob_parse) == 1 else None
248 if project_parsed_filepath and File(project_parsed_filepath).exists:
249 return project_parsed_filepath
250 # At this point we can conclude the input structure file does not exist
251 # If we have no paths at all then it means a glob pattern was passed and it didn't match
252 # Note that if a glob pattern existed then it would mean the file actually existed
253 if len(md_glob_parse) == 0 and len(project_glob_parse) == 0:
254 # Warn the user in case it was trying to use glob syntax to donwload remote files
255 if self.remote:
256 warn('Spread syntax is not supported to download remote files')
257 raise InputError('No trajectory file was reached neither in the project directory or MD directories in path(s) ' + ', '.join(input_path))
258 # If the path does not exist anywhere then we asume it will be downloaded and set it relative to the MD
259 # However make sure we have a remote
260 # As an exception, if the 'may not exist' flag is passed then we return the result even if there is no remote
261 if not may_not_exist and not self.remote:
262 raise InputError(f'Cannot find a structure file by "{input_path}" anywhere')
263 md_parsed_filepath = self.project.pathify(input_path) if f'{self.directory_name}/' in md_parsed_filepath else self.pathify(input_path)
264 return md_parsed_filepath
265 # If we have a value passed through command line
266 if self.arg_input_structure_filepath:
267 # Find out if it is relative to MD directories or to the project directory
268 self._input_structure_filepath = relativize_and_parse_paths(self.arg_input_structure_filepath)
269 # Save the parsed value in the inputs file
270 self.project.update_inputs(
271 f'mds.{self.index}.input_structure_filepath',
272 self._input_structure_filepath)
273 return self._input_structure_filepath
274 # If we have a value passed through the inputs file has the value
275 if self.project.is_inputs_file_available():
276 # Get the input value, whose key must exist
277 inputs_value = self.get_input('input_structure_filepath')
278 # If there is a valid input then use it
279 if inputs_value:
280 self._input_structure_filepath = relativize_and_parse_paths(inputs_value)
281 return self._input_structure_filepath
282 # If there is not input structure anywhere then use the input topology
283 # We will extract the structure from it using a sample frame from the trajectory
284 # Note that topology input filepath must exist and an input error will raise otherwise
285 # However if we are using the standard topology file we can not extract the PDB from it (yet)
286 if self.project.input_topology_file != MISSING_TOPOLOGY and \
287 not is_standard_topology(self.project.input_topology_file):
288 return self.project.input_topology_file.path
289 # If we can not use the topology either then surrender
290 raise InputError('There is not input structure at all')
292 def get_input_structure_file(self) -> str:
293 """Get the input pdb filename from the inputs.
294 If the file is not found try to download it.
295 """
296 # If the input structure file is already defined then return it
297 if self._input_structure_file:
298 return self._input_structure_file
299 # Otherwise we must set it
300 # First set the input structure filepath
301 input_structure_filepath = self.get_input_structure_filepath()
302 # Now set the input structure file
303 input_structure_file = File(input_structure_filepath)
304 # If there is a structure URL then we must donwload the structure first
305 input_structure_url = self._input_structure_url
306 if input_structure_url and not input_structure_file.exists:
307 original_filename = input_structure_url.split('/')[-1]
308 # If there is a remote then use it
309 if self.remote:
310 # If the structure filename is the standard structure filename then use the structure endpoint instead
311 if original_filename == STRUCTURE_FILENAME:
312 self.remote.download_standard_structure(input_structure_file)
313 # Otherwise download the input strucutre file by its filename
314 else:
315 self.remote.download_file(original_filename, input_structure_file)
316 # Otherwise use the URL as is
317 else:
318 download_file(input_structure_url, input_structure_file)
319 # If the file already exists then return it
320 if input_structure_file.exists:
321 self._input_structure_file = input_structure_file
322 return self._input_structure_file
323 raise InputError(f'Missing input structure file "{input_structure_file.path}"')
324 input_structure_file = property(get_input_structure_file, None, None, "Input structure filename (read only)")
326 # Input trajectory filename ------------
328 def get_input_trajectory_filepaths(self) -> str:
329 """Get the input trajectory file paths."""
330 # Return the internal value if it is already assigned
331 if self._input_trajectory_filepaths is not None:
332 return self._input_trajectory_filepaths
334 def relativize_and_parse_paths(input_paths: list[str]) -> list[str]:
335 """Check and fix input trajectory filepaths.
336 Also relativize paths to the current MD directory and parse glob notation.
337 """
338 checked_paths = input_paths
339 # Input trajectory filepaths may be both a list or a single string
340 # However we must keep a list
341 if type(checked_paths) is list:
342 pass
343 elif type(checked_paths) is str:
344 checked_paths = [checked_paths]
345 else:
346 raise InputError('Input trajectory filepaths must be a list of strings or a string')
347 # Make sure all or none of the trajectory paths are URLs
348 url_count = sum([is_url(path) for path in checked_paths])
349 if not (url_count == 0 or url_count == len(checked_paths)):
350 raise InputError('All trajectory paths must be paths or URLs. Mixing is not supported')
351 # In case trajectory paths are URLs
352 if url_count > 0:
353 self._input_trajectory_urls = checked_paths
354 # Set the paths for the further download
355 parsed_paths = []
356 for path in checked_paths:
357 source_filename = url_to_source_filename(path)
358 md_relative_filepath = self.pathify(source_filename)
359 parsed_paths.append(md_relative_filepath)
360 return parsed_paths
361 # Make sure all or none of the trajectory paths are absolute
362 abs_count = sum([isabs(path) for path in checked_paths])
363 if not (abs_count == 0 or abs_count == len(checked_paths)):
364 raise InputError('All trajectory paths must be relative or absolute. Mixing is not supported')
366 def parse_all_glob(paths: list[str]) -> list[str]:
367 """Glob-parse and merge all paths."""
368 parsed_paths = []
369 for path in paths:
370 parsed_paths += parse_glob(path)
371 return parsed_paths
372 # In case trajectory paths are absolute
373 if abs_count > 0:
374 absolute_parsed_paths = parse_all_glob(checked_paths)
375 # Check we successfully defined some trajectory file
376 if len(absolute_parsed_paths) == 0:
377 # Warn the user in case it was trying to use glob syntax to donwload remote files
378 if self.remote:
379 warn('Spread syntax is not supported to download remote files')
380 raise InputError('No trajectory file was reached neither in the project directory or MD directories in path(s) ' + ', '.join(input_paths))
381 return absolute_parsed_paths
382 # If trajectory paths are not absolute then check if they are relative to the MD directory
383 # Get paths relative to the current MD directory
384 md_relative_paths = [self.pathify(path) for path in checked_paths]
385 # In case there are glob characters we must parse the paths
386 md_parsed_paths = parse_all_glob(md_relative_paths)
387 # Check we successfully defined some trajectory file
388 if len(md_parsed_paths) > 0:
389 # If so, check at least one of the files do actually exist
390 if any([exists(path) for path in md_parsed_paths]):
391 return md_parsed_paths
392 # If no trajectory files where found then asume they are relative to the project
393 # Get paths relative to the project directory
394 project_relative_paths = [self.project.pathify(path) for path in checked_paths]
395 # In case there are glob characters we must parse the paths
396 project_parsed_paths = parse_all_glob(project_relative_paths)
397 # Check we successfully defined some trajectory file
398 if len(project_parsed_paths) > 0:
399 # If so, check at least one of the files do actually exist
400 if any([exists(path) for path in project_parsed_paths]):
401 return project_parsed_paths
402 # At this point we can conclude the input trajectory file does not exist
403 # If we have no paths at all then it means a glob pattern was passed and it didn't match
404 # Note that if a glob pattern existed then it would mean the file actually existed
405 if len(md_parsed_paths) == 0 and len(project_parsed_paths) == 0:
406 # Warn the user in case it was trying to use glob syntax to donwload remote files
407 if self.remote:
408 warn('Spread syntax is not supported to download remote files')
409 raise InputError('No trajectory file was reached neither in the project directory or MD directories in path(s) ' + ', '.join(input_paths))
410 # If we have a path however it may be downloaded from the database if we have a remote
411 if not self.remote:
412 raise InputError(f'Cannot find anywhere a trajectory file with path(s) "{", ".join(input_paths)}"')
413 # Note that if input path was not glob based it will be both as project relative and MD relative
414 if len(md_parsed_paths) == 0: raise ValueError('This should never happen')
415 # If file is to be downloaded then we must make sure the path is relative to the project
416 project_relative_paths = [
417 self.project.pathify(path) if f'{self.directory_name}/' in path else self.pathify(path) for path in checked_paths
418 ]
419 return project_relative_paths
420 # If we have a value passed through command line
421 if self.arg_input_trajectory_filepaths:
422 self._input_trajectory_filepaths = relativize_and_parse_paths(self.arg_input_trajectory_filepaths)
423 # Save the parsed value in the inputs file
424 self.project.update_inputs(
425 f'mds.{self.index}.input_trajectory_filepaths',
426 self._input_trajectory_filepaths)
427 return self._input_trajectory_filepaths
428 # Check if the inputs file has the value
429 if self.project.is_inputs_file_available():
430 # Get the input value
431 inputs_value = self.get_input('input_trajectory_filepaths')
432 if inputs_value:
433 self._input_trajectory_filepaths = relativize_and_parse_paths(inputs_value)
434 return self._input_trajectory_filepaths
435 # If there is no trajectory available then we surrender
436 raise InputError('There is not input trajectory at all')
438 def get_input_trajectory_files(self) -> str:
439 """Get the input trajectory filename(s) from the inputs.
440 If file(s) are not found try to download it.
441 """
442 # If we already defined input trajectory files then return them
443 if self._input_trajectory_files is not None:
444 return self._input_trajectory_files
445 # Otherwise we must set the input trajectory files
446 input_trajectory_filepaths = self.get_input_trajectory_filepaths()
447 input_trajectory_files = [File(path) for path in input_trajectory_filepaths]
448 # If there are input trajectory URLs then download the trajectories first
449 input_trajectory_urls = self._input_trajectory_urls
450 if input_trajectory_urls:
451 for trajectory_url, trajectory_file in zip(input_trajectory_urls, input_trajectory_files):
452 # If the trajectory file already exists then skip it
453 if trajectory_file.exists: continue
454 original_filename = trajectory_url.split('/')[-1]
455 # If there is a remote then use it
456 if self.remote:
457 # If the trajectory filename is the standard trajectory filename then use the trajectory endpoint instead
458 if original_filename == TRAJECTORY_FILENAME:
459 frame_selection = None
460 if self.project.sample_trajectory:
461 remote_frames = self.remote.snapshots
462 maximum_desired_frames = self.project.sample_trajectory
463 step, final_frames = calculate_frame_step(remote_frames, maximum_desired_frames)
464 frame_selection = f'1:{remote_frames}:{step}'
465 self.remote.download_trajectory(trajectory_file, frame_selection=frame_selection, format='xtc')
466 # Otherwise download the input trajectory file by its filename
467 else:
468 if self.project.sample_trajectory:
469 raise InputError('The "-smp" argument is supported only when asking for the standard trajectory')
470 self.remote.download_file(original_filename, trajectory_file)
471 # Otherwise use the URL as is
472 else:
473 if self.project.sample_trajectory:
474 raise InputError('The "-smp" argument is supported only when using the "-proj" argument')
475 download_file(trajectory_url, trajectory_file)
476 # Find missing trajectory files
477 missing_input_trajectory_files = []
478 for trajectory_file in input_trajectory_files:
479 if not trajectory_file.exists:
480 missing_input_trajectory_files.append(trajectory_file)
481 # If all files already exists then we are done
482 if len(missing_input_trajectory_files) == 0:
483 self._input_trajectory_files = input_trajectory_files
484 return self._input_trajectory_files
485 missing_filepaths = [trajectory_file.path for trajectory_file in missing_input_trajectory_files]
486 raise InputError('Missing input trajectory files: ' + ', '.join(missing_filepaths))
487 input_trajectory_files = property(get_input_trajectory_files, None, None, "Input trajectory filenames (read only)")
489 def get_md_inputs(self) -> dict:
490 """Get MD specific inputs."""
491 # If we already have a value stored then return it
492 if self._md_inputs:
493 return self._md_inputs
494 # Otherwise we must find its value
495 # If we have MD inputs in the inputs file then use them
496 if self.project.input_mds:
497 # Iterate over the different MD inputs to find out each directory
498 # We must find the MD inputs whcih belong to this specific MD according to this directory
499 for md in self.project.input_mds:
500 # Get the directory according to the inputs
501 directory = md.get(MD_DIRECTORY, None)
502 if directory:
503 check_directory(directory)
504 # If no directory is specified in the inputs then guess it from the MD name
505 else:
506 name = md.get('name', None)
507 if not name: raise InputError('There is a MD with no name and no directory. Please define at least one of them.')
508 directory = name_2_directory(name)
509 # If the directory matches then this is our MD inputs
510 if self.project.pathify(directory) == self.directory:
511 self._md_inputs = md
512 return self._md_inputs
513 # If this MD directory has not associated inputs then it means it was passed through command line
514 # We set a new MD inputs for it
515 new_md_name = directory_2_name(self.directory)
516 self._md_inputs = {'name': new_md_name, 'mdir': self.directory}
517 # Update the inputs file with the new MD inputs
518 mds = self.project.inputs.get('mds', [])
519 new_mds_inputs = [*mds, self._md_inputs]
520 self.project.update_inputs('mds', new_mds_inputs)
521 return self._md_inputs
523 md_inputs = property(get_md_inputs, None, None, "MD specific inputs (read only)")
525 def get_input(self, name: str):
526 """Get a specific 'input' value from MD inputs."""
527 value = self.md_inputs.get(name, MISSING_INPUT_EXCEPTION)
528 # If we had a value then return it
529 if value != MISSING_INPUT_EXCEPTION:
530 return value
531 return self.project.get_input(name)
533 # ---------------------------------
535 def get_file(self, target_file: File) -> bool:
536 """Check if a file exists. If not, try to download it from the database.
537 If the file is not found in the database it is fine, we do not even warn the user.
538 Note that this function is used to get populations and transitions files, which are not common.
539 """
540 # If it exists we are done
541 if target_file.exists:
542 return True
543 # Try to download the missing file
544 # If we do not have the required parameters to download it then we surrender here
545 if not self.remote:
546 return False
547 # Check if the file is among the available remote files
548 # If it is no then stop here
549 if target_file.filename not in self.remote.available_files:
550 return False
551 # Download the file
552 self.remote.download_file(target_file.filename, target_file)
553 return True
555 def print_tests_summary(self):
556 """Make a summary of tests and their status."""
557 print('Tests summary:')
558 for test_name in AVAILABLE_CHECKINGS:
559 test_result = self.register.tests.get(test_name, None)
560 # Print things pretty
561 test_nice_name = NICE_NAMES[test_name]
562 test_nice_result = None
563 if test_result is None:
564 test_nice_result = YELLOW_HEADER + 'Not run' + COLOR_END
565 elif test_result is False:
566 test_nice_result = RED_HEADER + 'Failed' + COLOR_END
567 elif test_result is True:
568 test_nice_result = GREEN_HEADER + 'Passed' + COLOR_END
569 elif test_result == 'na':
570 test_nice_result = BLUE_HEADER + 'Not applicable' + COLOR_END
571 else:
572 raise ValueError()
574 print(f' - {test_nice_name} -> {test_nice_result}')
576 # Issue some warnings if failed or never run tests are skipped
577 # This is run after processing input files
578 # RUBEN: mover a inpro?
579 def _issue_required_test_warnings(self):
580 for test_name in AVAILABLE_CHECKINGS:
581 # If test was not skipped then proceed
582 if test_name not in self.project.trust: continue
583 # If test passed in a previous run the proceed
584 test_result = self.register.tests.get(test_name)
585 if test_result is True: continue
586 # If test failed in a previous run we can also proceed
587 # The failing warning must be among the inherited warnings, so there is no need to add more warnings here
588 if test_result is False: continue
589 # If the test has been always skipped then issue a warning
590 if test_result is None:
591 # Remove previous warnings
592 self.register.remove_warnings(test_name)
593 # Get test pretty name
594 test_nice_name = NICE_NAMES[test_name]
595 # Issue the corresponding warning
596 self.register.add_warning(test_name, test_nice_name + ' was skipped and never run before')
597 continue
598 raise ValueError('Test value is not supported')
600 # Processed files ----------------------------------------------------
602 def get_topology_filepath(self) -> str:
603 """Get the processed topology file path."""
604 return self.project.get_topology_filepath()
605 topology_filepath = property(get_topology_filepath, None, None, "Topology file path (read only)")
607 # Run the actual processing to generate output processed files out of input raw files
608 # And by "files" I mean structure, trajectory and topology
609 input_files_processing = Task('inpro', 'Input files processing', process_input_files,
610 output_filenames={
611 'output_topology_file': get_topology_filepath,
612 'output_structure_file': STRUCTURE_FILENAME,
613 'output_trajectory_file': TRAJECTORY_FILENAME,
614 })
616 # Output main files
617 get_structure_file = input_files_processing.get_output_file_getter('output_structure_file')
618 structure_file = property(get_structure_file, None, None, "Structure file (read only)")
619 get_trajectory_file = input_files_processing.get_output_file_getter('output_trajectory_file')
620 trajectory_file = property(get_trajectory_file, None, None, "Trajectory file (read only)")
622 def get_topology_file(self) -> 'File':
623 """Get the processed topology from the project."""
624 return self.project.get_topology_file()
625 topology_file = property(get_topology_file, None, None,
626 "Topology filename from the project (read only)")
628 # ---------------------------------------------------------------------------------
629 # Others values which may be found/calculated and files to be generated on demand
630 # ---------------------------------------------------------------------------------
632 # Trajectory snapshots
633 get_snapshots = Task('frames', 'Count trajectory frames', get_frames_count)
634 snapshots = property(get_snapshots, None, None, "Trajectory snapshots (read only)")
636 def get_reference_bonds(self) -> list[list[int]]:
637 """Get the reference bonds."""
638 return self.project.reference_bonds
639 reference_bonds = property(get_reference_bonds, None, None, "Atom bonds to be trusted (read only)")
641 def get_structure(self) -> 'Structure':
642 """Get the parsed structure."""
643 # If we already have a stored value then return it
644 if self._structure:
645 return self._structure
646 # Otherwise we must set the structure
647 # Make sure the structure file exists at this point
648 if not self.structure_file.exists:
649 raise ValueError('Trying to set standard structure but file '
650 f'{self.structure_file.path} does not exist yet. Are you trying '
651 'to access the standard structure before processing input files?')
652 # Note that this is not only the structure class, but it also contains additional logic
653 self._structure = Structure.from_pdb_file(self.structure_file.path)
654 # If the stable bonds test failed and we had mercy then it is sure our structure will have wrong bonds
655 # In order to make it coherent with the topology we will mine topology bonds from here and force them in the structure
656 # If we fail to get bonds from topology then just go along with the default structure bonds
657 if not self.register.tests.get(STABLE_BONDS_FLAG, None):
658 self._structure.bonds = self.reference_bonds
659 # Same procedure if we have coarse grain atoms
660 elif self.cg_selection:
661 self._structure.bonds = self.reference_bonds
662 return self._structure
663 structure = property(get_structure, None, None, "Parsed structure (read only)")
665 # First frame PDB file
666 get_first_frame = Task('firstframe', 'Get first frame structure',
667 get_first_frame, output_filenames={'output_file': FIRST_FRAME_FILENAME})
668 get_first_frame_file = get_first_frame.get_output_file_getter('output_file')
669 first_frame_file = property(get_first_frame_file, None, None, "First frame (read only)")
671 # Average structure filename
672 get_average_structure = Task('average', 'Get average structure',
673 get_average_structure, output_filenames={'output_file': AVERAGE_STRUCTURE_FILENAME})
674 get_average_structure_file = get_average_structure.get_output_file_getter('output_file')
675 average_structure_file = property(get_average_structure_file, None, None, "Average structure filename (read only)")
677 # Produce the MD metadata file to be uploaded to the database
678 prepare_metadata = Task('mdmeta', 'Prepare MD metadata',
679 generate_md_metadata, output_filenames={'output_file': OUTPUT_METADATA_FILENAME})
681 # The processed interactions
682 get_processed_interactions = Task('inter', 'Interactions processing',
683 process_interactions, {'frames_limit': 1000})
684 interactions = property(get_processed_interactions, None, None, "Processed interactions (read only)")
686 # MDAnalysis Universe object
687 get_MDAnalysis_Universe = Task('mda_univ', 'MDAnalysis Universe object',
688 get_mda_universe, use_cache=False)
689 universe = property(get_MDAnalysis_Universe, None, None, "MDAnalysis Universe object (read only)")
691 def input_getter(name: str):
692 """Get input values which may be MD specific.
693 If the MD input is missing then we use the project input value.
694 """
695 # Set the getter
696 def getter(self):
697 # Get the MD input
698 value = self.md_inputs.get(name, None)
699 if value is not None:
700 return value
701 # If there is no MD input then return the project value
702 return getattr(self.project, f'input_{name}')
703 return getter
705 # Assign the MD input getters
706 input_interactions = property(input_getter('interactions'), None, None, "Interactions to be analyzed (read only)")
707 input_pbc_selection = property(input_getter('pbc_selection'), None, None, "Selection of atoms which are still in periodic boundary conditions (read only)")
708 input_cg_selection = property(input_getter('cg_selection'), None, None, "Selection of atoms which are not actual atoms but coarse grain beads (read only)")
710 def _set_pbc_selection(self, reference_structure: 'Structure', verbose: bool = False) -> 'Selection':
711 """Set PBC selection.
712 It may parse the inputs file selection string if it is available or guess it otherwise.
713 """
714 # Otherwise we must set the PBC selection
715 if verbose: print('Setting Periodic Boundary Conditions (PBC) atoms selection')
716 selection_string = None
717 # If there is inputs file then get the input pbc selection
718 if self.project.is_inputs_file_available():
719 if verbose: print(' Using selection string in the inputs file')
720 selection_string = self.input_pbc_selection
721 # If there is no inputs file we guess PBC atoms automatically
722 else:
723 if verbose: print(' No inputs file -> Selection string will be set automatically')
724 selection_string = 'auto'
725 # Parse the selection string using the reference structure
726 parsed_selection = None
727 # If the input PBC selection is 'auto' then guess it automatically
728 if selection_string == 'auto':
729 # To guess PBC atoms (with the current implementation) we must make sure ther eis no CG
730 if reference_structure.has_cg():
731 raise InputError('We can not guess PBC atoms in CG systems. Please set PBC atoms manually.\n'
732 ' Use the "-pbc" argument or set the inputs file "pbc_selection" field.')
733 if verbose: print(' Guessing PBC atoms as solvent, counter ions and lipids')
734 parsed_selection = reference_structure.select_pbc_guess()
735 # If we have a valid input value then use it
736 elif selection_string:
737 if verbose: print(f' Selecting PBC atoms "{selection_string}"')
738 parsed_selection = reference_structure.select(selection_string)
739 if not parsed_selection:
740 raise InputError(f'PBC selection "{selection_string}" selected no atoms')
741 # If we have an input value but it is empty then we set an empty selection
742 else:
743 if verbose: print(' No PBC atoms selected')
744 parsed_selection = Selection()
745 # Log a few of the selected residue names
746 if verbose and parsed_selection:
747 print(f' Parsed PBC selection has {len(parsed_selection)} atoms')
748 selected_residues = reference_structure.get_selection_residues(parsed_selection)
749 selected_residue_names = list(set([residue.name for residue in selected_residues]))
750 limit = 3 # Show a maximum of 3 residue names
751 example_residue_names = ', '.join(selected_residue_names[0:limit])
752 if len(selected_residue_names) > limit: example_residue_names += ', etc.'
753 print(' e.g. ' + example_residue_names)
754 return parsed_selection
756 def get_pbc_selection(self) -> 'Selection':
757 """Get the periodic boundary conditions atom selection."""
758 # If we already have a stored value then return it
759 if self.project._pbc_selection is not None:
760 return self.project._pbc_selection
761 # Otherwise we must set the PBC selection
762 self.project._pbc_selection = self._set_pbc_selection(self.structure)
763 return self.project._pbc_selection
764 pbc_selection = property(get_pbc_selection, None, None, "Periodic boundary conditions atom selection (read only)")
766 # WARNING: Do not inherit project pbc residues
767 # WARNING: It may trigger all the processing logic of the reference MD when there is no need
768 def get_pbc_residues(self) -> list[int]:
769 """Get indices of residues in periodic boundary conditions."""
770 # If we already have a stored value then return it
771 if self.project._pbc_residues:
772 return self.project._pbc_residues
773 # If there is no inputs file then asume there are no PBC residues
774 if not self.pbc_selection:
775 self.project._pbc_residues = []
776 return self.project._pbc_residues
777 # Otherwise we parse the selection and return the list of residue indices
778 self.project._pbc_residues = self.structure.get_selection_residue_indices(self.pbc_selection)
779 print(f'PBC residues "{self.input_pbc_selection}" -> {len(self.project._pbc_residues)} residues')
780 return self.project._pbc_residues
781 pbc_residues = property(get_pbc_residues, None, None, "Indices of residues in periodic boundary conditions (read only)")
783 # DANI: Esto algún día habría que tratar de automatizarlo
784 def _set_cg_selection(self, reference_structure: 'Structure', verbose: bool = False) -> 'Selection':
785 """Set the coarse grain selection."""
786 if verbose: print('Setting Coarse Grained (CG) atoms selection')
787 # If there is no inputs file then asum there is no CG selection
788 if not self.project.is_inputs_file_available():
789 if verbose: print(' No inputs file -> Asuming there is no CG at all')
790 return Selection()
791 # Otherwise we use the selection string from the inputs
792 if verbose: print(' Using selection string in the inputs file')
793 selection_string = self.input_cg_selection
794 # If the selection is empty, again, assume there is no CG selection
795 if not selection_string:
796 if verbose: print(' Empty selection -> There is no CG at all')
797 return Selection()
798 # Otherwise, process it
799 # If we have a valid input value then use it
800 elif selection_string:
801 if verbose: print(f' Selecting CG atoms "{selection_string}"')
802 parsed_selection = reference_structure.select(selection_string)
803 # If we have an input value but it is empty then we set an empty selection
804 else:
805 if verbose: print(' No CG atoms selected')
806 parsed_selection = Selection()
807 # Lof the parsed selection size
808 if verbose: print(f' Parsed CG selection has {len(parsed_selection)} atoms')
809 # Log a few of the selected residue names
810 if verbose and parsed_selection:
811 selected_residues = reference_structure.get_selection_residues(parsed_selection)
812 selected_residue_names = list(set([residue.name for residue in selected_residues]))
813 limit = 3 # Show a maximum of 3 residue names
814 example_residue_names = ', '.join(selected_residue_names[0:limit])
815 if len(selected_residue_names) > limit: example_residue_names += ', etc.'
816 print(' e.g. ' + example_residue_names)
817 return parsed_selection
819 def get_cg_selection(self) -> 'Selection':
820 """Get the coarse grain atom selection."""
821 # If we already have a stored value then return it
822 if self.project._cg_selection:
823 return self.project._cg_selection
824 # Otherwise we must set the PBC selection
825 self.project._cg_selection = self._set_cg_selection(self.structure)
826 return self.project._cg_selection
827 cg_selection = property(get_cg_selection, None, None, "Periodic boundary conditions atom selection (read only)")
829 # WARNING: Do not inherit project cg residues
830 # WARNING: It may trigger all the processing logic of the reference MD when there is no need
831 def get_cg_residues(self) -> list[int]:
832 """Get indices of residues in coarse grain."""
833 # If we already have a stored value then return it
834 if self.project._cg_residues:
835 return self.project._cg_residues
836 # If there is no inputs file then asume there are no cg residues
837 if not self.cg_selection:
838 self.project._cg_residues = []
839 return self.project._cg_residues
840 # Otherwise we parse the selection and return the list of residue indices
841 self.project._cg_residues = self.structure.get_selection_residue_indices(self.cg_selection)
842 print(f'CG residues "{self.input_cg_selection}" -> {len(self.project._cg_residues)} residues')
843 return self.project._cg_residues
844 cg_residues = property(get_cg_residues, None, None, "Indices of residues in coarse grain (read only)")
846 def get_populations(self) -> list[float]:
847 """Get equilibrium populations from a MSM from the project."""
848 return self.project.populations
849 populations = property(get_populations, None, None, "Equilibrium populations from a MSM (read only)")
851 def get_transitions(self) -> list[list[float]]:
852 """Get transition probabilities from a MSM from the project."""
853 return self.project.transitions
854 transitions = property(get_transitions, None, None, "Transition probabilities from a MSM (read only)")
856 def get_protein_map(self) -> dict:
857 """Get the residues mapping from the project."""
858 return self.project.protein_map
859 protein_map = property(get_protein_map, None, None, "Residues mapping (read only)")
861 def get_charges(self) -> dict:
862 """Get the residues mapping from the project."""
863 return self.project.charges
864 charges = property(get_charges, None, None, "Residues charges (read only)")
866 # Reference frame
867 get_reference_frame = Task('reframe', 'Reference frame', get_bonds_reference_frame)
868 reference_frame = property(get_reference_frame, None, None, "Reference frame to be used to represent the MD (read only)")
870 # ---------------------------------------------------------------------------------
871 # Tests
872 # ---------------------------------------------------------------------------------
874 def is_trajectory_integral(self) -> Optional[bool]:
875 """Sudden jumps test."""
876 # If we already have a stored value then return it
877 if self._trajectory_integrity is not None:
878 return self._trajectory_integrity
879 # Otherwise we must find the value
880 self._trajectory_integrity = check_trajectory_integrity(
881 input_structure_filename=self.structure_file.path,
882 input_trajectory_filename=self.trajectory_file.path,
883 structure=self.structure,
884 pbc_selection=self.pbc_selection,
885 mercy=self.project.mercy,
886 trust=self.project.trust,
887 register=self.register,
888 # time_length = self.time_length,
889 check_selection=ALL_ATOMS,
890 standard_deviations_cutoff=self.project.rmsd_cutoff,
891 snapshots=self.snapshots,
892 )
893 return self._trajectory_integrity
895 # ---------------------------------------------------------------------------------
896 # Analyses
897 # ---------------------------------------------------------------------------------
899 # RMSDs analysis
900 run_rmsds_analysis = Task('rmsds', 'RMSDs analysis',
901 rmsds, {'frames_limit': 5000})
903 # TM scores analysis
904 run_tmscores_analysis = Task('tmscore', 'TM scores analysis',
905 tmscores, {'frames_limit': 200})
907 # RMSF, atom fluctuation analysis
908 run_rmsf_analysis = Task('rmsf', 'Fluctuation analysis', rmsf)
910 # Radius of gyration analysis
911 run_rgyr_analysis = Task('rgyr', 'Radius of gyration analysis',
912 rgyr, {'frames_limit': 5000})
914 # PCA, principal component analysis
915 run_pca_analysis = Task('pca', 'Principal component analysis',
916 pca, {'frames_limit': 2000, 'projection_frames': 20})
918 # PCA contacts
919 # DANI: Intenta usar mucha memoria, hay que revisar
920 # DANI: Puede saltar un error de imposible alojar tanta memoria
921 # DANI: Puede comerse toda la ram y que al final salte un error de 'Terminado (killed)'
922 # DANI: De momento me lo salto
923 # DANI: Lleva mucho tiempo sin mantenerse, habrá que cambiar varias cosas al recuperarlo
924 # run_pca_contacts('pcacons', 'PCA contacts', pca_contacts)
926 # RMSD per residue analysis
927 run_rmsd_perres_analysis = Task('perres', 'RMSD per residue analysis',
928 rmsd_per_residue, {'frames_limit': 100})
930 # RMSD pairwise
931 # Perform an analysis for the overall structure and then one more analysis for each interaction
932 run_rmsd_pairwise_analysis = Task('pairwise', 'RMSD pairwise',
933 rmsd_pairwise, {'frames_limit': 200, 'overall_selection': "name CA or name C5'"})
935 # Run the cluster analysis
936 run_clusters_analysis = Task('clusters', 'Clusters analysis',
937 clusters_analysis, {'frames_limit': 1000, 'desired_n_clusters': 20})
939 # Calculate the distance mean and standard deviation of each pair of residues
940 run_dist_perres_analysis = Task('dist', 'Distance per residue',
941 distance_per_residue, {'frames_limit': 200})
943 # Hydrogen bonds
944 run_hbonds_analysis = Task('hbonds', 'Hydrogen bonds analysis',
945 hydrogen_bonds, {'time_splits': 100})
947 # SASA, solvent accessible surface analysis
948 run_sas_analysis = Task('sas', 'Solvent accessible surface analysis',
949 sasa, {'frames_limit': 100})
951 # Perform the electrostatic and vdw energies analysis for each pair of interaction agents
952 run_energies_analysis = Task('energies', 'Energies analysis',
953 energies, {'frames_limit': 100})
955 # Calculate torsions and then dihedral energies for every dihedral along the trajectory
956 run_dihedral_energies = Task('dihedrals', 'Dihedral energies analysis',
957 compute_dihedral_energies, {'frames_limit': 100})
959 # Perform the pockets analysis
960 run_pockets_analysis = Task('pockets', 'Pockets analysis',
961 pockets, {'frames_limit': 100, 'maximum_pockets_number': 10})
963 # Helical parameters
964 run_helical_analysis = Task('helical', 'Helical parameters', helical_parameters)
966 # Markov
967 run_markov_analysis = Task('markov', 'Markov', markov, {'rmsd_selection': PROTEIN_AND_NUCLEIC})
969 # Membrane density analysis
970 run_density_analysis = Task('density', 'Membrane density analysis',
971 density, {'frames_limit': 1000})
973 # Membrane thickness analysis
974 run_thickness_analysis = Task('thickness', 'Membrane thickness analysis',
975 thickness, {'frames_limit': 100})
977 # Area per lipid analysis
978 run_apl_analysis = Task('apl', 'Membrane area per lipid analysis', area_per_lipid)
980 # Calculate lipid order parameters for membranes
981 run_lipid_order_analysis = Task('lorder', 'Membrane lipid order analysis',
982 lipid_order, {'frames_limit': 100})
984 # Lipid-protein interactions analysis
985 run_lipid_interactions_analysis = Task('linter', 'Membrane lipid-protein interactions analysis',
986 lipid_interactions, {'frames_limit': 100})
988 run_channels_analysis = Task('channels', 'Membrane channels analysis',
989 channels, {'frames_limit': 10})
991 def get_warnings(self) -> list:
992 """Get the warnings.
994 The warnings list should not be reasigned, but it was back in the day.
995 To avoid silent bugs, we read it directly from the register every time.
996 """
997 return self.register.warnings
998 warnings = property(get_warnings, None, None, "MD warnings to be written in metadata")
1001class Project:
1002 """Class for the main project of an MDDB accession.
1003 A project is a set of related MDs.
1004 These MDs share all or most topology and metadata.
1005 """
1007 def __init__(self,
1008 directory: str = '.',
1009 accession: Optional[str] = None,
1010 database_url: str = DEFAULT_API_URL,
1011 inputs_filepath: str = None,
1012 input_topology_filepath: Optional[str] = None,
1013 input_structure_filepath: Optional[str] = None,
1014 input_trajectory_filepaths: Optional[list[str]] = None,
1015 md_directories: Optional[list[str]] = None,
1016 md_config: Optional[list[list[str]]] = None,
1017 reference_md_index: Optional[int] = None,
1018 forced_inputs: Optional[list[list[str]]] = None,
1019 populations_filepath: str = DEFAULT_POPULATIONS_FILENAME,
1020 transitions_filepath: str = DEFAULT_TRANSITIONS_FILENAME,
1021 aiida_data_filepath: Optional[str] = None,
1022 filter_selection: bool | str = False,
1023 pbc_selection: Optional[str] = None,
1024 cg_selection: Optional[str] = None,
1025 image: bool = False,
1026 fit: bool = False,
1027 translation: list[float] = [0, 0, 0],
1028 mercy: list[str] | bool = [],
1029 trust: list[str] | bool = [],
1030 faith: bool = False,
1031 ssleep: bool = False,
1032 pca_analysis_selection: str = PROTEIN_AND_NUCLEIC_BACKBONE,
1033 pca_fit_selection: str = PROTEIN_AND_NUCLEIC_BACKBONE,
1034 rmsd_cutoff: float = DEFAULT_RMSD_CUTOFF,
1035 interaction_cutoff: float = DEFAULT_INTERACTION_CUTOFF,
1036 interactions_auto: Optional[str] = None,
1037 guess_bonds: bool = False,
1038 ignore_bonds: bool = False,
1039 sample_trajectory: Optional[int] = None,
1040 ):
1041 """Initialize a Project.
1043 Args:
1044 directory (str):
1045 Local directory where the project takes place.
1046 accession (Optional[str]):
1047 Project accession to download missing input files from the database (if already uploaded).
1048 database_url (str):
1049 API URL to download missing data. when an accession is provided.
1050 inputs_filepath (str):
1051 Path to a file with inputs for metadata, simulation parameters and analysis config.
1052 input_topology_filepath (Optional[str]):
1053 Path to input topology file relative to the project directory.
1054 input_structure_filepath (Optional[str]):
1055 Path to input structure file. It may be relative to the project or to each MD directory.
1056 If this value is not passed then the standard structure file is used as input by default.
1057 input_trajectory_filepaths (Optional[list[str]]):
1058 Paths to input trajectory files relative to each MD directory.
1059 If this value is not passed then the standard trajectory file path is used as input by default.
1060 md_directories (Optional[list[str]]):
1061 Path to the different MD directories.
1062 Each directory is to contain an independent trajectory and structure.
1063 Several output files will be generated in every MD directory.
1064 md_config (Optional[list]):
1065 Configuration of a specific MD. You may declare as many as you want.
1066 Every MD requires a directory name and at least one trajectory path.
1067 The structure is -md <directory> <trajectory_1> <trajectory_2> ...
1068 Note that all trajectories from the same MD will be merged.
1069 For legacy reasons, you may also provide a specific structure for an MD.
1070 e.g. -md <directory> <structure> <trajectory_1> <trajectory_2> ...
1071 reference_md_index (Optional[int]):
1072 Index of the reference MD (used by project-level functions; defaults to first MD).
1073 forced_inputs (Optional[list]):
1074 Force a specific input through the command line.
1075 Inputs passed through command line have priority over the ones from the inputs file.
1076 In fact, these values will overwritten or be appended in the inputs file.
1077 Every forced input requires an input name and a value.
1078 The structure is -fi <input name> <new input value>
1079 populations_filepath (str):
1080 Path to equilibrium populations file (Markov State Model only)
1081 transitions_filepath (str):
1082 Path to transition probabilities file (Markov State Model only).
1083 aiida_data_filepath (Optional[str]):
1084 Path to the AiiDA data file.
1085 This file may be generated by the aiida-gromacs plugin and contains provenance data.
1086 filter_selection (bool|str):
1087 Atoms selection to be filtered in VMD format.
1088 If the argument is passed alone (i.e. with no selection) then water and counter ions are filtered.
1089 pbc_selection (Optional[str]):
1090 Selection of atoms which stay in Periodic Boundary Conditions even after imaging the trajectory.
1091 e.g. remaining solvent, ions, membrane lipids, etc.
1092 Selection passed through console overrides the one in inputs file.
1093 cg_selection (Optional[str]):
1094 Selection of atoms which are not actual atoms but Coarse Grained beads.
1095 Selection passed through console overrides the one in inputs file.
1096 image (bool):
1097 Set if the trajectory is to be imaged so atoms stay in the PBC box. See -pbc for more information.
1098 fit (bool):
1099 Set if the trajectory is to be fitted (both rotation and translation) to minimize the RMSD to PROTEIN_AND_NUCLEIC_BACKBONE selection.
1100 translation (list[float]):
1101 Set the x y z translation for the imaging process.
1102 e.g. -trans 0.5 -1 0
1103 mercy (list[str]|bool):
1104 Failures to be tolerated (or boolean to set all/none).
1105 trust (list[str]|bool):
1106 Tests to skip/trust (or boolean to set all/none).
1107 faith (bool):
1108 If True, require input files to match expected output files and skip processing.
1109 ssleep (bool):
1110 If True, SSL certificate authentication is skipped when downloading data from an API.
1111 pca_analysis_selection (str):
1112 Atom selection for PCA analysis in VMD syntax.
1113 pca_fit_selection (str):
1114 Atom selection for the PCA fitting in VMD syntax.
1115 rmsd_cutoff (float):
1116 Set the cutoff for the RMSD sudden jumps analysis to fail.
1117 This cutoff stands for the number of standard deviations away from the mean an RMSD value is to be.
1118 interaction_cutoff (float):
1119 Set the cutoff for the interactions analysis to fail.
1120 This cutoff stands for percent of the trajectory where the interaction happens (from 0 to 1).
1121 interactions_auto (Optional[str]):
1122 Guess input interactions automatically. A VMD selection may be passed to limit guessed interactions to a specific subset of atoms.
1123 guess_bonds (bool):
1124 Force the workflow to guess atom bonds based on distance and atom radii in different frames along the trajectory instead of mining topology bonds.
1125 ignore_bonds (bool):
1126 Force the workflow to ignore atom bonds. This will result in many check-ins being skipped
1127 sample_trajectory (Optional[int]):
1128 If passed, download the first 10 (by default) frames from the trajectory.
1129 You can specify a different number by providing an integer value.
1131 """
1132 # Save input parameters
1133 # RUBEN: directory nunca se usa, eliminar?
1134 self.directory = normpath(directory)
1135 # If it is an absolute path then make it relative to the project
1136 if isabs(self.directory):
1137 self.directory = relpath(self.directory)
1138 # Save the directory name alone apart
1139 if self.directory == '.':
1140 self.directory_name = basename(getcwd())
1141 else:
1142 self.directory_name = basename(self.directory)
1144 # Set the database handler
1145 self.ssleep = ssleep
1146 self.database = Database(database_url, self.ssleep) if database_url else None
1147 # Set the remote project handler
1148 self.accession = accession
1149 self.remote = None
1150 if self.database and self.accession:
1151 self.remote = self.database.get_remote_project(self.accession)
1153 # Set the inputs file
1154 # Set the expected default name in case there is no inputs file since it may be downloaded
1155 self._inputs_file = File(self.pathify(DEFAULT_INPUTS_FILENAME))
1156 # If there is an input filepath then use it
1157 if inputs_filepath:
1158 self._inputs_file = File(inputs_filepath)
1159 # Otherwise guess the inputs file using the accepted filenames
1160 else:
1161 for filename in ACCEPTED_INPUT_FILENAMES:
1162 if exists(filename):
1163 self._inputs_file = File(filename)
1164 break
1165 # Set the input topology file
1166 # Note that even if the input topology path is passed we do not check it exists
1167 # Never forget we can donwload some input files from the database on the fly
1168 self.arg_input_topology_filepath = input_topology_filepath
1169 self._input_topology_filepath = None
1170 self._input_topology_file = None
1171 self._input_topology_url = None
1172 # Input structure and trajectory filepaths
1173 # Do not parse them to files yet, let this to the MD class
1174 self.input_structure_filepath = input_structure_filepath
1175 self.input_trajectory_filepaths = input_trajectory_filepaths
1177 # Make sure the new MD configuration (-md) was not passed as well as old MD inputs (-mdir, -traj)
1178 if md_config and (md_directories or input_trajectory_filepaths):
1179 raise InputError('MD configurations (-md) is not compatible with old MD inputs (-mdir, -traj)')
1180 # Save the MD configurations
1181 self.md_config = md_config
1182 # Make sure MD configuration has the correct format
1183 if self.md_config:
1184 # Make sure all MD configurations have at least 3 values each
1185 for mdc in self.md_config:
1186 if len(mdc) < 2:
1187 raise InputError('Wrong MD configuration: the patter is -md <directory> <trajectory> <trajectory 2> ...')
1188 # Make sure there are no duplictaed MD directories
1189 md_directories = [mdc[0] for mdc in self.md_config]
1190 if len(md_directories) > len(set(md_directories)):
1191 raise InputError('There are duplicated MD directories')
1193 # Input populations and transitions for MSM
1194 self.populations_filepath = populations_filepath
1195 self._populations_file = File(self.populations_filepath)
1196 self.transitions_filepath = transitions_filepath
1197 self._transitions_file = File(self.transitions_filepath)
1198 # Input AiiDA data
1199 self.aiida_data_filepath = aiida_data_filepath
1200 self._aiida_data_file = File(self.aiida_data_filepath) if aiida_data_filepath else None
1202 # Set the processed topology filepath, which depends on the input topology filename
1203 # Note that this file is different from the standard topology, although it may be standard as well
1204 self._topology_filepath = None
1206 # Set the standard topology file
1207 self._standard_topology_file = None
1209 # Set the MD directories
1210 self._md_directories = md_directories
1211 # Check input MDs are correct to far
1212 if self._md_directories:
1213 self.check_md_directories()
1215 # Set the reference MD
1216 self._reference_md = None
1217 self._reference_md_index = reference_md_index
1219 # Set the rest of inputs
1220 # Note that the filter selection variable is not handled here at all
1221 # This is just pased to the filtering function which knows how to handle the default
1222 self.filter_selection = filter_selection
1223 # PBC selection may come from the console or from the inputs
1224 self._input_pbc_selection = pbc_selection
1225 self._input_cg_selection = cg_selection
1226 self.image = image
1227 self.fit = fit
1228 self.translation = translation
1229 self.mercy = mercy
1230 # Fix the mercy input, if needed
1231 # If a boolean is passed instead of a list then we set its corresponding value
1232 if type(mercy) is bool:
1233 if mercy:
1234 self.mercy = AVAILABLE_FAILURES
1235 else:
1236 self.mercy = []
1237 self.trust = trust
1238 # Fix the trust input, if needed
1239 # If a boolean is passed instead of a list then we set its corresponding value
1240 if type(trust) is bool:
1241 if trust:
1242 self.trust = AVAILABLE_CHECKINGS
1243 else:
1244 self.trust = []
1245 self.faith = faith
1246 self.pca_analysis_selection = pca_analysis_selection
1247 self.pca_fit_selection = pca_fit_selection
1248 self.rmsd_cutoff = rmsd_cutoff
1249 self.interaction_cutoff = interaction_cutoff
1250 self.sample_trajectory = sample_trajectory
1251 self.interactions_auto = interactions_auto
1252 self.guess_bonds = guess_bonds
1253 self.ignore_bonds = ignore_bonds
1254 # Set the inputs, where values from the inputs file will be stored
1255 self._inputs = None
1257 # Other values which may be found/calculated on demand
1258 self._pbc_selection = None
1259 self._pbc_residues = None
1260 self._cg_selection = None
1261 self._cg_residues = None
1262 self._reference_bonds = None
1263 self._topology_reader = None
1264 self._dihedrals = None
1265 self._populations = None
1266 self._transitions = None
1267 self._pdb_ids = None
1268 self._mds = None
1270 # Save forced inputs and use them when necessary
1271 self.forced_inputs = {}
1272 if forced_inputs:
1273 # Make sure the format is respected
1274 for forced_input in forced_inputs:
1275 n_values = len(forced_input)
1276 if n_values == 0:
1277 raise InputError('There is an empty "-fin". Please remove it from the command line.')
1278 if n_values == 1:
1279 only_value = forced_input[0]
1280 raise InputError(f'There is a "-fin {only_value}" which is missing the new input value.')
1281 if n_values > 2:
1282 suggested_fix = f'{forced_input[0]} "{" ".join(forced_input[1:])}"'
1283 raise InputError(f'Too many values in "-fin {" ".join(forced_input)}".\n' +
1284 ' Note that only two values are expected: -fin <input name> <new input value>\n' +
1285 f' Did you forget the quotes maybe? Try this: -fin {suggested_fix}')
1287 # Save forced inputs as a dict
1288 self.forced_inputs = {name: value for name, value in forced_inputs}
1290 # Check that forced inputs exist
1291 # This is to prevent the user from loosing a lot of time for a silly typo
1292 template_inputs = load_yaml(INPUTS_TEMPLATE_FILEPATH)
1293 for input_name in self.forced_inputs.keys():
1294 if input_name in template_inputs: continue
1295 available_inputs = ', '.join(template_inputs.keys())
1296 raise InputError(f'Unrecognized forced input "{input_name}". Available inputs: {available_inputs}')
1298 # Overwrite input file values
1299 for input_name, input_value in self.forced_inputs.items():
1300 self.update_inputs(input_name, input_value)
1302 # Force a couple of extraordinary files which is generated if atoms are resorted
1303 self.resorted_bonds_file = File(self.pathify(RESORTED_BONDS_FILENAME))
1304 self.resorted_charges_file = File(self.pathify(RESORTED_CHARGES_FILENAME))
1306 # Set a new entry for the register
1307 # This is useful to track previous workflow runs and problems
1308 register_filepath = self.pathify(REGISTER_FILENAME)
1309 register_file = File(register_filepath)
1310 self.register = Register(register_file)
1312 # Set the cache
1313 cache_filepath = self.pathify(CACHE_FILENAME)
1314 cache_file = File(cache_filepath)
1315 self.cache = Cache(cache_file)
1317 # Set tasks whose output is to be overwritten
1318 self.overwritables = set()
1320 def __repr__(self):
1321 """Return a string representation of the Project object."""
1322 return 'Project'
1324 def pathify(self, filename_or_relative_path: str) -> str:
1325 """Given a filename or relative path, add the project directory path at the beginning."""
1326 return normpath(self.directory + '/' + filename_or_relative_path)
1328 def check_md_directories(self):
1329 """Check MD directories to be right.
1330 If there is any problem then directly raise an input error.
1331 """
1332 # Check there is at least one MD
1333 if len(self._md_directories) < 1:
1334 raise InputError('There must be at least one MD')
1335 # Check there are not duplicated MD directories
1336 if len(set(self._md_directories)) != len(self._md_directories):
1337 raise InputError('There are duplicated MD directories')
1339 def get_md_directories(self) -> list:
1340 """Get MD directories."""
1341 # If MD directories are already declared then return them
1342 if self._md_directories:
1343 return self._md_directories
1344 # Otherwise use the default MDs
1345 self._md_directories = []
1346 # Use the MDs from the inputs file when available
1347 if self.is_inputs_file_available() and self.input_mds:
1348 for input_md in self.input_mds:
1349 # Get the directory according to the inputs
1350 directory = input_md.get(MD_DIRECTORY, None)
1351 if directory:
1352 check_directory(directory)
1353 # If no directory is specified in the inputs then guess it from the MD name
1354 else:
1355 name = input_md['name']
1356 if not name:
1357 name = 'unnamed'
1358 directory = name_2_directory(name)
1359 self._md_directories.append(directory)
1360 # Otherwise, guess MD directories by checking which directories include a register file
1361 else:
1362 available_directories = sorted(next(walk(self.directory))[1])
1363 for directory in available_directories:
1364 if exists(directory + '/' + REGISTER_FILENAME):
1365 self._md_directories.append(directory)
1366 # If we found no MD directory then it means MDs were never declared before
1367 if len(self._md_directories) == 0:
1368 raise InputError('Impossible to know which are the MD directories. '
1369 'You can either declare them using the "-mdir" option or by providing an inputs file')
1370 self.check_md_directories()
1371 return self._md_directories
1372 md_directories = property(get_md_directories, None, None, "MD directories (read only)")
1374 def get_reference_md_index(self) -> int:
1375 """Get the reference MD index."""
1376 # If we are already have a value then return it
1377 if self._reference_md_index:
1378 return self._reference_md_index
1379 # Otherwise we must find the reference MD index
1380 # If the inputs file is available then it must declare the reference MD index
1381 if self.is_inputs_file_available():
1382 self._reference_md_index = self.get_input('mdref')
1383 # Otherwise we simply set the first MD as the reference and warn the user about this
1384 if self._reference_md_index is None:
1385 warn('No reference MD was specified. The first MD will be used as reference.')
1386 self._reference_md_index = 0
1387 return self._reference_md_index
1388 reference_md_index = property(get_reference_md_index, None, None, "Reference MD index (read only)")
1390 def get_reference_md(self) -> MD:
1391 """Get the reference MD."""
1392 # If we are already have a value then return it
1393 if self._reference_md:
1394 return self._reference_md
1395 # Otherwise we must find the reference MD
1396 self._reference_md = self.mds[self.reference_md_index]
1397 return self._reference_md
1398 reference_md: MD = property(get_reference_md, None, None, "Reference MD (read only)")
1400 def get_mds(self) -> list:
1401 """Get the available MDs (read only)."""
1402 # If MDs are already declared then return them
1403 if self._mds:
1404 return self._mds
1405 # Now instantiate a new MD for each declared MD and save the reference MD
1406 self._mds = []
1407 # New system with MD configurations (-md)
1408 if self.md_config:
1409 for n, config in enumerate(self.md_config, 1):
1410 directory = config[0]
1411 # LEGACY
1412 # In a previous version, the md config argument also holded the structure
1413 # This was the second argument, so we check if we have more than 2 arguments
1414 # If this is the case, then check if the second argument has different format
1415 # Note that PDB format is also a trajectory supported format
1416 has_structure = False
1417 if len(config) > 2:
1418 first_sample = File(config[1])
1419 second_sample = File(config[2])
1420 if first_sample.format != second_sample.format:
1421 has_structure = True
1422 # Finally set the input structure and trajectories
1423 input_structure_filepath = config[1] if has_structure else self.input_structure_filepath
1424 input_trajectory_filepaths = config[2:] if has_structure else config[1:]
1425 # Define the MD
1426 md = MD(
1427 project=self, number=n, directory=directory,
1428 input_structure_filepath=input_structure_filepath,
1429 input_trajectory_filepaths=input_trajectory_filepaths,
1430 )
1431 self._mds.append(md)
1432 # Old system (-mdir, -stru -traj)
1433 else:
1434 for n, md_directory in enumerate(self.md_directories, 1):
1435 md = MD(
1436 project=self, number=n, directory=md_directory,
1437 input_structure_filepath=self.input_structure_filepath,
1438 input_trajectory_filepaths=self.input_trajectory_filepaths,
1439 )
1440 self._mds.append(md)
1441 return self._mds
1442 mds: list[MD] = property(get_mds, None, None, "Available MDs (read only)")
1444 # Check input files exist when their filenames are read
1445 # If they do not exist then try to download them
1446 # If the download is not possible then raise an error
1448 # Inputs filename ------------
1450 def is_inputs_file_available(self) -> bool:
1451 """Set a function to check if inputs file is available.
1452 Note that asking for it when it is not available will lead to raising an input error.
1453 """
1454 # If name is not declared then it is impossible to reach it
1455 if not self._inputs_file:
1456 return False
1457 # If the file already exists then it is available
1458 if self._inputs_file.exists:
1459 return True
1460 # If it does not exist but it may be downloaded then it is available
1461 if self.remote:
1462 return True
1463 return False
1465 def get_inputs_file(self) -> File:
1466 """Set a function to load the inputs yaml file."""
1467 # There must be an inputs filename
1468 if not self._inputs_file:
1469 raise InputError('Not defined inputs filename')
1470 # If the file already exists then we are done
1471 if self._inputs_file.exists:
1472 return self._inputs_file
1473 # Try to download it
1474 # If we do not have the required parameters to download it then we surrender here
1475 if not self.remote:
1476 raise InputError(f'Missing inputs file "{self._inputs_file.filename}"')
1477 # Download the inputs json file if it does not exists
1478 self.remote.download_inputs_file(self._inputs_file)
1479 return self._inputs_file
1480 inputs_file = property(get_inputs_file, None, None, "Inputs filename (read only)")
1482 # Topology filename ------------
1484 def get_input_topology_filepath(self) -> Optional[str]:
1485 """Get the input topology filepath from the inputs or try to guess it.
1487 If the input topology filepath is a 'no' flag then we consider there is no topology at all
1488 So far we extract atom charges and atom bonds from the topology file
1489 In this scenario we can keep working but there are some consecuences:
1491 1. Analysis using atom charges such as 'energies' will be skipped
1492 2. The standard topology file will not include atom charges
1493 3. Bonds will be guessed
1494 """
1495 # If we already have an internal value calculated then return it
1496 if self._input_topology_filepath is not None:
1497 return self._input_topology_filepath
1499 def parse(filepath: str) -> str:
1500 """Parse possible glob notation."""
1501 # If it is a URL then set the paths for the further download
1502 if is_url(filepath):
1503 self._input_topology_url = filepath
1504 source_filename = url_to_source_filename(filepath)
1505 return source_filename
1506 if not is_glob(filepath):
1507 return filepath
1508 # If there is glob pattern then parse it
1509 parsed_filepaths = glob(filepath)
1510 if len(parsed_filepaths) == 0:
1511 # Warn the user in case it was trying to use glob syntax to donwload remote files
1512 if self.remote:
1513 warn('Spread syntax is not supported to download remote files')
1514 raise InputError(f'No topologies found with "{filepath}"')
1515 if len(parsed_filepaths) > 1:
1516 raise InputError(f'Multiple topologies found with "{filepath}": {", ".join(parsed_filepaths)}')
1517 return parsed_filepaths[0]
1518 # If this value was passed through command line then it would be set as the internal value already
1519 if self.arg_input_topology_filepath:
1520 if self.arg_input_topology_filepath.lower() in {'no', 'not', 'na'}:
1521 self._input_topology_filepath = MISSING_TOPOLOGY
1522 return self._input_topology_filepath
1523 self._input_topology_filepath = parse(self.arg_input_topology_filepath)
1524 # Update the input topology fielpath in the inputs file, in case it is not matching
1525 self.update_inputs('input_topology_filepath', relpath(self._input_topology_filepath, self.directory))
1526 return self._input_topology_filepath
1527 # Check if the inputs file has the value
1528 if self.is_inputs_file_available():
1529 # Get the input value, whose key must exist
1530 inputs_value = self.get_input('input_topology_filepath')
1531 # If there is a valid input then use it
1532 if inputs_value is not None:
1533 # WARNING: the yaml parser automatically converts 'no' to False
1534 if inputs_value is False or inputs_value.lower() in {'no', 'not', 'na'}:
1535 self._input_topology_filepath = MISSING_TOPOLOGY
1536 return self._input_topology_filepath
1537 parsed_input_value = parse(inputs_value)
1538 self._input_topology_filepath = self.pathify(parsed_input_value)
1539 return self._input_topology_filepath
1540 # If nothing worked then surrender
1541 raise InputError('Missing input topology file path. Please provide a topology file using the "-top" argument.\n' +
1542 ' Note that you may run the workflow without a topology file. To do so, use the "-top no" argument.\n' +
1543 ' However this has implications since we usually mine atom charges and bonds from the topology file.\n' +
1544 ' Some analyses such us the interaction energies will be skiped')
1546 def get_input_topology_file(self) -> Optional[File]:
1547 """Get the input topology file.
1548 If the file is not found try to download it.
1549 """
1550 # If we already have a value then return it
1551 if self._input_topology_file is not None:
1552 return self._input_topology_file
1553 # Set the input topology filepath
1554 input_topology_filepath = self.get_input_topology_filepath()
1555 # If the input filepath is None then it menas we must proceed without a topology
1556 if input_topology_filepath == MISSING_TOPOLOGY:
1557 self._input_topology_file = MISSING_TOPOLOGY
1558 return self._input_topology_file
1559 # If no input is passed then we check the inputs file
1560 # Set the file
1561 input_topology_file = File(input_topology_filepath)
1562 # If there is a topology URL then we must donwload the topology first
1563 input_topology_url = self._input_topology_url
1564 if input_topology_url and not input_topology_file.exists:
1565 original_filename = input_topology_url.split('/')[-1]
1566 # If there is a remote then use it
1567 if self.remote:
1568 # If the topology filename is the standard topology filename then use the topology endpoint instead
1569 if original_filename == STANDARD_TOPOLOGY_FILENAME:
1570 self.remote.download_standard_topology(input_topology_file)
1571 # Otherwise download the input strucutre file by its filename
1572 else:
1573 self.remote.download_file(original_filename, input_topology_file)
1574 # In case the topology is a '.top' file we consider it is a Gromacs topology
1575 # It may come with additional itp files we must download as well
1576 if input_topology_file.format == 'top':
1577 # Find available .itp files and download each of them
1578 itp_filenames = [filename for filename in self.remote.available_files if filename[-4:] == '.itp']
1579 for itp_filename in itp_filenames:
1580 itp_filepath = self.pathify(itp_filename)
1581 itp_file = File(itp_filepath)
1582 self.remote.download_file(itp_file.filename, itp_file)
1583 # Otherwise use the URL as is
1584 else:
1585 if input_topology_file.format == 'top':
1586 warn('Automatic download of itp files is not supported without the "-proj" argument.' +
1587 ' Thus if the topology has associated itp files they will not be downloaded.')
1588 download_file(input_topology_url, input_topology_file)
1589 # If the file already exists then we are done
1590 if input_topology_file.exists:
1591 self._input_topology_file = input_topology_file
1592 return self._input_topology_file
1593 raise InputError(f'Missing input topology file "{input_topology_file.filename}"')
1594 input_topology_file = property(get_input_topology_file, None, None, "Input topology file (read only)")
1596 def get_input_structure_file(self) -> File:
1597 """Get the input structure filename."""
1598 # When calling this function make sure all MDs have the file or try to download it
1599 return self.reference_md._input_structure_file
1600 input_structure_file = property(get_input_structure_file, None, None, "Input structure filename for each MD (read only)")
1602 def get_input_trajectory_files(self) -> list[File]:
1603 """Get the input trajectory filename(s) from the inputs.
1604 If file(s) are not found try to download it.
1605 """
1606 return self.reference_md._input_trajectory_files
1607 input_trajectory_files = property(get_input_trajectory_files, None, None, "Input trajectory filenames for each MD (read only)")
1609 def get_populations_file(self) -> Optional[File]:
1610 """Get the MSM equilibrium populations file."""
1611 if not self.get_file(self._populations_file):
1612 return None
1613 return self._populations_file
1614 populations_file = property(get_populations_file, None, None, "MSM equilibrium populations file (read only)")
1616 def get_transitions_file(self) -> Optional[File]:
1617 """Get the MSM transition probabilities file."""
1618 if not self.get_file(self._transitions_file):
1619 return None
1620 return self._transitions_file
1621 transitions_file = property(get_transitions_file, None, None, "MSM transition probabilities file (read only)")
1623 def get_aiida_data_file(self) -> Optional[File]:
1624 """Get the AiiDA data file."""
1625 if not self._aiida_data_file: return None
1626 if not self.get_file(self._aiida_data_file): return None
1627 return self._aiida_data_file
1628 aiida_data_file = property(get_aiida_data_file, None, None, "AiiDA data file (read only)")
1630 # ---------------------------------
1632 def get_file(self, target_file: File) -> bool:
1633 """Check if a file exists.
1634 If not, try to download it from the database.
1635 If the file is not found in the database it is fine, we do not even warn the user.
1636 Note that nowadays this function is used to get populations and transitions files, which are not common.
1637 """
1638 return self.reference_md.get_file(target_file)
1640 # Input file values -----------------------------------------
1642 # First of all set input themselves
1644 def get_inputs(self) -> dict:
1645 """Get inputs."""
1646 # If inputs are already loaded then return them
1647 if self._inputs:
1648 return self._inputs
1649 # When loading the inuts file, replace some values automatically
1650 replaces = [
1651 ('$DIR', self.directory_name)
1652 ]
1653 # Otherwise, load inputs from the inputs file
1654 inputs_data = None
1655 if self.inputs_file.format == 'json':
1656 inputs_data = load_json(self.inputs_file.path, replaces)
1657 elif self.inputs_file.format == 'yaml':
1658 inputs_data = load_yaml(self.inputs_file.path, replaces)
1659 else:
1660 raise InputError('Input file format is not supported. Please use json or yaml files.')
1661 if not inputs_data:
1662 raise InputError('Input file is empty')
1663 self._inputs = inputs_data
1664 # Legacy fixes
1665 old_pdb_ids = self._inputs.get('pdbIds', None)
1666 if old_pdb_ids:
1667 self._inputs['pdb_ids'] = old_pdb_ids
1668 # Finally return the updated inputs
1669 return self._inputs
1670 inputs = property(get_inputs, None, None, "Inputs from the inputs file (read only)")
1672 def update_inputs(self, nested_key: str, new_value):
1673 """Permanently update the inputs file.
1674 This may be done when command line inputs do not match file inputs.
1675 """
1676 # If there is no inputs file then rhere is nothing to update
1677 if not self.is_inputs_file_available(): return
1678 # If the input already matches then do nothing
1679 current_value = read_ndict(self.inputs, nested_key, MISSING_INPUT_EXCEPTION)
1680 if current_value == new_value: return
1681 # Set the new value
1682 write_ndict(self.inputs, nested_key, new_value)
1683 # If there is no inputs file then do not try to save anything
1684 if not self.is_inputs_file_available(): return
1685 print(f'* Field "{nested_key}" in the inputs file will be permanently modified')
1686 # Write the new inputs to disk
1687 if self.inputs_file.format == 'json':
1688 save_json(self.inputs, self.inputs_file.path)
1689 elif self.inputs_file.format == 'yaml':
1690 # Note that comments in the original YAML file will be not kept
1691 save_yaml(self.inputs, self.inputs_file.path)
1692 else:
1693 raise InputError('Input file format is not supported. Please use json or yaml files.')
1695 # Then set getters for every value in the inputs file
1696 def get_input(self, name: str) -> Any:
1697 """Get a specific 'input' value."""
1698 # Check if the value of this input was forced from command line
1699 if name in self.forced_inputs:
1700 return self.forced_inputs[name]
1701 # Get the input value from the inputs file
1702 value = self.inputs.get(name, MISSING_INPUT_EXCEPTION)
1703 # If we had a value then return it
1704 if value != MISSING_INPUT_EXCEPTION:
1705 return value
1706 # If the field is not specified in the inputs file then set a defualt value
1707 default_value = DEFAULT_INPUT_VALUES.get(name, None)
1708 # Warn the user about this
1709 warn(f'Missing input "{name}" -> Using default value: {default_value}')
1710 return default_value
1712 def input_property(name: str, doc: str = ""):
1713 """Set a function to get a specific 'input' value by its key/name.
1714 Note that we return the property without calling the getter.
1715 """
1716 def getter(self: 'Project'):
1717 return self.get_input(name)
1718 return property(getter, doc=doc)
1720 # Assign the getters
1721 input_interactions = input_property('interactions', "Interactions to be analyzed (read only)")
1722 input_protein_references = input_property('forced_references', "Uniprot IDs to be used first when aligning protein sequences (read only)")
1723 input_pdb_ids = input_property('pdb_ids', "Protein Data Bank IDs used for the setup of the system (read only)")
1724 input_type = input_property('type', "Set if its a trajectory or an ensemble (read only)")
1725 input_mds = input_property('mds', "Input MDs configuration (read only)")
1726 input_ligands = input_property('ligands', "Input ligand references (read only)")
1727 input_force_fields = input_property('ff', "Input force fields (read only)")
1728 input_collections = input_property('collections', "Input collections (read only)")
1729 input_chain_names = input_property('chainnames', "Input chain names (read only)")
1730 input_framestep = input_property('framestep', "Input framestep (read only)")
1731 input_name = input_property('name', "Input name (read only)")
1732 input_description = input_property('description', "Input description (read only)")
1733 input_authors = input_property('authors', "Input authors (read only)")
1734 input_groups = input_property('groups', "Input groups (read only)")
1735 input_contact = input_property('contact', "Input contact (read only)")
1736 input_program = input_property('program', "Input program (read only)")
1737 input_version = input_property('version', "Input version (read only)")
1738 input_method = input_property('method', "Input method (read only)")
1739 input_license = input_property('license', "Input license (read only)")
1740 input_linkcense = input_property('linkcense', "Input license link (read only)")
1741 input_citation = input_property('citation', "Input citation (read only)")
1742 input_thanks = input_property('thanks', "Input acknowledgements (read only)")
1743 input_links = input_property('links', "Input links (read only)")
1744 input_timestep = input_property('timestep', "Input timestep (read only)")
1745 input_temperature = input_property('temp', "Input temperature (read only)")
1746 input_ensemble = input_property('ensemble', "Input ensemble (read only)")
1747 input_water = input_property('wat', "Input water force field (read only)")
1748 input_boxtype = input_property('boxtype', "Input boxtype (read only)")
1749 input_pbc_selection = input_property('pbc_selection', "Input Periodic Boundary Conditions (PBC) selection (read only)")
1750 input_cg_selection = input_property('cg_selection', "Input Coarse Grained (CG) selection (read only)")
1751 input_customs = input_property('customs', "Input custom representations (read only)")
1752 input_orientation = input_property('orientation', "Input orientation (read only)")
1753 input_multimeric = input_property('multimeric', "Input multimeric labels (read only)")
1754 # Additional topic-specific inputs
1755 input_cv19_unit = input_property('cv19_unit', "Input Covid-19 Unit (read only)")
1756 input_cv19_startconf = input_property('cv19_startconf', "Input Covid-19 starting conformation (read only)")
1757 input_cv19_abs = input_property('cv19_abs', "Input Covid-19 antibodies (read only)")
1758 input_cv19_nanobs = input_property('cv19_nanobs', "Input Covid-19 nanobodies (read only)")
1760 def get_input_pbc_selection(self) -> Optional[str]:
1761 """PBC selection may come from the console or from the inputs file.
1762 Console has priority over the inputs file.
1763 """
1764 # If we have an internal value then return it
1765 if self._input_pbc_selection:
1766 return self._input_pbc_selection
1767 # As an exception, we avoid asking for the inputs file if it is not available
1768 # This input is required for some early processing steps where we do not need the inputs file for anything else
1769 if not self.is_inputs_file_available():
1770 return None
1771 # Otherwise, find it in the inputs
1772 # Get the input value, whose key must exist
1773 self._input_pbc_selection = self.get_input('pbc_selection')
1774 return self._input_pbc_selection
1775 input_pbc_selection = property(get_input_pbc_selection, None, None, "Selection of atoms which are still in periodic boundary conditions (read only)")
1777 def get_input_cg_selection(self) -> Optional[str]:
1778 """CG selection may come from the console or from the inputs file.
1779 Console has priority over the inputs file.
1780 """
1781 # If we have an internal value then return it
1782 if self._input_cg_selection:
1783 return self._input_cg_selection
1784 # As an exception, we avoid asking for the inputs file if it is not available
1785 # This input is required for some early processing steps where we do not need the inputs file for anything else
1786 if not self.is_inputs_file_available():
1787 return None
1788 # Otherwise, find it in the inputs
1789 # Get the input value, whose key must exist
1790 self._input_cg_selection = self.get_input('cg_selection')
1791 return self._input_cg_selection
1792 input_cg_selection = property(get_input_cg_selection, None, None, "Selection of atoms which are not acutal atoms but Coarse Grained beads (read only)")
1794 # Set additional values infered from input values
1796 def check_is_time_dependent(self) -> bool:
1797 """Set if MDs are time dependent."""
1798 if self.input_type == 'trajectory':
1799 return True
1800 elif self.input_type == 'ensemble':
1801 return False
1802 raise InputError(f'Not supported input "type" value: {self.input_type}. It must be "trajectory" or "ensemble"')
1803 is_time_dependent = property(check_is_time_dependent, None, None, "Check if trajectory frames are time dependent (read only)")
1805 # Processed files ----------------------------------------------------
1807 def inherit_topology_filename(self) -> Optional[str]:
1808 """Set the expected output topology filename given the input topology filename.
1809 Note that topology formats are conserved.
1810 """
1811 if self.input_topology_file == MISSING_TOPOLOGY:
1812 return MISSING_TOPOLOGY
1813 filename = self.input_topology_file.filename
1814 if not filename: raise RuntimeError('Unnamed file?')
1815 if filename == RAW_CHARGES_FILENAME:
1816 return filename
1817 standard_format = self.input_topology_file.format
1818 return 'topology.' + standard_format
1820 def get_topology_filepath(self) -> str:
1821 """Get the processed topology file path."""
1822 # If we have a stored value then return it
1823 if self._topology_filepath:
1824 return self._topology_filepath
1825 # Otherwise we must find it
1826 inherited_filename = self.inherit_topology_filename()
1827 if inherited_filename == MISSING_TOPOLOGY:
1828 self._topology_filepath = MISSING_TOPOLOGY
1829 else:
1830 self._topology_filepath = self.pathify(inherited_filename)
1831 return self._topology_filepath
1832 topology_filepath = property(get_topology_filepath, None, None, "Topology file path (read only)")
1834 def get_topology_file(self) -> 'File':
1835 """Get the processed topology from the reference MD."""
1836 getter = self.reference_md.input_files_processing.get_output_file_getter('output_topology_file')
1837 return getter(self.reference_md)
1838 topology_file = property(get_topology_file, None, None, "Topology file (read only)")
1840 def get_structure_file(self) -> 'File':
1841 """Get the processed structure from the reference MD."""
1842 return self.reference_md.structure_file
1843 structure_file = property(get_structure_file, None, None, "Structure filename from the reference MD (read only)")
1845 def get_trajectory_file(self) -> 'File':
1846 """Get the processed trajectory from the reference MD."""
1847 return self.reference_md.trajectory_file
1848 trajectory_file = property(get_trajectory_file, None, None, "Trajectory filename from the reference MD (read only)")
1850 # ---------------------------------------------------------------------------------
1851 # Others values which may be found/calculated and files to be generated on demand
1852 # ---------------------------------------------------------------------------------
1854 def get_structure(self) -> 'Structure':
1855 """Get the parsed structure from the reference MD."""
1856 return self.reference_md.structure
1857 structure = property(get_structure, None, None, "Parsed structure from the reference MD (read only)")
1859 def get_pbc_residues(self) -> list[int]:
1860 """Get the indices of residues in periodic boundary conditions."""
1861 return self.reference_md.pbc_residues
1862 pbc_residues = property(get_pbc_residues, None, None, "Indices of residues in periodic boundary conditions (read only)")
1864 def get_cg_residues(self) -> list[int]:
1865 """Get the indices of residues in coarse grain."""
1866 return self.reference_md.cg_residues
1867 cg_residues = property(get_cg_residues, None, None, "Indices of residues in coarse grain (read only)")
1869 def get_snapshots(self) -> int:
1870 """Get the reference MD snapshots."""
1871 return self.reference_md.snapshots
1872 snapshots = property(get_snapshots, None, None, "Reference MD snapshots (read only)")
1874 def get_universe(self) -> int:
1875 """Get the MDAnalysis Universe from the reference MD."""
1876 return self.reference_md.universe
1877 universe = property(get_universe, None, None, "MDAnalysis Universe object (read only)")
1879 def get_processed_interactions(self) -> dict:
1880 """Get the processed interactions from the reference replica, which are the same for all replicas."""
1881 return self.reference_md.interactions
1882 interactions = property(get_processed_interactions, None, None, "Processed interactions (read only)")
1884 def get_check_stable_bonds(self) -> bool:
1885 """Check if we must check stable bonds."""
1886 # Set if stable bonds have to be checked
1887 must_check = STABLE_BONDS_FLAG not in self.trust
1888 # If this analysis has been already passed then we can trust structure bonds
1889 if self.register.tests.get(STABLE_BONDS_FLAG, None) is True:
1890 must_check = False
1891 return must_check
1892 must_check_stable_bonds = property(get_check_stable_bonds, None, None, "Check if we must check stable bonds (read only)")
1894 # Reference bonds
1895 get_reference_bonds = Task('refbonds', 'Reference bonds', find_safe_bonds)
1896 reference_bonds = property(get_reference_bonds, None, None, "Atom bonds to be trusted (read only)")
1898 # Atom charges
1899 get_charges = Task('charges', 'Getting atom charges', get_charges)
1900 charges = property(get_charges, None, None, "Atom charges (read only)")
1902 def get_topology_reader(self) -> 'Topology':
1903 """Get the topology data reader."""
1904 # If we already have a stored value then return it
1905 if self._topology_reader: return self._topology_reader
1906 # Instantiate the topology reader
1907 self._topology_reader = Topology(self.topology_file)
1908 return self._topology_reader
1909 topology_reader = property(get_topology_reader, None, None, "Topology reader (read only)")
1911 def get_dihedrals(self) -> list[dict]:
1912 """Get the topology dihedrals."""
1913 # If we already have a stored value then return it
1914 if self._dihedrals: return self._dihedrals
1915 # Calculate the dihedrals otherwise
1916 self._dihedrals = self.topology_reader.get_dihedrals_data()
1917 return self._dihedrals
1918 dihedrals = property(get_dihedrals, None, None, "Topology dihedrals (read only)")
1920 def get_populations(self) -> Optional[list[float]]:
1921 """Get the equilibrium populations from a MSM."""
1922 # If we already have a stored value then return it
1923 if self._populations:
1924 return self._populations
1925 # Otherwise we must find the value
1926 if not self.populations_file:
1927 return None
1928 self._populations = read_file(self.populations_file)
1929 return self._populations
1930 populations = property(get_populations, None, None, "Equilibrium populations from a MSM (read only)")
1932 def get_transitions(self) -> Optional[list[list[float]]]:
1933 """Get the transition probabilities from a MSM."""
1934 # If we already have a stored value then return it
1935 if self._transitions:
1936 return self._transitions
1937 # Otherwise we must find the value
1938 if not self.transitions_file:
1939 return None
1940 self._transitions = read_file(self.transitions_file)
1941 return self._transitions
1942 transitions = property(get_transitions, None, None, "Transition probabilities from a MSM (read only)")
1944 def get_pdb_ids(self) -> list[str]:
1945 """Get the tested and standardized PDB ids."""
1946 # If we already have a stored value then return it
1947 if self._pdb_ids is not None:
1948 return self._pdb_ids
1949 # Otherwise test and standarize input PDB ids
1950 self._pdb_ids = []
1951 # If there is no input pdb ids (may be None) then stop here
1952 if not self.input_pdb_ids:
1953 return []
1954 # If input PDB ids is a string instead of a list then fix it
1955 input_pdb_ids = self.input_pdb_ids
1956 if type(input_pdb_ids) is str:
1957 input_pdb_ids = [input_pdb_ids]
1958 # Iterate input PDB ids
1959 for input_pdb_id in input_pdb_ids:
1960 # First make sure this is a PDB id
1961 if not re.match(PDB_ID_FORMAT, input_pdb_id):
1962 raise InputError(f'Input PDB id "{input_pdb_id}" does not look like a PDB id')
1963 # Make letters upper
1964 pdb_id = input_pdb_id.upper()
1965 self._pdb_ids.append(pdb_id)
1966 return self._pdb_ids
1967 pdb_ids = property(get_pdb_ids, None, None, "Tested and standarized PDB ids (read only)")
1969 # Prepare the PDB references file to be uploaded to the database
1970 get_pdb_references = Task('pdbs', 'Prepare PDB references',
1971 prepare_pdb_references, output_filenames={'pdb_references_file': PDB_REFERENCES_FILENAME})
1972 pdb_references_file = get_pdb_references.get_output_file_getter('pdb_references_file')
1974 # Map the structure aminoacids sequences against the Uniprot reference sequences
1975 get_protein_map = Task('protmap', 'Protein residues mapping', generate_protein_mapping,
1976 output_filenames={'protein_references_file': PROTEIN_REFERENCES_FILENAME})
1977 protein_map = property(get_protein_map, None, None, "Protein residues mapping (read only)")
1979 # Define the output file of the protein mapping including protein references
1980 get_protein_references_file = get_protein_map.get_output_file_getter('protein_references_file')
1981 protein_references_file = property(get_protein_references_file, None, None, "File including protein refereces data mined from UniProt (read only)")
1983 get_chain_references = Task('chains', 'Chain references', prepare_chain_references,
1984 output_filenames={'chains_references_file': OUTPUT_CHAINS_FILENAME})
1986 get_inchikeys = Task('inchikeys', 'InChI keys residues mapping', generate_inchikeys)
1987 inchikeys = property(get_inchikeys, None, None, "InChI keys (read only)")
1989 get_lipid_references = Task('lipmap', 'Lipid references', generate_lipid_references)
1990 lipid_references = property(get_lipid_references, None, None, "Lipid references (read only)")
1992 get_membrane_map = Task('memmap', 'Membrane residues mapping', generate_membrane_mapping,
1993 output_filenames={'output_file': MEMBRANE_MAPPING_FILENAME})
1994 membrane_map = property(get_membrane_map, None, None, "Membrane mapping (read only)")
1996 get_ligand_references = Task('ligmap', 'Ligand residues mapping', generate_ligand_references)
1997 ligand_references = property(get_ligand_references, None, None, "Ligand references (read only)")
1999 get_inchi_references = Task('inchimap', 'InChI references', generate_inchi_references,
2000 output_filenames={'output_file': INCHIKEY_REFERENCES_FILENAME})
2001 inchikey_map = property(get_inchi_references, None, None, "InChI references (read only)")
2003 # Build the residue map from both proteins and ligands maps
2004 # This is formatted as both the standard topology and metadata producers expect them
2005 get_residue_map = Task('resmap', 'Residue mapping', generate_residue_mapping)
2006 residue_map = property(get_residue_map, None, None, "Residue map (read only)")
2008 # Prepare the project metadata file to be upload to the database
2009 prepare_metadata = Task('pmeta', 'Prepare project metadata',
2010 prepare_project_metadata, output_filenames={'output_file': OUTPUT_METADATA_FILENAME})
2012 # Prepare the standard topology file to be uploaded to the database
2013 prepare_standard_topology = Task('stopology', 'Standard topology file',
2014 generate_topology, output_filenames={'output_file': STANDARD_TOPOLOGY_FILENAME})
2015 get_standard_topology_file = prepare_standard_topology.get_output_file_getter('output_file')
2016 standard_topology_file = property(get_standard_topology_file, None, None, "Standard topology filename (read only)")
2018 # Get a screenshot of the system
2019 get_screenshot_filename = Task('screenshot', 'Screenshot file',
2020 get_screenshot, output_filenames={'output_file': OUTPUT_SCREENSHOT_FILENAME})
2021 screenshot_filename = property(get_screenshot_filename, None, None, "Screenshot filename (read only)")
2023 # Provenance data
2024 produce_provenance = Task('aiidata', 'Produce provenance', produce_provenance)
2026 def get_warnings(self) -> list:
2027 """Get the warnings.
2029 The warnings list should not be reasigned, but it was back in the day.
2030 To avoid silent bugs, we read it directly from the register every time.
2031 """
2032 return self.register.warnings
2033 warnings = property(get_warnings, None, None, "Project warnings to be written in metadata")
2036# AUXILIAR FUNCTIONS ---------------------------------------------------------------------------
2038# DANI: En cuanto se concrete el formato de los markov esta función no hará falta
2039def read_file(target_file: File) -> dict:
2040 """Set a function to read a file which may be in different formats."""
2041 # Get the file format
2042 file_format = target_file.filename.split('.')[-1]
2043 # Read numpy files
2044 if file_format == 'npy':
2045 return numpy.load(target_file.path)
2046 # Read JSON files
2047 if file_format == 'json':
2048 return load_json(target_file.path)
2051def name_2_directory(name: str) -> str:
2052 """Set a function to convert an MD name into an equivalent MD directory."""
2053 # Replace white spaces with underscores
2054 directory = name.replace(' ', '_')
2055 # Remove problematic characters
2056 for character in FORBIDDEN_DIRECTORY_CHARACTERS:
2057 directory = directory.replace(character, '')
2058 return directory
2061def check_directory(directory: str) -> str:
2062 """Check for problematic characters in a directory path."""
2063 # Remove problematic characters
2064 directory_characters = set(directory)
2065 for character in FORBIDDEN_DIRECTORY_CHARACTERS:
2066 if character in directory_characters:
2067 raise InputError(f'Directory path "{directory}" includes the forbidden character "{character}"')
2070def directory_2_name(directory: str) -> str:
2071 """Convert an MD directory into an equivalent MD name."""
2072 # Remove a possible starting './'
2073 # Replace white spaces with underscores
2074 name = directory.split('/')[-1].replace('_', ' ')
2075 return name
2078# Project input files
2079project_input_files = {
2080 'itopology': Project.get_input_topology_file,
2081 'inputs': Project.get_inputs_file,
2082 'populations': Project.get_populations_file,
2083 'transitions': Project.get_transitions_file
2084}
2085# MD input files
2086md_input_files = {
2087 'istructure': MD.get_input_structure_file,
2088 'itrajectory': MD.get_input_trajectory_files
2089}
2090# Both project and MD input files
2091input_files = {**project_input_files, **md_input_files}
2093# Project processed files
2094project_processed_files = {
2095 'topology': Project.get_topology_file
2096}
2097# MD processed files
2098md_processed_files = {
2099 'structure': MD.get_structure_file,
2100 'trajectory': MD.get_trajectory_file
2101}
2102# Both project and MD processed files
2103processed_files = {**project_processed_files, **md_processed_files}
2105# Project requestable tasks
2106project_requestables = {
2107 **project_input_files,
2108 **project_processed_files,
2109}
2110# Add available tasks to project requestables
2111for callable in vars(Project).values():
2112 if isinstance(callable, Task): project_requestables[callable.flag] = callable
2113# MD requestable tasks
2114md_requestables = {
2115 **md_input_files,
2116 **md_processed_files,
2117}
2118# Add available tasks to project requestables
2119for callable in vars(MD).values():
2120 if isinstance(callable, Task): md_requestables[callable.flag] = callable
2121# Requestables for the console
2122# Note that this constant is global
2123requestables = {**project_requestables, **md_requestables}
2125# Set groups of dependencies to be requested together using only one flag
2126DEPENDENCY_FLAGS = {
2127 'download': list(input_files.keys()),
2128 'setup': list(processed_files.keys()),
2129 'meta': ['pmeta', 'mdmeta'],
2130 'network': ['resmap', 'ligmap', 'lipmap', 'chains', 'pdbs', 'memmap'],
2131 'minimal': ['pmeta', 'mdmeta', 'stopology'],
2132 'interdeps': ['inter', 'pairwise', 'hbonds', 'energies', 'perres', 'clusters', 'dist'],
2133 'membs': ['memmap', 'density', 'thickness', 'apl', 'lorder', 'linter', 'channels']
2134}
2136# Set the default analyses to be run when no task is specified
2137DEFAULT_ANALYSES = ['clusters', 'dist', 'energies', 'hbonds', 'pca', 'pockets',
2138 'rgyr', 'rmsds', 'perres', 'pairwise', 'rmsf', 'sas', 'tmscore', 'density',
2139 'thickness', 'apl', 'lorder', 'linter']
2142def workflow(
2143 project_parameters: dict = {},
2144 working_directory: str = '.',
2145 download: bool = False,
2146 setup: bool = False,
2147 include: Optional[list[str]] = None,
2148 exclude: Optional[list[str]] = None,
2149 overwrite: Optional[list[str] | bool] = None,
2150):
2151 """Run the MDDB workflow.
2153 Args:
2154 project_parameters (dict): Parameters to initiate the project.
2155 working_directory (str): Working directory where the project is located.
2156 download (bool): (Deprecated: use -i download) If passed, only download required files. Then exits.
2157 setup (bool): (Deprecated: use -i setup) If passed, only download required files and run mandatory dependencies. Then exits.
2158 include (list[str] | None): Set the unique analyses or tools to be run. All other steps will be skipped.
2159 exclude (list[str] | None): List of analyses or tools to be skipped. All other steps will be run. Note that if we exclude a dependency of something else then it will be run anyway.
2160 overwrite (list[str] | bool | None): List of tasks that will be re-run, overwriting previous output files. Use this flag alone to overwrite everything.
2162 """
2163 # Check there are not input errors
2165 # Include and exclude are not compatible
2166 # This is to protect the user to do something which makes not sense
2167 if include and exclude:
2168 raise InputError('Include (-i) and exclude (-e) are not compatible. Use one of these options.')
2170 # Save the directory where the workflow is called from so we can come back at the very end
2171 workflow_call_directory = getcwd()
2173 # Make sure the working directory exists
2174 if not exists(working_directory):
2175 raise InputError(f'Working directory "{working_directory}" does not exist')
2177 # Make sure the working directory is actually a directory
2178 if not isdir(working_directory):
2179 raise InputError(f'Working directory "{working_directory}" is actually not a directory')
2181 # Move the current directory to the working directory
2182 chdir(working_directory)
2183 current_directory_name = getcwd().split('/')[-1]
2184 git_version = get_git_version()
2185 print(f'\n{CYAN_HEADER}Running MDDB workflow ({git_version}) for project at {current_directory_name}{COLOR_END}')
2187 # Initiate the project project
2188 project = Project(**project_parameters)
2189 print(f' {len(project.mds)} MDs are to be run')
2191 # Set the tasks to be run
2192 tasks = None
2193 # If the download argument is passed then just make sure input files are available
2194 if download:
2195 warn('The "-d" or "--download" argument is deprecated. Please use "-i download" instead.')
2196 tasks = list(input_files.keys())
2197 # If the setup argument is passed then just process input files
2198 elif setup:
2199 warn('The "-s" or "--setup" argument is deprecated. Please use "-i setup" instead.')
2200 tasks = list(processed_files.keys())
2201 # If the include argument then add only the specified tasks to the list
2202 elif include and len(include) > 0:
2203 tasks = [*include]
2204 # Search for special flags among included
2205 for flag, dependencies in DEPENDENCY_FLAGS.items():
2206 if flag not in tasks: continue
2207 # If the flag is found then remove it and write the corresponding dependencie instead
2208 # Make sure not to duplicate a dependency if it was already included
2209 tasks.remove(flag)
2210 for dep in dependencies:
2211 if dep in tasks: continue
2212 tasks.append(dep)
2213 # Set the default tasks otherwise
2214 else:
2215 tasks = [
2216 # Project tasks
2217 'stopology',
2218 'screenshot',
2219 'pmeta',
2220 'pdbs',
2221 'chains',
2222 # MD tasks
2223 'mdmeta',
2224 'inter',
2225 *DEFAULT_ANALYSES,
2226 ]
2227 # If the exclude parameter was passed then remove excluded tasks from the default tasks
2228 if exclude and len(exclude) > 0:
2229 excluded_dependencies = [*exclude]
2230 # Search for special flags among excluded
2231 for flag, dependencies in DEPENDENCY_FLAGS.items():
2232 if flag not in exclude: continue
2233 # If the flag is found then exclude the dependencies instead
2234 # Make sure not to duplicate a dependency if it was already included
2235 excluded_dependencies.remove(flag)
2236 for dep in dependencies:
2237 if dep in exclude: continue
2238 excluded_dependencies.append(dep)
2239 tasks = [name for name in tasks if name not in excluded_dependencies]
2241 # If the user requested to overwrite something, make sure it is in the tasks list
2243 # Update the overwritable variable with the requested overwrites
2244 overwritables = set()
2245 if overwrite:
2246 # If the overwrite argument is simply true then add all requestables to the overwritable
2247 if type(overwrite) is bool:
2248 for task in tasks:
2249 overwritables.add(task)
2250 # If the overwrite argument is a list of tasks then iterate them
2251 elif type(overwrite) is list:
2252 for task in overwrite:
2253 # Make sure the task to be overwriten is among the tasks to be run
2254 if task not in tasks:
2255 raise InputError(f'Task "{task}" is to be overwriten but it is not in the tasks list. Either include it or do not exclude it')
2256 # Add it to the global variable
2257 overwritables.add(task)
2258 else: raise ValueError('Not supported overwrite type')
2260 # Get project tasks
2261 project_tasks = [task for task in tasks if task in project_requestables]
2262 # Get the MD tasks
2263 md_tasks = [task for task in tasks if task in md_requestables]
2265 # Set project overwritables
2266 project.overwritables = set([task for task in project_tasks if task in overwritables])
2267 # Set MD overwrittables
2268 # Note that this must be done before running project tasks
2269 # Some project tasks rely in MD tasks
2270 for md in project.mds:
2271 md.overwritables = set([task for task in md_tasks if task in overwritables])
2273 # Run the project tasks now
2274 for task in project_tasks:
2275 # Get the function to be called and call it
2276 getter = requestables[task]
2277 getter(project)
2279 # If there are no MD tasks then we are done already
2280 if len(md_tasks) == 0:
2281 print("Finished!")
2282 return
2284 # Now iterate over the different MDs
2285 for md in project.mds:
2286 print(f'\n{CYAN_HEADER} Processing MD at {md.directory}{COLOR_END}')
2287 # Run the MD tasks
2288 for task in md_tasks:
2289 # Get the function to be called and call it
2290 getter = requestables[task]
2291 getter(md)
2293 # Remove gromacs backups and other trash files from this MD
2294 remove_trash(md.directory)
2296 # Remove gromacs backups and other trash files from the project
2297 remove_trash(project.directory)
2299 # Return back to the place where the workflow was called originally
2300 # This is not important for many applications
2301 # But if you call the workflow function from a python script then this is important
2302 chdir(workflow_call_directory)
2304 print("Done!")