Coverage for mddb_workflow/mwf.py: 79%
1263 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 15:48 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 15:48 +0000
1#!/usr/bin/env python
3# This is the starter script
5# Import python libraries
6from os import chdir, rename, remove, walk, mkdir, getcwd
7from os.path import exists, isdir, isabs, relpath, normpath, split, basename
8from shutil import rmtree
9import sys
10import io
11import re
12import numpy
13from glob import glob
14from inspect import getfullargspec
15import time
17# Constants
18# Importing constants first is important
19from mddb_workflow.utils.constants import *
21# Import local utils
22#from mddb_workflow.utils.httpsf import mount
23from mddb_workflow.utils.auxiliar import InputError, MISSING_TOPOLOGY
24from mddb_workflow.utils.auxiliar import warn, load_json, save_json, load_yaml, save_yaml
25from mddb_workflow.utils.auxiliar import is_directory_empty, is_glob, parse_glob, safe_getattr
26from mddb_workflow.utils.auxiliar import read_ndict, write_ndict, get_git_version
27from mddb_workflow.utils.arg_cksum import get_cksum_id
28from mddb_workflow.utils.register import Register
29from mddb_workflow.utils.cache import Cache
30from mddb_workflow.utils.structures import Structure
31from mddb_workflow.utils.topologies import Topology
32from mddb_workflow.utils.file import File
33from mddb_workflow.utils.remote import Remote
34from mddb_workflow.utils.pyt_spells import get_frames_count, get_average_structure
35from mddb_workflow.utils.selections import Selection
36from mddb_workflow.utils.mda_spells import get_mda_universe
37from mddb_workflow.utils.type_hints import *
39# Import local tools
40from mddb_workflow.tools.get_first_frame import get_first_frame
41from mddb_workflow.tools.get_bonds import find_safe_bonds, get_bonds_reference_frame
42from mddb_workflow.tools.process_interactions import process_interactions
43from mddb_workflow.tools.generate_metadata import prepare_project_metadata, generate_md_metadata
44from mddb_workflow.tools.generate_ligands_desc import generate_ligand_mapping
45from mddb_workflow.tools.chains import prepare_chain_references
46from mddb_workflow.tools.generate_pdb_references import prepare_pdb_references
47from mddb_workflow.tools.residue_mapping import generate_residue_mapping
48from mddb_workflow.tools.generate_map import generate_protein_mapping
49from mddb_workflow.tools.generate_lipid_references import generate_lipid_references
50from mddb_workflow.tools.generate_membrane_mapping import generate_membrane_mapping
51from mddb_workflow.tools.generate_topology import generate_topology
52from mddb_workflow.tools.get_charges import get_charges
53from mddb_workflow.tools.get_inchi_keys import get_inchikeys
54from mddb_workflow.tools.remove_trash import remove_trash
55from mddb_workflow.tools.get_screenshot import get_screenshot
56from mddb_workflow.tools.process_input_files import process_input_files
57from mddb_workflow.tools.provenance import produce_provenance
59# Import local analyses
60from mddb_workflow.analyses.rmsds import rmsds
61from mddb_workflow.analyses.tmscores import tmscores
62from mddb_workflow.analyses.rmsf import rmsf
63from mddb_workflow.analyses.rgyr import rgyr
64from mddb_workflow.analyses.pca import pca
65from mddb_workflow.analyses.density import density
66from mddb_workflow.analyses.thickness import thickness
67from mddb_workflow.analyses.area_per_lipid import area_per_lipid
68from mddb_workflow.analyses.lipid_order import lipid_order
69from mddb_workflow.analyses.lipid_interactions import lipid_interactions
70from mddb_workflow.analyses.channels import channels
71#from mddb_workflow.analyses.pca_contacts import pca_contacts
72from mddb_workflow.analyses.rmsd_per_residue import rmsd_per_residue
73from mddb_workflow.analyses.rmsd_pairwise import rmsd_pairwise
74from mddb_workflow.analyses.clusters import clusters_analysis
75from mddb_workflow.analyses.distance_per_residue import distance_per_residue
76from mddb_workflow.analyses.hydrogen_bonds import hydrogen_bonds
77from mddb_workflow.analyses.sasa import sasa
78from mddb_workflow.analyses.energies import energies
79from mddb_workflow.analyses.dihedral_energies import compute_dihedral_energies
80from mddb_workflow.analyses.pockets import pockets
81from mddb_workflow.analyses.rmsd_check import check_trajectory_integrity
82from mddb_workflow.analyses.helical_parameters import helical_parameters
83from mddb_workflow.analyses.markov import markov
85# Make the system output stream to not be buffered
86# Only when not in a Jupyter notebook or using pytest
87# Check if we're in an interactive Python shell like Jupyter
88if not hasattr(sys, 'ps1') and not 'pytest' in sys.modules:
89 # This is useful to make prints work on time in Slurm
90 # Otherwise, output logs are written after the script has fully run
91 # Note that this fix affects all modules and built-ins
92 unbuffered = io.TextIOWrapper(open(sys.stdout.fileno(), 'wb', 0), write_through=True)
93 sys.stdout = unbuffered
95# Set a special exception for missing inputs
96MISSING_INPUT_EXCEPTION = Exception('Missing input')
98# Set a special exception for missing task function arguments
99# This is used for easy debug when a new functions is added wrongly
100MISSING_ARGUMENT_EXCEPTION = Exception('Missing argument')
102# Set a special exception for when a value is missing
103MISSING_VALUE_EXCEPTION = Exception('Missing value')
105# Name of the argument used by all functions to know where to write output
106OUTPUT_FILEPATH_ARG = 'output_filepath'
107OUTPUT_DIRECTORY_ARG = 'output_directory'
109# Set some variables which are filled at the end but are referred by previously defined functions
110requestables = {}
111inverted_requestables = {}
114class Task:
115 """ Descriptor class to handle a generic task.
116 It implements lazy properties, caching and overwriting.
118 Since its properties are static, results are stored in the parent object
119 (MD/Project), or otherwise all MDs would share the same task values. """
121 def __init__ (self,
122 flag : str,
123 name : str,
124 func : Callable,
125 args : dict = {},
126 output_filename : Optional[str] = None,
127 use_cache : bool = True,
128 debug : bool = False,
129 ):
130 """
131 Initialize the Task object.
133 Args:
134 flag (str):
135 The task flag.
136 This name is used by the include/exclude/overwrite arguments and to name the analysis output directory.
138 name (str):
139 The task user-friendly name is to be used in the logs.
141 func (Callable):
142 The task function.
143 Function argument names must correspond with Project/MD property names.
145 args (dict, optional):
146 The task function additional arguments.
147 Project/MD properties are automatically sent to the function as arguments.
148 However some analyses have additional arguments (e.g. frames limit, cutoffs, etc.)
150 output_filename (str, optional):
151 The task output filename.
152 Path will be set automatically relative to its project/MD.
153 For those tasks which generate a directory with multiple outputs this is not necessary.
154 However this may come in handy by tasks with a single file output.
155 Specially when this output file is used later in this workflow.
157 use_cache (bool, optional):
158 Set if the returned output is to be cached.
159 Note that argument values are always cached, this is not optional.
161 debug (bool, optional):
162 If the task is run in debug mode, producing more output logs. Defaults to False.
163 """
164 # Save input arguments
165 self.flag = flag
166 self.name = name
167 self.func = func
168 self.args = args
169 self.output_filename = output_filename
170 self.use_cache = use_cache
171 self.debug = debug
172 # Set the key used to store and retireve data in the parent and cache
173 self.parent_output_key = f'_{self.flag}_task_output'
174 self.parent_output_filepath_key = f'{self.flag}_task_output_filepath'
175 self.cache_output_key = f'{self.flag}_task_output'
176 self.cache_arg_cksums = f'{self.flag}_task_arg_cksums'
177 # Para el arg_cksum
178 self.__name__ = self.flag
180 # Set internal functions to handle parent saved output
181 # This output is not saved in the task itself, but in the parent, because the task is static
182 def _get_parent_output (self, parent):
183 return safe_getattr(parent, self.parent_output_key, MISSING_VALUE_EXCEPTION)
184 def _set_parent_output (self, parent, new_output):
185 return setattr(parent, self.parent_output_key, new_output)
186 def _get_parent_output_filepath (self, parent):
187 return safe_getattr(parent, self.parent_output_filepath_key, MISSING_VALUE_EXCEPTION)
188 def _set_parent_output_filepath (self, parent, new_filepath):
189 return setattr(parent, self.parent_output_filepath_key, new_filepath)
190 # Get the task output, running the task if necessary
191 def get_output (self, parent):
192 # If we already have a value stored from this run then return it
193 output = self._get_parent_output(parent)
194 if output != MISSING_VALUE_EXCEPTION: return output
195 # Otherwise run the task and return the output
196 return self(parent)
197 output = property(get_output, None, None, "Task output (read only)")
198 # Asking for the output file or filepath implies running the Task, then returning the file/filepath
199 def get_output_filepath (self, parent) -> str:
200 # If we already have a filepath stored from this run then return it
201 filepath = self._get_parent_output_filepath(parent)
202 if filepath != MISSING_VALUE_EXCEPTION: return filepath
203 # Otherwise run the task and return the filepath
204 self(parent)
205 filepath = self._get_parent_output_filepath(parent)
206 if filepath != MISSING_VALUE_EXCEPTION: return filepath
207 raise ValueError(f'Task {self.flag} has no output filepath after running')
208 output_filepath = property(get_output_filepath, None, None, "Task output filepath (read only)")
209 def get_output_file (self, parent) -> str:
210 filepath = self.get_output_filepath(parent)
211 return File(filepath)
212 output_file = property(get_output_file, None, None, "Task output file (read only)")
214 # When the task is printed, show the flag
215 def __repr__ (self): return f'<Task ({self.flag})>'
217 # When a task is called
218 def __call__(self, parent: Union['Project', 'MD']):
219 # First of all check if this task has been already done in this very run
220 # If so then return the stored vale
221 output = self._get_parent_output(parent)
222 if output != MISSING_VALUE_EXCEPTION: return output
223 # Process the task function arguments
224 processed_args = {}
225 # Get the task function expected arguments
226 specification = getfullargspec(self.func)
227 expected_arguments = specification.args
228 n_default_arguments = len(specification.defaults) if specification.defaults else 0
229 # Find out which arguments are optional since they have default values
230 default_arguments = set(expected_arguments[::-1][:n_default_arguments])
231 # If one of the expected arguments is the output_filename then set it here
232 output_filepath = None
233 writes_output_file = OUTPUT_FILEPATH_ARG in expected_arguments
234 if writes_output_file:
235 # The task should have a defined output file
236 if not self.output_filename:
237 raise RuntimeError(f'Task {self.flag} must have an "output_filename"')
238 # Set the output file path
239 output_filepath = parent.pathify(self.output_filename)
240 self._set_parent_output_filepath(parent, output_filepath)
241 # Add it to the processed args
242 processed_args[OUTPUT_FILEPATH_ARG] = output_filepath
243 # Remove the expected argument from the list
244 expected_arguments.remove(OUTPUT_FILEPATH_ARG)
245 # If one of the expected arguments is the output_directory then set it here
246 # We will set a new directory with the flag name of the task, in the correspoding path
247 # Note that while the task is beeing done the output directory has a different name
248 # Thus the directory is hidden and marked as incomplete
249 # The final output directory is the one without the incomplete prefix
250 writes_output_dir = OUTPUT_DIRECTORY_ARG in expected_arguments
251 incomplete_output_directory = None
252 final_output_directory = None
253 if writes_output_dir:
254 # Set the output directory path
255 incomplete_output_directory = parent.pathify(INCOMPLETE_PREFIX + self.flag)
256 final_output_directory = incomplete_output_directory.replace(INCOMPLETE_PREFIX, '')
257 # Add it to the processed args
258 processed_args[OUTPUT_DIRECTORY_ARG] = incomplete_output_directory
259 # Remove the expected argument from the list
260 expected_arguments.remove(OUTPUT_DIRECTORY_ARG)
261 # Iterate the reamining expected arguments
262 for arg in expected_arguments:
263 # First find the argument among the parent properties
264 arg_value = self.find_arg_value(arg, parent, default_arguments)
265 if arg_value == MISSING_ARGUMENT_EXCEPTION: continue
266 # Add the processed argument
267 processed_args[arg] = arg_value
268 # Check again if the task has output already
269 # It may happend that some dependencies assign output on their own
270 # e.g. charges, bonds
271 # If so then return the stored vale
272 output = self._get_parent_output(parent)
273 if output != MISSING_VALUE_EXCEPTION: return output
274 # Find if we have cached output
275 if self.use_cache:
276 output = parent.cache.retrieve(self.cache_output_key, MISSING_VALUE_EXCEPTION)
277 self._set_parent_output(parent, output)
278 # Check if this dependency is to be overwriten
279 forced_overwrite = self.flag in parent.overwritables
280 # Get the list of inputs which have changed compared to a previous run
281 # WARNING: Always get changed inputs, since this function updates the cache
282 # If had_cache is false then it means this is the first time the task is ever done
283 changed_inputs, had_cache, cache_cksums = self.get_changed_inputs(parent, processed_args)
284 any_input_changed = len(changed_inputs) > 0
285 # Update the cache inputs
286 parent.cache.update(self.cache_arg_cksums, cache_cksums)
287 # We must overwrite outputs either if inputs changed or if it was forced by the user
288 must_overwrite = forced_overwrite or any_input_changed
289 # Check if output already exists
290 # If the final directory already exists then it means the task was started in a previous run
291 existing_incomplete_output = writes_output_dir and exists(incomplete_output_directory)
292 # If the final directory already exists then it means the task was done in a previous run
293 existing_final_output = writes_output_dir and exists(final_output_directory)
294 # If the output file already exists then it also means the task was done in a previous run
295 existing_output_file = writes_output_file and exists(output_filepath)
296 # If we already have a cached output result
297 existing_output_data = output != MISSING_VALUE_EXCEPTION
298 # If we must overwrite then purge previous outputs
299 if must_overwrite:
300 if existing_incomplete_output: rmtree(incomplete_output_directory)
301 if existing_final_output: rmtree(final_output_directory)
302 if existing_output_file: remove(output_filepath)
303 if existing_output_data: parent.cache.delete(self.cache_output_key)
304 # If already existing output is not to be overwritten then check if it is already what we need
305 else:
306 # If output files/directories are expected then they must exist
307 # If output data is expected then it must be cached
308 satisfied_output = (not writes_output_dir or exists(final_output_directory)) \
309 and (not writes_output_file or exists(output_filepath)) \
310 and (output != MISSING_VALUE_EXCEPTION)
311 # If we already have the expected output then we can skip the task at all
312 if satisfied_output:
313 print(f'{GREY_HEADER}-> Task {self.flag} ({self.name}) already completed{COLOR_END}')
314 return output
315 # If we are at this point then we are missing some output so we must proceed to run the task
316 # Use the final output directory instead of the incomplete one if exists
317 # Note that we must check if it exists again since it may have been deleted since the last check
318 if writes_output_dir and exists(final_output_directory):
319 processed_args[OUTPUT_DIRECTORY_ARG] = final_output_directory
320 # Create the incomplete output directory, if necessary
321 missing_incomplete_output = writes_output_dir \
322 and not exists(incomplete_output_directory) \
323 and not exists(final_output_directory)
324 if missing_incomplete_output: mkdir(incomplete_output_directory)
325 # Finally call the function
326 print(f'{GREEN_HEADER}-> Running task {self.flag} ({self.name}){COLOR_END}')
327 start_time = time.time()
328 # If the task is to be run again because an inputs changed then let the user know
329 if any_input_changed and had_cache and not forced_overwrite:
330 changes = ''.join([ '\n - ' + inp for inp in changed_inputs ])
331 print(f'{GREEN_HEADER} The task is run again since the following inputs changed:{changes}{COLOR_END}')
332 # Save a few internal values the task although the task is static
333 # We save it right before calling the function in case the function uses this task as input
334 self.changed_inputs = changed_inputs
335 self.cache_cksums = cache_cksums
336 # Run the actual task
337 output = self.func(**processed_args)
338 end_time = time.time()
339 print(f' Task {self.flag} completed in {end_time - start_time:.2f} seconds{COLOR_END}')
340 self._set_parent_output(parent, output)
341 # Set the output to be saved in cache
342 # Note that all must be JSON serializable values
343 cache_output = output
344 # Update cache output unless it is marked to not save it
345 if self.use_cache: parent.cache.update(self.cache_output_key, cache_output)
346 # Update the overwritables so this is not remade further in the same run
347 parent.overwritables.discard(self.flag)
348 # As a brief cleanup, if the output directory is empty at the end, then remove it
349 # Otheriwse, change the incomplete directory name to its final name
350 if writes_output_dir and exists(incomplete_output_directory):
351 if is_directory_empty(incomplete_output_directory): rmtree(incomplete_output_directory)
352 else: rename(incomplete_output_directory, final_output_directory)
353 # Now return the function result
354 return output
356 def find_arg_value (self, arg : str, parent : Union['Project', 'MD'], default_arguments : set):
357 """ Find argument values, thus running any dependency if necessary. """
358 # Word 'task' is reserved for getting the task itself
359 if arg == 'task': return self
360 # Word 'self' is reserved for getting the caller Project/MD
361 if arg == 'self': return parent
362 # Check if the argument is an MD property
363 arg_value = safe_getattr(parent, arg, MISSING_ARGUMENT_EXCEPTION)
364 if arg_value != MISSING_ARGUMENT_EXCEPTION: return arg_value
365 # If the parent is an MD then it may happen the property is from the Project
366 if isinstance(parent, MD):
367 arg_value = safe_getattr(parent.project, arg, MISSING_ARGUMENT_EXCEPTION)
368 if arg_value != MISSING_ARGUMENT_EXCEPTION: return arg_value
369 # If the property is missing then search among the additional arguments
370 arg_value = self.args.get(arg, MISSING_ARGUMENT_EXCEPTION)
371 if arg_value != MISSING_ARGUMENT_EXCEPTION: return arg_value
372 # It may also happen that the argument has a default value
373 # If this is the case then we can skip it
374 if arg in default_arguments: return MISSING_ARGUMENT_EXCEPTION
375 # NEVER FORGET: Function arguments must have the same name that the Project/MD property
376 # If the argument is still missing then you programmed the function wrongly or...
377 # You may have forgotten the additional argument in the task args
378 raise RuntimeError(f'Function "{self.func.__name__}" from task "{self.flag}" expects argument "{arg}" but it is missing')
380 def get_changed_inputs (self,
381 parent : Union['Project', 'MD'],
382 processed_args : dict) -> tuple[ list[str], bool ]:
383 """ Find out if input arguments changed regarding the last run. """
384 # Get cache argument references
385 cache_cksums = parent.cache.retrieve(self.cache_arg_cksums, MISSING_VALUE_EXCEPTION)
386 had_cache = False if cache_cksums == MISSING_VALUE_EXCEPTION else True
387 if cache_cksums == MISSING_VALUE_EXCEPTION: cache_cksums = {}
388 # Check argument by argument
389 # Keep a list with arguments which have changed
390 unmatched_arguments = []
391 for arg_name, arg_value in processed_args.items():
392 # Skip the output directory argument
393 # Changes in this argument are not actual changes
394 if arg_name == OUTPUT_DIRECTORY_ARG: continue
395 # Get the cksum from the new argument value
396 new_cksum = get_cksum_id(arg_value)
397 # Retrieve the cksum from the old argument value
398 old_cksum = cache_cksums.get(arg_name, None)
399 if self.debug: print(f'Task "{self.name}" -> argument "{arg_name}"\n' +
400 f' new value: {arg_value}\n' +
401 f' new value cksum: {new_cksum}\n' +
402 f' old value cksum: {old_cksum}\n' +
403 f' match: {new_cksum == old_cksum}')
404 # Compare new and old cksums
405 if new_cksum != old_cksum:
406 # If we found a missmatch then add it to the list
407 unmatched_arguments.append(arg_name)
408 # Update the references
409 cache_cksums[arg_name] = new_cksum
410 return unmatched_arguments, had_cache, cache_cksums
413class MD:
414 """ A Molecular Dynamics (MD) is the union of a structure and a trajectory.
415 Having this data several analyses are possible.
416 Note that an MD is always defined inside of a Project and thus it has additional topology and metadata. """
418 def __init__ (self,
419 project : 'Project',
420 number : int,
421 directory : str,
422 input_structure_filepath : str,
423 input_trajectory_filepaths : list[str],
424 ):
425 """
426 Initialize the MD object.
428 Args:
429 project (Project): The parent project this MD belongs to.
430 number (int): The number of the MD according to its accession.
431 directory (str): The local directory where the MD takes place.
432 input_structure_filepath (str): The input structure file path.
433 input_trajectory_filepaths (list[str]): The input trajectory file paths.
434 """
436 # Save the inputs
437 self.project = project
438 if not project:
439 raise Exception('Project is mandatory to instantiate a new MD')
440 # Save the MD number and index
441 self.number = number
442 self.index = number - 1
443 # Set the MD accession and request URL
444 self.accession = None
445 self.remote = None
446 if self.project.database_url and self.project.accession:
447 self.accession = f'{self.project.accession}.{self.number}'
448 self.remote = Remote(self.project.database_url, self.accession)
449 # Save the directory
450 self.directory = normpath(directory)
451 # Now set the director relative to the project
452 self.directory = self.project.pathify(self.directory)
453 if normpath(self.directory) == normpath(self.project.directory):
454 raise InputError(f'MD {self.number} has the same directory as the project: {self.directory}')
455 # Save the directory name alone apart
456 self.directory_location, self.directory_name = split(self.directory)
457 # If the directory does not exists then create it
458 if not exists(self.directory):
459 mkdir(self.directory)
460 # Save the input structure filepath
461 # They may be relative to the project directory (unique) or relative to the MD directory (one per MD)
462 # If the path is absolute then it is considered unique
463 # If the file does not exist and it is to be downloaded then it is downloaded for each MD
464 # Priorize the MD directory over the project directory
465 self.arg_input_structure_filepath = input_structure_filepath
466 self._input_structure_filepath = None
467 # Set the internal variable for the input structure file, to be assigned later
468 self._input_structure_file = None
469 # Save the input trajectory filepaths
470 self.arg_input_trajectory_filepaths = input_trajectory_filepaths
471 self._input_trajectory_filepaths = None
472 # Set the internal variable for the input trajectory files, to be assigned later
473 self._input_trajectory_files = None
475 # Processed structure and trajectory files
476 self._structure_file = None
477 self._trajectory_file = None
479 # Other values which may be found/calculated on demand
480 self._md_inputs = None
481 self._structure = None
483 # Tests
484 self._trajectory_integrity = None
486 # Set a new MD specific register
487 # In case the directory is the project directory itself, use the project register
488 register_filepath = self.pathify(REGISTER_FILENAME)
489 register_file = File(register_filepath)
490 if register_file.path == self.project.register.file.path:
491 self.register = self.project.register
492 else:
493 self.register = Register(register_file)
494 # Save also warnings apart since they are to be used as an input for metadata tasks
495 self.warnings = self.register.warnings
497 # Set a new MD specific cache
498 # In case the directory is the project directory itself, use the project cache
499 cache_filepath = self.pathify(CACHE_FILENAME)
500 cache_file = File(cache_filepath)
501 if cache_file.path == self.project.cache.file.path:
502 self.cache = self.project.cache
503 else:
504 self.cache = Cache(cache_file)
506 # Set tasks whose output is to be overwritten
507 self.overwritables = set()
509 # Get MD inputs just to fill the inputs' "mds" value
510 # Some functions may fail otherwise when its value is missing
511 self.get_md_inputs()
513 def __repr__ (self):
514 return 'MD'
516 def pathify (self, filename_or_relative_path : str) -> str:
517 """ Given a filename or relative path, add the MD directory path at the beginning. """
518 return normpath(self.directory + '/' + filename_or_relative_path)
520 # Input structure file ------------
522 def get_input_structure_filepath (self) -> str:
523 """ Set a function to get input structure file path. """
524 # Return the internal value if it is already assigned
525 if self._input_structure_filepath != None:
526 return self._input_structure_filepath
527 # Set a function to find out if a path is relative to MD directories or to the project directory
528 # To do so just check if the file exists in any of those
529 # In case it exists in both or none then assume it is relative to MD directory
530 # Parse glob notation in the process
531 def relativize_and_parse_paths (input_path : str, may_not_exist : bool = False) -> Optional[str]:
532 # Check if it is an absolute path
533 if isabs(input_path):
534 abs_glob_parse = parse_glob(input_path)
535 # If we had multiple results then we complain
536 if len(abs_glob_parse) > 1:
537 raise InputError(f'Multiple structures found with "{input_path}": {", ".join(abs_glob_parse)}')
538 # If we had no results then we complain
539 if len(abs_glob_parse) == 0:
540 if self.remote:
541 warn('Spread syntax is not supported to download remote files')
542 raise InputError(f'No structure found with "{input_path}"')
543 abs_parsed_filepath = abs_glob_parse[0]
544 return abs_parsed_filepath
545 # Check the MD directory
546 md_relative_filepath = self.pathify(input_path)
547 md_glob_parse = parse_glob(md_relative_filepath)
548 if len(md_glob_parse) > 1:
549 raise InputError(f'Multiple structures found with "{input_path}": {", ".join(md_glob_parse)}')
550 md_parsed_filepath = md_glob_parse[0] if len(md_glob_parse) == 1 else None
551 if md_parsed_filepath and File(md_parsed_filepath).exists:
552 return md_parsed_filepath
553 # Check the project directory
554 project_relative_filepath = self.project.pathify(input_path)
555 project_glob_parse = parse_glob(project_relative_filepath)
556 if len(project_glob_parse) > 1:
557 raise InputError(f'Multiple structures found with "{input_path}": {", ".join(project_glob_parse)}')
558 project_parsed_filepath = project_glob_parse[0] if len(project_glob_parse) == 1 else None
559 if project_parsed_filepath and File(project_parsed_filepath).exists:
560 return project_parsed_filepath
561 # At this point we can conclude the input structure file does not exist
562 # If we have no paths at all then it means a glob pattern was passed and it didn't match
563 # Note that if a glob pattern existed then it would mean the file actually existed
564 if len(md_glob_parse) == 0 and len(project_glob_parse) == 0:
565 # Warn the user in case it was trying to use glob syntax to donwload remote files
566 if self.remote:
567 warn('Spread syntax is not supported to download remote files')
568 raise InputError('No trajectory file was reached neither in the project directory or MD directories in path(s) ' + ', '.join(input_path))
569 # If the path does not exist anywhere then we asume it will be downloaded and set it relative to the MD
570 # However make sure we have a remote
571 # As an exception, if the 'may not exist' flag is passed then we return the result even if there is no remote
572 if not may_not_exist and not self.remote:
573 raise InputError(f'Cannot find a structure file by "{input_path}" anywhere')
574 md_parsed_filepath = self.project.pathify(input_path) if f'{self.directory_name}/' in md_parsed_filepath else self.pathify(input_path)
575 return md_parsed_filepath
576 # If we have a value passed through command line
577 if self.arg_input_structure_filepath:
578 # Find out if it is relative to MD directories or to the project directory
579 self._input_structure_filepath = relativize_and_parse_paths(self.arg_input_structure_filepath)
580 # Save the parsed value in the inputs file
581 self.project.update_inputs(
582 f'mds.{self.index}.input_structure_filepath',
583 self._input_structure_filepath)
584 return self._input_structure_filepath
585 # If we have a value passed through the inputs file has the value
586 if self.project.is_inputs_file_available():
587 # Get the input value, whose key must exist
588 inputs_value = self.get_input('input_structure_filepath')
589 # If there is a valid input then use it
590 if inputs_value:
591 self._input_structure_filepath = relativize_and_parse_paths(inputs_value)
592 return self._input_structure_filepath
593 # If there is not input structure anywhere then use the input topology
594 # We will extract the structure from it using a sample frame from the trajectory
595 # Note that topology input filepath must exist and an input error will raise otherwise
596 # However if we are using the standard topology file we can not extract the PDB from it (yet)
597 if self.project.input_topology_file != MISSING_TOPOLOGY and \
598 self.project.input_topology_file.filename != STANDARD_TOPOLOGY_FILENAME:
599 return self.project.input_topology_file.path
600 # If we can not use the topology either then surrender
601 raise InputError('There is not input structure at all')
603 def get_input_structure_file (self) -> str:
604 """ Get the input pdb filename from the inputs.
605 If the file is not found try to download it. """
606 # If the input structure file is already defined then return it
607 if self._input_structure_file:
608 return self._input_structure_file
609 # Otherwise we must set it
610 # First set the input structure filepath
611 input_structure_filepath = self.get_input_structure_filepath()
612 # Now set the input structure file
613 self._input_structure_file = File(input_structure_filepath)
614 # If the file already exists then return it
615 if self._input_structure_file.exists:
616 return self._input_structure_file
617 # Try to download it
618 # If we do not have the required parameters to download it then we surrender here
619 if not self.remote:
620 raise InputError(f'Missing input structure file "{self._input_structure_file.path}"')
621 # Download the structure
622 # If the structure filename is the standard structure filename then use the structure endpoint instead
623 if self._input_structure_file.filename == STRUCTURE_FILENAME:
624 self.remote.download_standard_structure(self._input_structure_file)
625 # Otherwise download the input strucutre file by its filename
626 else:
627 self.remote.download_file(self._input_structure_file)
628 return self._input_structure_file
629 input_structure_file = property(get_input_structure_file, None, None, "Input structure filename (read only)")
631 # Input trajectory filename ------------
633 def get_input_trajectory_filepaths (self) -> str:
634 """ Get the input trajectory file paths. """
635 # Return the internal value if it is already assigned
636 if self._input_trajectory_filepaths != None:
637 return self._input_trajectory_filepaths
638 # Set a function to check and fix input trajectory filepaths
639 # Also relativize paths to the current MD directory and parse glob notation
640 def relativize_and_parse_paths (input_paths : list[str]) -> list[str]:
641 checked_paths = input_paths
642 # Input trajectory filepaths may be both a list or a single string
643 # However we must keep a list
644 if type(checked_paths) == list:
645 pass
646 elif type(checked_paths) == str:
647 checked_paths = [ checked_paths ]
648 else:
649 raise InputError('Input trajectory filepaths must be a list of strings or a string')
650 # Make sure all or none of the trajectory paths are absolute
651 abs_count = sum([ isabs(path) for path in checked_paths ])
652 if not (abs_count == 0 or abs_count == len(checked_paths)):
653 raise InputError('All trajectory paths must be relative or absolute. Mixing is not supported')
654 # Set a function to glob-parse and merge all paths
655 def parse_all_glob (paths : list[str]) -> list[str]:
656 parsed_paths = []
657 for path in paths:
658 parsed_paths += parse_glob(path)
659 return parsed_paths
660 # In case trajectory paths are absolute
661 if abs_count > 0:
662 absolute_parsed_paths = parse_all_glob(checked_paths)
663 # Check we successfully defined some trajectory file
664 if len(absolute_parsed_paths) == 0:
665 # Warn the user in case it was trying to use glob syntax to donwload remote files
666 if self.remote:
667 warn('Spread syntax is not supported to download remote files')
668 raise InputError('No trajectory file was reached neither in the project directory or MD directories in path(s) ' + ', '.join(input_paths))
669 return absolute_parsed_paths
670 # If trajectory paths are not absolute then check if they are relative to the MD directory
671 # Get paths relative to the current MD directory
672 md_relative_paths = [ self.pathify(path) for path in checked_paths ]
673 # In case there are glob characters we must parse the paths
674 md_parsed_paths = parse_all_glob(md_relative_paths)
675 # Check we successfully defined some trajectory file
676 if len(md_parsed_paths) > 0:
677 # If so, check at least one of the files do actually exist
678 if any([ File(path).exists for path in md_parsed_paths ]):
679 return md_parsed_paths
680 # If no trajectory files where found then asume they are relative to the project
681 # Get paths relative to the project directory
682 project_relative_paths = [ self.project.pathify(path) for path in checked_paths ]
683 # In case there are glob characters we must parse the paths
684 project_parsed_paths = parse_all_glob(project_relative_paths)
685 # Check we successfully defined some trajectory file
686 if len(project_parsed_paths) > 0:
687 # If so, check at least one of the files do actually exist
688 if any([ File(path).exists for path in project_parsed_paths ]):
689 return project_parsed_paths
690 # At this point we can conclude the input trajectory file does not exist
691 # If we have no paths at all then it means a glob pattern was passed and it didn't match
692 # Note that if a glob pattern existed then it would mean the file actually existed
693 if len(md_parsed_paths) == 0 and len(project_parsed_paths) == 0:
694 # Warn the user in case it was trying to use glob syntax to donwload remote files
695 if self.remote:
696 warn('Spread syntax is not supported to download remote files')
697 raise InputError('No trajectory file was reached neither in the project directory or MD directories in path(s) ' + ', '.join(input_paths))
698 # If we have a path however it may be downloaded from the database if we have a remote
699 if not self.remote:
700 raise InputError(f'Cannot find anywhere a trajectory file with path(s) "{", ".join(input_paths)}"')
701 # Note that if input path was not glob based it will be both as project relative and MD relative
702 if len(md_parsed_paths) == 0: raise ValueError('This should never happen')
703 # If file is to be downloaded then we must make sure the path is relative to the project
704 project_relative_paths = [
705 self.project.pathify(path) if f'{self.directory_name}/' in path else self.pathify(path) for path in checked_paths
706 ]
707 return project_relative_paths
708 # If we have a value passed through command line
709 if self.arg_input_trajectory_filepaths:
710 self._input_trajectory_filepaths = relativize_and_parse_paths(self.arg_input_trajectory_filepaths)
711 # Save the parsed value in the inputs file
712 self.project.update_inputs(
713 f'mds.{self.index}.input_trajectory_filepaths',
714 self._input_trajectory_filepaths)
715 return self._input_trajectory_filepaths
716 # Check if the inputs file has the value
717 if self.project.is_inputs_file_available():
718 # Get the input value
719 inputs_value = self.get_input('input_trajectory_filepaths')
720 if inputs_value:
721 self._input_trajectory_filepaths = relativize_and_parse_paths(inputs_value)
722 return self._input_trajectory_filepaths
723 # If there is no trajectory available then we surrender
724 raise InputError('There is not input trajectory at all')
726 def get_input_trajectory_files (self) -> str:
727 """ Get the input trajectory filename(s) from the inputs.
728 If file(s) are not found try to download it. """
729 # If we already defined input trajectory files then return them
730 if self._input_trajectory_files != None:
731 return self._input_trajectory_files
732 # Otherwise we must set the input trajectory files
733 input_trajectory_filepaths = self.get_input_trajectory_filepaths()
734 self._input_trajectory_files = [ File(path) for path in input_trajectory_filepaths ]
735 # Find missing trajectory files
736 missing_input_trajectory_files = []
737 for trajectory_file in self._input_trajectory_files:
738 if not trajectory_file.exists:
739 missing_input_trajectory_files.append(trajectory_file)
740 # If all files already exists then we are done
741 if len(missing_input_trajectory_files) == 0:
742 return self._input_trajectory_files
743 # Try to download the missing files
744 # If we do not have the required parameters to download it then we surrender here
745 if not self.remote:
746 missing_filepaths = [ trajectory_file.path for trajectory_file in missing_input_trajectory_files ]
747 raise InputError('Missing input trajectory files: ' + ', '.join(missing_filepaths))
748 # Download each trajectory file (ususally it will be just one)
749 for trajectory_file in self._input_trajectory_files:
750 # If this is the main trajectory (the usual one) then use the dedicated endpoint
751 if trajectory_file.filename == TRAJECTORY_FILENAME:
752 frame_selection = f'1:{self.project.sample_trajectory}:1' if self.project.sample_trajectory else None
753 self.remote.download_trajectory(trajectory_file, frame_selection=frame_selection, format='xtc')
754 # Otherwise, download it by its filename
755 else:
756 self.remote.download_file(trajectory_file)
757 return self._input_trajectory_files
758 input_trajectory_files = property(get_input_trajectory_files, None, None, "Input trajectory filenames (read only)")
760 def get_md_inputs (self) -> dict:
761 """ Get MD specific inputs. """
762 # If we already have a value stored then return it
763 if self._md_inputs:
764 return self._md_inputs
765 # Otherwise we must find its value
766 # If we have MD inputs in the inputs file then use them
767 if self.project.input_mds:
768 # Iterate over the different MD inputs to find out each directory
769 # We must find the MD inputs whcih belong to this specific MD according to this directory
770 for md in self.project.input_mds:
771 # Get the directory according to the inputs
772 directory = md.get(MD_DIRECTORY, None)
773 if directory:
774 check_directory(directory)
775 # If no directory is specified in the inputs then guess it from the MD name
776 else:
777 name = md.get('name', None)
778 if not name: raise InputError('There is a MD with no name and no directory. Please define at least one of them.')
779 directory = name_2_directory(name)
780 # If the directory matches then this is our MD inputs
781 if self.project.pathify(directory) == self.directory:
782 self._md_inputs = md
783 return self._md_inputs
784 # If this MD directory has not associated inputs then it means it was passed through command line
785 # We set a new MD inputs for it
786 new_md_name = directory_2_name(self.directory)
787 self._md_inputs = { 'name': new_md_name, 'mdir': self.directory }
788 # Update the inputs file with the new MD inputs
789 mds = self.project.inputs.get('mds', None)
790 if mds == None: mds = []
791 new_mds_inputs = [ *mds, self._md_inputs ]
792 self.project.update_inputs('mds', new_mds_inputs)
793 return self._md_inputs
795 md_inputs = property(get_md_inputs, None, None, "MD specific inputs (read only)")
797 def get_input (self, name: str):
798 """ Get a specific 'input' value from MD inputs. """
799 value = self.md_inputs.get(name, MISSING_INPUT_EXCEPTION)
800 # If we had a value then return it
801 if value != MISSING_INPUT_EXCEPTION:
802 return value
803 return self.project.get_input(name)
805 # ---------------------------------
807 def get_file (self, target_file : File) -> bool:
808 """ Check if a file exists. If not, try to download it from the database.
809 If the file is not found in the database it is fine, we do not even warn the user.
810 Note that this function is used to get populations and transitions files, which are not common. """
811 # If it exists we are done
812 if target_file.exists:
813 return True
814 # Try to download the missing file
815 # If we do not have the required parameters to download it then we surrender here
816 if not self.remote:
817 return False
818 # Check if the file is among the available remote files
819 # If it is no then stop here
820 if target_file.filename not in self.remote.available_files:
821 return False
822 # Download the file
823 self.remote.download_file(target_file)
824 return True
826 def print_tests_summary (self):
827 """ Make a summary of tests and their status. """
828 print('Tests summary:')
829 for test_name in AVAILABLE_CHECKINGS:
830 test_result = self.register.tests.get(test_name, None)
831 # Print things pretty
832 test_nice_name = NICE_NAMES[test_name]
833 test_nice_result = None
834 if test_result == None:
835 test_nice_result = YELLOW_HEADER + 'Not run' + COLOR_END
836 elif test_result == False:
837 test_nice_result = RED_HEADER + 'Failed' + COLOR_END
838 elif test_result == True:
839 test_nice_result = GREEN_HEADER + 'Passed' + COLOR_END
840 elif test_result == 'na':
841 test_nice_result = BLUE_HEADER + 'Not applicable' + COLOR_END
842 else:
843 raise ValueError()
845 print(f' - {test_nice_name} -> {test_nice_result}')
847 # Issue some warnings if failed or never run tests are skipped
848 # This is run after processing input files
849 def _issue_required_test_warnings (self):
850 for test_name in AVAILABLE_CHECKINGS:
851 # If test was not skipped then proceed
852 if test_name not in self.project.trust: continue
853 # If test passed in a previous run the proceed
854 test_result = self.register.tests.get(test_name)
855 if test_result == True: continue
856 # If test failed in a previous run we can also proceed
857 # The failing warning must be among the inherited warnings, so there is no need to add more warnings here
858 if test_result == False: continue
859 # If the test has been always skipped then issue a warning
860 if test_result == None:
861 # Remove previous warnings
862 self.register.remove_warnings(test_name)
863 # Get test pretty name
864 test_nice_name = NICE_NAMES[test_name]
865 # Issue the corresponding warning
866 self.register.add_warning(test_name, test_nice_name + ' was skipped and never run before')
867 continue
868 raise ValueError('Test value is not supported')
870 # Processed files ----------------------------------------------------
872 # Run the actual processing to generate output processed files out of input raw files
873 # And by "files" I mean structure, trajectory and topology
874 input_files_processing = Task('inpro', 'Input files processing', process_input_files)
876 def get_structure_file (self) -> str:
877 """ Get the processed structure file. """
878 # If we have a stored value then return it
879 # This means we already found or generated this file
880 if self._structure_file:
881 return self._structure_file
882 # Set the file
883 structure_filepath = self.pathify(STRUCTURE_FILENAME)
884 self._structure_file = File(structure_filepath)
885 # If the faith flag was passed then simply make sure the input file makes sense
886 if self.project.faith:
887 if self.input_structure_file != self._structure_file:
888 raise InputError('Input structure file is not equal to output structure file but the "faith" flag was used.\n'
889 ' Please refrain from using the faith argument (-f) if you ignore its effect.')
890 if not self.input_structure_file.exists:
891 raise InputError('Input structure file does not exist but the "faith" flag was used.\n'
892 ' Please refrain from using the faith argument (-f) if you ignore its effect.')
893 return self._structure_file
894 # Run the processing logic
895 self.input_files_processing(self)
896 # Now that the file is sure to exist we return it
897 return self._structure_file
898 structure_file = property(get_structure_file, None, None, "Structure file (read only)")
900 def get_trajectory_file (self) -> str:
901 """ Get the processed trajectory file. """
902 # If we have a stored value then return it
903 # This means we already found or generated this file
904 if self._trajectory_file:
905 return self._trajectory_file
906 # If the file already exists then we are done
907 trajectory_filepath = self.pathify(TRAJECTORY_FILENAME)
908 self._trajectory_file = File(trajectory_filepath)
909 # If the faith flag was passed then simply make sure the input file makes sense
910 if self.project.faith:
911 if len(self.input_trajectory_files) > 1:
912 raise InputError('Several input trajectory files but the "faith" flag was used.\n'
913 ' Please refrain from using the faith argument (-f) if you ignore its effect.')
914 sample = self.input_trajectory_files[0]
915 if sample != self._trajectory_file:
916 raise InputError('Input trajectory file is not equal to output trajectory file but the "faith" flag was used.\n'
917 ' Please refrain from using the faith argument (-f) if you ignore its effect.')
918 if not self._trajectory_file.exists:
919 raise InputError('Input trajectory file does not exist but the "faith" flag was used.\n'
920 ' Please refrain from using the faith argument (-f) if you ignore its effect.')
921 return self._trajectory_file
922 # Run the processing logic
923 self.input_files_processing(self)
924 # Now that the file is sure to exist we return it
925 return self._trajectory_file
926 trajectory_file = property(get_trajectory_file, None, None, "Trajectory file (read only)")
928 def get_topology_file (self) -> str:
929 """ Get the processed topology from the project. """
930 return self.project.get_topology_file()
931 topology_file = property(get_topology_file, None, None,
932 "Topology filename from the project (read only)")
934 # ---------------------------------------------------------------------------------
935 # Others values which may be found/calculated and files to be generated on demand
936 # ---------------------------------------------------------------------------------
938 # Trajectory snapshots
939 get_snapshots = Task('frames', 'Count trajectory frames', get_frames_count)
940 snapshots = property(get_snapshots, None, None, "Trajectory snapshots (read only)")
942 def get_reference_bonds (self) -> list[list[int]]:
943 """ Get the reference bonds. """
944 return self.project.reference_bonds
945 reference_bonds = property(get_reference_bonds, None, None, "Atom bonds to be trusted (read only)")
947 def get_structure (self) -> 'Structure':
948 """ Get the parsed structure. """
949 # If we already have a stored value then return it
950 if self._structure:
951 return self._structure
952 # Otherwise we must set the structure
953 # Make sure the structure file exists at this point
954 if not self.structure_file.exists:
955 raise ValueError('Trying to set standard structure but file '
956 f'{self.structure_file.path} does not exist yet. Are you trying '
957 'to access the standard structure before processing input files?')
958 # Note that this is not only the structure class, but it also contains additional logic
959 self._structure = Structure.from_pdb_file(self.structure_file.path)
960 # If the stable bonds test failed and we had mercy then it is sure our structure will have wrong bonds
961 # In order to make it coherent with the topology we will mine topology bonds from here and force them in the structure
962 # If we fail to get bonds from topology then just go along with the default structure bonds
963 if not self.register.tests.get(STABLE_BONDS_FLAG, None):
964 self._structure.bonds = self.reference_bonds
965 # Same procedure if we have coarse grain atoms
966 elif self.cg_selection:
967 self._structure.bonds = self.reference_bonds
968 return self._structure
969 structure = property(get_structure, None, None, "Parsed structure (read only)")
971 # First frame PDB file
972 get_first_frame = Task('firstframe', 'Get first frame structure',
973 get_first_frame, output_filename = FIRST_FRAME_FILENAME)
974 get_first_frame_file = get_first_frame.get_output_file
975 first_frame_file = property(get_first_frame_file, None, None, "First frame (read only)")
977 # Average structure filename
978 get_average_structure = Task('average', 'Get average structure',
979 get_average_structure, output_filename = AVERAGE_STRUCTURE_FILENAME)
980 get_average_structure_file = get_average_structure.get_output_file
981 average_structure_file = property(get_average_structure_file, None, None, "Average structure filename (read only)")
983 # Produce the MD metadata file to be uploaded to the database
984 prepare_metadata = Task('mdmeta', 'Prepare MD metadata',
985 generate_md_metadata, output_filename=OUTPUT_METADATA_FILENAME)
987 # The processed interactions
988 get_processed_interactions = Task('inter', 'Interaccions processing',
989 process_interactions, { 'frames_limit': 1000 })
990 interactions = property(get_processed_interactions, None, None, "Processed interactions (read only)")
992 # MDAnalysis Universe object
993 get_MDAnalysis_Universe = Task('mda_univ', 'MDAnalysis Universe object',
994 get_mda_universe, use_cache = False)
995 universe = property(get_MDAnalysis_Universe, None, None, "MDAnalysis Universe object (read only)")
997 def input_getter (name : str):
998 """ Function to get input values which may be MD specific.
999 If the MD input is missing then we use the project input value. """
1000 # Set the getter
1001 def getter (self):
1002 # Get the MD input
1003 value = self.md_inputs.get(name, None)
1004 if value != None:
1005 return value
1006 # If there is no MD input then return the project value
1007 return getattr(self.project, f'input_{name}')
1008 return getter
1010 # Assign the MD input getters
1011 input_interactions = property(input_getter('interactions'), None, None, "Interactions to be analyzed (read only)")
1012 input_pbc_selection = property(input_getter('pbc_selection'), None, None, "Selection of atoms which are still in periodic boundary conditions (read only)")
1013 input_cg_selection = property(input_getter('cg_selection'), None, None, "Selection of atoms which are not actual atoms but coarse grain beads (read only)")
1015 def _set_pbc_selection (self, reference_structure : 'Structure', verbose : bool = False) -> 'Selection':
1016 """ Internal function to set PBC selection.
1017 It may parse the inputs file selection string if it is available or guess it otherwise. """
1018 # Otherwise we must set the PBC selection
1019 if verbose: print('Setting Periodic Boundary Conditions (PBC) atoms selection')
1020 selection_string = None
1021 # If there is inputs file then get the input pbc selection
1022 if self.project.is_inputs_file_available():
1023 if verbose: print(' Using selection string in the inputs file')
1024 selection_string = self.input_pbc_selection
1025 # If there is no inputs file we guess PBC atoms automatically
1026 else:
1027 if verbose: print(' No inputs file -> Selection string will be set automatically')
1028 selection_string = 'auto'
1029 # Parse the selection string using the reference structure
1030 parsed_selection = None
1031 # If the input PBC selection is 'auto' then guess it automatically
1032 if selection_string == 'auto':
1033 # To guess PBC atoms (with the current implementation) we must make sure ther eis no CG
1034 if reference_structure.has_cg():
1035 raise InputError('We can not guess PBC atoms in CG systems. Please set PBC atoms manually.\n'
1036 ' Use the "-pbc" argument or set the inputs file "pbc_selection" field.')
1037 if verbose: print(' Guessing PBC atoms as solvent, counter ions and lipids')
1038 parsed_selection = reference_structure.select_pbc_guess()
1039 # If we have a valid input value then use it
1040 elif selection_string:
1041 if verbose: print(f' Selecting PBC atoms "{selection_string}"')
1042 parsed_selection = reference_structure.select(selection_string)
1043 if not parsed_selection:
1044 raise InputError(f'PBC selection "{selection_string}" selected no atoms')
1045 # If we have an input value but it is empty then we set an empty selection
1046 else:
1047 if verbose: print(' No PBC atoms selected')
1048 parsed_selection = Selection()
1049 # Log a few of the selected residue names
1050 if verbose and parsed_selection:
1051 print(f' Parsed PBC selection has {len(parsed_selection)} atoms')
1052 selected_residues = reference_structure.get_selection_residues(parsed_selection)
1053 selected_residue_names = list(set([ residue.name for residue in selected_residues ]))
1054 limit = 3 # Show a maximum of 3 residue names
1055 example_residue_names = ', '.join(selected_residue_names[0:limit])
1056 if len(selected_residue_names) > limit: example_residue_names += ', etc.'
1057 print(' e.g. ' + example_residue_names)
1058 return parsed_selection
1060 def get_pbc_selection (self) -> 'Selection':
1061 """ Get the periodic boundary conditions atom selection. """
1062 # If we already have a stored value then return it
1063 if self.project._pbc_selection != None:
1064 return self.project._pbc_selection
1065 # Otherwise we must set the PBC selection
1066 self.project._pbc_selection = self._set_pbc_selection(self.structure)
1067 return self.project._pbc_selection
1068 pbc_selection = property(get_pbc_selection, None, None, "Periodic boundary conditions atom selection (read only)")
1070 # WARNING: Do not inherit project pbc residues
1071 # WARNING: It may trigger all the processing logic of the reference MD when there is no need
1072 def get_pbc_residues (self) -> list[int]:
1073 """ Get indices of residues in periodic boundary conditions. """
1074 # If we already have a stored value then return it
1075 if self.project._pbc_residues:
1076 return self.project._pbc_residues
1077 # If there is no inputs file then asume there are no PBC residues
1078 if not self.pbc_selection:
1079 self.project._pbc_residues = []
1080 return self.project._pbc_residues
1081 # Otherwise we parse the selection and return the list of residue indices
1082 self.project._pbc_residues = self.structure.get_selection_residue_indices(self.pbc_selection)
1083 print(f'PBC residues "{self.input_pbc_selection}" -> {len(self.project._pbc_residues)} residues')
1084 return self.project._pbc_residues
1085 pbc_residues = property(get_pbc_residues, None, None, "Indices of residues in periodic boundary conditions (read only)")
1087 # DANI: Esto algún día habría que tratar de automatizarlo
1088 def _set_cg_selection (self, reference_structure : 'Structure', verbose : bool = False) -> 'Selection':
1089 """ Set the coarse grain selection. """
1090 if verbose: print('Setting Coarse Grained (CG) atoms selection')
1091 # If there is no inputs file then asum there is no CG selection
1092 if not self.project.is_inputs_file_available():
1093 if verbose: print(' No inputs file -> Asuming there is no CG at all')
1094 return Selection()
1095 # Otherwise we use the selection string from the inputs
1096 if verbose: print(' Using selection string in the inputs file')
1097 selection_string = self.input_cg_selection
1098 # If the selection is empty, again, assume there is no CG selection
1099 if not selection_string:
1100 if verbose: print(' Empty selection -> There is no CG at all')
1101 return Selection()
1102 # Otherwise, process it
1103 # If we have a valid input value then use it
1104 elif selection_string:
1105 if verbose: print(f' Selecting CG atoms "{selection_string}"')
1106 parsed_selection = reference_structure.select(selection_string)
1107 # If we have an input value but it is empty then we set an empty selection
1108 else:
1109 if verbose: print(' No CG atoms selected')
1110 parsed_selection = Selection()
1111 # Lof the parsed selection size
1112 if verbose: print(f' Parsed CG selection has {len(parsed_selection)} atoms')
1113 # Log a few of the selected residue names
1114 if verbose and parsed_selection:
1115 selected_residues = reference_structure.get_selection_residues(parsed_selection)
1116 selected_residue_names = list(set([ residue.name for residue in selected_residues ]))
1117 limit = 3 # Show a maximum of 3 residue names
1118 example_residue_names = ', '.join(selected_residue_names[0:limit])
1119 if len(selected_residue_names) > limit: example_residue_names += ', etc.'
1120 print(' e.g. ' + example_residue_names)
1121 return parsed_selection
1123 def get_cg_selection (self) -> 'Selection':
1124 """ Get the coarse grain atom selection. """
1125 # If we already have a stored value then return it
1126 if self.project._cg_selection:
1127 return self.project._cg_selection
1128 # Otherwise we must set the PBC selection
1129 self.project._cg_selection = self._set_cg_selection(self.structure)
1130 return self.project._cg_selection
1131 cg_selection = property(get_cg_selection, None, None, "Periodic boundary conditions atom selection (read only)")
1133 # WARNING: Do not inherit project cg residues
1134 # WARNING: It may trigger all the processing logic of the reference MD when there is no need
1135 def get_cg_residues (self) -> list[int]:
1136 """ Get indices of residues in coarse grain. """
1137 # If we already have a stored value then return it
1138 if self.project._cg_residues:
1139 return self.project._cg_residues
1140 # If there is no inputs file then asume there are no cg residues
1141 if not self.cg_selection:
1142 self.project._cg_residues = []
1143 return self.project._cg_residues
1144 # Otherwise we parse the selection and return the list of residue indices
1145 self.project._cg_residues = self.structure.get_selection_residue_indices(self.cg_selection)
1146 print(f'CG residues "{self.input_cg_selection}" -> {len(self.project._cg_residues)} residues')
1147 return self.project._cg_residues
1148 cg_residues = property(get_cg_residues, None, None, "Indices of residues in coarse grain (read only)")
1150 def get_populations (self) -> list[float]:
1151 """ Get equilibrium populations from a MSM from the project. """
1152 return self.project.populations
1153 populations = property(get_populations, None, None, "Equilibrium populations from a MSM (read only)")
1155 def get_transitions (self) -> list[list[float]]:
1156 """ Get transition probabilities from a MSM from the project. """
1157 return self.project.transitions
1158 transitions = property(get_transitions, None, None, "Transition probabilities from a MSM (read only)")
1160 def get_protein_map (self) -> dict:
1161 """ Get the residues mapping from the project. """
1162 return self.project.protein_map
1163 protein_map = property(get_protein_map, None, None, "Residues mapping (read only)")
1165 def get_charges (self) -> dict:
1166 """ Get the residues mapping from the project. """
1167 return self.project.charges
1168 charges = property(get_charges, None, None, "Residues charges (read only)")
1170 # Reference frame
1171 get_reference_frame = Task('reframe', 'Reference frame', get_bonds_reference_frame)
1172 reference_frame = property(get_reference_frame, None, None, "Reference frame to be used to represent the MD (read only)")
1174 # ---------------------------------------------------------------------------------
1175 # Tests
1176 # ---------------------------------------------------------------------------------
1178 def is_trajectory_integral (self) -> Optional[bool]:
1179 """ Sudden jumps test. """
1180 # If we already have a stored value then return it
1181 if self._trajectory_integrity != None:
1182 return self._trajectory_integrity
1183 # Otherwise we must find the value
1184 self._trajectory_integrity = check_trajectory_integrity(
1185 input_structure_filename = self.structure_file.path,
1186 input_trajectory_filename = self.trajectory_file.path,
1187 structure = self.structure,
1188 pbc_selection = self.pbc_selection,
1189 mercy = self.project.mercy,
1190 trust = self.project.trust,
1191 register = self.register,
1192 # time_length = self.time_length,
1193 check_selection = ALL_ATOMS,
1194 standard_deviations_cutoff = self.project.rmsd_cutoff,
1195 snapshots = self.snapshots,
1196 )
1197 return self._trajectory_integrity
1199 # ---------------------------------------------------------------------------------
1200 # Analyses
1201 # ---------------------------------------------------------------------------------
1203 # RMSDs analysis
1204 run_rmsds_analysis = Task('rmsds', 'RMSDs analysis',
1205 rmsds, { 'frames_limit': 5000 })
1207 # TM scores analysis
1208 run_tmscores_analysis = Task('tmscore', 'TM scores analysis',
1209 tmscores, { 'frames_limit': 200 })
1211 # RMSF, atom fluctuation analysis
1212 run_rmsf_analysis = Task('rmsf', 'Fluctuation analysis', rmsf)
1214 # Radius of gyration analysis
1215 run_rgyr_analysis = Task('rgyr', 'Radius of gyration analysis',
1216 rgyr, { 'frames_limit': 5000 })
1218 # PCA, principal component analysis
1219 run_pca_analysis = Task('pca', 'Principal component analysis',
1220 pca, { 'frames_limit': 2000, 'projection_frames': 20 })
1222 # PCA contacts
1223 # DANI: Intenta usar mucha memoria, hay que revisar
1224 # DANI: Puede saltar un error de imposible alojar tanta memoria
1225 # DANI: Puede comerse toda la ram y que al final salte un error de 'Terminado (killed)'
1226 # DANI: De momento me lo salto
1227 # DANI: Lleva mucho tiempo sin mantenerse, habrá que cambiar varias cosas al recuperarlo
1228 # run_pca_contacts('pcacons', 'PCA contacts', pca_contacts)
1230 # RMSD per residue analysis
1231 run_rmsd_perres_analysis = Task('perres', 'RMSD per residue analysis',
1232 rmsd_per_residue, { 'frames_limit': 100 })
1234 # RMSD pairwise
1235 # Perform an analysis for the overall structure and then one more analysis for each interaction
1236 run_rmsd_pairwise_analysis = Task('pairwise', 'RMSD pairwise',
1237 rmsd_pairwise, { 'frames_limit': 200, 'overall_selection': "name CA or name C5'" })
1239 # Run the cluster analysis
1240 run_clusters_analysis = Task('clusters', 'Clusters analysis',
1241 clusters_analysis, { 'frames_limit': 1000, 'desired_n_clusters': 20 })
1243 # Calculate the distance mean and standard deviation of each pair of residues
1244 run_dist_perres_analysis = Task('dist', 'Distance per residue',
1245 distance_per_residue, { 'frames_limit': 200 })
1247 # Hydrogen bonds
1248 run_hbonds_analysis = Task('hbonds', 'Hydrogen bonds analysis',
1249 hydrogen_bonds, { 'time_splits': 100 })
1251 # SASA, solvent accessible surface analysis
1252 run_sas_analysis = Task('sas', 'Solvent accessible surface analysis',
1253 sasa, { 'frames_limit': 100 })
1255 # Perform the electrostatic and vdw energies analysis for each pair of interaction agents
1256 run_energies_analysis = Task('energies', 'Energies analysis',
1257 energies, { 'frames_limit': 100 })
1259 # Calculate torsions and then dihedral energies for every dihedral along the trajectory
1260 run_dihedral_energies = Task('dihedrals', 'Dihedral energies analysis',
1261 compute_dihedral_energies, { 'frames_limit': 100 })
1263 # Perform the pockets analysis
1264 run_pockets_analysis = Task('pockets', 'Pockets analysis',
1265 pockets, { 'frames_limit': 100, 'maximum_pockets_number': 10 })
1267 # Helical parameters
1268 run_helical_analysis = Task('helical', 'Helical parameters', helical_parameters)
1270 # Markov
1271 run_markov_analysis = Task('markov', 'Markov', markov, { 'rmsd_selection': PROTEIN_AND_NUCLEIC })
1273 # Membrane density analysis
1274 run_density_analysis = Task('density', 'Membrane density analysis',
1275 density, { 'frames_limit': 1000 })
1277 # Membrane thickness analysis
1278 run_thickness_analysis = Task('thickness', 'Membrane thickness analysis',
1279 thickness, { 'frames_limit': 100 })
1281 # Area per lipid analysis
1282 run_apl_analysis = Task('apl', 'Membrane area per lipid analysis', area_per_lipid)
1284 # Calculate lipid order parameters for membranes
1285 run_lipid_order_analysis = Task('lorder', 'Membrane lipid order analysis',
1286 lipid_order, { 'frames_limit': 100 })
1288 # Lipid-protein interactions analysis
1289 run_lipid_interactions_analysis = Task('linter', 'Membrane lipid-protein interactions analysis',
1290 lipid_interactions, { 'frames_limit': 100 })
1292 run_lipid_interactions_analysis = Task('channels', 'Membrane channels analysis',
1293 channels, { 'frames_limit': 10 })
1296class Project:
1297 """ Class for the main project of an MDDB accession.
1298 A project is a set of related MDs.
1299 These MDs share all or most topology and metadata. """
1301 def __init__ (self,
1302 directory : str = '.',
1303 accession : Optional[str] = None,
1304 database_url : str = DEFAULT_API_URL,
1305 inputs_filepath : str = None,
1306 input_topology_filepath : Optional[str] = None,
1307 input_structure_filepath : Optional[str] = None,
1308 input_trajectory_filepaths : Optional[list[str]] = None,
1309 md_directories : Optional[list[str]] = None,
1310 md_config : Optional[list[list[str]]] = None,
1311 reference_md_index : Optional[int] = None,
1312 populations_filepath : str = DEFAULT_POPULATIONS_FILENAME,
1313 transitions_filepath : str = DEFAULT_TRANSITIONS_FILENAME,
1314 aiida_data_filepath : Optional[str] = None,
1315 filter_selection : bool | str = False,
1316 pbc_selection : Optional[str] = None,
1317 cg_selection : Optional[str] = None,
1318 image : bool = False,
1319 fit : bool = False,
1320 translation : list[float] = [0, 0, 0],
1321 mercy : list[str] | bool = [],
1322 trust : list[str] | bool = [],
1323 faith : bool = False,
1324 pca_analysis_selection : str = PROTEIN_AND_NUCLEIC_BACKBONE,
1325 pca_fit_selection : str = PROTEIN_AND_NUCLEIC_BACKBONE,
1326 rmsd_cutoff : float = DEFAULT_RMSD_CUTOFF,
1327 interaction_cutoff : float = DEFAULT_INTERACTION_CUTOFF,
1328 interactions_auto : Optional[str] = None,
1329 guess_bonds : bool = False,
1330 sample_trajectory : Optional[int] = None,
1331 ):
1332 """
1333 Initialize a Project.
1335 Args:
1336 directory (str):
1337 Local directory where the project takes place.
1338 accession (Optional[str]):
1339 Project accession to download missing input files from the database (if already uploaded).
1340 database_url (str):
1341 API URL to download missing data. when an accession is provided.
1342 inputs_filepath (str):
1343 Path to a file with inputs for metadata, simulation parameters and analysis config.
1344 input_topology_filepath (Optional[str]):
1345 Path to input topology file relative to the project directory.
1346 Multiple formats accepted; default is our parsed JSON topology.
1347 input_structure_filepath (Optional[str]):
1348 Path to input structure file. It may be relative to the project or to each MD directory.
1349 If this value is not passed then the standard structure file is used as input by default.
1350 input_trajectory_filepaths (Optional[list[str]]):
1351 Paths to input trajectory files relative to each MD directory.
1352 If this value is not passed then the standard trajectory file path is used as input by default.
1353 md_directories (Optional[list[str]]):
1354 Path to the different MD directories.
1355 Each directory is to contain an independent trajectory and structure.
1356 Several output files will be generated in every MD directory.
1357 md_config (Optional[list]):
1358 Configuration of a specific MD. You may declare as many as you want.
1359 Every MD requires a directory name and at least one trajectory path.
1360 The structure is -md <directory> <trajectory_1> <trajectory_2> ...
1361 Note that all trajectories from the same MD will be merged.
1362 For legacy reasons, you may also provide a specific structure for an MD.
1363 e.g. -md <directory> <structure> <trajectory_1> <trajectory_2> ...
1364 reference_md_index (Optional[int]):
1365 Index of the reference MD (used by project-level functions; defaults to first MD).
1366 populations_filepath (str):
1367 Path to equilibrium populations file (Markov State Model only)
1368 transitions_filepath (str):
1369 Path to transition probabilities file (Markov State Model only).
1370 aiida_data_filepath (Optional[str]):
1371 Path to the AiiDA data file.
1372 This file may be generated by the aiida-gromacs plugin and contains provenance data.
1373 filter_selection (bool|str):
1374 Atoms selection to be filtered in VMD format.
1375 If the argument is passed alone (i.e. with no selection) then water and counter ions are filtered.
1376 pbc_selection (Optional[str]):
1377 Selection of atoms which stay in Periodic Boundary Conditions even after imaging the trajectory.
1378 e.g. remaining solvent, ions, membrane lipids, etc.
1379 Selection passed through console overrides the one in inputs file.
1380 cg_selection (Optional[str]):
1381 Selection of atoms which are not actual atoms but Coarse Grained beads.
1382 Selection passed through console overrides the one in inputs file.
1383 image (bool):
1384 Set if the trajectory is to be imaged so atoms stay in the PBC box. See -pbc for more information.
1385 fit (bool):
1386 Set if the trajectory is to be fitted (both rotation and translation) to minimize the RMSD to PROTEIN_AND_NUCLEIC_BACKBONE selection.
1387 translation (list[float]):
1388 Set the x y z translation for the imaging process.
1389 e.g. -trans 0.5 -1 0
1390 mercy (list[str]|bool):
1391 Failures to be tolerated (or boolean to set all/none).
1392 trust (list[str]|bool):
1393 Tests to skip/trust (or boolean to set all/none).
1394 faith (bool):
1395 If True, require input files to match expected output files and skip processing.
1396 pca_analysis_selection (str):
1397 Atom selection for PCA analysis in VMD syntax.
1398 pca_fit_selection (str):
1399 Atom selection for the PCA fitting in VMD syntax.
1400 rmsd_cutoff (float):
1401 Set the cutoff for the RMSD sudden jumps analysis to fail.
1402 This cutoff stands for the number of standard deviations away from the mean an RMSD value is to be.
1403 interaction_cutoff (float):
1404 Set the cutoff for the interactions analysis to fail.
1405 This cutoff stands for percent of the trajectory where the interaction happens (from 0 to 1).
1406 interactions_auto (Optional[str]):
1407 Guess input interactions automatically. A VMD selection may be passed to limit guessed interactions to a specific subset of atoms.
1408 guess_bonds (bool):
1409 Force the workflow to guess atom bonds based on distance and atom radii in different frames along the trajectory instead of mining topology bonds.
1410 sample_trajectory (Optional[int]):
1411 If passed, download the first 10 (by default) frames from the trajectory.
1412 You can specify a different number by providing an integer value.
1413 """
1414 # Save input parameters
1415 self.directory = normpath(directory)
1416 # If it is an absolute path then make it relative to the project
1417 if isabs(self.directory):
1418 self.directory = relpath(self.directory)
1419 # Save the directory name alone apart
1420 if self.directory == '.':
1421 self.directory_name = basename(getcwd())
1422 else:
1423 self.directory_name = basename(self.directory)
1425 self.database_url = database_url
1426 self.accession = accession
1427 # Set the project URL in case we have the required data
1428 self.remote = None
1429 if self.database_url and self.accession:
1430 self.remote = Remote(self.database_url, self.accession)
1432 # Set the inputs file
1433 # Set the expected default name in case there is no inputs file since it may be downloaded
1434 self._inputs_file = File(self.pathify(DEFAULT_INPUTS_FILENAME))
1435 # If there is an input filepath then use it
1436 if inputs_filepath:
1437 self._inputs_file = File(inputs_filepath)
1438 # Otherwise guess the inputs file using the accepted filenames
1439 else:
1440 for filename in ACCEPTED_INPUT_FILENAMES:
1441 inputs_file = File(filename)
1442 if inputs_file.exists:
1443 self._inputs_file = inputs_file
1444 break
1445 # Set the input topology file
1446 # Note that even if the input topology path is passed we do not check it exists
1447 # Never forget we can donwload some input files from the database on the fly
1448 self.arg_input_topology_filepath = input_topology_filepath
1449 self._input_topology_filepath = None
1450 self._input_topology_file = None
1451 # Input structure and trajectory filepaths
1452 # Do not parse them to files yet, let this to the MD class
1453 self.input_structure_filepath = input_structure_filepath
1454 self.input_trajectory_filepaths = input_trajectory_filepaths
1456 # Make sure the new MD configuration (-md) was not passed as well as old MD inputs (-mdir, -stru, -traj)
1457 if md_config and (md_directories or input_trajectory_filepaths):
1458 raise InputError('MD configurations (-md) is not compatible with old MD inputs (-mdir, -traj)')
1459 # Save the MD configurations
1460 self.md_config = md_config
1461 # Make sure MD configuration has the correct format
1462 if self.md_config:
1463 # Make sure all MD configurations have at least 3 values each
1464 for mdc in self.md_config:
1465 if len(mdc) < 2:
1466 raise InputError('Wrong MD configuration: the patter is -md <directory> <trajectory> <trajectory 2> ...')
1467 # Make sure there are no duplictaed MD directories
1468 md_directories = [ mdc[0] for mdc in self.md_config ]
1469 if len(md_directories) > len(set(md_directories)):
1470 raise InputError('There are duplicated MD directories')
1472 # Input populations and transitions for MSM
1473 self.populations_filepath = populations_filepath
1474 self._populations_file = File(self.populations_filepath)
1475 self.transitions_filepath = transitions_filepath
1476 self._transitions_file = File(self.transitions_filepath)
1477 # Input AiiDA data
1478 self.aiida_data_filepath = aiida_data_filepath
1479 self._aiida_data_file = File(self.aiida_data_filepath) if aiida_data_filepath else None
1481 # Set the processed topology filepath, which depends on the input topology filename
1482 # Note that this file is different from the standard topology, although it may be standard as well
1483 self._topology_filepath = None
1484 self._topology_file = None
1486 # Set the standard topology file
1487 self._standard_topology_file = None
1489 # Set the MD directories
1490 self._md_directories = md_directories
1491 # Check input MDs are correct to far
1492 if self._md_directories:
1493 self.check_md_directories()
1495 # Set the reference MD
1496 self._reference_md = None
1497 self._reference_md_index = reference_md_index
1499 # Set the rest of inputs
1500 # Note that the filter selection variable is not handled here at all
1501 # This is just pased to the filtering function which knows how to handle the default
1502 self.filter_selection = filter_selection
1503 # PBC selection may come from the console or from the inputs
1504 self._input_pbc_selection = pbc_selection
1505 self._input_cg_selection = cg_selection
1506 self.image = image
1507 self.fit = fit
1508 self.translation = translation
1509 self.mercy = mercy
1510 # Fix the mercy input, if needed
1511 # If a boolean is passed instead of a list then we set its corresponding value
1512 if type(mercy) == bool:
1513 if mercy:
1514 self.mercy = AVAILABLE_FAILURES
1515 else:
1516 self.mercy = []
1517 self.trust = trust
1518 # Fix the trust input, if needed
1519 # If a boolean is passed instead of a list then we set its corresponding value
1520 if type(trust) == bool:
1521 if trust:
1522 self.trust = AVAILABLE_CHECKINGS
1523 else:
1524 self.trust = []
1525 self.faith = faith
1526 self.pca_analysis_selection = pca_analysis_selection
1527 self.pca_fit_selection = pca_fit_selection
1528 self.rmsd_cutoff = rmsd_cutoff
1529 self.interaction_cutoff = interaction_cutoff
1530 self.sample_trajectory = sample_trajectory
1531 self.interactions_auto = interactions_auto
1532 self.guess_bonds = guess_bonds
1533 # Set the inputs, where values from the inputs file will be stored
1534 self._inputs = None
1536 # Other values which may be found/calculated on demand
1537 self._pbc_selection = None
1538 self._pbc_residues = None
1539 self._cg_selection = None
1540 self._cg_residues = None
1541 self._reference_bonds = None
1542 self._topology_reader = None
1543 self._dihedrals = None
1544 self._populations = None
1545 self._transitions = None
1546 self._pdb_ids = None
1547 self._mds = None
1549 # Force a couple of extraordinary files which is generated if atoms are resorted
1550 self.resorted_bonds_file = File(self.pathify(RESORTED_BONDS_FILENAME))
1551 self.resorted_charges_file = File(self.pathify(RESORTED_CHARGES_FILENAME))
1553 # Set a new entry for the register
1554 # This is useful to track previous workflow runs and problems
1555 register_filepath = self.pathify(REGISTER_FILENAME)
1556 register_file = File(register_filepath)
1557 self.register = Register(register_file)
1558 # Save also warnings apart since they are to be used as an input for metadata tasks
1559 self.warnings = self.register.warnings
1561 # Set the cache
1562 cache_filepath = self.pathify(CACHE_FILENAME)
1563 cache_file = File(cache_filepath)
1564 self.cache = Cache(cache_file)
1566 # Set tasks whose output is to be overwritten
1567 self.overwritables = set()
1569 def __repr__ (self):
1570 return 'Project'
1572 def pathify (self, filename_or_relative_path : str) -> str:
1573 """ Given a filename or relative path, add the project directory path at the beginning. """
1574 return normpath(self.directory + '/' + filename_or_relative_path)
1576 # Check MD directories to be right
1577 # If there is any problem then directly raise an input error
1578 def check_md_directories (self):
1579 # Check there is at least one MD
1580 if len(self._md_directories) < 1:
1581 raise InputError('There must be at least one MD')
1582 # Check there are not duplicated MD directories
1583 if len(set(self._md_directories)) != len(self._md_directories):
1584 raise InputError('There are duplicated MD directories')
1586 # Set a function to get MD directories
1587 def get_md_directories (self) -> list:
1588 # If MD directories are already declared then return them
1589 if self._md_directories:
1590 return self._md_directories
1591 # Otherwise use the default MDs
1592 self._md_directories = []
1593 # Use the MDs from the inputs file when available
1594 if self.is_inputs_file_available() and self.input_mds:
1595 for input_md in self.input_mds:
1596 # Get the directory according to the inputs
1597 directory = input_md.get(MD_DIRECTORY, None)
1598 if directory:
1599 check_directory(directory)
1600 # If no directory is specified in the inputs then guess it from the MD name
1601 else:
1602 name = input_md['name']
1603 if not name:
1604 name = 'unnamed'
1605 directory = name_2_directory(name)
1606 self._md_directories.append(directory)
1607 # Otherwise, guess MD directories by checking which directories include a register file
1608 else:
1609 available_directories = sorted(next(walk(self.directory))[1])
1610 for directory in available_directories:
1611 if exists(directory + '/' + REGISTER_FILENAME):
1612 self._md_directories.append(directory)
1613 # If we found no MD directory then it means MDs were never declared before
1614 if len(self._md_directories) == 0:
1615 raise InputError('Impossible to know which are the MD directories. '
1616 'You can either declare them using the "-mdir" option or by providing an inputs file')
1617 self.check_md_directories()
1618 return self._md_directories
1619 md_directories = property(get_md_directories, None, None, "MD directories (read only)")
1621 # Set the reference MD index
1622 def get_reference_md_index (self) -> int:
1623 # If we are already have a value then return it
1624 if self._reference_md_index:
1625 return self._reference_md_index
1626 # Otherwise we must find the reference MD index
1627 # If the inputs file is available then it must declare the reference MD index
1628 if self.is_inputs_file_available():
1629 self._reference_md_index = self.get_input('mdref')
1630 # Otherwise we simply set the first MD as the reference and warn the user about this
1631 if self._reference_md_index == None:
1632 warn('No reference MD was specified. The first MD will be used as reference.')
1633 self._reference_md_index = 0
1634 return self._reference_md_index
1635 reference_md_index = property(get_reference_md_index, None, None, "Reference MD index (read only)")
1637 # Set the reference MD
1638 def get_reference_md (self) -> MD:
1639 # If we are already have a value then return it
1640 if self._reference_md:
1641 return self._reference_md
1642 # Otherwise we must find the reference MD
1643 self._reference_md = self.mds[self.reference_md_index]
1644 return self._reference_md
1645 reference_md: MD = property(get_reference_md, None, None, "Reference MD (read only)")
1647 # Setup the MDs
1648 def get_mds (self) -> list:
1649 """ Get the available MDs (read only). """
1650 # If MDs are already declared then return them
1651 if self._mds:
1652 return self._mds
1653 # Now instantiate a new MD for each declared MD and save the reference MD
1654 self._mds = []
1655 # New system with MD configurations (-md)
1656 if self.md_config:
1657 for n, config in enumerate(self.md_config, 1):
1658 directory = config[0]
1659 # LEGACY
1660 # In a previous version, the md config argument also holded the structure
1661 # This was the second argument, so we check if we have more than 2 arguments
1662 # If this is the case, then check if the second argument has different format
1663 # Note that PDB format is also a trajectory supported format
1664 has_structure = False
1665 if len(config) > 2:
1666 first_sample = File(config[1])
1667 second_sample = File(config[2])
1668 if first_sample.format != second_sample.format:
1669 has_structure = True
1670 # Finally set the input structure and trajectories
1671 input_structure_filepath = config[1] if has_structure else self.input_structure_filepath
1672 input_trajectory_filepaths = config[2:] if has_structure else config[1:]
1673 # Define the MD
1674 md = MD(
1675 project = self, number = n, directory = directory,
1676 input_structure_filepath = input_structure_filepath,
1677 input_trajectory_filepaths = input_trajectory_filepaths,
1678 )
1679 self._mds.append(md)
1680 # Old system (-mdir, -stru -traj)
1681 else:
1682 for n, md_directory in enumerate(self.md_directories, 1):
1683 md = MD(
1684 project = self, number = n, directory = md_directory,
1685 input_structure_filepath = self.input_structure_filepath,
1686 input_trajectory_filepaths = self.input_trajectory_filepaths,
1687 )
1688 self._mds.append(md)
1689 return self._mds
1690 mds: list[MD] = property(get_mds, None, None, "Available MDs (read only)")
1692 # Check input files exist when their filenames are read
1693 # If they do not exist then try to download them
1694 # If the download is not possible then raise an error
1696 # Inputs filename ------------
1698 def is_inputs_file_available (self) -> bool:
1699 """ Set a function to check if inputs file is available.
1700 Note that asking for it when it is not available will lead to raising an input error. """
1701 # If name is not declared then it is impossible to reach it
1702 if not self._inputs_file:
1703 return False
1704 # If the file already exists then it is available
1705 if self._inputs_file.exists:
1706 return True
1707 # If it does not exist but it may be downloaded then it is available
1708 if self.remote:
1709 return True
1710 return False
1712 def get_inputs_file (self) -> File:
1713 """ Set a function to load the inputs yaml file. """
1714 # There must be an inputs filename
1715 if not self._inputs_file:
1716 raise InputError('Not defined inputs filename')
1717 # If the file already exists then we are done
1718 if self._inputs_file.exists:
1719 return self._inputs_file
1720 # Try to download it
1721 # If we do not have the required parameters to download it then we surrender here
1722 if not self.remote:
1723 raise InputError(f'Missing inputs file "{self._inputs_file.filename}"')
1724 # Download the inputs json file if it does not exists
1725 self.remote.download_inputs_file(self._inputs_file)
1726 return self._inputs_file
1727 inputs_file = property(get_inputs_file, None, None, "Inputs filename (read only)")
1729 # Topology filename ------------
1731 def get_input_topology_filepath (self) -> Optional[str]:
1732 """ Get the input topology filepath from the inputs or try to guess it.
1733 If the input topology filepath is a 'no' flag then we consider there is no topology at all
1734 So far we extract atom charges and atom bonds from the topology file
1735 In this scenario we can keep working but there are some consecuences:
1736 1 - Analysis using atom charges such as 'energies' will be skipped
1737 2 - The standard topology file will not include atom charges
1738 3 - Bonds will be guessed
1739 """
1740 # If we arelady have an internal value calculated then return it
1741 if self._input_topology_filepath != None:
1742 return self._input_topology_filepath
1743 # Set a function to parse possible glob notation
1744 def parse (filepath : str) -> str:
1745 # If there is no glob pattern then just return the string as is
1746 if not is_glob(filepath):
1747 return filepath
1748 # If there is glob pattern then parse it
1749 parsed_filepaths = glob(filepath)
1750 if len(parsed_filepaths) == 0:
1751 # Warn the user in case it was trying to use glob syntax to donwload remote files
1752 if self.remote:
1753 warn('Spread syntax is not supported to download remote files')
1754 raise InputError(f'No topologies found with "{filepath}"')
1755 if len(parsed_filepaths) > 1:
1756 raise InputError(f'Multiple topologies found with "{filepath}": {", ".join(parsed_filepaths)}')
1757 return parsed_filepaths[0]
1758 # If this value was passed through command line then it would be set as the internal value already
1759 if self.arg_input_topology_filepath:
1760 if self.arg_input_topology_filepath.lower() in { 'no', 'not', 'na' }:
1761 self._input_topology_filepath = MISSING_TOPOLOGY
1762 return self._input_topology_filepath
1763 self._input_topology_filepath = parse(self.arg_input_topology_filepath)
1764 # Update the input topology fielpath in the inputs file, in case it is not matching
1765 self.update_inputs('input_topology_filepath', relpath(self._input_topology_filepath, self.directory))
1766 return self._input_topology_filepath
1767 # Check if the inputs file has the value
1768 if self.is_inputs_file_available():
1769 # Get the input value, whose key must exist
1770 inputs_value = self.get_input('input_topology_filepath')
1771 # If there is a valid input then use it
1772 if inputs_value != None:
1773 # WARNING: the yaml parser automatically converts 'no' to False
1774 if inputs_value == False or inputs_value.lower() in { 'no', 'not', 'na' }:
1775 self._input_topology_filepath = MISSING_TOPOLOGY
1776 return self._input_topology_filepath
1777 parsed_input_value = parse(inputs_value)
1778 self._input_topology_filepath = self.pathify(parsed_input_value)
1779 return self._input_topology_filepath
1780 # If nothing worked then surrender
1781 raise InputError('Missing input topology file path. Please provide a topology file using the "-top" argument.\n' +
1782 ' Note that you may run the workflow without a topology file. To do so, use the "-top no" argument.\n' +
1783 ' However this has implications since we usually mine atom charges and bonds from the topology file.\n' +
1784 ' Some analyses such us the interaction energies will be skiped')
1786 def get_input_topology_file (self) -> Optional[File]:
1787 """ Get the input topology file.
1788 If the file is not found try to download it. """
1789 # If we already have a value then return it
1790 if self._input_topology_file != None:
1791 return self._input_topology_file
1792 # Set the input topology filepath
1793 input_topology_filepath = self.get_input_topology_filepath()
1794 # If the input filepath is None then it menas we must proceed without a topology
1795 if input_topology_filepath == MISSING_TOPOLOGY:
1796 self._input_topology_file = MISSING_TOPOLOGY
1797 return self._input_topology_file
1798 # If no input is passed then we check the inputs file
1799 # Set the file
1800 self._input_topology_file = File(input_topology_filepath)
1801 # If the file already exists then we are done
1802 if self._input_topology_file.exists:
1803 return self._input_topology_file
1804 # Try to download it
1805 # If we do not have the required parameters to download it then we surrender here
1806 if not self.remote:
1807 raise InputError(f'Missing input topology file "{self._input_topology_file.filename}"')
1808 # Otherwise, try to download it using the files endpoint
1809 # Note that this is not usually required
1810 if self._input_topology_file.filename == STANDARD_TOPOLOGY_FILENAME:
1811 self.remote.download_standard_topology(self._input_topology_file)
1812 else:
1813 self.remote.download_file(self._input_topology_file)
1814 # In case the topology is a '.top' file we consider it is a Gromacs topology
1815 # It may come with additional itp files we must download as well
1816 if self._input_topology_file.format == 'top':
1817 # Find available .itp files and download each of them
1818 itp_filenames = [filename for filename in self.remote.available_files if filename[-4:] == '.itp']
1819 for itp_filename in itp_filenames:
1820 itp_filepath = self.pathify(itp_filename)
1821 itp_file = File(itp_filepath)
1822 self.remote.download_file(itp_file)
1823 return self._input_topology_file
1824 input_topology_file = property(get_input_topology_file, None, None, "Input topology file (read only)")
1826 def get_input_structure_file (self) -> File:
1827 """ Get the input structure filename. """
1828 # When calling this function make sure all MDs have the file or try to download it
1829 return self.reference_md._input_structure_file
1830 input_structure_file = property(get_input_structure_file, None, None, "Input structure filename for each MD (read only)")
1832 def get_input_trajectory_files (self) -> list[File]:
1833 """ Get the input trajectory filename(s) from the inputs.
1834 If file(s) are not found try to download it. """
1835 return self.reference_md._input_trajectory_files
1836 input_trajectory_files = property(get_input_trajectory_files, None, None, "Input trajectory filenames for each MD (read only)")
1838 def get_populations_file (self) -> Optional[File]:
1839 """ Get the MSM equilibrium populations file. """
1840 if not self.get_file(self._populations_file):
1841 return None
1842 return self._populations_file
1843 populations_file = property(get_populations_file, None, None, "MSM equilibrium populations file (read only)")
1845 def get_transitions_file (self) -> Optional[File]:
1846 """ Get the MSM transition probabilities file. """
1847 if not self.get_file(self._transitions_file):
1848 return None
1849 return self._transitions_file
1850 transitions_file = property(get_transitions_file, None, None, "MSM transition probabilities file (read only)")
1852 def get_aiida_data_file (self) -> Optional[File]:
1853 """ Get the AiiDA data file."""
1854 if not self._aiida_data_file: return None
1855 if not self.get_file(self._aiida_data_file): return None
1856 return self._aiida_data_file
1857 aiida_data_file = property(get_aiida_data_file, None, None, "AiiDA data file (read only)")
1859 # ---------------------------------
1861 def get_file (self, target_file : File) -> bool:
1862 """ Check if a file exists.
1863 If not, try to download it from the database.
1864 If the file is not found in the database it is fine, we do not even warn the user.
1865 Note that nowadays this function is used to get populations and transitions files, which are not common.
1866 """
1867 return self.reference_md.get_file(target_file)
1869 # Input file values -----------------------------------------
1871 # First of all set input themselves
1873 # Get inputs
1874 def get_inputs (self) -> dict:
1875 # If inputs are already loaded then return them
1876 if self._inputs:
1877 return self._inputs
1878 # When loading the inuts file, replace some values automatically
1879 replaces = [
1880 ( '$DIR', self.directory_name )
1881 ]
1882 # Otherwise, load inputs from the inputs file
1883 inputs_data = None
1884 if self.inputs_file.format == 'json':
1885 inputs_data = load_json(self.inputs_file.path, replaces)
1886 elif self.inputs_file.format == 'yaml':
1887 inputs_data = load_yaml(self.inputs_file.path, replaces)
1888 else:
1889 raise InputError('Input file format is not supported. Please use json or yaml files.')
1890 if not inputs_data:
1891 raise InputError('Input file is empty')
1892 self._inputs = inputs_data
1893 # Legacy fixes
1894 old_pdb_ids = self._inputs.get('pdbIds', None)
1895 if old_pdb_ids:
1896 self._inputs['pdb_ids'] = old_pdb_ids
1897 # Finally return the updated inputs
1898 return self._inputs
1899 inputs = property(get_inputs, None, None, "Inputs from the inputs file (read only)")
1901 def update_inputs (self, nested_key : str, new_value):
1902 """ Permanently update the inputs file.
1903 This may be done when command line inputs do not match file inputs. """
1904 # If the input already matches then do nothing
1905 current_value = read_ndict(self.inputs, nested_key, MISSING_INPUT_EXCEPTION)
1906 if current_value == new_value: return
1907 # Set the new value
1908 write_ndict(self.inputs, nested_key, new_value)
1909 # If there is no inputs file then do not try to save anything
1910 if not self.is_inputs_file_available(): return
1911 print(f'* Field "{nested_key}" in the inputs file will be permanently modified')
1912 # Write the new inputs to disk
1913 if self.inputs_file.format == 'json':
1914 save_json(self.inputs, self.inputs_file.path)
1915 elif self.inputs_file.format == 'yaml':
1916 # Note that comments in the original YAML file will be not kept
1917 save_yaml(self.inputs, self.inputs_file.path)
1918 else:
1919 raise InputError('Input file format is not supported. Please use json or yaml files.')
1921 # Then set getters for every value in the inputs file
1923 def get_input (self, name: str):
1924 """ Get a specific 'input' value. """
1925 value = self.inputs.get(name, MISSING_INPUT_EXCEPTION)
1926 # If we had a value then return it
1927 if value != MISSING_INPUT_EXCEPTION:
1928 return value
1929 # If the field is not specified in the inputs file then set a defualt value
1930 default_value = DEFAULT_INPUT_VALUES.get(name, None)
1931 # Warn the user about this
1932 warn(f'Missing input "{name}" -> Using default value: {default_value}')
1933 return default_value
1935 # Set a function to get a specific 'input' value by its key/name
1936 # Note that we return the getter function but we do not call it just yet
1937 def input_getter (name : str):
1938 def getter (self):
1939 return self.get_input(name)
1940 return getter
1942 # Assign the getters
1943 input_interactions = property(input_getter('interactions'), None, None, "Interactions to be analyzed (read only)")
1944 input_protein_references = property(input_getter('forced_references'), None, None, "Uniprot IDs to be used first when aligning protein sequences (read only)")
1945 input_pdb_ids = property(input_getter('pdb_ids'), None, None, "Protein Data Bank IDs used for the setup of the system (read only)")
1946 input_type = property(input_getter('type'), None, None, "Set if its a trajectory or an ensemble (read only)")
1947 input_mds = property(input_getter('mds'), None, None, "Input MDs configuration (read only)")
1948 input_ligands = property(input_getter('ligands'), None, None, "Input ligand references (read only)")
1949 input_force_fields = property(input_getter('ff'), None, None, "Input force fields (read only)")
1950 input_collections = property(input_getter('collections'), None, None, "Input collections (read only)")
1951 input_chain_names = property(input_getter('chainnames'), None, None, "Input chain names (read only)")
1952 input_framestep = property(input_getter('framestep'), None, None, "Input framestep (read only)")
1953 input_name = property(input_getter('name'), None, None, "Input name (read only)")
1954 input_description = property(input_getter('description'), None, None, "Input description (read only)")
1955 input_authors = property(input_getter('authors'), None, None, "Input authors (read only)")
1956 input_groups = property(input_getter('groups'), None, None, "Input groups (read only)")
1957 input_contact = property(input_getter('contact'), None, None, "Input contact (read only)")
1958 input_program = property(input_getter('program'), None, None, "Input program (read only)")
1959 input_version = property(input_getter('version'), None, None, "Input version (read only)")
1960 input_method = property(input_getter('method'), None, None, "Input method (read only)")
1961 input_license = property(input_getter('license'), None, None, "Input license (read only)")
1962 input_linkcense = property(input_getter('linkcense'), None, None, "Input license link (read only)")
1963 input_citation = property(input_getter('citation'), None, None, "Input citation (read only)")
1964 input_thanks = property(input_getter('thanks'), None, None, "Input acknowledgements (read only)")
1965 input_links = property(input_getter('links'), None, None, "Input links (read only)")
1966 input_timestep = property(input_getter('timestep'), None, None, "Input timestep (read only)")
1967 input_temperature = property(input_getter('temp'), None, None, "Input temperature (read only)")
1968 input_ensemble = property(input_getter('ensemble'), None, None, "Input ensemble (read only)")
1969 input_water = property(input_getter('wat'), None, None, "Input water force field (read only)")
1970 input_boxtype = property(input_getter('boxtype'), None, None, "Input boxtype (read only)")
1971 input_pbc_selection = property(input_getter('pbc_selection'), None, None, "Input Periodic Boundary Conditions (PBC) selection (read only)")
1972 input_cg_selection = property(input_getter('cg_selection'), None, None, "Input Coarse Grained (CG) selection (read only)")
1973 input_customs = property(input_getter('customs'), None, None, "Input custom representations (read only)")
1974 input_orientation = property(input_getter('orientation'), None, None, "Input orientation (read only)")
1975 input_multimeric = property(input_getter('multimeric'), None, None, "Input multimeric labels (read only)")
1976 # Additional topic-specific inputs
1977 input_cv19_unit = property(input_getter('cv19_unit'), None, None, "Input Covid-19 Unit (read only)")
1978 input_cv19_startconf = property(input_getter('cv19_startconf'), None, None, "Input Covid-19 starting conformation (read only)")
1979 input_cv19_abs = property(input_getter('cv19_abs'), None, None, "Input Covid-19 antibodies (read only)")
1980 input_cv19_nanobs = property(input_getter('cv19_nanobs'), None, None, "Input Covid-19 nanobodies (read only)")
1982 # PBC selection may come from the console or from the inputs file
1983 # Console has priority over the inputs file
1984 def get_input_pbc_selection (self) -> Optional[str]:
1985 # If we have an internal value then return it
1986 if self._input_pbc_selection:
1987 return self._input_pbc_selection
1988 # As an exception, we avoid asking for the inputs file if it is not available
1989 # This input is required for some early processing steps where we do not need the inputs file for anything else
1990 if not self.is_inputs_file_available():
1991 return None
1992 # Otherwise, find it in the inputs
1993 # Get the input value, whose key must exist
1994 self._input_pbc_selection = self.get_input('pbc_selection')
1995 return self._input_pbc_selection
1996 input_pbc_selection = property(get_input_pbc_selection, None, None, "Selection of atoms which are still in periodic boundary conditions (read only)")
1998 # CG selection may come from the console or from the inputs file
1999 # Console has priority over the inputs file
2000 def get_input_cg_selection (self) -> Optional[str]:
2001 # If we have an internal value then return it
2002 if self._input_cg_selection:
2003 return self._input_cg_selection
2004 # As an exception, we avoid asking for the inputs file if it is not available
2005 # This input is required for some early processing steps where we do not need the inputs file for anything else
2006 if not self.is_inputs_file_available():
2007 return None
2008 # Otherwise, find it in the inputs
2009 # Get the input value, whose key must exist
2010 self._input_cg_selection = self.get_input('cg_selection')
2011 return self._input_cg_selection
2012 input_cg_selection = property(get_input_cg_selection, None, None, "Selection of atoms which are not acutal atoms but Coarse Grained beads (read only)")
2014 # Set additional values infered from input values
2016 def check_is_time_dependent (self) -> bool:
2017 """ Set if MDs are time dependent. """
2018 if self.input_type == 'trajectory':
2019 return True
2020 elif self.input_type == 'ensemble':
2021 return False
2022 raise InputError(f'Not supported input "type" value: {self.input_type}. It must be "trajectory" or "ensemble"')
2023 is_time_dependent = property(check_is_time_dependent, None, None, "Check if trajectory frames are time dependent (read only)")
2025 # Processed files ----------------------------------------------------
2027 def inherit_topology_filename (self) -> Optional[str]:
2028 """ Set the expected output topology filename given the input topology filename.
2029 Note that topology formats are conserved. """
2030 if self.input_topology_file == MISSING_TOPOLOGY:
2031 return None
2032 filename = self.input_topology_file.filename
2033 if not filename:
2034 return None
2035 if filename == RAW_CHARGES_FILENAME:
2036 return filename
2037 standard_format = self.input_topology_file.format
2038 return 'topology.' + standard_format
2040 def get_topology_filepath (self) -> str:
2041 """ Get the processed topology file path. """
2042 # If we have a stored value then return it
2043 if self._topology_filepath:
2044 return self._topology_filepath
2045 # Otherwise we must find it
2046 inherited_filename = self.inherit_topology_filename()
2047 self._topology_filepath = self.pathify(inherited_filename) if inherited_filename else None
2048 return self._topology_filepath
2049 topology_filepath = property(get_topology_filepath, None, None, "Topology file path (read only)")
2051 def get_topology_file (self) -> str:
2052 """ Get the processed topology file. """
2053 # If we have a stored value then return it
2054 # This means we already found or generated this file
2055 if self._topology_file != None:
2056 return self._topology_file
2057 # If the file already exists then we are done
2058 self._topology_file = File(self.topology_filepath) if self.topology_filepath != None else MISSING_TOPOLOGY
2059 # If the faith flag was passed then simply make sure the input file makes sense
2060 if self.faith:
2061 if self.input_topology_file != self._topology_file:
2062 raise InputError('Input topology file is not equal to output topology file but the "faith" flag was used.\n'
2063 ' Please refrain from using the faith argument (-f) if you ignore its effect.')
2064 if not self.input_topology_file.exists:
2065 raise InputError('Input topology file does not exist but the "faith" flag was used.\n'
2066 ' Please refrain from using the faith argument (-f) if you ignore its effect.')
2067 return self._topology_file
2068 # Run the processing logic
2069 self.reference_md.input_files_processing(self.reference_md)
2070 # Now that the file is sure to exist we return it
2071 return self._topology_file
2072 topology_file = property(get_topology_file, None, None, "Topology file (read only)")
2074 def get_structure_file (self) -> str:
2075 """ Get the processed structure from the reference MD. """
2076 return self.reference_md.structure_file
2077 structure_file = property(get_structure_file, None, None, "Structure filename from the reference MD (read only)")
2079 def get_trajectory_file (self) -> str:
2080 """ Get the processed trajectory from the reference MD. """
2081 return self.reference_md.trajectory_file
2082 trajectory_file = property(get_trajectory_file, None, None, "Trajectory filename from the reference MD (read only)")
2084 # ---------------------------------------------------------------------------------
2085 # Others values which may be found/calculated and files to be generated on demand
2086 # ---------------------------------------------------------------------------------
2088 def get_structure (self) -> 'Structure':
2089 """ Get the parsed structure from the reference MD. """
2090 return self.reference_md.structure
2091 structure = property(get_structure, None, None, "Parsed structure from the reference MD (read only)")
2093 def get_pbc_residues (self) -> list[int]:
2094 """ Get the indices of residues in periodic boundary conditions. """
2095 return self.reference_md.pbc_residues
2096 pbc_residues = property(get_pbc_residues, None, None, "Indices of residues in periodic boundary conditions (read only)")
2098 def get_cg_residues (self) -> list[int]:
2099 """ Get the indices of residues in coarse grain. """
2100 return self.reference_md.cg_residues
2101 cg_residues = property(get_cg_residues, None, None, "Indices of residues in coarse grain (read only)")
2103 def get_snapshots (self) -> int:
2104 """ Get the reference MD snapshots. """
2105 return self.reference_md.snapshots
2106 snapshots = property(get_snapshots, None, None, "Reference MD snapshots (read only)")
2108 def get_universe (self) -> int:
2109 """ Get the MDAnalysis Universe from the reference MD. """
2110 return self.reference_md.universe
2111 universe = property(get_universe, None, None, "MDAnalysis Universe object (read only)")
2113 def get_processed_interactions (self) -> dict:
2114 """ Get the processed interactions from the reference replica, which are the same for all replicas. """
2115 return self.reference_md.interactions
2116 interactions = property(get_processed_interactions, None, None, "Processed interactions (read only)")
2118 def get_check_stable_bonds (self) -> bool:
2119 """ Check if we must check stable bonds. """
2120 # Set if stable bonds have to be checked
2121 must_check = STABLE_BONDS_FLAG not in self.trust
2122 # If this analysis has been already passed then we can trust structure bonds
2123 if self.register.tests.get(STABLE_BONDS_FLAG, None) == True:
2124 must_check = False
2125 return must_check
2126 must_check_stable_bonds = property(get_check_stable_bonds, None, None, "Check if we must check stable bonds (read only)")
2128 # Reference bonds
2129 get_reference_bonds = Task('refbonds', 'Reference bonds', find_safe_bonds)
2130 reference_bonds = property(get_reference_bonds, None, None, "Atom bonds to be trusted (read only)")
2132 # Atom charges
2133 get_charges = Task('charges', 'Getting atom charges', get_charges)
2134 charges = property(get_charges, None, None, "Atom charges (read only)")
2136 # InChI keys
2137 get_inchi_keys = Task('inchikeys', 'Getting InChI keys', get_inchikeys)
2138 inchikeys = property(get_inchi_keys, None, None, "InChI keys (read only)")
2140 def get_topology_reader (self) -> 'Topology':
2141 """ Get the topology data reader. """
2142 # If we already have a stored value then return it
2143 if self._topology_reader: return self._topology_reader
2144 # Instantiate the topology reader
2145 self._topology_reader = Topology(self.topology_file)
2146 return self._topology_reader
2147 topology_reader = property(get_topology_reader, None, None, "Topology reader (read only)")
2149 def get_dihedrals (self) -> list[dict]:
2150 """ Get the topology dihedrals. """
2151 # If we already have a stored value then return it
2152 if self._dihedrals: return self._dihedrals
2153 # Calculate the dihedrals otherwise
2154 self._dihedrals = self.topology_reader.get_dihedrals_data()
2155 return self._dihedrals
2156 dihedrals = property(get_dihedrals, None, None, "Topology dihedrals (read only)")
2158 def get_populations (self) -> Optional[list[float]]:
2159 """ Get the equilibrium populations from a MSM. """
2160 # If we already have a stored value then return it
2161 if self._populations:
2162 return self._populations
2163 # Otherwise we must find the value
2164 if not self.populations_file:
2165 return None
2166 self._populations = read_file(self.populations_file)
2167 return self._populations
2168 populations = property(get_populations, None, None, "Equilibrium populations from a MSM (read only)")
2170 def get_transitions (self) -> Optional[list[list[float]]]:
2171 """ Get the transition probabilities from a MSM. """
2172 # If we already have a stored value then return it
2173 if self._transitions:
2174 return self._transitions
2175 # Otherwise we must find the value
2176 if not self.transitions_file:
2177 return None
2178 self._transitions = read_file(self.transitions_file)
2179 return self._transitions
2180 transitions = property(get_transitions, None, None, "Transition probabilities from a MSM (read only)")
2182 def get_pdb_ids (self) -> list[str]:
2183 """ Get the tested and standardized PDB ids. """
2184 # If we already have a stored value then return it
2185 if self._pdb_ids != None:
2186 return self._pdb_ids
2187 # Otherwise test and standarize input PDB ids
2188 self._pdb_ids = []
2189 # If there is no input pdb ids (may be None) then stop here
2190 if not self.input_pdb_ids:
2191 return []
2192 # If input PDB ids is a string instead of a list then fix it
2193 input_pdb_ids = self.input_pdb_ids
2194 if type(input_pdb_ids) == str:
2195 input_pdb_ids = [ input_pdb_ids ]
2196 # Iterate input PDB ids
2197 for input_pdb_id in input_pdb_ids:
2198 # First make sure this is a PDB id
2199 if not re.match(PDB_ID_FORMAT, input_pdb_id):
2200 raise InputError(f'Input PDB id "{input_pdb_id}" does not look like a PDB id')
2201 # Make letters upper
2202 pdb_id = input_pdb_id.upper()
2203 self._pdb_ids.append(pdb_id)
2204 return self._pdb_ids
2205 pdb_ids = property(get_pdb_ids, None, None, "Tested and standarized PDB ids (read only)")
2207 # Prepare the PDB references file to be uploaded to the database
2208 get_pdb_references = Task('pdbs', 'Prepare PDB references',
2209 prepare_pdb_references, output_filename = PDB_REFERENCES_FILENAME)
2210 pdb_references_file = get_pdb_references.get_output_file
2212 # Map the structure aminoacids sequences against the Uniprot reference sequences
2213 get_protein_map = Task('protmap', 'Protein residues mapping',
2214 generate_protein_mapping, output_filename=PROTEIN_REFERENCES_FILENAME)
2215 protein_map = property(get_protein_map, None, None, "Protein residues mapping (read only)")
2217 # Define the output file of the protein mapping including protein references
2218 get_protein_references_file = get_protein_map.get_output_file
2219 protein_references_file = property(get_protein_references_file, None, None, "File including protein refereces data mined from UniProt (read only)")
2221 # Get chain references
2222 get_chain_references = Task('chains', 'Chain references',
2223 prepare_chain_references, output_filename = OUTPUT_CHAINS_FILENAME)
2225 # Get the ligand residues mapping
2226 get_ligand_map = Task('ligmap', 'Ligand residues mapping',
2227 generate_ligand_mapping, output_filename = LIGAND_REFERENCES_FILENAME)
2228 ligand_map = property(get_ligand_map, None, None, "Ligand references (read only)")
2230 # Define the output file of the ligand mapping including ligand references
2231 get_ligand_references_file = get_ligand_map.get_output_file
2232 ligand_references_file = property(get_ligand_references_file, None, None, "File including ligand refereces data mined from PubChem (read only)")
2234 # Get the lipid references
2235 get_lipid_map = Task('lipmap', 'Lipid mapping',
2236 generate_lipid_references, output_filename = INCHIKEY_REFERENCES_FILENAME)
2237 lipid_map = property(get_lipid_map, None, None, "Lipid mapping (read only)")
2239 # Define the output file of the lipid mapping including lipid references
2240 get_lipid_references_file = get_lipid_map.get_output_file
2241 lipid_references_file = property(get_lipid_references_file, None, None, "File including lipid references data mined from PubChem (read only)")
2243 # Get mapping of residues in the membrane
2244 get_membrane_map = Task('memmap', 'Membrane mapping',
2245 generate_membrane_mapping, output_filename = MEMBRANE_MAPPING_FILENAME)
2246 membrane_map = property(get_membrane_map, None, None, "Membrane mapping (read only)")
2248 # Build the residue map from both proteins and ligands maps
2249 # This is formatted as both the standard topology and metadata producers expect them
2250 get_residue_map = Task('resmap', 'Residue mapping', generate_residue_mapping)
2251 residue_map = property(get_residue_map, None, None, "Residue map (read only)")
2253 # Prepare the project metadata file to be upload to the database
2254 prepare_metadata = Task('pmeta', 'Prepare project metadata',
2255 prepare_project_metadata, output_filename=OUTPUT_METADATA_FILENAME)
2257 # Prepare the standard topology file to be uploaded to the database
2258 prepare_standard_topology = Task('stopology', 'Standard topology file',
2259 generate_topology, output_filename = STANDARD_TOPOLOGY_FILENAME)
2260 get_standard_topology_file = prepare_standard_topology.get_output_file
2261 standard_topology_file = property(get_standard_topology_file, None, None, "Standard topology filename (read only)")
2263 # Get a screenshot of the system
2264 get_screenshot_filename = Task('screenshot', 'Screenshot file',
2265 get_screenshot, output_filename = OUTPUT_SCREENSHOT_FILENAME)
2266 screenshot_filename = property(get_screenshot_filename, None, None, "Screenshot filename (read only)")
2268 # Provenance data
2269 produce_provenance = Task('aiidata', 'Produce provenance', produce_provenance)
2271# AUXILIAR FUNCTIONS ---------------------------------------------------------------------------
2273# DANI: En cuanto se concrete el formato de los markov esta función no hará falta
2274def read_file (target_file : File) -> dict:
2275 """ Set a function to read a file which may be in different formats. """
2276 # Get the file format
2277 file_format = target_file.filename.split('.')[-1]
2278 # Read numpy files
2279 if file_format == 'npy':
2280 return numpy.load(target_file.path)
2281 # Read JSON files
2282 if file_format == 'json':
2283 return load_json(target_file.path)
2285def name_2_directory (name : str) -> str:
2286 """ Set a function to convert an MD name into an equivalent MD directory. """
2287 # Replace white spaces with underscores
2288 directory = name.replace(' ', '_')
2289 # Remove problematic characters
2290 for character in FORBIDDEN_DIRECTORY_CHARACTERS:
2291 directory = directory.replace(character, '')
2292 return directory
2294def check_directory (directory : str) -> str:
2295 """ Check for problematic characters in a directory path. """
2296 # Remove problematic characters
2297 directory_characters = set(directory)
2298 for character in FORBIDDEN_DIRECTORY_CHARACTERS:
2299 if character in directory_characters:
2300 raise InputError(f'Directory path "{directory}" includes the forbidden character "{character}"')
2302def directory_2_name (directory : str) -> str:
2303 """ Convert an MD directory into an equivalent MD name. """
2304 # Remove a possible starting './'
2305 # Replace white spaces with underscores
2306 name = directory.split('/')[-1].replace('_', ' ')
2307 return name
2309# Project input files
2310project_input_files = {
2311 'itopology': Project.get_input_topology_file,
2312 'inputs': Project.get_inputs_file,
2313 'populations': Project.get_populations_file,
2314 'transitions': Project.get_transitions_file
2315}
2316# MD input files
2317md_input_files = {
2318 'istructure': MD.get_input_structure_file,
2319 'itrajectory': MD.get_input_trajectory_files
2320}
2321# Both project and MD input files
2322input_files = { **project_input_files, **md_input_files }
2324# Project processed files
2325project_processed_files = {
2326 'topology': Project.get_topology_file
2327}
2328# MD processed files
2329md_processed_files = {
2330 'structure': MD.get_structure_file,
2331 'trajectory': MD.get_trajectory_file
2332}
2333# Both project and MD processed files
2334processed_files = { **project_processed_files, **md_processed_files }
2336# Project requestable tasks
2337project_requestables = {
2338 **project_input_files,
2339 **project_processed_files,
2340}
2341# Add available tasks to project requestables
2342for callable in vars(Project).values():
2343 if isinstance(callable, Task): project_requestables[callable.flag] = callable
2344# MD requestable tasks
2345md_requestables = {
2346 **md_input_files,
2347 **md_processed_files,
2348}
2349# Add available tasks to project requestables
2350for callable in vars(MD).values():
2351 if isinstance(callable, Task): md_requestables[callable.flag] = callable
2352# Requestables for the console
2353# Note that this constant is global
2354requestables.update({ **project_requestables, **md_requestables })
2355# Inverted requestables for every function to know which is its 'label'
2356inverted_requestables.update({ v: k for k, v in requestables.items() })
2358# Set groups of dependencies to be requested together using only one flag
2359DEPENDENCY_FLAGS = {
2360 'download': list(input_files.keys()),
2361 'setup': list(processed_files.keys()),
2362 'meta': ['pmeta', 'mdmeta'],
2363 'network': [ 'resmap', 'ligands', 'chains', 'pdbs', 'memmap' ],
2364 'minimal': [ 'pmeta', 'mdmeta', 'stopology' ],
2365 'interdeps': [ 'interactions', 'pairwise', 'hbonds', 'energies', 'perres', 'clusters', 'dist' ],
2366 'membs': ['memmap', 'density', 'thickness', 'apl', 'lorder', 'linter', 'channels']
2367}
2369# Set the default analyses to be run when no task is specified
2370DEFAULT_ANALYSES = ['clusters', 'dist', 'energies', 'hbonds', 'pca', 'pockets',
2371 'rgyr', 'rmsds', 'perres', 'pairwise', 'rmsf', 'sas', 'tmscore', 'density',
2372 'thickness', 'apl', 'lorder', 'linter']
2374# The actual main function
2375def workflow (
2376 # Project parameters
2377 project_parameters : dict = {},
2378 # The actual workflow parameters
2379 # The working directory
2380 working_directory : str = '.',
2381 # Download only
2382 download : bool = False,
2383 # Download and correct only
2384 setup : bool = False,
2385 # Run only specific analyses/processes
2386 include : Optional[list[str]] = None,
2387 # Run everything but specific analyses/processes
2388 exclude : Optional[list[str]] = None,
2389 # Overwrite already existing output files
2390 overwrite : Optional[ list[str] | bool ] = None,
2391):
2393 # Check there are not input errors
2395 # Include and exclude are not compatible
2396 # This is to protect the user to do something which makes not sense
2397 if include and exclude:
2398 raise InputError('Include (-i) and exclude (-e) are not compatible. Use one of these options.')
2400 # Make sure the working directory exists
2401 if not exists(working_directory):
2402 raise InputError(f'Working directory "{working_directory}" does not exist')
2404 # Make sure the working directory is actually a directory
2405 if not isdir(working_directory):
2406 raise InputError(f'Working directory "{working_directory}" is actually not a directory')
2408 # Move the current directory to the working directory
2409 chdir(working_directory)
2410 current_directory_name = getcwd().split('/')[-1]
2411 git_version = get_git_version()
2412 print(f'\n{CYAN_HEADER}Running MDDB workflow ({git_version}) for project at {current_directory_name}{COLOR_END}')
2414 # Initiate the project project
2415 project = Project(**project_parameters)
2416 print(f' {len(project.mds)} MDs are to be run')
2418 # Set the tasks to be run
2419 tasks = None
2420 # If the download argument is passed then just make sure input files are available
2421 if download:
2422 warn('The "-d" or "--download" argument is deprecated. Please use "-i download" instead.')
2423 tasks = list(input_files.keys())
2424 # If the setup argument is passed then just process input files
2425 elif setup:
2426 warn('The "-s" or "--setup" argument is deprecated. Please use "-i setup" instead.')
2427 tasks = list(processed_files.keys())
2428 # If the include argument then add only the specified tasks to the list
2429 elif include and len(include) > 0:
2430 tasks = [ *include ]
2431 # Search for special flags among included
2432 for flag, dependencies in DEPENDENCY_FLAGS.items():
2433 if flag not in tasks: continue
2434 # If the flag is found then remove it and write the corresponding dependencie instead
2435 # Make sure not to duplicate a dependency if it was already included
2436 tasks.remove(flag)
2437 for dep in dependencies:
2438 if dep in tasks: continue
2439 tasks.append(dep)
2440 # Set the default tasks otherwise
2441 else:
2442 tasks = [
2443 # Project tasks
2444 'stopology',
2445 'screenshot',
2446 'pmeta',
2447 'pdbs',
2448 'chains',
2449 # MD tasks
2450 'mdmeta',
2451 'inter',
2452 *DEFAULT_ANALYSES,
2453 ]
2454 # If the exclude parameter was passed then remove excluded tasks from the default tasks
2455 if exclude and len(exclude) > 0:
2456 excluded_dependencies = [ *exclude ]
2457 # Search for special flags among excluded
2458 for flag, dependencies in DEPENDENCY_FLAGS.items():
2459 if flag not in exclude: continue
2460 # If the flag is found then exclude the dependencies instead
2461 # Make sure not to duplicate a dependency if it was already included
2462 excluded_dependencies.remove(flag)
2463 for dep in dependencies:
2464 if dep in exclude: continue
2465 excluded_dependencies.append(dep)
2466 tasks = [ name for name in tasks if name not in excluded_dependencies ]
2468 # If the user requested to overwrite something, make sure it is in the tasks list
2470 # Update the overwritable variable with the requested overwrites
2471 overwritables = set()
2472 if overwrite:
2473 # If the overwrite argument is simply true then add all requestables to the overwritable
2474 if type(overwrite) == bool:
2475 for task in tasks:
2476 overwritables.add(task)
2477 # If the overwrite argument is a list of tasks then iterate them
2478 elif type(overwrite) == list:
2479 for task in overwrite:
2480 # Make sure the task to be overwriten is among the tasks to be run
2481 if task not in tasks:
2482 raise InputError(f'Task "{task}" is to be overwriten but it is not in the tasks list. Either include it or do not exclude it')
2483 # Add it to the global variable
2484 overwritables.add(task)
2485 else: raise ValueError('Not supported overwrite type')
2487 # Get project tasks
2488 project_tasks = [ task for task in tasks if task in project_requestables ]
2489 # Get the MD tasks
2490 md_tasks = [ task for task in tasks if task in md_requestables ]
2492 # Set project overwritables
2493 project.overwritables = set([ task for task in project_tasks if task in overwritables ])
2494 # Set MD overwrittables
2495 # Note that this must be done before running project tasks
2496 # Some project tasks rely in MD tasks
2497 for md in project.mds:
2498 md.overwritables = set([ task for task in md_tasks if task in overwritables ])
2500 # Run the project tasks now
2501 for task in project_tasks:
2502 # Get the function to be called and call it
2503 getter = requestables[task]
2504 getter(project)
2506 # If there are no MD tasks then we are done already
2507 if len(md_tasks) == 0:
2508 print("Finished!")
2509 return
2511 # Now iterate over the different MDs
2512 for md in project.mds:
2513 print(f'\n{CYAN_HEADER} Processing MD at {md.directory}{COLOR_END}')
2514 # Run the MD tasks
2515 for task in md_tasks:
2516 # Get the function to be called and call it
2517 getter = requestables[task]
2518 getter(md)
2520 # Remove gromacs backups and other trash files from this MD
2521 remove_trash(md.directory)
2523 # Remove gromacs backups and other trash files from the project
2524 remove_trash(project.directory)
2526 print("Done!")