Coverage for mddb_workflow/mwf.py: 79%

1#!/usr/bin/env python

3# This is the starter script

5# Import python libraries

6from os import chdir, rename, remove, walk, mkdir, getcwd

7from os.path import exists, isdir, isabs, relpath, normpath, split, basename

8from shutil import rmtree

9import sys

10import io

11import re

12import numpy

13from glob import glob

14from inspect import getfullargspec

15import time

17# Constants

18# Importing constants first is important

19from mddb_workflow.utils.constants import *

21# Import local utils

22#from mddb_workflow.utils.httpsf import mount

23from mddb_workflow.utils.auxiliar import InputError, MISSING_TOPOLOGY

24from mddb_workflow.utils.auxiliar import warn, load_json, save_json, load_yaml, save_yaml

25from mddb_workflow.utils.auxiliar import is_directory_empty, is_glob, parse_glob, safe_getattr

26from mddb_workflow.utils.auxiliar import read_ndict, write_ndict, get_git_version

27from mddb_workflow.utils.arg_cksum import get_cksum_id

28from mddb_workflow.utils.register import Register

29from mddb_workflow.utils.cache import Cache

30from mddb_workflow.utils.structures import Structure

31from mddb_workflow.utils.topologies import Topology

32from mddb_workflow.utils.file import File

33from mddb_workflow.utils.remote import Remote

34from mddb_workflow.utils.pyt_spells import get_frames_count, get_average_structure

35from mddb_workflow.utils.selections import Selection

36from mddb_workflow.utils.mda_spells import get_mda_universe

37from mddb_workflow.utils.type_hints import *

39# Import local tools

40from mddb_workflow.tools.get_first_frame import get_first_frame

41from mddb_workflow.tools.get_bonds import find_safe_bonds, get_bonds_reference_frame

42from mddb_workflow.tools.process_interactions import process_interactions

43from mddb_workflow.tools.generate_metadata import prepare_project_metadata, generate_md_metadata

44from mddb_workflow.tools.generate_ligands_desc import generate_ligand_mapping

45from mddb_workflow.tools.chains import prepare_chain_references

46from mddb_workflow.tools.generate_pdb_references import prepare_pdb_references

47from mddb_workflow.tools.residue_mapping import generate_residue_mapping

48from mddb_workflow.tools.generate_map import generate_protein_mapping

49from mddb_workflow.tools.generate_lipid_references import generate_lipid_references

50from mddb_workflow.tools.generate_membrane_mapping import generate_membrane_mapping

51from mddb_workflow.tools.generate_topology import generate_topology

52from mddb_workflow.tools.get_charges import get_charges

53from mddb_workflow.tools.get_inchi_keys import get_inchikeys

54from mddb_workflow.tools.remove_trash import remove_trash

55from mddb_workflow.tools.get_screenshot import get_screenshot

56from mddb_workflow.tools.process_input_files import process_input_files

57from mddb_workflow.tools.provenance import produce_provenance

59# Import local analyses

60from mddb_workflow.analyses.rmsds import rmsds

61from mddb_workflow.analyses.tmscores import tmscores

62from mddb_workflow.analyses.rmsf import rmsf

63from mddb_workflow.analyses.rgyr import rgyr

64from mddb_workflow.analyses.pca import pca

65from mddb_workflow.analyses.density import density

66from mddb_workflow.analyses.thickness import thickness

67from mddb_workflow.analyses.area_per_lipid import area_per_lipid

68from mddb_workflow.analyses.lipid_order import lipid_order

69from mddb_workflow.analyses.lipid_interactions import lipid_interactions

70from mddb_workflow.analyses.channels import channels

71#from mddb_workflow.analyses.pca_contacts import pca_contacts

72from mddb_workflow.analyses.rmsd_per_residue import rmsd_per_residue

73from mddb_workflow.analyses.rmsd_pairwise import rmsd_pairwise

74from mddb_workflow.analyses.clusters import clusters_analysis

75from mddb_workflow.analyses.distance_per_residue import distance_per_residue

76from mddb_workflow.analyses.hydrogen_bonds import hydrogen_bonds

77from mddb_workflow.analyses.sasa import sasa

78from mddb_workflow.analyses.energies import energies

79from mddb_workflow.analyses.dihedral_energies import compute_dihedral_energies

80from mddb_workflow.analyses.pockets import pockets

81from mddb_workflow.analyses.rmsd_check import check_trajectory_integrity

82from mddb_workflow.analyses.helical_parameters import helical_parameters

83from mddb_workflow.analyses.markov import markov

85# Make the system output stream to not be buffered

86# Only when not in a Jupyter notebook or using pytest

87# Check if we're in an interactive Python shell like Jupyter

88if not hasattr(sys, 'ps1') and not 'pytest' in sys.modules:

89 # This is useful to make prints work on time in Slurm

90 # Otherwise, output logs are written after the script has fully run

91 # Note that this fix affects all modules and built-ins

92 unbuffered = io.TextIOWrapper(open(sys.stdout.fileno(), 'wb', 0), write_through=True)

93 sys.stdout = unbuffered

95# Set a special exception for missing inputs

96MISSING_INPUT_EXCEPTION = Exception('Missing input')

98# Set a special exception for missing task function arguments

99# This is used for easy debug when a new functions is added wrongly

100MISSING_ARGUMENT_EXCEPTION = Exception('Missing argument')

101

102# Set a special exception for when a value is missing

103MISSING_VALUE_EXCEPTION = Exception('Missing value')

104

105# Name of the argument used by all functions to know where to write output

106OUTPUT_FILEPATH_ARG = 'output_filepath'

107OUTPUT_DIRECTORY_ARG = 'output_directory'

108

109# Set some variables which are filled at the end but are referred by previously defined functions

110requestables = {}

111inverted_requestables = {}

112

113

114class Task:

115 """ Descriptor class to handle a generic task.

116 It implements lazy properties, caching and overwriting.

117

118 Since its properties are static, results are stored in the parent object

119 (MD/Project), or otherwise all MDs would share the same task values. """

120

121 def __init__ (self,

122 flag : str,

123 name : str,

124 func : Callable,

125 args : dict = {},

126 output_filename : Optional[str] = None,

127 use_cache : bool = True,

128 debug : bool = False,

129 ):

130 """

131 Initialize the Task object.

132

133 Args:

134 flag (str):

135 The task flag.

136 This name is used by the include/exclude/overwrite arguments and to name the analysis output directory.

137

138 name (str):

139 The task user-friendly name is to be used in the logs.

140

141 func (Callable):

142 The task function.

143 Function argument names must correspond with Project/MD property names.

144

145 args (dict, optional):

146 The task function additional arguments.

147 Project/MD properties are automatically sent to the function as arguments.

148 However some analyses have additional arguments (e.g. frames limit, cutoffs, etc.)

149

150 output_filename (str, optional):

151 The task output filename.

152 Path will be set automatically relative to its project/MD.

153 For those tasks which generate a directory with multiple outputs this is not necessary.

154 However this may come in handy by tasks with a single file output.

155 Specially when this output file is used later in this workflow.

156

157 use_cache (bool, optional):

158 Set if the returned output is to be cached.

159 Note that argument values are always cached, this is not optional.

160

161 debug (bool, optional):

162 If the task is run in debug mode, producing more output logs. Defaults to False.

163 """

164 # Save input arguments

165 self.flag = flag

166 self.name = name

167 self.func = func

168 self.args = args

169 self.output_filename = output_filename

170 self.use_cache = use_cache

171 self.debug = debug

172 # Set the key used to store and retireve data in the parent and cache

173 self.parent_output_key = f'_{self.flag}_task_output'

174 self.parent_output_filepath_key = f'{self.flag}_task_output_filepath'

175 self.cache_output_key = f'{self.flag}_task_output'

176 self.cache_arg_cksums = f'{self.flag}_task_arg_cksums'

177 # Para el arg_cksum

178 self.__name__ = self.flag

179

180 # Set internal functions to handle parent saved output

181 # This output is not saved in the task itself, but in the parent, because the task is static

182 def _get_parent_output (self, parent):

183 return safe_getattr(parent, self.parent_output_key, MISSING_VALUE_EXCEPTION)

184 def _set_parent_output (self, parent, new_output):

185 return setattr(parent, self.parent_output_key, new_output)

186 def _get_parent_output_filepath (self, parent):

187 return safe_getattr(parent, self.parent_output_filepath_key, MISSING_VALUE_EXCEPTION)

188 def _set_parent_output_filepath (self, parent, new_filepath):

189 return setattr(parent, self.parent_output_filepath_key, new_filepath)

190 # Get the task output, running the task if necessary

191 def get_output (self, parent):

192 # If we already have a value stored from this run then return it

193 output = self._get_parent_output(parent)

194 if output != MISSING_VALUE_EXCEPTION: return output

195 # Otherwise run the task and return the output

196 return self(parent)

197 output = property(get_output, None, None, "Task output (read only)")

198 # Asking for the output file or filepath implies running the Task, then returning the file/filepath

199 def get_output_filepath (self, parent) -> str:

200 # If we already have a filepath stored from this run then return it

201 filepath = self._get_parent_output_filepath(parent)

202 if filepath != MISSING_VALUE_EXCEPTION: return filepath

203 # Otherwise run the task and return the filepath

204 self(parent)

205 filepath = self._get_parent_output_filepath(parent)

206 if filepath != MISSING_VALUE_EXCEPTION: return filepath

207 raise ValueError(f'Task {self.flag} has no output filepath after running')

208 output_filepath = property(get_output_filepath, None, None, "Task output filepath (read only)")

209 def get_output_file (self, parent) -> str:

210 filepath = self.get_output_filepath(parent)

211 return File(filepath)

212 output_file = property(get_output_file, None, None, "Task output file (read only)")

213

214 # When the task is printed, show the flag

215 def __repr__ (self): return f'<Task ({self.flag})>'

216

217 # When a task is called

218 def __call__(self, parent: Union['Project', 'MD']):

219 # First of all check if this task has been already done in this very run

220 # If so then return the stored vale

221 output = self._get_parent_output(parent)

222 if output != MISSING_VALUE_EXCEPTION: return output

223 # Process the task function arguments

224 processed_args = {}

225 # Get the task function expected arguments

226 specification = getfullargspec(self.func)

227 expected_arguments = specification.args

228 n_default_arguments = len(specification.defaults) if specification.defaults else 0

229 # Find out which arguments are optional since they have default values

230 default_arguments = set(expected_arguments[::-1][:n_default_arguments])

231 # If one of the expected arguments is the output_filename then set it here

232 output_filepath = None

233 writes_output_file = OUTPUT_FILEPATH_ARG in expected_arguments

234 if writes_output_file:

235 # The task should have a defined output file

236 if not self.output_filename:

237 raise RuntimeError(f'Task {self.flag} must have an "output_filename"')

238 # Set the output file path

239 output_filepath = parent.pathify(self.output_filename)

240 self._set_parent_output_filepath(parent, output_filepath)

241 # Add it to the processed args

242 processed_args[OUTPUT_FILEPATH_ARG] = output_filepath

243 # Remove the expected argument from the list

244 expected_arguments.remove(OUTPUT_FILEPATH_ARG)

245 # If one of the expected arguments is the output_directory then set it here

246 # We will set a new directory with the flag name of the task, in the correspoding path

247 # Note that while the task is beeing done the output directory has a different name

248 # Thus the directory is hidden and marked as incomplete

249 # The final output directory is the one without the incomplete prefix

250 writes_output_dir = OUTPUT_DIRECTORY_ARG in expected_arguments

251 incomplete_output_directory = None

252 final_output_directory = None

253 if writes_output_dir:

254 # Set the output directory path

255 incomplete_output_directory = parent.pathify(INCOMPLETE_PREFIX + self.flag)

256 final_output_directory = incomplete_output_directory.replace(INCOMPLETE_PREFIX, '')

257 # Add it to the processed args

258 processed_args[OUTPUT_DIRECTORY_ARG] = incomplete_output_directory

259 # Remove the expected argument from the list

260 expected_arguments.remove(OUTPUT_DIRECTORY_ARG)

261 # Iterate the reamining expected arguments

262 for arg in expected_arguments:

263 # First find the argument among the parent properties

264 arg_value = self.find_arg_value(arg, parent, default_arguments)

265 if arg_value == MISSING_ARGUMENT_EXCEPTION: continue

266 # Add the processed argument

267 processed_args[arg] = arg_value

268 # Check again if the task has output already

269 # It may happend that some dependencies assign output on their own

270 # e.g. charges, bonds

271 # If so then return the stored vale

272 output = self._get_parent_output(parent)

273 if output != MISSING_VALUE_EXCEPTION: return output

274 # Find if we have cached output

275 if self.use_cache:

276 output = parent.cache.retrieve(self.cache_output_key, MISSING_VALUE_EXCEPTION)

277 self._set_parent_output(parent, output)

278 # Check if this dependency is to be overwriten

279 forced_overwrite = self.flag in parent.overwritables

280 # Get the list of inputs which have changed compared to a previous run

281 # WARNING: Always get changed inputs, since this function updates the cache

282 # If had_cache is false then it means this is the first time the task is ever done

283 changed_inputs, had_cache, cache_cksums = self.get_changed_inputs(parent, processed_args)

284 any_input_changed = len(changed_inputs) > 0

285 # Update the cache inputs

286 parent.cache.update(self.cache_arg_cksums, cache_cksums)

287 # We must overwrite outputs either if inputs changed or if it was forced by the user

288 must_overwrite = forced_overwrite or any_input_changed

289 # Check if output already exists

290 # If the final directory already exists then it means the task was started in a previous run

291 existing_incomplete_output = writes_output_dir and exists(incomplete_output_directory)

292 # If the final directory already exists then it means the task was done in a previous run

293 existing_final_output = writes_output_dir and exists(final_output_directory)

294 # If the output file already exists then it also means the task was done in a previous run

295 existing_output_file = writes_output_file and exists(output_filepath)

296 # If we already have a cached output result

297 existing_output_data = output != MISSING_VALUE_EXCEPTION

298 # If we must overwrite then purge previous outputs

299 if must_overwrite:

300 if existing_incomplete_output: rmtree(incomplete_output_directory)

301 if existing_final_output: rmtree(final_output_directory)

302 if existing_output_file: remove(output_filepath)

303 if existing_output_data: parent.cache.delete(self.cache_output_key)

304 # If already existing output is not to be overwritten then check if it is already what we need

305 else:

306 # If output files/directories are expected then they must exist

307 # If output data is expected then it must be cached

308 satisfied_output = (not writes_output_dir or exists(final_output_directory)) \

309 and (not writes_output_file or exists(output_filepath)) \

310 and (output != MISSING_VALUE_EXCEPTION)

311 # If we already have the expected output then we can skip the task at all

312 if satisfied_output:

313 print(f'{GREY_HEADER}-> Task {self.flag} ({self.name}) already completed{COLOR_END}')

314 return output

315 # If we are at this point then we are missing some output so we must proceed to run the task

316 # Use the final output directory instead of the incomplete one if exists

317 # Note that we must check if it exists again since it may have been deleted since the last check

318 if writes_output_dir and exists(final_output_directory):

319 processed_args[OUTPUT_DIRECTORY_ARG] = final_output_directory

320 # Create the incomplete output directory, if necessary

321 missing_incomplete_output = writes_output_dir \

322 and not exists(incomplete_output_directory) \

323 and not exists(final_output_directory)

324 if missing_incomplete_output: mkdir(incomplete_output_directory)

325 # Finally call the function

326 print(f'{GREEN_HEADER}-> Running task {self.flag} ({self.name}){COLOR_END}')

327 start_time = time.time()

328 # If the task is to be run again because an inputs changed then let the user know

329 if any_input_changed and had_cache and not forced_overwrite:

330 changes = ''.join([ '\n - ' + inp for inp in changed_inputs ])

331 print(f'{GREEN_HEADER} The task is run again since the following inputs changed:{changes}{COLOR_END}')

332 # Save a few internal values the task although the task is static

333 # We save it right before calling the function in case the function uses this task as input

334 self.changed_inputs = changed_inputs

335 self.cache_cksums = cache_cksums

336 # Run the actual task

337 output = self.func(**processed_args)

338 end_time = time.time()

339 print(f' Task {self.flag} completed in {end_time - start_time:.2f} seconds{COLOR_END}')

340 self._set_parent_output(parent, output)

341 # Set the output to be saved in cache

342 # Note that all must be JSON serializable values

343 cache_output = output

344 # Update cache output unless it is marked to not save it

345 if self.use_cache: parent.cache.update(self.cache_output_key, cache_output)

346 # Update the overwritables so this is not remade further in the same run

347 parent.overwritables.discard(self.flag)

348 # As a brief cleanup, if the output directory is empty at the end, then remove it

349 # Otheriwse, change the incomplete directory name to its final name

350 if writes_output_dir and exists(incomplete_output_directory):

351 if is_directory_empty(incomplete_output_directory): rmtree(incomplete_output_directory)

352 else: rename(incomplete_output_directory, final_output_directory)

353 # Now return the function result

354 return output

355

356 def find_arg_value (self, arg : str, parent : Union['Project', 'MD'], default_arguments : set):

357 """ Find argument values, thus running any dependency if necessary. """

358 # Word 'task' is reserved for getting the task itself

359 if arg == 'task': return self

360 # Word 'self' is reserved for getting the caller Project/MD

361 if arg == 'self': return parent

362 # Check if the argument is an MD property

363 arg_value = safe_getattr(parent, arg, MISSING_ARGUMENT_EXCEPTION)

364 if arg_value != MISSING_ARGUMENT_EXCEPTION: return arg_value

365 # If the parent is an MD then it may happen the property is from the Project

366 if isinstance(parent, MD):

367 arg_value = safe_getattr(parent.project, arg, MISSING_ARGUMENT_EXCEPTION)

368 if arg_value != MISSING_ARGUMENT_EXCEPTION: return arg_value

369 # If the property is missing then search among the additional arguments

370 arg_value = self.args.get(arg, MISSING_ARGUMENT_EXCEPTION)

371 if arg_value != MISSING_ARGUMENT_EXCEPTION: return arg_value

372 # It may also happen that the argument has a default value

373 # If this is the case then we can skip it

374 if arg in default_arguments: return MISSING_ARGUMENT_EXCEPTION

375 # NEVER FORGET: Function arguments must have the same name that the Project/MD property

376 # If the argument is still missing then you programmed the function wrongly or...

377 # You may have forgotten the additional argument in the task args

378 raise RuntimeError(f'Function "{self.func.__name__}" from task "{self.flag}" expects argument "{arg}" but it is missing')

379

380 def get_changed_inputs (self,

381 parent : Union['Project', 'MD'],

382 processed_args : dict) -> tuple[ list[str], bool ]:

383 """ Find out if input arguments changed regarding the last run. """

384 # Get cache argument references

385 cache_cksums = parent.cache.retrieve(self.cache_arg_cksums, MISSING_VALUE_EXCEPTION)

386 had_cache = False if cache_cksums == MISSING_VALUE_EXCEPTION else True

387 if cache_cksums == MISSING_VALUE_EXCEPTION: cache_cksums = {}

388 # Check argument by argument

389 # Keep a list with arguments which have changed

390 unmatched_arguments = []

391 for arg_name, arg_value in processed_args.items():

392 # Skip the output directory argument

393 # Changes in this argument are not actual changes

394 if arg_name == OUTPUT_DIRECTORY_ARG: continue

395 # Get the cksum from the new argument value

396 new_cksum = get_cksum_id(arg_value)

397 # Retrieve the cksum from the old argument value

398 old_cksum = cache_cksums.get(arg_name, None)

399 if self.debug: print(f'Task "{self.name}" -> argument "{arg_name}"\n' +

400 f' new value: {arg_value}\n' +

401 f' new value cksum: {new_cksum}\n' +

402 f' old value cksum: {old_cksum}\n' +

403 f' match: {new_cksum == old_cksum}')

404 # Compare new and old cksums

405 if new_cksum != old_cksum:

406 # If we found a missmatch then add it to the list

407 unmatched_arguments.append(arg_name)

408 # Update the references

409 cache_cksums[arg_name] = new_cksum

410 return unmatched_arguments, had_cache, cache_cksums

411

412

413class MD:

414 """ A Molecular Dynamics (MD) is the union of a structure and a trajectory.

415 Having this data several analyses are possible.

416 Note that an MD is always defined inside of a Project and thus it has additional topology and metadata. """

417

418 def __init__ (self,

419 project : 'Project',

420 number : int,

421 directory : str,

422 input_structure_filepath : str,

423 input_trajectory_filepaths : list[str],

424 ):

425 """

426 Initialize the MD object.

427

428 Args:

429 project (Project): The parent project this MD belongs to.

430 number (int): The number of the MD according to its accession.

431 directory (str): The local directory where the MD takes place.

432 input_structure_filepath (str): The input structure file path.

433 input_trajectory_filepaths (list[str]): The input trajectory file paths.

434 """

435

436 # Save the inputs

437 self.project = project

438 if not project:

439 raise Exception('Project is mandatory to instantiate a new MD')

440 # Save the MD number and index

441 self.number = number

442 self.index = number - 1

443 # Set the MD accession and request URL

444 self.accession = None

445 self.remote = None

446 if self.project.database_url and self.project.accession:

447 self.accession = f'{self.project.accession}.{self.number}'

448 self.remote = Remote(self.project.database_url, self.accession)

449 # Save the directory

450 self.directory = normpath(directory)

451 # Now set the director relative to the project

452 self.directory = self.project.pathify(self.directory)

453 if normpath(self.directory) == normpath(self.project.directory):

454 raise InputError(f'MD {self.number} has the same directory as the project: {self.directory}')

455 # Save the directory name alone apart

456 self.directory_location, self.directory_name = split(self.directory)

457 # If the directory does not exists then create it

458 if not exists(self.directory):

459 mkdir(self.directory)

460 # Save the input structure filepath

461 # They may be relative to the project directory (unique) or relative to the MD directory (one per MD)

462 # If the path is absolute then it is considered unique

463 # If the file does not exist and it is to be downloaded then it is downloaded for each MD

464 # Priorize the MD directory over the project directory

465 self.arg_input_structure_filepath = input_structure_filepath

466 self._input_structure_filepath = None

467 # Set the internal variable for the input structure file, to be assigned later

468 self._input_structure_file = None

469 # Save the input trajectory filepaths

470 self.arg_input_trajectory_filepaths = input_trajectory_filepaths

471 self._input_trajectory_filepaths = None

472 # Set the internal variable for the input trajectory files, to be assigned later

473 self._input_trajectory_files = None

474

475 # Processed structure and trajectory files

476 self._structure_file = None

477 self._trajectory_file = None

478

479 # Other values which may be found/calculated on demand

480 self._md_inputs = None

481 self._structure = None

482

483 # Tests

484 self._trajectory_integrity = None

485

486 # Set a new MD specific register

487 # In case the directory is the project directory itself, use the project register

488 register_filepath = self.pathify(REGISTER_FILENAME)

489 register_file = File(register_filepath)

490 if register_file.path == self.project.register.file.path:

491 self.register = self.project.register

492 else:

493 self.register = Register(register_file)

494 # Save also warnings apart since they are to be used as an input for metadata tasks

495 self.warnings = self.register.warnings

496

497 # Set a new MD specific cache

498 # In case the directory is the project directory itself, use the project cache

499 cache_filepath = self.pathify(CACHE_FILENAME)

500 cache_file = File(cache_filepath)

501 if cache_file.path == self.project.cache.file.path:

502 self.cache = self.project.cache

503 else:

504 self.cache = Cache(cache_file)

505

506 # Set tasks whose output is to be overwritten

507 self.overwritables = set()

508

509 # Get MD inputs just to fill the inputs' "mds" value

510 # Some functions may fail otherwise when its value is missing

511 self.get_md_inputs()

512

513 def __repr__ (self):

514 return 'MD'

515

516 def pathify (self, filename_or_relative_path : str) -> str:

517 """ Given a filename or relative path, add the MD directory path at the beginning. """

518 return normpath(self.directory + '/' + filename_or_relative_path)

519

520 # Input structure file ------------

521

522 def get_input_structure_filepath (self) -> str:

523 """ Set a function to get input structure file path. """

524 # Return the internal value if it is already assigned

525 if self._input_structure_filepath != None:

526 return self._input_structure_filepath

527 # Set a function to find out if a path is relative to MD directories or to the project directory

528 # To do so just check if the file exists in any of those

529 # In case it exists in both or none then assume it is relative to MD directory

530 # Parse glob notation in the process

531 def relativize_and_parse_paths (input_path : str, may_not_exist : bool = False) -> Optional[str]:

532 # Check if it is an absolute path

533 if isabs(input_path):

534 abs_glob_parse = parse_glob(input_path)

535 # If we had multiple results then we complain

536 if len(abs_glob_parse) > 1:

537 raise InputError(f'Multiple structures found with "{input_path}": {", ".join(abs_glob_parse)}')

538 # If we had no results then we complain

539 if len(abs_glob_parse) == 0:

540 if self.remote:

541 warn('Spread syntax is not supported to download remote files')

542 raise InputError(f'No structure found with "{input_path}"')

543 abs_parsed_filepath = abs_glob_parse[0]

544 return abs_parsed_filepath

545 # Check the MD directory

546 md_relative_filepath = self.pathify(input_path)

547 md_glob_parse = parse_glob(md_relative_filepath)

548 if len(md_glob_parse) > 1:

549 raise InputError(f'Multiple structures found with "{input_path}": {", ".join(md_glob_parse)}')

550 md_parsed_filepath = md_glob_parse[0] if len(md_glob_parse) == 1 else None

551 if md_parsed_filepath and File(md_parsed_filepath).exists:

552 return md_parsed_filepath

553 # Check the project directory

554 project_relative_filepath = self.project.pathify(input_path)

555 project_glob_parse = parse_glob(project_relative_filepath)

556 if len(project_glob_parse) > 1:

557 raise InputError(f'Multiple structures found with "{input_path}": {", ".join(project_glob_parse)}')

558 project_parsed_filepath = project_glob_parse[0] if len(project_glob_parse) == 1 else None

559 if project_parsed_filepath and File(project_parsed_filepath).exists:

560 return project_parsed_filepath

561 # At this point we can conclude the input structure file does not exist

562 # If we have no paths at all then it means a glob pattern was passed and it didn't match

563 # Note that if a glob pattern existed then it would mean the file actually existed

564 if len(md_glob_parse) == 0 and len(project_glob_parse) == 0:

565 # Warn the user in case it was trying to use glob syntax to donwload remote files

566 if self.remote:

567 warn('Spread syntax is not supported to download remote files')

568 raise InputError('No trajectory file was reached neither in the project directory or MD directories in path(s) ' + ', '.join(input_path))

569 # If the path does not exist anywhere then we asume it will be downloaded and set it relative to the MD

570 # However make sure we have a remote

571 # As an exception, if the 'may not exist' flag is passed then we return the result even if there is no remote

572 if not may_not_exist and not self.remote:

573 raise InputError(f'Cannot find a structure file by "{input_path}" anywhere')

574 md_parsed_filepath = self.project.pathify(input_path) if f'{self.directory_name}/' in md_parsed_filepath else self.pathify(input_path)

575 return md_parsed_filepath

576 # If we have a value passed through command line

577 if self.arg_input_structure_filepath:

578 # Find out if it is relative to MD directories or to the project directory

579 self._input_structure_filepath = relativize_and_parse_paths(self.arg_input_structure_filepath)

580 # Save the parsed value in the inputs file

581 self.project.update_inputs(

582 f'mds.{self.index}.input_structure_filepath',

583 self._input_structure_filepath)

584 return self._input_structure_filepath

585 # If we have a value passed through the inputs file has the value

586 if self.project.is_inputs_file_available():

587 # Get the input value, whose key must exist

588 inputs_value = self.get_input('input_structure_filepath')

589 # If there is a valid input then use it

590 if inputs_value:

591 self._input_structure_filepath = relativize_and_parse_paths(inputs_value)

592 return self._input_structure_filepath

593 # If there is not input structure anywhere then use the input topology

594 # We will extract the structure from it using a sample frame from the trajectory

595 # Note that topology input filepath must exist and an input error will raise otherwise

596 # However if we are using the standard topology file we can not extract the PDB from it (yet)

597 if self.project.input_topology_file != MISSING_TOPOLOGY and \

598 self.project.input_topology_file.filename != STANDARD_TOPOLOGY_FILENAME:

599 return self.project.input_topology_file.path

600 # If we can not use the topology either then surrender

601 raise InputError('There is not input structure at all')

602

603 def get_input_structure_file (self) -> str:

604 """ Get the input pdb filename from the inputs.

605 If the file is not found try to download it. """

606 # If the input structure file is already defined then return it

607 if self._input_structure_file:

608 return self._input_structure_file

609 # Otherwise we must set it

610 # First set the input structure filepath

611 input_structure_filepath = self.get_input_structure_filepath()

612 # Now set the input structure file

613 self._input_structure_file = File(input_structure_filepath)

614 # If the file already exists then return it

615 if self._input_structure_file.exists:

616 return self._input_structure_file

617 # Try to download it

618 # If we do not have the required parameters to download it then we surrender here

619 if not self.remote:

620 raise InputError(f'Missing input structure file "{self._input_structure_file.path}"')

621 # Download the structure

622 # If the structure filename is the standard structure filename then use the structure endpoint instead

623 if self._input_structure_file.filename == STRUCTURE_FILENAME:

624 self.remote.download_standard_structure(self._input_structure_file)

625 # Otherwise download the input strucutre file by its filename

626 else:

627 self.remote.download_file(self._input_structure_file)

628 return self._input_structure_file

629 input_structure_file = property(get_input_structure_file, None, None, "Input structure filename (read only)")

630

631 # Input trajectory filename ------------

632

633 def get_input_trajectory_filepaths (self) -> str:

634 """ Get the input trajectory file paths. """

635 # Return the internal value if it is already assigned

636 if self._input_trajectory_filepaths != None:

637 return self._input_trajectory_filepaths

638 # Set a function to check and fix input trajectory filepaths

639 # Also relativize paths to the current MD directory and parse glob notation

640 def relativize_and_parse_paths (input_paths : list[str]) -> list[str]:

641 checked_paths = input_paths

642 # Input trajectory filepaths may be both a list or a single string

643 # However we must keep a list

644 if type(checked_paths) == list:

645 pass

646 elif type(checked_paths) == str:

647 checked_paths = [ checked_paths ]

648 else:

649 raise InputError('Input trajectory filepaths must be a list of strings or a string')

650 # Make sure all or none of the trajectory paths are absolute

651 abs_count = sum([ isabs(path) for path in checked_paths ])

652 if not (abs_count == 0 or abs_count == len(checked_paths)):

653 raise InputError('All trajectory paths must be relative or absolute. Mixing is not supported')

654 # Set a function to glob-parse and merge all paths

655 def parse_all_glob (paths : list[str]) -> list[str]:

656 parsed_paths = []

657 for path in paths:

658 parsed_paths += parse_glob(path)

659 return parsed_paths

660 # In case trajectory paths are absolute

661 if abs_count > 0:

662 absolute_parsed_paths = parse_all_glob(checked_paths)

663 # Check we successfully defined some trajectory file

664 if len(absolute_parsed_paths) == 0:

665 # Warn the user in case it was trying to use glob syntax to donwload remote files

666 if self.remote:

667 warn('Spread syntax is not supported to download remote files')

668 raise InputError('No trajectory file was reached neither in the project directory or MD directories in path(s) ' + ', '.join(input_paths))

669 return absolute_parsed_paths

670 # If trajectory paths are not absolute then check if they are relative to the MD directory

671 # Get paths relative to the current MD directory

672 md_relative_paths = [ self.pathify(path) for path in checked_paths ]

673 # In case there are glob characters we must parse the paths

674 md_parsed_paths = parse_all_glob(md_relative_paths)

675 # Check we successfully defined some trajectory file

676 if len(md_parsed_paths) > 0:

677 # If so, check at least one of the files do actually exist

678 if any([ File(path).exists for path in md_parsed_paths ]):

679 return md_parsed_paths

680 # If no trajectory files where found then asume they are relative to the project

681 # Get paths relative to the project directory

682 project_relative_paths = [ self.project.pathify(path) for path in checked_paths ]

683 # In case there are glob characters we must parse the paths

684 project_parsed_paths = parse_all_glob(project_relative_paths)

685 # Check we successfully defined some trajectory file

686 if len(project_parsed_paths) > 0:

687 # If so, check at least one of the files do actually exist

688 if any([ File(path).exists for path in project_parsed_paths ]):

689 return project_parsed_paths

690 # At this point we can conclude the input trajectory file does not exist

691 # If we have no paths at all then it means a glob pattern was passed and it didn't match

692 # Note that if a glob pattern existed then it would mean the file actually existed

693 if len(md_parsed_paths) == 0 and len(project_parsed_paths) == 0:

694 # Warn the user in case it was trying to use glob syntax to donwload remote files

695 if self.remote:

696 warn('Spread syntax is not supported to download remote files')

697 raise InputError('No trajectory file was reached neither in the project directory or MD directories in path(s) ' + ', '.join(input_paths))

698 # If we have a path however it may be downloaded from the database if we have a remote

699 if not self.remote:

700 raise InputError(f'Cannot find anywhere a trajectory file with path(s) "{", ".join(input_paths)}"')

701 # Note that if input path was not glob based it will be both as project relative and MD relative

702 if len(md_parsed_paths) == 0: raise ValueError('This should never happen')

703 # If file is to be downloaded then we must make sure the path is relative to the project

704 project_relative_paths = [

705 self.project.pathify(path) if f'{self.directory_name}/' in path else self.pathify(path) for path in checked_paths

706 ]

707 return project_relative_paths

708 # If we have a value passed through command line

709 if self.arg_input_trajectory_filepaths:

710 self._input_trajectory_filepaths = relativize_and_parse_paths(self.arg_input_trajectory_filepaths)

711 # Save the parsed value in the inputs file

712 self.project.update_inputs(

713 f'mds.{self.index}.input_trajectory_filepaths',

714 self._input_trajectory_filepaths)

715 return self._input_trajectory_filepaths

716 # Check if the inputs file has the value

717 if self.project.is_inputs_file_available():

718 # Get the input value

719 inputs_value = self.get_input('input_trajectory_filepaths')

720 if inputs_value:

721 self._input_trajectory_filepaths = relativize_and_parse_paths(inputs_value)

722 return self._input_trajectory_filepaths

723 # If there is no trajectory available then we surrender

724 raise InputError('There is not input trajectory at all')

725

726 def get_input_trajectory_files (self) -> str:

727 """ Get the input trajectory filename(s) from the inputs.

728 If file(s) are not found try to download it. """

729 # If we already defined input trajectory files then return them

730 if self._input_trajectory_files != None:

731 return self._input_trajectory_files

732 # Otherwise we must set the input trajectory files

733 input_trajectory_filepaths = self.get_input_trajectory_filepaths()

734 self._input_trajectory_files = [ File(path) for path in input_trajectory_filepaths ]

735 # Find missing trajectory files

736 missing_input_trajectory_files = []

737 for trajectory_file in self._input_trajectory_files:

738 if not trajectory_file.exists:

739 missing_input_trajectory_files.append(trajectory_file)

740 # If all files already exists then we are done

741 if len(missing_input_trajectory_files) == 0:

742 return self._input_trajectory_files

743 # Try to download the missing files

744 # If we do not have the required parameters to download it then we surrender here

745 if not self.remote:

746 missing_filepaths = [ trajectory_file.path for trajectory_file in missing_input_trajectory_files ]

747 raise InputError('Missing input trajectory files: ' + ', '.join(missing_filepaths))

748 # Download each trajectory file (ususally it will be just one)

749 for trajectory_file in self._input_trajectory_files:

750 # If this is the main trajectory (the usual one) then use the dedicated endpoint

751 if trajectory_file.filename == TRAJECTORY_FILENAME:

752 frame_selection = f'1:{self.project.sample_trajectory}:1' if self.project.sample_trajectory else None

753 self.remote.download_trajectory(trajectory_file, frame_selection=frame_selection, format='xtc')

754 # Otherwise, download it by its filename

755 else:

756 self.remote.download_file(trajectory_file)

757 return self._input_trajectory_files

758 input_trajectory_files = property(get_input_trajectory_files, None, None, "Input trajectory filenames (read only)")

759

760 def get_md_inputs (self) -> dict:

761 """ Get MD specific inputs. """

762 # If we already have a value stored then return it

763 if self._md_inputs:

764 return self._md_inputs

765 # Otherwise we must find its value

766 # If we have MD inputs in the inputs file then use them

767 if self.project.input_mds:

768 # Iterate over the different MD inputs to find out each directory

769 # We must find the MD inputs whcih belong to this specific MD according to this directory

770 for md in self.project.input_mds:

771 # Get the directory according to the inputs

772 directory = md.get(MD_DIRECTORY, None)

773 if directory:

774 check_directory(directory)

775 # If no directory is specified in the inputs then guess it from the MD name

776 else:

777 name = md.get('name', None)

778 if not name: raise InputError('There is a MD with no name and no directory. Please define at least one of them.')

779 directory = name_2_directory(name)

780 # If the directory matches then this is our MD inputs

781 if self.project.pathify(directory) == self.directory:

782 self._md_inputs = md

783 return self._md_inputs

784 # If this MD directory has not associated inputs then it means it was passed through command line

785 # We set a new MD inputs for it

786 new_md_name = directory_2_name(self.directory)

787 self._md_inputs = { 'name': new_md_name, 'mdir': self.directory }

788 # Update the inputs file with the new MD inputs

789 mds = self.project.inputs.get('mds', None)

790 if mds == None: mds = []

791 new_mds_inputs = [ *mds, self._md_inputs ]

792 self.project.update_inputs('mds', new_mds_inputs)

793 return self._md_inputs

794

795 md_inputs = property(get_md_inputs, None, None, "MD specific inputs (read only)")

796

797 def get_input (self, name: str):

798 """ Get a specific 'input' value from MD inputs. """

799 value = self.md_inputs.get(name, MISSING_INPUT_EXCEPTION)

800 # If we had a value then return it

801 if value != MISSING_INPUT_EXCEPTION:

802 return value

803 return self.project.get_input(name)

804

805 # ---------------------------------

806

807 def get_file (self, target_file : File) -> bool:

808 """ Check if a file exists. If not, try to download it from the database.

809 If the file is not found in the database it is fine, we do not even warn the user.

810 Note that this function is used to get populations and transitions files, which are not common. """

811 # If it exists we are done

812 if target_file.exists:

813 return True

814 # Try to download the missing file

815 # If we do not have the required parameters to download it then we surrender here

816 if not self.remote:

817 return False

818 # Check if the file is among the available remote files

819 # If it is no then stop here

820 if target_file.filename not in self.remote.available_files:

821 return False

822 # Download the file

823 self.remote.download_file(target_file)

824 return True

825

826 def print_tests_summary (self):

827 """ Make a summary of tests and their status. """

828 print('Tests summary:')

829 for test_name in AVAILABLE_CHECKINGS:

830 test_result = self.register.tests.get(test_name, None)

831 # Print things pretty

832 test_nice_name = NICE_NAMES[test_name]

833 test_nice_result = None

834 if test_result == None:

835 test_nice_result = YELLOW_HEADER + 'Not run' + COLOR_END

836 elif test_result == False:

837 test_nice_result = RED_HEADER + 'Failed' + COLOR_END

838 elif test_result == True:

839 test_nice_result = GREEN_HEADER + 'Passed' + COLOR_END

840 elif test_result == 'na':

841 test_nice_result = BLUE_HEADER + 'Not applicable' + COLOR_END

842 else:

843 raise ValueError()

844

845 print(f' - {test_nice_name} -> {test_nice_result}')

846

847 # Issue some warnings if failed or never run tests are skipped

848 # This is run after processing input files

849 def _issue_required_test_warnings (self):

850 for test_name in AVAILABLE_CHECKINGS:

851 # If test was not skipped then proceed

852 if test_name not in self.project.trust: continue

853 # If test passed in a previous run the proceed

854 test_result = self.register.tests.get(test_name)

855 if test_result == True: continue

856 # If test failed in a previous run we can also proceed

857 # The failing warning must be among the inherited warnings, so there is no need to add more warnings here

858 if test_result == False: continue

859 # If the test has been always skipped then issue a warning

860 if test_result == None:

861 # Remove previous warnings

862 self.register.remove_warnings(test_name)

863 # Get test pretty name

864 test_nice_name = NICE_NAMES[test_name]

865 # Issue the corresponding warning

866 self.register.add_warning(test_name, test_nice_name + ' was skipped and never run before')

867 continue

868 raise ValueError('Test value is not supported')

869

870 # Processed files ----------------------------------------------------

871

872 # Run the actual processing to generate output processed files out of input raw files

873 # And by "files" I mean structure, trajectory and topology

874 input_files_processing = Task('inpro', 'Input files processing', process_input_files)

875

876 def get_structure_file (self) -> str:

877 """ Get the processed structure file. """

878 # If we have a stored value then return it

879 # This means we already found or generated this file

880 if self._structure_file:

881 return self._structure_file

882 # Set the file

883 structure_filepath = self.pathify(STRUCTURE_FILENAME)

884 self._structure_file = File(structure_filepath)

885 # If the faith flag was passed then simply make sure the input file makes sense

886 if self.project.faith:

887 if self.input_structure_file != self._structure_file:

888 raise InputError('Input structure file is not equal to output structure file but the "faith" flag was used.\n'

889 ' Please refrain from using the faith argument (-f) if you ignore its effect.')

890 if not self.input_structure_file.exists:

891 raise InputError('Input structure file does not exist but the "faith" flag was used.\n'

892 ' Please refrain from using the faith argument (-f) if you ignore its effect.')

893 return self._structure_file

894 # Run the processing logic

895 self.input_files_processing(self)

896 # Now that the file is sure to exist we return it

897 return self._structure_file

898 structure_file = property(get_structure_file, None, None, "Structure file (read only)")

899

900 def get_trajectory_file (self) -> str:

901 """ Get the processed trajectory file. """

902 # If we have a stored value then return it

903 # This means we already found or generated this file

904 if self._trajectory_file:

905 return self._trajectory_file

906 # If the file already exists then we are done

907 trajectory_filepath = self.pathify(TRAJECTORY_FILENAME)

908 self._trajectory_file = File(trajectory_filepath)

909 # If the faith flag was passed then simply make sure the input file makes sense

910 if self.project.faith:

911 if len(self.input_trajectory_files) > 1:

912 raise InputError('Several input trajectory files but the "faith" flag was used.\n'

913 ' Please refrain from using the faith argument (-f) if you ignore its effect.')

914 sample = self.input_trajectory_files[0]

915 if sample != self._trajectory_file:

916 raise InputError('Input trajectory file is not equal to output trajectory file but the "faith" flag was used.\n'

917 ' Please refrain from using the faith argument (-f) if you ignore its effect.')

918 if not self._trajectory_file.exists:

919 raise InputError('Input trajectory file does not exist but the "faith" flag was used.\n'

920 ' Please refrain from using the faith argument (-f) if you ignore its effect.')

921 return self._trajectory_file

922 # Run the processing logic

923 self.input_files_processing(self)

924 # Now that the file is sure to exist we return it

925 return self._trajectory_file

926 trajectory_file = property(get_trajectory_file, None, None, "Trajectory file (read only)")

927

928 def get_topology_file (self) -> str:

929 """ Get the processed topology from the project. """

930 return self.project.get_topology_file()

931 topology_file = property(get_topology_file, None, None,

932 "Topology filename from the project (read only)")

933

934 # ---------------------------------------------------------------------------------

935 # Others values which may be found/calculated and files to be generated on demand

936 # ---------------------------------------------------------------------------------

937

938 # Trajectory snapshots

939 get_snapshots = Task('frames', 'Count trajectory frames', get_frames_count)

940 snapshots = property(get_snapshots, None, None, "Trajectory snapshots (read only)")

941

942 def get_reference_bonds (self) -> list[list[int]]:

943 """ Get the reference bonds. """

944 return self.project.reference_bonds

945 reference_bonds = property(get_reference_bonds, None, None, "Atom bonds to be trusted (read only)")

946

947 def get_structure (self) -> 'Structure':

948 """ Get the parsed structure. """

949 # If we already have a stored value then return it

950 if self._structure:

951 return self._structure

952 # Otherwise we must set the structure

953 # Make sure the structure file exists at this point

954 if not self.structure_file.exists:

955 raise ValueError('Trying to set standard structure but file '

956 f'{self.structure_file.path} does not exist yet. Are you trying '

957 'to access the standard structure before processing input files?')

958 # Note that this is not only the structure class, but it also contains additional logic

959 self._structure = Structure.from_pdb_file(self.structure_file.path)

960 # If the stable bonds test failed and we had mercy then it is sure our structure will have wrong bonds

961 # In order to make it coherent with the topology we will mine topology bonds from here and force them in the structure

962 # If we fail to get bonds from topology then just go along with the default structure bonds

963 if not self.register.tests.get(STABLE_BONDS_FLAG, None):

964 self._structure.bonds = self.reference_bonds

965 # Same procedure if we have coarse grain atoms

966 elif self.cg_selection:

967 self._structure.bonds = self.reference_bonds

968 return self._structure

969 structure = property(get_structure, None, None, "Parsed structure (read only)")

970

971 # First frame PDB file

972 get_first_frame = Task('firstframe', 'Get first frame structure',

973 get_first_frame, output_filename = FIRST_FRAME_FILENAME)

974 get_first_frame_file = get_first_frame.get_output_file

975 first_frame_file = property(get_first_frame_file, None, None, "First frame (read only)")

976

977 # Average structure filename

978 get_average_structure = Task('average', 'Get average structure',

979 get_average_structure, output_filename = AVERAGE_STRUCTURE_FILENAME)

980 get_average_structure_file = get_average_structure.get_output_file

981 average_structure_file = property(get_average_structure_file, None, None, "Average structure filename (read only)")

982

983 # Produce the MD metadata file to be uploaded to the database

984 prepare_metadata = Task('mdmeta', 'Prepare MD metadata',

985 generate_md_metadata, output_filename=OUTPUT_METADATA_FILENAME)

986

987 # The processed interactions

988 get_processed_interactions = Task('inter', 'Interaccions processing',

989 process_interactions, { 'frames_limit': 1000 })

990 interactions = property(get_processed_interactions, None, None, "Processed interactions (read only)")

991

992 # MDAnalysis Universe object

993 get_MDAnalysis_Universe = Task('mda_univ', 'MDAnalysis Universe object',

994 get_mda_universe, use_cache = False)

995 universe = property(get_MDAnalysis_Universe, None, None, "MDAnalysis Universe object (read only)")

996

997 def input_getter (name : str):

998 """ Function to get input values which may be MD specific.

999 If the MD input is missing then we use the project input value. """

1000 # Set the getter

1001 def getter (self):

1002 # Get the MD input

1003 value = self.md_inputs.get(name, None)

1004 if value != None:

1005 return value

1006 # If there is no MD input then return the project value

1007 return getattr(self.project, f'input_{name}')

1008 return getter

1009

1010 # Assign the MD input getters

1011 input_interactions = property(input_getter('interactions'), None, None, "Interactions to be analyzed (read only)")

1012 input_pbc_selection = property(input_getter('pbc_selection'), None, None, "Selection of atoms which are still in periodic boundary conditions (read only)")

1013 input_cg_selection = property(input_getter('cg_selection'), None, None, "Selection of atoms which are not actual atoms but coarse grain beads (read only)")

1014

1015 def _set_pbc_selection (self, reference_structure : 'Structure', verbose : bool = False) -> 'Selection':

1016 """ Internal function to set PBC selection.

1017 It may parse the inputs file selection string if it is available or guess it otherwise. """

1018 # Otherwise we must set the PBC selection

1019 if verbose: print('Setting Periodic Boundary Conditions (PBC) atoms selection')

1020 selection_string = None

1021 # If there is inputs file then get the input pbc selection

1022 if self.project.is_inputs_file_available():

1023 if verbose: print(' Using selection string in the inputs file')

1024 selection_string = self.input_pbc_selection

1025 # If there is no inputs file we guess PBC atoms automatically

1026 else:

1027 if verbose: print(' No inputs file -> Selection string will be set automatically')

1028 selection_string = 'auto'

1029 # Parse the selection string using the reference structure

1030 parsed_selection = None

1031 # If the input PBC selection is 'auto' then guess it automatically

1032 if selection_string == 'auto':

1033 # To guess PBC atoms (with the current implementation) we must make sure ther eis no CG

1034 if reference_structure.has_cg():

1035 raise InputError('We can not guess PBC atoms in CG systems. Please set PBC atoms manually.\n'

1036 ' Use the "-pbc" argument or set the inputs file "pbc_selection" field.')

1037 if verbose: print(' Guessing PBC atoms as solvent, counter ions and lipids')

1038 parsed_selection = reference_structure.select_pbc_guess()

1039 # If we have a valid input value then use it

1040 elif selection_string:

1041 if verbose: print(f' Selecting PBC atoms "{selection_string}"')

1042 parsed_selection = reference_structure.select(selection_string)

1043 if not parsed_selection:

1044 raise InputError(f'PBC selection "{selection_string}" selected no atoms')

1045 # If we have an input value but it is empty then we set an empty selection

1046 else:

1047 if verbose: print(' No PBC atoms selected')

1048 parsed_selection = Selection()

1049 # Log a few of the selected residue names

1050 if verbose and parsed_selection:

1051 print(f' Parsed PBC selection has {len(parsed_selection)} atoms')

1052 selected_residues = reference_structure.get_selection_residues(parsed_selection)

1053 selected_residue_names = list(set([ residue.name for residue in selected_residues ]))

1054 limit = 3 # Show a maximum of 3 residue names

1055 example_residue_names = ', '.join(selected_residue_names[0:limit])

1056 if len(selected_residue_names) > limit: example_residue_names += ', etc.'

1057 print(' e.g. ' + example_residue_names)

1058 return parsed_selection

1059

1060 def get_pbc_selection (self) -> 'Selection':

1061 """ Get the periodic boundary conditions atom selection. """

1062 # If we already have a stored value then return it

1063 if self.project._pbc_selection != None:

1064 return self.project._pbc_selection

1065 # Otherwise we must set the PBC selection

1066 self.project._pbc_selection = self._set_pbc_selection(self.structure)

1067 return self.project._pbc_selection

1068 pbc_selection = property(get_pbc_selection, None, None, "Periodic boundary conditions atom selection (read only)")

1069

1070 # WARNING: Do not inherit project pbc residues

1071 # WARNING: It may trigger all the processing logic of the reference MD when there is no need

1072 def get_pbc_residues (self) -> list[int]:

1073 """ Get indices of residues in periodic boundary conditions. """

1074 # If we already have a stored value then return it

1075 if self.project._pbc_residues:

1076 return self.project._pbc_residues

1077 # If there is no inputs file then asume there are no PBC residues

1078 if not self.pbc_selection:

1079 self.project._pbc_residues = []

1080 return self.project._pbc_residues

1081 # Otherwise we parse the selection and return the list of residue indices

1082 self.project._pbc_residues = self.structure.get_selection_residue_indices(self.pbc_selection)

1083 print(f'PBC residues "{self.input_pbc_selection}" -> {len(self.project._pbc_residues)} residues')

1084 return self.project._pbc_residues

1085 pbc_residues = property(get_pbc_residues, None, None, "Indices of residues in periodic boundary conditions (read only)")

1086

1087 # DANI: Esto algún día habría que tratar de automatizarlo

1088 def _set_cg_selection (self, reference_structure : 'Structure', verbose : bool = False) -> 'Selection':

1089 """ Set the coarse grain selection. """

1090 if verbose: print('Setting Coarse Grained (CG) atoms selection')

1091 # If there is no inputs file then asum there is no CG selection

1092 if not self.project.is_inputs_file_available():

1093 if verbose: print(' No inputs file -> Asuming there is no CG at all')

1094 return Selection()

1095 # Otherwise we use the selection string from the inputs

1096 if verbose: print(' Using selection string in the inputs file')

1097 selection_string = self.input_cg_selection

1098 # If the selection is empty, again, assume there is no CG selection

1099 if not selection_string:

1100 if verbose: print(' Empty selection -> There is no CG at all')

1101 return Selection()

1102 # Otherwise, process it

1103 # If we have a valid input value then use it

1104 elif selection_string:

1105 if verbose: print(f' Selecting CG atoms "{selection_string}"')

1106 parsed_selection = reference_structure.select(selection_string)

1107 # If we have an input value but it is empty then we set an empty selection

1108 else:

1109 if verbose: print(' No CG atoms selected')

1110 parsed_selection = Selection()

1111 # Lof the parsed selection size

1112 if verbose: print(f' Parsed CG selection has {len(parsed_selection)} atoms')

1113 # Log a few of the selected residue names

1114 if verbose and parsed_selection:

1115 selected_residues = reference_structure.get_selection_residues(parsed_selection)

1116 selected_residue_names = list(set([ residue.name for residue in selected_residues ]))

1117 limit = 3 # Show a maximum of 3 residue names

1118 example_residue_names = ', '.join(selected_residue_names[0:limit])

1119 if len(selected_residue_names) > limit: example_residue_names += ', etc.'

1120 print(' e.g. ' + example_residue_names)

1121 return parsed_selection

1122

1123 def get_cg_selection (self) -> 'Selection':

1124 """ Get the coarse grain atom selection. """

1125 # If we already have a stored value then return it

1126 if self.project._cg_selection:

1127 return self.project._cg_selection

1128 # Otherwise we must set the PBC selection

1129 self.project._cg_selection = self._set_cg_selection(self.structure)

1130 return self.project._cg_selection

1131 cg_selection = property(get_cg_selection, None, None, "Periodic boundary conditions atom selection (read only)")

1132

1133 # WARNING: Do not inherit project cg residues

1134 # WARNING: It may trigger all the processing logic of the reference MD when there is no need

1135 def get_cg_residues (self) -> list[int]:

1136 """ Get indices of residues in coarse grain. """

1137 # If we already have a stored value then return it

1138 if self.project._cg_residues:

1139 return self.project._cg_residues

1140 # If there is no inputs file then asume there are no cg residues

1141 if not self.cg_selection:

1142 self.project._cg_residues = []

1143 return self.project._cg_residues

1144 # Otherwise we parse the selection and return the list of residue indices

1145 self.project._cg_residues = self.structure.get_selection_residue_indices(self.cg_selection)

1146 print(f'CG residues "{self.input_cg_selection}" -> {len(self.project._cg_residues)} residues')

1147 return self.project._cg_residues

1148 cg_residues = property(get_cg_residues, None, None, "Indices of residues in coarse grain (read only)")

1149

1150 def get_populations (self) -> list[float]:

1151 """ Get equilibrium populations from a MSM from the project. """

1152 return self.project.populations

1153 populations = property(get_populations, None, None, "Equilibrium populations from a MSM (read only)")

1154

1155 def get_transitions (self) -> list[list[float]]:

1156 """ Get transition probabilities from a MSM from the project. """

1157 return self.project.transitions

1158 transitions = property(get_transitions, None, None, "Transition probabilities from a MSM (read only)")

1159

1160 def get_protein_map (self) -> dict:

1161 """ Get the residues mapping from the project. """

1162 return self.project.protein_map

1163 protein_map = property(get_protein_map, None, None, "Residues mapping (read only)")

1164

1165 def get_charges (self) -> dict:

1166 """ Get the residues mapping from the project. """

1167 return self.project.charges

1168 charges = property(get_charges, None, None, "Residues charges (read only)")

1169

1170 # Reference frame

1171 get_reference_frame = Task('reframe', 'Reference frame', get_bonds_reference_frame)

1172 reference_frame = property(get_reference_frame, None, None, "Reference frame to be used to represent the MD (read only)")

1173

1174 # ---------------------------------------------------------------------------------

1175 # Tests

1176 # ---------------------------------------------------------------------------------

1177

1178 def is_trajectory_integral (self) -> Optional[bool]:

1179 """ Sudden jumps test. """

1180 # If we already have a stored value then return it

1181 if self._trajectory_integrity != None:

1182 return self._trajectory_integrity

1183 # Otherwise we must find the value

1184 self._trajectory_integrity = check_trajectory_integrity(

1185 input_structure_filename = self.structure_file.path,

1186 input_trajectory_filename = self.trajectory_file.path,

1187 structure = self.structure,

1188 pbc_selection = self.pbc_selection,

1189 mercy = self.project.mercy,

1190 trust = self.project.trust,

1191 register = self.register,

1192 # time_length = self.time_length,

1193 check_selection = ALL_ATOMS,

1194 standard_deviations_cutoff = self.project.rmsd_cutoff,

1195 snapshots = self.snapshots,

1196 )

1197 return self._trajectory_integrity

1198

1199 # ---------------------------------------------------------------------------------

1200 # Analyses

1201 # ---------------------------------------------------------------------------------

1202

1203 # RMSDs analysis

1204 run_rmsds_analysis = Task('rmsds', 'RMSDs analysis',

1205 rmsds, { 'frames_limit': 5000 })

1206

1207 # TM scores analysis

1208 run_tmscores_analysis = Task('tmscore', 'TM scores analysis',

1209 tmscores, { 'frames_limit': 200 })

1210

1211 # RMSF, atom fluctuation analysis

1212 run_rmsf_analysis = Task('rmsf', 'Fluctuation analysis', rmsf)

1213

1214 # Radius of gyration analysis

1215 run_rgyr_analysis = Task('rgyr', 'Radius of gyration analysis',

1216 rgyr, { 'frames_limit': 5000 })

1217

1218 # PCA, principal component analysis

1219 run_pca_analysis = Task('pca', 'Principal component analysis',

1220 pca, { 'frames_limit': 2000, 'projection_frames': 20 })

1221

1222 # PCA contacts

1223 # DANI: Intenta usar mucha memoria, hay que revisar

1224 # DANI: Puede saltar un error de imposible alojar tanta memoria

1225 # DANI: Puede comerse toda la ram y que al final salte un error de 'Terminado (killed)'

1226 # DANI: De momento me lo salto

1227 # DANI: Lleva mucho tiempo sin mantenerse, habrá que cambiar varias cosas al recuperarlo

1228 # run_pca_contacts('pcacons', 'PCA contacts', pca_contacts)

1229

1230 # RMSD per residue analysis

1231 run_rmsd_perres_analysis = Task('perres', 'RMSD per residue analysis',

1232 rmsd_per_residue, { 'frames_limit': 100 })

1233

1234 # RMSD pairwise

1235 # Perform an analysis for the overall structure and then one more analysis for each interaction

1236 run_rmsd_pairwise_analysis = Task('pairwise', 'RMSD pairwise',

1237 rmsd_pairwise, { 'frames_limit': 200, 'overall_selection': "name CA or name C5'" })

1238

1239 # Run the cluster analysis

1240 run_clusters_analysis = Task('clusters', 'Clusters analysis',

1241 clusters_analysis, { 'frames_limit': 1000, 'desired_n_clusters': 20 })

1242

1243 # Calculate the distance mean and standard deviation of each pair of residues

1244 run_dist_perres_analysis = Task('dist', 'Distance per residue',

1245 distance_per_residue, { 'frames_limit': 200 })

1246

1247 # Hydrogen bonds

1248 run_hbonds_analysis = Task('hbonds', 'Hydrogen bonds analysis',

1249 hydrogen_bonds, { 'time_splits': 100 })

1250

1251 # SASA, solvent accessible surface analysis

1252 run_sas_analysis = Task('sas', 'Solvent accessible surface analysis',

1253 sasa, { 'frames_limit': 100 })

1254

1255 # Perform the electrostatic and vdw energies analysis for each pair of interaction agents

1256 run_energies_analysis = Task('energies', 'Energies analysis',

1257 energies, { 'frames_limit': 100 })

1258

1259 # Calculate torsions and then dihedral energies for every dihedral along the trajectory

1260 run_dihedral_energies = Task('dihedrals', 'Dihedral energies analysis',

1261 compute_dihedral_energies, { 'frames_limit': 100 })

1262

1263 # Perform the pockets analysis

1264 run_pockets_analysis = Task('pockets', 'Pockets analysis',

1265 pockets, { 'frames_limit': 100, 'maximum_pockets_number': 10 })

1266

1267 # Helical parameters

1268 run_helical_analysis = Task('helical', 'Helical parameters', helical_parameters)

1269

1270 # Markov

1271 run_markov_analysis = Task('markov', 'Markov', markov, { 'rmsd_selection': PROTEIN_AND_NUCLEIC })

1272

1273 # Membrane density analysis

1274 run_density_analysis = Task('density', 'Membrane density analysis',

1275 density, { 'frames_limit': 1000 })

1276

1277 # Membrane thickness analysis

1278 run_thickness_analysis = Task('thickness', 'Membrane thickness analysis',

1279 thickness, { 'frames_limit': 100 })

1280

1281 # Area per lipid analysis

1282 run_apl_analysis = Task('apl', 'Membrane area per lipid analysis', area_per_lipid)

1283

1284 # Calculate lipid order parameters for membranes

1285 run_lipid_order_analysis = Task('lorder', 'Membrane lipid order analysis',

1286 lipid_order, { 'frames_limit': 100 })

1287

1288 # Lipid-protein interactions analysis

1289 run_lipid_interactions_analysis = Task('linter', 'Membrane lipid-protein interactions analysis',

1290 lipid_interactions, { 'frames_limit': 100 })

1291

1292 run_lipid_interactions_analysis = Task('channels', 'Membrane channels analysis',

1293 channels, { 'frames_limit': 10 })

1294

1295

1296class Project:

1297 """ Class for the main project of an MDDB accession.

1298 A project is a set of related MDs.

1299 These MDs share all or most topology and metadata. """

1300

1301 def __init__ (self,

1302 directory : str = '.',

1303 accession : Optional[str] = None,

1304 database_url : str = DEFAULT_API_URL,

1305 inputs_filepath : str = None,

1306 input_topology_filepath : Optional[str] = None,

1307 input_structure_filepath : Optional[str] = None,

1308 input_trajectory_filepaths : Optional[list[str]] = None,

1309 md_directories : Optional[list[str]] = None,

1310 md_config : Optional[list[list[str]]] = None,

1311 reference_md_index : Optional[int] = None,

1312 populations_filepath : str = DEFAULT_POPULATIONS_FILENAME,

1313 transitions_filepath : str = DEFAULT_TRANSITIONS_FILENAME,

1314 aiida_data_filepath : Optional[str] = None,

1315 filter_selection : bool | str = False,

1316 pbc_selection : Optional[str] = None,

1317 cg_selection : Optional[str] = None,

1318 image : bool = False,

1319 fit : bool = False,

1320 translation : list[float] = [0, 0, 0],

1321 mercy : list[str] | bool = [],

1322 trust : list[str] | bool = [],

1323 faith : bool = False,

1324 pca_analysis_selection : str = PROTEIN_AND_NUCLEIC_BACKBONE,

1325 pca_fit_selection : str = PROTEIN_AND_NUCLEIC_BACKBONE,

1326 rmsd_cutoff : float = DEFAULT_RMSD_CUTOFF,

1327 interaction_cutoff : float = DEFAULT_INTERACTION_CUTOFF,

1328 interactions_auto : Optional[str] = None,

1329 guess_bonds : bool = False,

1330 sample_trajectory : Optional[int] = None,

1331 ):

1332 """

1333 Initialize a Project.

1334

1335 Args:

1336 directory (str):

1337 Local directory where the project takes place.

1338 accession (Optional[str]):

1339 Project accession to download missing input files from the database (if already uploaded).

1340 database_url (str):

1341 API URL to download missing data. when an accession is provided.

1342 inputs_filepath (str):

1343 Path to a file with inputs for metadata, simulation parameters and analysis config.

1344 input_topology_filepath (Optional[str]):

1345 Path to input topology file relative to the project directory.

1346 Multiple formats accepted; default is our parsed JSON topology.

1347 input_structure_filepath (Optional[str]):

1348 Path to input structure file. It may be relative to the project or to each MD directory.

1349 If this value is not passed then the standard structure file is used as input by default.

1350 input_trajectory_filepaths (Optional[list[str]]):

1351 Paths to input trajectory files relative to each MD directory.

1352 If this value is not passed then the standard trajectory file path is used as input by default.

1353 md_directories (Optional[list[str]]):

1354 Path to the different MD directories.

1355 Each directory is to contain an independent trajectory and structure.

1356 Several output files will be generated in every MD directory.

1357 md_config (Optional[list]):

1358 Configuration of a specific MD. You may declare as many as you want.

1359 Every MD requires a directory name and at least one trajectory path.

1360 The structure is -md <directory> <trajectory_1> <trajectory_2> ...

1361 Note that all trajectories from the same MD will be merged.

1362 For legacy reasons, you may also provide a specific structure for an MD.

1363 e.g. -md <directory> <structure> <trajectory_1> <trajectory_2> ...

1364 reference_md_index (Optional[int]):

1365 Index of the reference MD (used by project-level functions; defaults to first MD).

1366 populations_filepath (str):

1367 Path to equilibrium populations file (Markov State Model only)

1368 transitions_filepath (str):

1369 Path to transition probabilities file (Markov State Model only).

1370 aiida_data_filepath (Optional[str]):

1371 Path to the AiiDA data file.

1372 This file may be generated by the aiida-gromacs plugin and contains provenance data.

1373 filter_selection (bool|str):

1374 Atoms selection to be filtered in VMD format.

1375 If the argument is passed alone (i.e. with no selection) then water and counter ions are filtered.

1376 pbc_selection (Optional[str]):

1377 Selection of atoms which stay in Periodic Boundary Conditions even after imaging the trajectory.

1378 e.g. remaining solvent, ions, membrane lipids, etc.

1379 Selection passed through console overrides the one in inputs file.

1380 cg_selection (Optional[str]):

1381 Selection of atoms which are not actual atoms but Coarse Grained beads.

1382 Selection passed through console overrides the one in inputs file.

1383 image (bool):

1384 Set if the trajectory is to be imaged so atoms stay in the PBC box. See -pbc for more information.

1385 fit (bool):

1386 Set if the trajectory is to be fitted (both rotation and translation) to minimize the RMSD to PROTEIN_AND_NUCLEIC_BACKBONE selection.

1387 translation (list[float]):

1388 Set the x y z translation for the imaging process.

1389 e.g. -trans 0.5 -1 0

1390 mercy (list[str]|bool):

1391 Failures to be tolerated (or boolean to set all/none).

1392 trust (list[str]|bool):

1393 Tests to skip/trust (or boolean to set all/none).

1394 faith (bool):

1395 If True, require input files to match expected output files and skip processing.

1396 pca_analysis_selection (str):

1397 Atom selection for PCA analysis in VMD syntax.

1398 pca_fit_selection (str):

1399 Atom selection for the PCA fitting in VMD syntax.

1400 rmsd_cutoff (float):

1401 Set the cutoff for the RMSD sudden jumps analysis to fail.

1402 This cutoff stands for the number of standard deviations away from the mean an RMSD value is to be.

1403 interaction_cutoff (float):

1404 Set the cutoff for the interactions analysis to fail.

1405 This cutoff stands for percent of the trajectory where the interaction happens (from 0 to 1).

1406 interactions_auto (Optional[str]):

1407 Guess input interactions automatically. A VMD selection may be passed to limit guessed interactions to a specific subset of atoms.

1408 guess_bonds (bool):

1409 Force the workflow to guess atom bonds based on distance and atom radii in different frames along the trajectory instead of mining topology bonds.

1410 sample_trajectory (Optional[int]):

1411 If passed, download the first 10 (by default) frames from the trajectory.

1412 You can specify a different number by providing an integer value.

1413 """

1414 # Save input parameters

1415 self.directory = normpath(directory)

1416 # If it is an absolute path then make it relative to the project

1417 if isabs(self.directory):

1418 self.directory = relpath(self.directory)

1419 # Save the directory name alone apart

1420 if self.directory == '.':

1421 self.directory_name = basename(getcwd())

1422 else:

1423 self.directory_name = basename(self.directory)

1424

1425 self.database_url = database_url

1426 self.accession = accession

1427 # Set the project URL in case we have the required data

1428 self.remote = None

1429 if self.database_url and self.accession:

1430 self.remote = Remote(self.database_url, self.accession)

1431

1432 # Set the inputs file

1433 # Set the expected default name in case there is no inputs file since it may be downloaded

1434 self._inputs_file = File(self.pathify(DEFAULT_INPUTS_FILENAME))

1435 # If there is an input filepath then use it

1436 if inputs_filepath:

1437 self._inputs_file = File(inputs_filepath)

1438 # Otherwise guess the inputs file using the accepted filenames

1439 else:

1440 for filename in ACCEPTED_INPUT_FILENAMES:

1441 inputs_file = File(filename)

1442 if inputs_file.exists:

1443 self._inputs_file = inputs_file

1444 break

1445 # Set the input topology file

1446 # Note that even if the input topology path is passed we do not check it exists

1447 # Never forget we can donwload some input files from the database on the fly

1448 self.arg_input_topology_filepath = input_topology_filepath

1449 self._input_topology_filepath = None

1450 self._input_topology_file = None

1451 # Input structure and trajectory filepaths

1452 # Do not parse them to files yet, let this to the MD class

1453 self.input_structure_filepath = input_structure_filepath

1454 self.input_trajectory_filepaths = input_trajectory_filepaths

1455

1456 # Make sure the new MD configuration (-md) was not passed as well as old MD inputs (-mdir, -stru, -traj)

1457 if md_config and (md_directories or input_trajectory_filepaths):

1458 raise InputError('MD configurations (-md) is not compatible with old MD inputs (-mdir, -traj)')

1459 # Save the MD configurations

1460 self.md_config = md_config

1461 # Make sure MD configuration has the correct format

1462 if self.md_config:

1463 # Make sure all MD configurations have at least 3 values each

1464 for mdc in self.md_config:

1465 if len(mdc) < 2:

1466 raise InputError('Wrong MD configuration: the patter is -md <directory> <trajectory> <trajectory 2> ...')

1467 # Make sure there are no duplictaed MD directories

1468 md_directories = [ mdc[0] for mdc in self.md_config ]

1469 if len(md_directories) > len(set(md_directories)):

1470 raise InputError('There are duplicated MD directories')

1471

1472 # Input populations and transitions for MSM

1473 self.populations_filepath = populations_filepath

1474 self._populations_file = File(self.populations_filepath)

1475 self.transitions_filepath = transitions_filepath

1476 self._transitions_file = File(self.transitions_filepath)

1477 # Input AiiDA data

1478 self.aiida_data_filepath = aiida_data_filepath

1479 self._aiida_data_file = File(self.aiida_data_filepath) if aiida_data_filepath else None

1480

1481 # Set the processed topology filepath, which depends on the input topology filename

1482 # Note that this file is different from the standard topology, although it may be standard as well

1483 self._topology_filepath = None

1484 self._topology_file = None

1485

1486 # Set the standard topology file

1487 self._standard_topology_file = None

1488

1489 # Set the MD directories

1490 self._md_directories = md_directories

1491 # Check input MDs are correct to far

1492 if self._md_directories:

1493 self.check_md_directories()

1494

1495 # Set the reference MD

1496 self._reference_md = None

1497 self._reference_md_index = reference_md_index

1498

1499 # Set the rest of inputs

1500 # Note that the filter selection variable is not handled here at all

1501 # This is just pased to the filtering function which knows how to handle the default

1502 self.filter_selection = filter_selection

1503 # PBC selection may come from the console or from the inputs

1504 self._input_pbc_selection = pbc_selection

1505 self._input_cg_selection = cg_selection

1506 self.image = image

1507 self.fit = fit

1508 self.translation = translation

1509 self.mercy = mercy

1510 # Fix the mercy input, if needed

1511 # If a boolean is passed instead of a list then we set its corresponding value

1512 if type(mercy) == bool:

1513 if mercy:

1514 self.mercy = AVAILABLE_FAILURES

1515 else:

1516 self.mercy = []

1517 self.trust = trust

1518 # Fix the trust input, if needed

1519 # If a boolean is passed instead of a list then we set its corresponding value

1520 if type(trust) == bool:

1521 if trust:

1522 self.trust = AVAILABLE_CHECKINGS

1523 else:

1524 self.trust = []

1525 self.faith = faith

1526 self.pca_analysis_selection = pca_analysis_selection

1527 self.pca_fit_selection = pca_fit_selection

1528 self.rmsd_cutoff = rmsd_cutoff

1529 self.interaction_cutoff = interaction_cutoff

1530 self.sample_trajectory = sample_trajectory

1531 self.interactions_auto = interactions_auto

1532 self.guess_bonds = guess_bonds

1533 # Set the inputs, where values from the inputs file will be stored

1534 self._inputs = None

1535

1536 # Other values which may be found/calculated on demand

1537 self._pbc_selection = None

1538 self._pbc_residues = None

1539 self._cg_selection = None

1540 self._cg_residues = None

1541 self._reference_bonds = None

1542 self._topology_reader = None

1543 self._dihedrals = None

1544 self._populations = None

1545 self._transitions = None

1546 self._pdb_ids = None

1547 self._mds = None

1548

1549 # Force a couple of extraordinary files which is generated if atoms are resorted

1550 self.resorted_bonds_file = File(self.pathify(RESORTED_BONDS_FILENAME))

1551 self.resorted_charges_file = File(self.pathify(RESORTED_CHARGES_FILENAME))

1552

1553 # Set a new entry for the register

1554 # This is useful to track previous workflow runs and problems

1555 register_filepath = self.pathify(REGISTER_FILENAME)

1556 register_file = File(register_filepath)

1557 self.register = Register(register_file)

1558 # Save also warnings apart since they are to be used as an input for metadata tasks

1559 self.warnings = self.register.warnings

1560

1561 # Set the cache

1562 cache_filepath = self.pathify(CACHE_FILENAME)

1563 cache_file = File(cache_filepath)

1564 self.cache = Cache(cache_file)

1565

1566 # Set tasks whose output is to be overwritten

1567 self.overwritables = set()

1568

1569 def __repr__ (self):

1570 return 'Project'

1571

1572 def pathify (self, filename_or_relative_path : str) -> str:

1573 """ Given a filename or relative path, add the project directory path at the beginning. """

1574 return normpath(self.directory + '/' + filename_or_relative_path)

1575

1576 # Check MD directories to be right

1577 # If there is any problem then directly raise an input error

1578 def check_md_directories (self):

1579 # Check there is at least one MD

1580 if len(self._md_directories) < 1:

1581 raise InputError('There must be at least one MD')

1582 # Check there are not duplicated MD directories

1583 if len(set(self._md_directories)) != len(self._md_directories):

1584 raise InputError('There are duplicated MD directories')

1585

1586 # Set a function to get MD directories

1587 def get_md_directories (self) -> list:

1588 # If MD directories are already declared then return them

1589 if self._md_directories:

1590 return self._md_directories

1591 # Otherwise use the default MDs

1592 self._md_directories = []

1593 # Use the MDs from the inputs file when available

1594 if self.is_inputs_file_available() and self.input_mds:

1595 for input_md in self.input_mds:

1596 # Get the directory according to the inputs

1597 directory = input_md.get(MD_DIRECTORY, None)

1598 if directory:

1599 check_directory(directory)

1600 # If no directory is specified in the inputs then guess it from the MD name

1601 else:

1602 name = input_md['name']

1603 if not name:

1604 name = 'unnamed'

1605 directory = name_2_directory(name)

1606 self._md_directories.append(directory)

1607 # Otherwise, guess MD directories by checking which directories include a register file

1608 else:

1609 available_directories = sorted(next(walk(self.directory))[1])

1610 for directory in available_directories:

1611 if exists(directory + '/' + REGISTER_FILENAME):

1612 self._md_directories.append(directory)

1613 # If we found no MD directory then it means MDs were never declared before

1614 if len(self._md_directories) == 0:

1615 raise InputError('Impossible to know which are the MD directories. '

1616 'You can either declare them using the "-mdir" option or by providing an inputs file')

1617 self.check_md_directories()

1618 return self._md_directories

1619 md_directories = property(get_md_directories, None, None, "MD directories (read only)")

1620

1621 # Set the reference MD index

1622 def get_reference_md_index (self) -> int:

1623 # If we are already have a value then return it

1624 if self._reference_md_index:

1625 return self._reference_md_index

1626 # Otherwise we must find the reference MD index

1627 # If the inputs file is available then it must declare the reference MD index

1628 if self.is_inputs_file_available():

1629 self._reference_md_index = self.get_input('mdref')

1630 # Otherwise we simply set the first MD as the reference and warn the user about this

1631 if self._reference_md_index == None:

1632 warn('No reference MD was specified. The first MD will be used as reference.')

1633 self._reference_md_index = 0

1634 return self._reference_md_index

1635 reference_md_index = property(get_reference_md_index, None, None, "Reference MD index (read only)")

1636

1637 # Set the reference MD

1638 def get_reference_md (self) -> MD:

1639 # If we are already have a value then return it

1640 if self._reference_md:

1641 return self._reference_md

1642 # Otherwise we must find the reference MD

1643 self._reference_md = self.mds[self.reference_md_index]

1644 return self._reference_md

1645 reference_md: MD = property(get_reference_md, None, None, "Reference MD (read only)")

1646

1647 # Setup the MDs

1648 def get_mds (self) -> list:

1649 """ Get the available MDs (read only). """

1650 # If MDs are already declared then return them

1651 if self._mds:

1652 return self._mds

1653 # Now instantiate a new MD for each declared MD and save the reference MD

1654 self._mds = []

1655 # New system with MD configurations (-md)

1656 if self.md_config:

1657 for n, config in enumerate(self.md_config, 1):

1658 directory = config[0]

1659 # LEGACY

1660 # In a previous version, the md config argument also holded the structure

1661 # This was the second argument, so we check if we have more than 2 arguments

1662 # If this is the case, then check if the second argument has different format

1663 # Note that PDB format is also a trajectory supported format

1664 has_structure = False

1665 if len(config) > 2:

1666 first_sample = File(config[1])

1667 second_sample = File(config[2])

1668 if first_sample.format != second_sample.format:

1669 has_structure = True

1670 # Finally set the input structure and trajectories

1671 input_structure_filepath = config[1] if has_structure else self.input_structure_filepath

1672 input_trajectory_filepaths = config[2:] if has_structure else config[1:]

1673 # Define the MD

1674 md = MD(

1675 project = self, number = n, directory = directory,

1676 input_structure_filepath = input_structure_filepath,

1677 input_trajectory_filepaths = input_trajectory_filepaths,

1678 )

1679 self._mds.append(md)

1680 # Old system (-mdir, -stru -traj)

1681 else:

1682 for n, md_directory in enumerate(self.md_directories, 1):

1683 md = MD(

1684 project = self, number = n, directory = md_directory,

1685 input_structure_filepath = self.input_structure_filepath,

1686 input_trajectory_filepaths = self.input_trajectory_filepaths,

1687 )

1688 self._mds.append(md)

1689 return self._mds

1690 mds: list[MD] = property(get_mds, None, None, "Available MDs (read only)")

1691

1692 # Check input files exist when their filenames are read

1693 # If they do not exist then try to download them

1694 # If the download is not possible then raise an error

1695

1696 # Inputs filename ------------

1697

1698 def is_inputs_file_available (self) -> bool:

1699 """ Set a function to check if inputs file is available.

1700 Note that asking for it when it is not available will lead to raising an input error. """

1701 # If name is not declared then it is impossible to reach it

1702 if not self._inputs_file:

1703 return False

1704 # If the file already exists then it is available

1705 if self._inputs_file.exists:

1706 return True

1707 # If it does not exist but it may be downloaded then it is available

1708 if self.remote:

1709 return True

1710 return False

1711

1712 def get_inputs_file (self) -> File:

1713 """ Set a function to load the inputs yaml file. """

1714 # There must be an inputs filename

1715 if not self._inputs_file:

1716 raise InputError('Not defined inputs filename')

1717 # If the file already exists then we are done

1718 if self._inputs_file.exists:

1719 return self._inputs_file

1720 # Try to download it

1721 # If we do not have the required parameters to download it then we surrender here

1722 if not self.remote:

1723 raise InputError(f'Missing inputs file "{self._inputs_file.filename}"')

1724 # Download the inputs json file if it does not exists

1725 self.remote.download_inputs_file(self._inputs_file)

1726 return self._inputs_file

1727 inputs_file = property(get_inputs_file, None, None, "Inputs filename (read only)")

1728

1729 # Topology filename ------------

1730

1731 def get_input_topology_filepath (self) -> Optional[str]:

1732 """ Get the input topology filepath from the inputs or try to guess it.

1733 If the input topology filepath is a 'no' flag then we consider there is no topology at all

1734 So far we extract atom charges and atom bonds from the topology file

1735 In this scenario we can keep working but there are some consecuences:

1736 1 - Analysis using atom charges such as 'energies' will be skipped

1737 2 - The standard topology file will not include atom charges

1738 3 - Bonds will be guessed

1739 """

1740 # If we arelady have an internal value calculated then return it

1741 if self._input_topology_filepath != None:

1742 return self._input_topology_filepath

1743 # Set a function to parse possible glob notation

1744 def parse (filepath : str) -> str:

1745 # If there is no glob pattern then just return the string as is

1746 if not is_glob(filepath):

1747 return filepath

1748 # If there is glob pattern then parse it

1749 parsed_filepaths = glob(filepath)

1750 if len(parsed_filepaths) == 0:

1751 # Warn the user in case it was trying to use glob syntax to donwload remote files

1752 if self.remote:

1753 warn('Spread syntax is not supported to download remote files')

1754 raise InputError(f'No topologies found with "{filepath}"')

1755 if len(parsed_filepaths) > 1:

1756 raise InputError(f'Multiple topologies found with "{filepath}": {", ".join(parsed_filepaths)}')

1757 return parsed_filepaths[0]

1758 # If this value was passed through command line then it would be set as the internal value already

1759 if self.arg_input_topology_filepath:

1760 if self.arg_input_topology_filepath.lower() in { 'no', 'not', 'na' }:

1761 self._input_topology_filepath = MISSING_TOPOLOGY

1762 return self._input_topology_filepath

1763 self._input_topology_filepath = parse(self.arg_input_topology_filepath)

1764 # Update the input topology fielpath in the inputs file, in case it is not matching

1765 self.update_inputs('input_topology_filepath', relpath(self._input_topology_filepath, self.directory))

1766 return self._input_topology_filepath

1767 # Check if the inputs file has the value

1768 if self.is_inputs_file_available():

1769 # Get the input value, whose key must exist

1770 inputs_value = self.get_input('input_topology_filepath')

1771 # If there is a valid input then use it

1772 if inputs_value != None:

1773 # WARNING: the yaml parser automatically converts 'no' to False

1774 if inputs_value == False or inputs_value.lower() in { 'no', 'not', 'na' }:

1775 self._input_topology_filepath = MISSING_TOPOLOGY

1776 return self._input_topology_filepath

1777 parsed_input_value = parse(inputs_value)

1778 self._input_topology_filepath = self.pathify(parsed_input_value)

1779 return self._input_topology_filepath

1780 # If nothing worked then surrender

1781 raise InputError('Missing input topology file path. Please provide a topology file using the "-top" argument.\n' +

1782 ' Note that you may run the workflow without a topology file. To do so, use the "-top no" argument.\n' +

1783 ' However this has implications since we usually mine atom charges and bonds from the topology file.\n' +

1784 ' Some analyses such us the interaction energies will be skiped')

1785

1786 def get_input_topology_file (self) -> Optional[File]:

1787 """ Get the input topology file.

1788 If the file is not found try to download it. """

1789 # If we already have a value then return it

1790 if self._input_topology_file != None:

1791 return self._input_topology_file

1792 # Set the input topology filepath

1793 input_topology_filepath = self.get_input_topology_filepath()

1794 # If the input filepath is None then it menas we must proceed without a topology

1795 if input_topology_filepath == MISSING_TOPOLOGY:

1796 self._input_topology_file = MISSING_TOPOLOGY

1797 return self._input_topology_file

1798 # If no input is passed then we check the inputs file

1799 # Set the file

1800 self._input_topology_file = File(input_topology_filepath)

1801 # If the file already exists then we are done

1802 if self._input_topology_file.exists:

1803 return self._input_topology_file

1804 # Try to download it

1805 # If we do not have the required parameters to download it then we surrender here

1806 if not self.remote:

1807 raise InputError(f'Missing input topology file "{self._input_topology_file.filename}"')

1808 # Otherwise, try to download it using the files endpoint

1809 # Note that this is not usually required

1810 if self._input_topology_file.filename == STANDARD_TOPOLOGY_FILENAME:

1811 self.remote.download_standard_topology(self._input_topology_file)

1812 else:

1813 self.remote.download_file(self._input_topology_file)

1814 # In case the topology is a '.top' file we consider it is a Gromacs topology

1815 # It may come with additional itp files we must download as well

1816 if self._input_topology_file.format == 'top':

1817 # Find available .itp files and download each of them

1818 itp_filenames = [filename for filename in self.remote.available_files if filename[-4:] == '.itp']

1819 for itp_filename in itp_filenames:

1820 itp_filepath = self.pathify(itp_filename)

1821 itp_file = File(itp_filepath)

1822 self.remote.download_file(itp_file)

1823 return self._input_topology_file

1824 input_topology_file = property(get_input_topology_file, None, None, "Input topology file (read only)")

1825

1826 def get_input_structure_file (self) -> File:

1827 """ Get the input structure filename. """

1828 # When calling this function make sure all MDs have the file or try to download it

1829 return self.reference_md._input_structure_file

1830 input_structure_file = property(get_input_structure_file, None, None, "Input structure filename for each MD (read only)")

1831

1832 def get_input_trajectory_files (self) -> list[File]:

1833 """ Get the input trajectory filename(s) from the inputs.

1834 If file(s) are not found try to download it. """

1835 return self.reference_md._input_trajectory_files

1836 input_trajectory_files = property(get_input_trajectory_files, None, None, "Input trajectory filenames for each MD (read only)")

1837

1838 def get_populations_file (self) -> Optional[File]:

1839 """ Get the MSM equilibrium populations file. """

1840 if not self.get_file(self._populations_file):

1841 return None

1842 return self._populations_file

1843 populations_file = property(get_populations_file, None, None, "MSM equilibrium populations file (read only)")

1844

1845 def get_transitions_file (self) -> Optional[File]:

1846 """ Get the MSM transition probabilities file. """

1847 if not self.get_file(self._transitions_file):

1848 return None

1849 return self._transitions_file

1850 transitions_file = property(get_transitions_file, None, None, "MSM transition probabilities file (read only)")

1851

1852 def get_aiida_data_file (self) -> Optional[File]:

1853 """ Get the AiiDA data file."""

1854 if not self._aiida_data_file: return None

1855 if not self.get_file(self._aiida_data_file): return None

1856 return self._aiida_data_file

1857 aiida_data_file = property(get_aiida_data_file, None, None, "AiiDA data file (read only)")

1858

1859 # ---------------------------------

1860

1861 def get_file (self, target_file : File) -> bool:

1862 """ Check if a file exists.

1863 If not, try to download it from the database.

1864 If the file is not found in the database it is fine, we do not even warn the user.

1865 Note that nowadays this function is used to get populations and transitions files, which are not common.

1866 """

1867 return self.reference_md.get_file(target_file)

1868

1869 # Input file values -----------------------------------------

1870

1871 # First of all set input themselves

1872

1873 # Get inputs

1874 def get_inputs (self) -> dict:

1875 # If inputs are already loaded then return them

1876 if self._inputs:

1877 return self._inputs

1878 # When loading the inuts file, replace some values automatically

1879 replaces = [

1880 ( '$DIR', self.directory_name )

1881 ]

1882 # Otherwise, load inputs from the inputs file

1883 inputs_data = None

1884 if self.inputs_file.format == 'json':

1885 inputs_data = load_json(self.inputs_file.path, replaces)

1886 elif self.inputs_file.format == 'yaml':

1887 inputs_data = load_yaml(self.inputs_file.path, replaces)

1888 else:

1889 raise InputError('Input file format is not supported. Please use json or yaml files.')

1890 if not inputs_data:

1891 raise InputError('Input file is empty')

1892 self._inputs = inputs_data

1893 # Legacy fixes

1894 old_pdb_ids = self._inputs.get('pdbIds', None)

1895 if old_pdb_ids:

1896 self._inputs['pdb_ids'] = old_pdb_ids

1897 # Finally return the updated inputs

1898 return self._inputs

1899 inputs = property(get_inputs, None, None, "Inputs from the inputs file (read only)")

1900

1901 def update_inputs (self, nested_key : str, new_value):

1902 """ Permanently update the inputs file.

1903 This may be done when command line inputs do not match file inputs. """

1904 # If the input already matches then do nothing

1905 current_value = read_ndict(self.inputs, nested_key, MISSING_INPUT_EXCEPTION)

1906 if current_value == new_value: return

1907 # Set the new value

1908 write_ndict(self.inputs, nested_key, new_value)

1909 # If there is no inputs file then do not try to save anything

1910 if not self.is_inputs_file_available(): return

1911 print(f'* Field "{nested_key}" in the inputs file will be permanently modified')

1912 # Write the new inputs to disk

1913 if self.inputs_file.format == 'json':

1914 save_json(self.inputs, self.inputs_file.path)

1915 elif self.inputs_file.format == 'yaml':

1916 # Note that comments in the original YAML file will be not kept

1917 save_yaml(self.inputs, self.inputs_file.path)

1918 else:

1919 raise InputError('Input file format is not supported. Please use json or yaml files.')

1920

1921 # Then set getters for every value in the inputs file

1922

1923 def get_input (self, name: str):

1924 """ Get a specific 'input' value. """

1925 value = self.inputs.get(name, MISSING_INPUT_EXCEPTION)

1926 # If we had a value then return it

1927 if value != MISSING_INPUT_EXCEPTION:

1928 return value

1929 # If the field is not specified in the inputs file then set a defualt value

1930 default_value = DEFAULT_INPUT_VALUES.get(name, None)

1931 # Warn the user about this

1932 warn(f'Missing input "{name}" -> Using default value: {default_value}')

1933 return default_value

1934

1935 # Set a function to get a specific 'input' value by its key/name

1936 # Note that we return the getter function but we do not call it just yet

1937 def input_getter (name : str):

1938 def getter (self):

1939 return self.get_input(name)

1940 return getter

1941

1942 # Assign the getters

1943 input_interactions = property(input_getter('interactions'), None, None, "Interactions to be analyzed (read only)")

1944 input_protein_references = property(input_getter('forced_references'), None, None, "Uniprot IDs to be used first when aligning protein sequences (read only)")

1945 input_pdb_ids = property(input_getter('pdb_ids'), None, None, "Protein Data Bank IDs used for the setup of the system (read only)")

1946 input_type = property(input_getter('type'), None, None, "Set if its a trajectory or an ensemble (read only)")

1947 input_mds = property(input_getter('mds'), None, None, "Input MDs configuration (read only)")

1948 input_ligands = property(input_getter('ligands'), None, None, "Input ligand references (read only)")

1949 input_force_fields = property(input_getter('ff'), None, None, "Input force fields (read only)")

1950 input_collections = property(input_getter('collections'), None, None, "Input collections (read only)")

1951 input_chain_names = property(input_getter('chainnames'), None, None, "Input chain names (read only)")

1952 input_framestep = property(input_getter('framestep'), None, None, "Input framestep (read only)")

1953 input_name = property(input_getter('name'), None, None, "Input name (read only)")

1954 input_description = property(input_getter('description'), None, None, "Input description (read only)")

1955 input_authors = property(input_getter('authors'), None, None, "Input authors (read only)")

1956 input_groups = property(input_getter('groups'), None, None, "Input groups (read only)")

1957 input_contact = property(input_getter('contact'), None, None, "Input contact (read only)")

1958 input_program = property(input_getter('program'), None, None, "Input program (read only)")

1959 input_version = property(input_getter('version'), None, None, "Input version (read only)")

1960 input_method = property(input_getter('method'), None, None, "Input method (read only)")

1961 input_license = property(input_getter('license'), None, None, "Input license (read only)")

1962 input_linkcense = property(input_getter('linkcense'), None, None, "Input license link (read only)")

1963 input_citation = property(input_getter('citation'), None, None, "Input citation (read only)")

1964 input_thanks = property(input_getter('thanks'), None, None, "Input acknowledgements (read only)")

1965 input_links = property(input_getter('links'), None, None, "Input links (read only)")

1966 input_timestep = property(input_getter('timestep'), None, None, "Input timestep (read only)")

1967 input_temperature = property(input_getter('temp'), None, None, "Input temperature (read only)")

1968 input_ensemble = property(input_getter('ensemble'), None, None, "Input ensemble (read only)")

1969 input_water = property(input_getter('wat'), None, None, "Input water force field (read only)")

1970 input_boxtype = property(input_getter('boxtype'), None, None, "Input boxtype (read only)")

1971 input_pbc_selection = property(input_getter('pbc_selection'), None, None, "Input Periodic Boundary Conditions (PBC) selection (read only)")

1972 input_cg_selection = property(input_getter('cg_selection'), None, None, "Input Coarse Grained (CG) selection (read only)")

1973 input_customs = property(input_getter('customs'), None, None, "Input custom representations (read only)")

1974 input_orientation = property(input_getter('orientation'), None, None, "Input orientation (read only)")

1975 input_multimeric = property(input_getter('multimeric'), None, None, "Input multimeric labels (read only)")

1976 # Additional topic-specific inputs

1977 input_cv19_unit = property(input_getter('cv19_unit'), None, None, "Input Covid-19 Unit (read only)")

1978 input_cv19_startconf = property(input_getter('cv19_startconf'), None, None, "Input Covid-19 starting conformation (read only)")

1979 input_cv19_abs = property(input_getter('cv19_abs'), None, None, "Input Covid-19 antibodies (read only)")

1980 input_cv19_nanobs = property(input_getter('cv19_nanobs'), None, None, "Input Covid-19 nanobodies (read only)")

1981

1982 # PBC selection may come from the console or from the inputs file

1983 # Console has priority over the inputs file

1984 def get_input_pbc_selection (self) -> Optional[str]:

1985 # If we have an internal value then return it

1986 if self._input_pbc_selection:

1987 return self._input_pbc_selection

1988 # As an exception, we avoid asking for the inputs file if it is not available

1989 # This input is required for some early processing steps where we do not need the inputs file for anything else

1990 if not self.is_inputs_file_available():

1991 return None

1992 # Otherwise, find it in the inputs

1993 # Get the input value, whose key must exist

1994 self._input_pbc_selection = self.get_input('pbc_selection')

1995 return self._input_pbc_selection

1996 input_pbc_selection = property(get_input_pbc_selection, None, None, "Selection of atoms which are still in periodic boundary conditions (read only)")

1997

1998 # CG selection may come from the console or from the inputs file

1999 # Console has priority over the inputs file

2000 def get_input_cg_selection (self) -> Optional[str]:

2001 # If we have an internal value then return it

2002 if self._input_cg_selection:

2003 return self._input_cg_selection

2004 # As an exception, we avoid asking for the inputs file if it is not available

2005 # This input is required for some early processing steps where we do not need the inputs file for anything else

2006 if not self.is_inputs_file_available():

2007 return None

2008 # Otherwise, find it in the inputs

2009 # Get the input value, whose key must exist

2010 self._input_cg_selection = self.get_input('cg_selection')

2011 return self._input_cg_selection

2012 input_cg_selection = property(get_input_cg_selection, None, None, "Selection of atoms which are not acutal atoms but Coarse Grained beads (read only)")

2013

2014 # Set additional values infered from input values

2015

2016 def check_is_time_dependent (self) -> bool:

2017 """ Set if MDs are time dependent. """

2018 if self.input_type == 'trajectory':

2019 return True

2020 elif self.input_type == 'ensemble':

2021 return False

2022 raise InputError(f'Not supported input "type" value: {self.input_type}. It must be "trajectory" or "ensemble"')

2023 is_time_dependent = property(check_is_time_dependent, None, None, "Check if trajectory frames are time dependent (read only)")

2024

2025 # Processed files ----------------------------------------------------

2026

2027 def inherit_topology_filename (self) -> Optional[str]:

2028 """ Set the expected output topology filename given the input topology filename.

2029 Note that topology formats are conserved. """

2030 if self.input_topology_file == MISSING_TOPOLOGY:

2031 return None

2032 filename = self.input_topology_file.filename

2033 if not filename:

2034 return None

2035 if filename == RAW_CHARGES_FILENAME:

2036 return filename

2037 standard_format = self.input_topology_file.format

2038 return 'topology.' + standard_format

2039

2040 def get_topology_filepath (self) -> str:

2041 """ Get the processed topology file path. """

2042 # If we have a stored value then return it

2043 if self._topology_filepath:

2044 return self._topology_filepath

2045 # Otherwise we must find it

2046 inherited_filename = self.inherit_topology_filename()

2047 self._topology_filepath = self.pathify(inherited_filename) if inherited_filename else None

2048 return self._topology_filepath

2049 topology_filepath = property(get_topology_filepath, None, None, "Topology file path (read only)")

2050

2051 def get_topology_file (self) -> str:

2052 """ Get the processed topology file. """

2053 # If we have a stored value then return it

2054 # This means we already found or generated this file

2055 if self._topology_file != None:

2056 return self._topology_file

2057 # If the file already exists then we are done

2058 self._topology_file = File(self.topology_filepath) if self.topology_filepath != None else MISSING_TOPOLOGY

2059 # If the faith flag was passed then simply make sure the input file makes sense

2060 if self.faith:

2061 if self.input_topology_file != self._topology_file:

2062 raise InputError('Input topology file is not equal to output topology file but the "faith" flag was used.\n'

2063 ' Please refrain from using the faith argument (-f) if you ignore its effect.')

2064 if not self.input_topology_file.exists:

2065 raise InputError('Input topology file does not exist but the "faith" flag was used.\n'

2066 ' Please refrain from using the faith argument (-f) if you ignore its effect.')

2067 return self._topology_file

2068 # Run the processing logic

2069 self.reference_md.input_files_processing(self.reference_md)

2070 # Now that the file is sure to exist we return it

2071 return self._topology_file

2072 topology_file = property(get_topology_file, None, None, "Topology file (read only)")

2073

2074 def get_structure_file (self) -> str:

2075 """ Get the processed structure from the reference MD. """

2076 return self.reference_md.structure_file

2077 structure_file = property(get_structure_file, None, None, "Structure filename from the reference MD (read only)")

2078

2079 def get_trajectory_file (self) -> str:

2080 """ Get the processed trajectory from the reference MD. """

2081 return self.reference_md.trajectory_file

2082 trajectory_file = property(get_trajectory_file, None, None, "Trajectory filename from the reference MD (read only)")

2083

2084 # ---------------------------------------------------------------------------------

2085 # Others values which may be found/calculated and files to be generated on demand

2086 # ---------------------------------------------------------------------------------

2087

2088 def get_structure (self) -> 'Structure':

2089 """ Get the parsed structure from the reference MD. """

2090 return self.reference_md.structure

2091 structure = property(get_structure, None, None, "Parsed structure from the reference MD (read only)")

2092

2093 def get_pbc_residues (self) -> list[int]:

2094 """ Get the indices of residues in periodic boundary conditions. """

2095 return self.reference_md.pbc_residues

2096 pbc_residues = property(get_pbc_residues, None, None, "Indices of residues in periodic boundary conditions (read only)")

2097

2098 def get_cg_residues (self) -> list[int]:

2099 """ Get the indices of residues in coarse grain. """

2100 return self.reference_md.cg_residues

2101 cg_residues = property(get_cg_residues, None, None, "Indices of residues in coarse grain (read only)")

2102

2103 def get_snapshots (self) -> int:

2104 """ Get the reference MD snapshots. """

2105 return self.reference_md.snapshots

2106 snapshots = property(get_snapshots, None, None, "Reference MD snapshots (read only)")

2107

2108 def get_universe (self) -> int:

2109 """ Get the MDAnalysis Universe from the reference MD. """

2110 return self.reference_md.universe

2111 universe = property(get_universe, None, None, "MDAnalysis Universe object (read only)")

2112

2113 def get_processed_interactions (self) -> dict:

2114 """ Get the processed interactions from the reference replica, which are the same for all replicas. """

2115 return self.reference_md.interactions

2116 interactions = property(get_processed_interactions, None, None, "Processed interactions (read only)")

2117

2118 def get_check_stable_bonds (self) -> bool:

2119 """ Check if we must check stable bonds. """

2120 # Set if stable bonds have to be checked

2121 must_check = STABLE_BONDS_FLAG not in self.trust

2122 # If this analysis has been already passed then we can trust structure bonds

2123 if self.register.tests.get(STABLE_BONDS_FLAG, None) == True:

2124 must_check = False

2125 return must_check

2126 must_check_stable_bonds = property(get_check_stable_bonds, None, None, "Check if we must check stable bonds (read only)")

2127

2128 # Reference bonds

2129 get_reference_bonds = Task('refbonds', 'Reference bonds', find_safe_bonds)

2130 reference_bonds = property(get_reference_bonds, None, None, "Atom bonds to be trusted (read only)")

2131

2132 # Atom charges

2133 get_charges = Task('charges', 'Getting atom charges', get_charges)

2134 charges = property(get_charges, None, None, "Atom charges (read only)")

2135

2136 # InChI keys

2137 get_inchi_keys = Task('inchikeys', 'Getting InChI keys', get_inchikeys)

2138 inchikeys = property(get_inchi_keys, None, None, "InChI keys (read only)")

2139

2140 def get_topology_reader (self) -> 'Topology':

2141 """ Get the topology data reader. """

2142 # If we already have a stored value then return it

2143 if self._topology_reader: return self._topology_reader

2144 # Instantiate the topology reader

2145 self._topology_reader = Topology(self.topology_file)

2146 return self._topology_reader

2147 topology_reader = property(get_topology_reader, None, None, "Topology reader (read only)")

2148

2149 def get_dihedrals (self) -> list[dict]:

2150 """ Get the topology dihedrals. """

2151 # If we already have a stored value then return it

2152 if self._dihedrals: return self._dihedrals

2153 # Calculate the dihedrals otherwise

2154 self._dihedrals = self.topology_reader.get_dihedrals_data()

2155 return self._dihedrals

2156 dihedrals = property(get_dihedrals, None, None, "Topology dihedrals (read only)")

2157

2158 def get_populations (self) -> Optional[list[float]]:

2159 """ Get the equilibrium populations from a MSM. """

2160 # If we already have a stored value then return it

2161 if self._populations:

2162 return self._populations

2163 # Otherwise we must find the value

2164 if not self.populations_file:

2165 return None

2166 self._populations = read_file(self.populations_file)

2167 return self._populations

2168 populations = property(get_populations, None, None, "Equilibrium populations from a MSM (read only)")

2169

2170 def get_transitions (self) -> Optional[list[list[float]]]:

2171 """ Get the transition probabilities from a MSM. """

2172 # If we already have a stored value then return it

2173 if self._transitions:

2174 return self._transitions

2175 # Otherwise we must find the value

2176 if not self.transitions_file:

2177 return None

2178 self._transitions = read_file(self.transitions_file)

2179 return self._transitions

2180 transitions = property(get_transitions, None, None, "Transition probabilities from a MSM (read only)")

2181

2182 def get_pdb_ids (self) -> list[str]:

2183 """ Get the tested and standardized PDB ids. """

2184 # If we already have a stored value then return it

2185 if self._pdb_ids != None:

2186 return self._pdb_ids

2187 # Otherwise test and standarize input PDB ids

2188 self._pdb_ids = []

2189 # If there is no input pdb ids (may be None) then stop here

2190 if not self.input_pdb_ids:

2191 return []

2192 # If input PDB ids is a string instead of a list then fix it

2193 input_pdb_ids = self.input_pdb_ids

2194 if type(input_pdb_ids) == str:

2195 input_pdb_ids = [ input_pdb_ids ]

2196 # Iterate input PDB ids

2197 for input_pdb_id in input_pdb_ids:

2198 # First make sure this is a PDB id

2199 if not re.match(PDB_ID_FORMAT, input_pdb_id):

2200 raise InputError(f'Input PDB id "{input_pdb_id}" does not look like a PDB id')

2201 # Make letters upper

2202 pdb_id = input_pdb_id.upper()

2203 self._pdb_ids.append(pdb_id)

2204 return self._pdb_ids

2205 pdb_ids = property(get_pdb_ids, None, None, "Tested and standarized PDB ids (read only)")

2206

2207 # Prepare the PDB references file to be uploaded to the database

2208 get_pdb_references = Task('pdbs', 'Prepare PDB references',

2209 prepare_pdb_references, output_filename = PDB_REFERENCES_FILENAME)

2210 pdb_references_file = get_pdb_references.get_output_file

2211

2212 # Map the structure aminoacids sequences against the Uniprot reference sequences

2213 get_protein_map = Task('protmap', 'Protein residues mapping',

2214 generate_protein_mapping, output_filename=PROTEIN_REFERENCES_FILENAME)

2215 protein_map = property(get_protein_map, None, None, "Protein residues mapping (read only)")

2216

2217 # Define the output file of the protein mapping including protein references

2218 get_protein_references_file = get_protein_map.get_output_file

2219 protein_references_file = property(get_protein_references_file, None, None, "File including protein refereces data mined from UniProt (read only)")

2220

2221 # Get chain references

2222 get_chain_references = Task('chains', 'Chain references',

2223 prepare_chain_references, output_filename = OUTPUT_CHAINS_FILENAME)

2224

2225 # Get the ligand residues mapping

2226 get_ligand_map = Task('ligmap', 'Ligand residues mapping',

2227 generate_ligand_mapping, output_filename = LIGAND_REFERENCES_FILENAME)

2228 ligand_map = property(get_ligand_map, None, None, "Ligand references (read only)")

2229

2230 # Define the output file of the ligand mapping including ligand references

2231 get_ligand_references_file = get_ligand_map.get_output_file

2232 ligand_references_file = property(get_ligand_references_file, None, None, "File including ligand refereces data mined from PubChem (read only)")

2233

2234 # Get the lipid references

2235 get_lipid_map = Task('lipmap', 'Lipid mapping',

2236 generate_lipid_references, output_filename = INCHIKEY_REFERENCES_FILENAME)

2237 lipid_map = property(get_lipid_map, None, None, "Lipid mapping (read only)")

2238

2239 # Define the output file of the lipid mapping including lipid references

2240 get_lipid_references_file = get_lipid_map.get_output_file

2241 lipid_references_file = property(get_lipid_references_file, None, None, "File including lipid references data mined from PubChem (read only)")

2242

2243 # Get mapping of residues in the membrane

2244 get_membrane_map = Task('memmap', 'Membrane mapping',

2245 generate_membrane_mapping, output_filename = MEMBRANE_MAPPING_FILENAME)

2246 membrane_map = property(get_membrane_map, None, None, "Membrane mapping (read only)")

2247

2248 # Build the residue map from both proteins and ligands maps

2249 # This is formatted as both the standard topology and metadata producers expect them

2250 get_residue_map = Task('resmap', 'Residue mapping', generate_residue_mapping)

2251 residue_map = property(get_residue_map, None, None, "Residue map (read only)")

2252

2253 # Prepare the project metadata file to be upload to the database

2254 prepare_metadata = Task('pmeta', 'Prepare project metadata',

2255 prepare_project_metadata, output_filename=OUTPUT_METADATA_FILENAME)

2256

2257 # Prepare the standard topology file to be uploaded to the database

2258 prepare_standard_topology = Task('stopology', 'Standard topology file',

2259 generate_topology, output_filename = STANDARD_TOPOLOGY_FILENAME)

2260 get_standard_topology_file = prepare_standard_topology.get_output_file

2261 standard_topology_file = property(get_standard_topology_file, None, None, "Standard topology filename (read only)")

2262

2263 # Get a screenshot of the system

2264 get_screenshot_filename = Task('screenshot', 'Screenshot file',

2265 get_screenshot, output_filename = OUTPUT_SCREENSHOT_FILENAME)

2266 screenshot_filename = property(get_screenshot_filename, None, None, "Screenshot filename (read only)")

2267

2268 # Provenance data

2269 produce_provenance = Task('aiidata', 'Produce provenance', produce_provenance)

2270

2271# AUXILIAR FUNCTIONS ---------------------------------------------------------------------------

2272

2273# DANI: En cuanto se concrete el formato de los markov esta función no hará falta

2274def read_file (target_file : File) -> dict:

2275 """ Set a function to read a file which may be in different formats. """

2276 # Get the file format

2277 file_format = target_file.filename.split('.')[-1]

2278 # Read numpy files

2279 if file_format == 'npy':

2280 return numpy.load(target_file.path)

2281 # Read JSON files

2282 if file_format == 'json':

2283 return load_json(target_file.path)

2284

2285def name_2_directory (name : str) -> str:

2286 """ Set a function to convert an MD name into an equivalent MD directory. """

2287 # Replace white spaces with underscores

2288 directory = name.replace(' ', '_')

2289 # Remove problematic characters

2290 for character in FORBIDDEN_DIRECTORY_CHARACTERS:

2291 directory = directory.replace(character, '')

2292 return directory

2293

2294def check_directory (directory : str) -> str:

2295 """ Check for problematic characters in a directory path. """

2296 # Remove problematic characters

2297 directory_characters = set(directory)

2298 for character in FORBIDDEN_DIRECTORY_CHARACTERS:

2299 if character in directory_characters:

2300 raise InputError(f'Directory path "{directory}" includes the forbidden character "{character}"')

2301

2302def directory_2_name (directory : str) -> str:

2303 """ Convert an MD directory into an equivalent MD name. """

2304 # Remove a possible starting './'

2305 # Replace white spaces with underscores

2306 name = directory.split('/')[-1].replace('_', ' ')

2307 return name

2308

2309# Project input files

2310project_input_files = {

2311 'itopology': Project.get_input_topology_file,

2312 'inputs': Project.get_inputs_file,

2313 'populations': Project.get_populations_file,

2314 'transitions': Project.get_transitions_file

2315}

2316# MD input files

2317md_input_files = {

2318 'istructure': MD.get_input_structure_file,

2319 'itrajectory': MD.get_input_trajectory_files

2320}

2321# Both project and MD input files

2322input_files = { **project_input_files, **md_input_files }

2323

2324# Project processed files

2325project_processed_files = {

2326 'topology': Project.get_topology_file

2327}

2328# MD processed files

2329md_processed_files = {

2330 'structure': MD.get_structure_file,

2331 'trajectory': MD.get_trajectory_file

2332}

2333# Both project and MD processed files

2334processed_files = { **project_processed_files, **md_processed_files }

2335

2336# Project requestable tasks

2337project_requestables = {

2338 **project_input_files,

2339 **project_processed_files,

2340}

2341# Add available tasks to project requestables

2342for callable in vars(Project).values():

2343 if isinstance(callable, Task): project_requestables[callable.flag] = callable

2344# MD requestable tasks

2345md_requestables = {

2346 **md_input_files,

2347 **md_processed_files,

2348}

2349# Add available tasks to project requestables

2350for callable in vars(MD).values():

2351 if isinstance(callable, Task): md_requestables[callable.flag] = callable

2352# Requestables for the console

2353# Note that this constant is global

2354requestables.update({ **project_requestables, **md_requestables })

2355# Inverted requestables for every function to know which is its 'label'

2356inverted_requestables.update({ v: k for k, v in requestables.items() })

2357

2358# Set groups of dependencies to be requested together using only one flag

2359DEPENDENCY_FLAGS = {

2360 'download': list(input_files.keys()),

2361 'setup': list(processed_files.keys()),

2362 'meta': ['pmeta', 'mdmeta'],

2363 'network': [ 'resmap', 'ligands', 'chains', 'pdbs', 'memmap' ],

2364 'minimal': [ 'pmeta', 'mdmeta', 'stopology' ],

2365 'interdeps': [ 'interactions', 'pairwise', 'hbonds', 'energies', 'perres', 'clusters', 'dist' ],

2366 'membs': ['memmap', 'density', 'thickness', 'apl', 'lorder', 'linter', 'channels']

2367}

2368

2369# Set the default analyses to be run when no task is specified

2370DEFAULT_ANALYSES = ['clusters', 'dist', 'energies', 'hbonds', 'pca', 'pockets',

2371 'rgyr', 'rmsds', 'perres', 'pairwise', 'rmsf', 'sas', 'tmscore', 'density',

2372 'thickness', 'apl', 'lorder', 'linter']

2373

2374# The actual main function

2375def workflow (

2376 # Project parameters

2377 project_parameters : dict = {},

2378 # The actual workflow parameters

2379 # The working directory

2380 working_directory : str = '.',

2381 # Download only

2382 download : bool = False,

2383 # Download and correct only

2384 setup : bool = False,

2385 # Run only specific analyses/processes

2386 include : Optional[list[str]] = None,

2387 # Run everything but specific analyses/processes

2388 exclude : Optional[list[str]] = None,

2389 # Overwrite already existing output files

2390 overwrite : Optional[ list[str] | bool ] = None,

2391):

2392

2393 # Check there are not input errors

2394

2395 # Include and exclude are not compatible

2396 # This is to protect the user to do something which makes not sense

2397 if include and exclude:

2398 raise InputError('Include (-i) and exclude (-e) are not compatible. Use one of these options.')

2399

2400 # Make sure the working directory exists

2401 if not exists(working_directory):

2402 raise InputError(f'Working directory "{working_directory}" does not exist')

2403

2404 # Make sure the working directory is actually a directory

2405 if not isdir(working_directory):

2406 raise InputError(f'Working directory "{working_directory}" is actually not a directory')

2407

2408 # Move the current directory to the working directory

2409 chdir(working_directory)

2410 current_directory_name = getcwd().split('/')[-1]

2411 git_version = get_git_version()

2412 print(f'\n{CYAN_HEADER}Running MDDB workflow ({git_version}) for project at {current_directory_name}{COLOR_END}')

2413

2414 # Initiate the project project

2415 project = Project(**project_parameters)

2416 print(f' {len(project.mds)} MDs are to be run')

2417

2418 # Set the tasks to be run

2419 tasks = None

2420 # If the download argument is passed then just make sure input files are available

2421 if download:

2422 warn('The "-d" or "--download" argument is deprecated. Please use "-i download" instead.')

2423 tasks = list(input_files.keys())

2424 # If the setup argument is passed then just process input files

2425 elif setup:

2426 warn('The "-s" or "--setup" argument is deprecated. Please use "-i setup" instead.')

2427 tasks = list(processed_files.keys())

2428 # If the include argument then add only the specified tasks to the list

2429 elif include and len(include) > 0:

2430 tasks = [ *include ]

2431 # Search for special flags among included

2432 for flag, dependencies in DEPENDENCY_FLAGS.items():

2433 if flag not in tasks: continue

2434 # If the flag is found then remove it and write the corresponding dependencie instead

2435 # Make sure not to duplicate a dependency if it was already included

2436 tasks.remove(flag)

2437 for dep in dependencies:

2438 if dep in tasks: continue

2439 tasks.append(dep)

2440 # Set the default tasks otherwise

2441 else:

2442 tasks = [

2443 # Project tasks

2444 'stopology',

2445 'screenshot',

2446 'pmeta',

2447 'pdbs',

2448 'chains',

2449 # MD tasks

2450 'mdmeta',

2451 'inter',

2452 *DEFAULT_ANALYSES,

2453 ]

2454 # If the exclude parameter was passed then remove excluded tasks from the default tasks

2455 if exclude and len(exclude) > 0:

2456 excluded_dependencies = [ *exclude ]

2457 # Search for special flags among excluded

2458 for flag, dependencies in DEPENDENCY_FLAGS.items():

2459 if flag not in exclude: continue

2460 # If the flag is found then exclude the dependencies instead

2461 # Make sure not to duplicate a dependency if it was already included

2462 excluded_dependencies.remove(flag)

2463 for dep in dependencies:

2464 if dep in exclude: continue

2465 excluded_dependencies.append(dep)

2466 tasks = [ name for name in tasks if name not in excluded_dependencies ]

2467

2468 # If the user requested to overwrite something, make sure it is in the tasks list

2469

2470 # Update the overwritable variable with the requested overwrites

2471 overwritables = set()

2472 if overwrite:

2473 # If the overwrite argument is simply true then add all requestables to the overwritable

2474 if type(overwrite) == bool:

2475 for task in tasks:

2476 overwritables.add(task)

2477 # If the overwrite argument is a list of tasks then iterate them

2478 elif type(overwrite) == list:

2479 for task in overwrite:

2480 # Make sure the task to be overwriten is among the tasks to be run

2481 if task not in tasks:

2482 raise InputError(f'Task "{task}" is to be overwriten but it is not in the tasks list. Either include it or do not exclude it')

2483 # Add it to the global variable

2484 overwritables.add(task)

2485 else: raise ValueError('Not supported overwrite type')

2486

2487 # Get project tasks

2488 project_tasks = [ task for task in tasks if task in project_requestables ]

2489 # Get the MD tasks

2490 md_tasks = [ task for task in tasks if task in md_requestables ]

2491

2492 # Set project overwritables

2493 project.overwritables = set([ task for task in project_tasks if task in overwritables ])

2494 # Set MD overwrittables

2495 # Note that this must be done before running project tasks

2496 # Some project tasks rely in MD tasks

2497 for md in project.mds:

2498 md.overwritables = set([ task for task in md_tasks if task in overwritables ])

2499

2500 # Run the project tasks now

2501 for task in project_tasks:

2502 # Get the function to be called and call it

2503 getter = requestables[task]

2504 getter(project)

2505

2506 # If there are no MD tasks then we are done already

2507 if len(md_tasks) == 0:

2508 print("Finished!")

2509 return

2510

2511 # Now iterate over the different MDs

2512 for md in project.mds:

2513 print(f'\n{CYAN_HEADER} Processing MD at {md.directory}{COLOR_END}')

2514 # Run the MD tasks

2515 for task in md_tasks:

2516 # Get the function to be called and call it

2517 getter = requestables[task]

2518 getter(md)

2519

2520 # Remove gromacs backups and other trash files from this MD

2521 remove_trash(md.directory)

2522

2523 # Remove gromacs backups and other trash files from the project

2524 remove_trash(project.directory)

2525

2526 print("Done!")