Coverage for mddb_workflow / mwf.py: 79%

1155 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 18:45 +0000

1# This is the starter script 

2 

3from os import chdir, walk, mkdir, getcwd 

4from os.path import exists, isdir, isabs, relpath, normpath, split, basename 

5import sys 

6import io 

7import re 

8import numpy 

9from glob import glob 

10 

11# Constants 

12# Importing constants first is important 

13from mddb_workflow.utils.constants import * 

14 

15# Import local utils 

16from mddb_workflow.utils.auxiliar import InputError, MISSING_TOPOLOGY 

17from mddb_workflow.utils.auxiliar import warn, load_json, save_json, load_yaml, save_yaml 

18from mddb_workflow.utils.auxiliar import is_glob, parse_glob, is_url, url_to_source_filename 

19from mddb_workflow.utils.auxiliar import read_ndict, write_ndict, get_git_version, download_file 

20from mddb_workflow.utils.auxiliar import is_standard_topology 

21from mddb_workflow.utils.register import Register 

22from mddb_workflow.utils.cache import Cache 

23from mddb_workflow.utils.structures import Structure 

24from mddb_workflow.utils.topologies import Topology 

25from mddb_workflow.utils.file import File 

26from mddb_workflow.utils.database import Database 

27from mddb_workflow.utils.pyt_spells import get_frames_count, get_average_structure 

28from mddb_workflow.utils.selections import Selection 

29from mddb_workflow.utils.mda_spells import get_mda_universe 

30from mddb_workflow.utils.tasks import Task 

31from mddb_workflow.utils.type_hints import * 

32 

33# Import local tools 

34from mddb_workflow.tools.get_first_frame import get_first_frame 

35from mddb_workflow.tools.get_bonds import find_safe_bonds, get_bonds_reference_frame 

36from mddb_workflow.tools.process_interactions import process_interactions 

37from mddb_workflow.tools.generate_metadata import prepare_project_metadata, generate_md_metadata 

38from mddb_workflow.tools.chains import prepare_chain_references 

39from mddb_workflow.tools.generate_pdb_references import prepare_pdb_references 

40from mddb_workflow.tools.generate_map import generate_protein_mapping 

41from mddb_workflow.tools.get_inchi_keys import generate_inchikeys, generate_inchi_references 

42from mddb_workflow.tools.get_lipids import generate_lipid_references 

43from mddb_workflow.tools.membrane_mapping import generate_membrane_mapping 

44from mddb_workflow.tools.get_ligands import generate_ligand_references 

45from mddb_workflow.tools.residue_mapping import generate_residue_mapping 

46from mddb_workflow.tools.generate_topology import generate_topology 

47from mddb_workflow.tools.get_charges import get_charges 

48from mddb_workflow.tools.remove_trash import remove_trash 

49from mddb_workflow.tools.get_screenshot import get_screenshot 

50from mddb_workflow.tools.process_input_files import process_input_files 

51from mddb_workflow.tools.provenance import produce_provenance 

52from mddb_workflow.tools.get_reduced_trajectory import calculate_frame_step 

53 

54# Import local analyses 

55from mddb_workflow.analyses.rmsds import rmsds 

56from mddb_workflow.analyses.tmscores import tmscores 

57from mddb_workflow.analyses.rmsf import rmsf 

58from mddb_workflow.analyses.rgyr import rgyr 

59from mddb_workflow.analyses.pca import pca 

60from mddb_workflow.analyses.density import density 

61from mddb_workflow.analyses.thickness import thickness 

62from mddb_workflow.analyses.area_per_lipid import area_per_lipid 

63from mddb_workflow.analyses.lipid_order import lipid_order 

64from mddb_workflow.analyses.lipid_interactions import lipid_interactions 

65from mddb_workflow.analyses.channels import channels 

66# from mddb_workflow.analyses.pca_contacts import pca_contacts 

67from mddb_workflow.analyses.rmsd_per_residue import rmsd_per_residue 

68from mddb_workflow.analyses.rmsd_pairwise import rmsd_pairwise 

69from mddb_workflow.analyses.clusters import clusters_analysis 

70from mddb_workflow.analyses.distance_per_residue import distance_per_residue 

71from mddb_workflow.analyses.hydrogen_bonds import hydrogen_bonds 

72from mddb_workflow.analyses.sasa import sasa 

73from mddb_workflow.analyses.energies import energies 

74from mddb_workflow.analyses.dihedral_energies import compute_dihedral_energies 

75from mddb_workflow.analyses.pockets import pockets 

76from mddb_workflow.analyses.rmsd_check import check_trajectory_integrity 

77from mddb_workflow.analyses.helical_parameters import helical_parameters 

78from mddb_workflow.analyses.markov import markov 

79 

80# Make the system output stream to not be buffered 

81# Only when not in a Jupyter notebook or using pytest 

82# Check if we're in an interactive Python shell like Jupyter 

83if not hasattr(sys, 'ps1') and 'pytest' not in sys.modules: 

84 # This is useful to make prints work on time in Slurm 

85 # Otherwise, output logs are written after the script has fully run 

86 # Note that this fix affects all modules and built-ins 

87 unbuffered = io.TextIOWrapper(open(sys.stdout.fileno(), 'wb', 0), write_through=True) 

88 sys.stdout = unbuffered 

89 

90# Set a special exception for missing inputs 

91MISSING_INPUT_EXCEPTION = Exception('Missing input') 

92 

93 

94class MD: 

95 """A Molecular Dynamics (MD) is the union of a structure and a trajectory. 

96 Having this data several analyses are possible. 

97 Note that an MD is always defined inside of a Project and thus it has additional topology and metadata. 

98 """ 

99 

100 def __init__(self, 

101 project: 'Project', 

102 number: int, 

103 directory: str, 

104 input_structure_filepath: str, 

105 input_trajectory_filepaths: list[str], 

106 ): 

107 """Initialize the MD object. 

108 

109 Args: 

110 project (Project): The parent project this MD belongs to. 

111 number (int): The number of the MD according to its accession. 

112 directory (str): The local directory where the MD takes place. 

113 input_structure_filepath (str): The input structure file path. 

114 input_trajectory_filepaths (list[str]): The input trajectory file paths. 

115 

116 """ 

117 # Save the inputs 

118 self.project = project 

119 if not project: 

120 raise Exception('Project is mandatory to instantiate a new MD') 

121 # Save the MD number and index 

122 self.number = number 

123 self.index = number - 1 

124 # Set the remote project handler for this specific MD 

125 self.accession = None 

126 self.remote = None 

127 if self.project.database and self.project.accession: 

128 self.accession = f'{self.project.accession}.{self.number}' 

129 self.remote = self.project.database.get_remote_project(self.accession) 

130 # Save the directory 

131 self.directory = normpath(directory) 

132 # Now set the director relative to the project 

133 self.directory = self.project.pathify(self.directory) 

134 if normpath(self.directory) == normpath(self.project.directory): 

135 raise InputError(f'MD {self.number} has the same directory as the project: {self.directory}') 

136 # Save the directory name alone apart 

137 self.directory_location, self.directory_name = split(self.directory) 

138 # If the directory does not exists then create it 

139 if not exists(self.directory): 

140 mkdir(self.directory) 

141 # Save the input structure filepath 

142 # They may be relative to the project directory (unique) or relative to the MD directory (one per MD) 

143 # If the path is absolute then it is considered unique 

144 # If the file does not exist and it is to be downloaded then it is downloaded for each MD 

145 # Priorize the MD directory over the project directory 

146 self.arg_input_structure_filepath = input_structure_filepath 

147 self._input_structure_filepath = None 

148 self._input_structure_url = None 

149 # Set the internal variable for the input structure file, to be assigned later 

150 self._input_structure_file = None 

151 # Save the input trajectory filepaths 

152 self.arg_input_trajectory_filepaths = input_trajectory_filepaths 

153 self._input_trajectory_filepaths = None 

154 self._input_trajectory_urls = None 

155 # Set the internal variable for the input trajectory files, to be assigned later 

156 self._input_trajectory_files = None 

157 

158 # Other values which may be found/calculated on demand 

159 self._md_inputs = None 

160 self._structure = None 

161 

162 # Tests 

163 self._trajectory_integrity = None 

164 

165 # Set a new MD specific register 

166 # In case the directory is the project directory itself, use the project register 

167 register_filepath = self.pathify(REGISTER_FILENAME) 

168 register_file = File(register_filepath) 

169 if register_file.path == self.project.register.file.path: 

170 self.register = self.project.register 

171 else: 

172 self.register = Register(register_file) 

173 

174 # Set a new MD specific cache 

175 # In case the directory is the project directory itself, use the project cache 

176 cache_filepath = self.pathify(CACHE_FILENAME) 

177 cache_file = File(cache_filepath) 

178 if cache_file.path == self.project.cache.file.path: 

179 self.cache = self.project.cache 

180 else: 

181 self.cache = Cache(cache_file) 

182 

183 # Set tasks whose output is to be overwritten 

184 self.overwritables = set() 

185 

186 # Get MD inputs just to fill the inputs' "mds" value 

187 # Some functions may fail otherwise when its value is missing 

188 if self.project.is_inputs_file_available(): 

189 self.get_md_inputs() 

190 

191 def __repr__(self): 

192 """Return a string representation of the MD object.""" 

193 return 'MD' 

194 

195 def pathify(self, filename_or_relative_path: str) -> str: 

196 """Given a filename or relative path, add the MD directory path at the beginning.""" 

197 return normpath(self.directory + '/' + filename_or_relative_path) 

198 

199 # Input structure file ------------ 

200 

201 def get_input_structure_filepath(self) -> str: 

202 """Set a function to get input structure file path.""" 

203 # Return the internal value if it is already assigned 

204 if self._input_structure_filepath is not None: 

205 return self._input_structure_filepath 

206 

207 def relativize_and_parse_paths(input_path: str, may_not_exist: bool = False) -> Optional[str]: 

208 """Find out if a path is relative to MD directories or to the project directory. 

209 

210 To do so just check if the file exists in any of those. 

211 In case it exists in both or none then assume it is relative to MD directory. 

212 Parse glob notation in the process. 

213 """ 

214 # Check if it is a URL 

215 if is_url(input_path): 

216 self._input_structure_url = input_path 

217 # Set the paths for the further download 

218 source_filename = url_to_source_filename(input_path) 

219 md_relative_filepath = self.pathify(source_filename) 

220 return md_relative_filepath 

221 # Check if it is an absolute path 

222 if isabs(input_path): 

223 abs_glob_parse = parse_glob(input_path) 

224 # If we had multiple results then we complain 

225 if len(abs_glob_parse) > 1: 

226 raise InputError(f'Multiple structures found with "{input_path}": {", ".join(abs_glob_parse)}') 

227 # If we had no results then we complain 

228 if len(abs_glob_parse) == 0: 

229 if self.remote: 

230 warn('Spread syntax is not supported to download remote files') 

231 raise InputError(f'No structure found with "{input_path}"') 

232 abs_parsed_filepath = abs_glob_parse[0] 

233 return abs_parsed_filepath 

234 # Check the MD directory 

235 md_relative_filepath = self.pathify(input_path) 

236 md_glob_parse = parse_glob(md_relative_filepath) 

237 if len(md_glob_parse) > 1: 

238 raise InputError(f'Multiple structures found with "{input_path}": {", ".join(md_glob_parse)}') 

239 md_parsed_filepath = md_glob_parse[0] if len(md_glob_parse) == 1 else None 

240 if md_parsed_filepath and File(md_parsed_filepath).exists: 

241 return md_parsed_filepath 

242 # Check the project directory 

243 project_relative_filepath = self.project.pathify(input_path) 

244 project_glob_parse = parse_glob(project_relative_filepath) 

245 if len(project_glob_parse) > 1: 

246 raise InputError(f'Multiple structures found with "{input_path}": {", ".join(project_glob_parse)}') 

247 project_parsed_filepath = project_glob_parse[0] if len(project_glob_parse) == 1 else None 

248 if project_parsed_filepath and File(project_parsed_filepath).exists: 

249 return project_parsed_filepath 

250 # At this point we can conclude the input structure file does not exist 

251 # If we have no paths at all then it means a glob pattern was passed and it didn't match 

252 # Note that if a glob pattern existed then it would mean the file actually existed 

253 if len(md_glob_parse) == 0 and len(project_glob_parse) == 0: 

254 # Warn the user in case it was trying to use glob syntax to donwload remote files 

255 if self.remote: 

256 warn('Spread syntax is not supported to download remote files') 

257 raise InputError('No trajectory file was reached neither in the project directory or MD directories in path(s) ' + ', '.join(input_path)) 

258 # If the path does not exist anywhere then we asume it will be downloaded and set it relative to the MD 

259 # However make sure we have a remote 

260 # As an exception, if the 'may not exist' flag is passed then we return the result even if there is no remote 

261 if not may_not_exist and not self.remote: 

262 raise InputError(f'Cannot find a structure file by "{input_path}" anywhere') 

263 md_parsed_filepath = self.project.pathify(input_path) if f'{self.directory_name}/' in md_parsed_filepath else self.pathify(input_path) 

264 return md_parsed_filepath 

265 # If we have a value passed through command line 

266 if self.arg_input_structure_filepath: 

267 # Find out if it is relative to MD directories or to the project directory 

268 self._input_structure_filepath = relativize_and_parse_paths(self.arg_input_structure_filepath) 

269 # Save the parsed value in the inputs file 

270 self.project.update_inputs( 

271 f'mds.{self.index}.input_structure_filepath', 

272 self._input_structure_filepath) 

273 return self._input_structure_filepath 

274 # If we have a value passed through the inputs file has the value 

275 if self.project.is_inputs_file_available(): 

276 # Get the input value, whose key must exist 

277 inputs_value = self.get_input('input_structure_filepath') 

278 # If there is a valid input then use it 

279 if inputs_value: 

280 self._input_structure_filepath = relativize_and_parse_paths(inputs_value) 

281 return self._input_structure_filepath 

282 # If there is not input structure anywhere then use the input topology 

283 # We will extract the structure from it using a sample frame from the trajectory 

284 # Note that topology input filepath must exist and an input error will raise otherwise 

285 # However if we are using the standard topology file we can not extract the PDB from it (yet) 

286 if self.project.input_topology_file != MISSING_TOPOLOGY and \ 

287 not is_standard_topology(self.project.input_topology_file): 

288 return self.project.input_topology_file.path 

289 # If we can not use the topology either then surrender 

290 raise InputError('There is not input structure at all') 

291 

292 def get_input_structure_file(self) -> str: 

293 """Get the input pdb filename from the inputs. 

294 If the file is not found try to download it. 

295 """ 

296 # If the input structure file is already defined then return it 

297 if self._input_structure_file: 

298 return self._input_structure_file 

299 # Otherwise we must set it 

300 # First set the input structure filepath 

301 input_structure_filepath = self.get_input_structure_filepath() 

302 # Now set the input structure file 

303 input_structure_file = File(input_structure_filepath) 

304 # If there is a structure URL then we must donwload the structure first 

305 input_structure_url = self._input_structure_url 

306 if input_structure_url and not input_structure_file.exists: 

307 original_filename = input_structure_url.split('/')[-1] 

308 # If there is a remote then use it 

309 if self.remote: 

310 # If the structure filename is the standard structure filename then use the structure endpoint instead 

311 if original_filename == STRUCTURE_FILENAME: 

312 self.remote.download_standard_structure(input_structure_file) 

313 # Otherwise download the input strucutre file by its filename 

314 else: 

315 self.remote.download_file(original_filename, input_structure_file) 

316 # Otherwise use the URL as is 

317 else: 

318 download_file(input_structure_url, input_structure_file) 

319 # If the file already exists then return it 

320 if input_structure_file.exists: 

321 self._input_structure_file = input_structure_file 

322 return self._input_structure_file 

323 raise InputError(f'Missing input structure file "{input_structure_file.path}"') 

324 input_structure_file = property(get_input_structure_file, None, None, "Input structure filename (read only)") 

325 

326 # Input trajectory filename ------------ 

327 

328 def get_input_trajectory_filepaths(self) -> str: 

329 """Get the input trajectory file paths.""" 

330 # Return the internal value if it is already assigned 

331 if self._input_trajectory_filepaths is not None: 

332 return self._input_trajectory_filepaths 

333 

334 def relativize_and_parse_paths(input_paths: list[str]) -> list[str]: 

335 """Check and fix input trajectory filepaths. 

336 Also relativize paths to the current MD directory and parse glob notation. 

337 """ 

338 checked_paths = input_paths 

339 # Input trajectory filepaths may be both a list or a single string 

340 # However we must keep a list 

341 if type(checked_paths) is list: 

342 pass 

343 elif type(checked_paths) is str: 

344 checked_paths = [checked_paths] 

345 else: 

346 raise InputError('Input trajectory filepaths must be a list of strings or a string') 

347 # Make sure all or none of the trajectory paths are URLs 

348 url_count = sum([is_url(path) for path in checked_paths]) 

349 if not (url_count == 0 or url_count == len(checked_paths)): 

350 raise InputError('All trajectory paths must be paths or URLs. Mixing is not supported') 

351 # In case trajectory paths are URLs 

352 if url_count > 0: 

353 self._input_trajectory_urls = checked_paths 

354 # Set the paths for the further download 

355 parsed_paths = [] 

356 for path in checked_paths: 

357 source_filename = url_to_source_filename(path) 

358 md_relative_filepath = self.pathify(source_filename) 

359 parsed_paths.append(md_relative_filepath) 

360 return parsed_paths 

361 # Make sure all or none of the trajectory paths are absolute 

362 abs_count = sum([isabs(path) for path in checked_paths]) 

363 if not (abs_count == 0 or abs_count == len(checked_paths)): 

364 raise InputError('All trajectory paths must be relative or absolute. Mixing is not supported') 

365 

366 def parse_all_glob(paths: list[str]) -> list[str]: 

367 """Glob-parse and merge all paths.""" 

368 parsed_paths = [] 

369 for path in paths: 

370 parsed_paths += parse_glob(path) 

371 return parsed_paths 

372 # In case trajectory paths are absolute 

373 if abs_count > 0: 

374 absolute_parsed_paths = parse_all_glob(checked_paths) 

375 # Check we successfully defined some trajectory file 

376 if len(absolute_parsed_paths) == 0: 

377 # Warn the user in case it was trying to use glob syntax to donwload remote files 

378 if self.remote: 

379 warn('Spread syntax is not supported to download remote files') 

380 raise InputError('No trajectory file was reached neither in the project directory or MD directories in path(s) ' + ', '.join(input_paths)) 

381 return absolute_parsed_paths 

382 # If trajectory paths are not absolute then check if they are relative to the MD directory 

383 # Get paths relative to the current MD directory 

384 md_relative_paths = [self.pathify(path) for path in checked_paths] 

385 # In case there are glob characters we must parse the paths 

386 md_parsed_paths = parse_all_glob(md_relative_paths) 

387 # Check we successfully defined some trajectory file 

388 if len(md_parsed_paths) > 0: 

389 # If so, check at least one of the files do actually exist 

390 if any([exists(path) for path in md_parsed_paths]): 

391 return md_parsed_paths 

392 # If no trajectory files where found then asume they are relative to the project 

393 # Get paths relative to the project directory 

394 project_relative_paths = [self.project.pathify(path) for path in checked_paths] 

395 # In case there are glob characters we must parse the paths 

396 project_parsed_paths = parse_all_glob(project_relative_paths) 

397 # Check we successfully defined some trajectory file 

398 if len(project_parsed_paths) > 0: 

399 # If so, check at least one of the files do actually exist 

400 if any([exists(path) for path in project_parsed_paths]): 

401 return project_parsed_paths 

402 # At this point we can conclude the input trajectory file does not exist 

403 # If we have no paths at all then it means a glob pattern was passed and it didn't match 

404 # Note that if a glob pattern existed then it would mean the file actually existed 

405 if len(md_parsed_paths) == 0 and len(project_parsed_paths) == 0: 

406 # Warn the user in case it was trying to use glob syntax to donwload remote files 

407 if self.remote: 

408 warn('Spread syntax is not supported to download remote files') 

409 raise InputError('No trajectory file was reached neither in the project directory or MD directories in path(s) ' + ', '.join(input_paths)) 

410 # If we have a path however it may be downloaded from the database if we have a remote 

411 if not self.remote: 

412 raise InputError(f'Cannot find anywhere a trajectory file with path(s) "{", ".join(input_paths)}"') 

413 # Note that if input path was not glob based it will be both as project relative and MD relative 

414 if len(md_parsed_paths) == 0: raise ValueError('This should never happen') 

415 # If file is to be downloaded then we must make sure the path is relative to the project 

416 project_relative_paths = [ 

417 self.project.pathify(path) if f'{self.directory_name}/' in path else self.pathify(path) for path in checked_paths 

418 ] 

419 return project_relative_paths 

420 # If we have a value passed through command line 

421 if self.arg_input_trajectory_filepaths: 

422 self._input_trajectory_filepaths = relativize_and_parse_paths(self.arg_input_trajectory_filepaths) 

423 # Save the parsed value in the inputs file 

424 self.project.update_inputs( 

425 f'mds.{self.index}.input_trajectory_filepaths', 

426 self._input_trajectory_filepaths) 

427 return self._input_trajectory_filepaths 

428 # Check if the inputs file has the value 

429 if self.project.is_inputs_file_available(): 

430 # Get the input value 

431 inputs_value = self.get_input('input_trajectory_filepaths') 

432 if inputs_value: 

433 self._input_trajectory_filepaths = relativize_and_parse_paths(inputs_value) 

434 return self._input_trajectory_filepaths 

435 # If there is no trajectory available then we surrender 

436 raise InputError('There is not input trajectory at all') 

437 

438 def get_input_trajectory_files(self) -> str: 

439 """Get the input trajectory filename(s) from the inputs. 

440 If file(s) are not found try to download it. 

441 """ 

442 # If we already defined input trajectory files then return them 

443 if self._input_trajectory_files is not None: 

444 return self._input_trajectory_files 

445 # Otherwise we must set the input trajectory files 

446 input_trajectory_filepaths = self.get_input_trajectory_filepaths() 

447 input_trajectory_files = [File(path) for path in input_trajectory_filepaths] 

448 # If there are input trajectory URLs then download the trajectories first 

449 input_trajectory_urls = self._input_trajectory_urls 

450 if input_trajectory_urls: 

451 for trajectory_url, trajectory_file in zip(input_trajectory_urls, input_trajectory_files): 

452 # If the trajectory file already exists then skip it 

453 if trajectory_file.exists: continue 

454 original_filename = trajectory_url.split('/')[-1] 

455 # If there is a remote then use it 

456 if self.remote: 

457 # If the trajectory filename is the standard trajectory filename then use the trajectory endpoint instead 

458 if original_filename == TRAJECTORY_FILENAME: 

459 frame_selection = None 

460 if self.project.sample_trajectory: 

461 remote_frames = self.remote.snapshots 

462 maximum_desired_frames = self.project.sample_trajectory 

463 step, final_frames = calculate_frame_step(remote_frames, maximum_desired_frames) 

464 frame_selection = f'1:{remote_frames}:{step}' 

465 self.remote.download_trajectory(trajectory_file, frame_selection=frame_selection, format='xtc') 

466 # Otherwise download the input trajectory file by its filename 

467 else: 

468 if self.project.sample_trajectory: 

469 raise InputError('The "-smp" argument is supported only when asking for the standard trajectory') 

470 self.remote.download_file(original_filename, trajectory_file) 

471 # Otherwise use the URL as is 

472 else: 

473 if self.project.sample_trajectory: 

474 raise InputError('The "-smp" argument is supported only when using the "-proj" argument') 

475 download_file(trajectory_url, trajectory_file) 

476 # Find missing trajectory files 

477 missing_input_trajectory_files = [] 

478 for trajectory_file in input_trajectory_files: 

479 if not trajectory_file.exists: 

480 missing_input_trajectory_files.append(trajectory_file) 

481 # If all files already exists then we are done 

482 if len(missing_input_trajectory_files) == 0: 

483 self._input_trajectory_files = input_trajectory_files 

484 return self._input_trajectory_files 

485 missing_filepaths = [trajectory_file.path for trajectory_file in missing_input_trajectory_files] 

486 raise InputError('Missing input trajectory files: ' + ', '.join(missing_filepaths)) 

487 input_trajectory_files = property(get_input_trajectory_files, None, None, "Input trajectory filenames (read only)") 

488 

489 def get_md_inputs(self) -> dict: 

490 """Get MD specific inputs.""" 

491 # If we already have a value stored then return it 

492 if self._md_inputs: 

493 return self._md_inputs 

494 # Otherwise we must find its value 

495 # If we have MD inputs in the inputs file then use them 

496 if self.project.input_mds: 

497 # Iterate over the different MD inputs to find out each directory 

498 # We must find the MD inputs whcih belong to this specific MD according to this directory 

499 for md in self.project.input_mds: 

500 # Get the directory according to the inputs 

501 directory = md.get(MD_DIRECTORY, None) 

502 if directory: 

503 check_directory(directory) 

504 # If no directory is specified in the inputs then guess it from the MD name 

505 else: 

506 name = md.get('name', None) 

507 if not name: raise InputError('There is a MD with no name and no directory. Please define at least one of them.') 

508 directory = name_2_directory(name) 

509 # If the directory matches then this is our MD inputs 

510 if self.project.pathify(directory) == self.directory: 

511 self._md_inputs = md 

512 return self._md_inputs 

513 # If this MD directory has not associated inputs then it means it was passed through command line 

514 # We set a new MD inputs for it 

515 new_md_name = directory_2_name(self.directory) 

516 self._md_inputs = {'name': new_md_name, 'mdir': self.directory} 

517 # Update the inputs file with the new MD inputs 

518 mds = self.project.inputs.get('mds', []) 

519 new_mds_inputs = [*mds, self._md_inputs] 

520 self.project.update_inputs('mds', new_mds_inputs) 

521 return self._md_inputs 

522 

523 md_inputs = property(get_md_inputs, None, None, "MD specific inputs (read only)") 

524 

525 def get_input(self, name: str): 

526 """Get a specific 'input' value from MD inputs.""" 

527 value = self.md_inputs.get(name, MISSING_INPUT_EXCEPTION) 

528 # If we had a value then return it 

529 if value != MISSING_INPUT_EXCEPTION: 

530 return value 

531 return self.project.get_input(name) 

532 

533 # --------------------------------- 

534 

535 def get_file(self, target_file: File) -> bool: 

536 """Check if a file exists. If not, try to download it from the database. 

537 If the file is not found in the database it is fine, we do not even warn the user. 

538 Note that this function is used to get populations and transitions files, which are not common. 

539 """ 

540 # If it exists we are done 

541 if target_file.exists: 

542 return True 

543 # Try to download the missing file 

544 # If we do not have the required parameters to download it then we surrender here 

545 if not self.remote: 

546 return False 

547 # Check if the file is among the available remote files 

548 # If it is no then stop here 

549 if target_file.filename not in self.remote.available_files: 

550 return False 

551 # Download the file 

552 self.remote.download_file(target_file.filename, target_file) 

553 return True 

554 

555 def print_tests_summary(self): 

556 """Make a summary of tests and their status.""" 

557 print('Tests summary:') 

558 for test_name in AVAILABLE_CHECKINGS: 

559 test_result = self.register.tests.get(test_name, None) 

560 # Print things pretty 

561 test_nice_name = NICE_NAMES[test_name] 

562 test_nice_result = None 

563 if test_result is None: 

564 test_nice_result = YELLOW_HEADER + 'Not run' + COLOR_END 

565 elif test_result is False: 

566 test_nice_result = RED_HEADER + 'Failed' + COLOR_END 

567 elif test_result is True: 

568 test_nice_result = GREEN_HEADER + 'Passed' + COLOR_END 

569 elif test_result == 'na': 

570 test_nice_result = BLUE_HEADER + 'Not applicable' + COLOR_END 

571 else: 

572 raise ValueError() 

573 

574 print(f' - {test_nice_name} -> {test_nice_result}') 

575 

576 # Issue some warnings if failed or never run tests are skipped 

577 # This is run after processing input files 

578 # RUBEN: mover a inpro? 

579 def _issue_required_test_warnings(self): 

580 for test_name in AVAILABLE_CHECKINGS: 

581 # If test was not skipped then proceed 

582 if test_name not in self.project.trust: continue 

583 # If test passed in a previous run the proceed 

584 test_result = self.register.tests.get(test_name) 

585 if test_result is True: continue 

586 # If test failed in a previous run we can also proceed 

587 # The failing warning must be among the inherited warnings, so there is no need to add more warnings here 

588 if test_result is False: continue 

589 # If the test has been always skipped then issue a warning 

590 if test_result is None: 

591 # Remove previous warnings 

592 self.register.remove_warnings(test_name) 

593 # Get test pretty name 

594 test_nice_name = NICE_NAMES[test_name] 

595 # Issue the corresponding warning 

596 self.register.add_warning(test_name, test_nice_name + ' was skipped and never run before') 

597 continue 

598 raise ValueError('Test value is not supported') 

599 

600 # Processed files ---------------------------------------------------- 

601 

602 def get_topology_filepath(self) -> str: 

603 """Get the processed topology file path.""" 

604 return self.project.get_topology_filepath() 

605 topology_filepath = property(get_topology_filepath, None, None, "Topology file path (read only)") 

606 

607 # Run the actual processing to generate output processed files out of input raw files 

608 # And by "files" I mean structure, trajectory and topology 

609 input_files_processing = Task('inpro', 'Input files processing', process_input_files, 

610 output_filenames={ 

611 'output_topology_file': get_topology_filepath, 

612 'output_structure_file': STRUCTURE_FILENAME, 

613 'output_trajectory_file': TRAJECTORY_FILENAME, 

614 }) 

615 

616 # Output main files 

617 get_structure_file = input_files_processing.get_output_file_getter('output_structure_file') 

618 structure_file = property(get_structure_file, None, None, "Structure file (read only)") 

619 get_trajectory_file = input_files_processing.get_output_file_getter('output_trajectory_file') 

620 trajectory_file = property(get_trajectory_file, None, None, "Trajectory file (read only)") 

621 

622 def get_topology_file(self) -> 'File': 

623 """Get the processed topology from the project.""" 

624 return self.project.get_topology_file() 

625 topology_file = property(get_topology_file, None, None, 

626 "Topology filename from the project (read only)") 

627 

628 # --------------------------------------------------------------------------------- 

629 # Others values which may be found/calculated and files to be generated on demand 

630 # --------------------------------------------------------------------------------- 

631 

632 # Trajectory snapshots 

633 get_snapshots = Task('frames', 'Count trajectory frames', get_frames_count) 

634 snapshots = property(get_snapshots, None, None, "Trajectory snapshots (read only)") 

635 

636 def get_reference_bonds(self) -> list[list[int]]: 

637 """Get the reference bonds.""" 

638 return self.project.reference_bonds 

639 reference_bonds = property(get_reference_bonds, None, None, "Atom bonds to be trusted (read only)") 

640 

641 def get_structure(self) -> 'Structure': 

642 """Get the parsed structure.""" 

643 # If we already have a stored value then return it 

644 if self._structure: 

645 return self._structure 

646 # Otherwise we must set the structure 

647 # Make sure the structure file exists at this point 

648 if not self.structure_file.exists: 

649 raise ValueError('Trying to set standard structure but file ' 

650 f'{self.structure_file.path} does not exist yet. Are you trying ' 

651 'to access the standard structure before processing input files?') 

652 # Note that this is not only the structure class, but it also contains additional logic 

653 self._structure = Structure.from_pdb_file(self.structure_file.path) 

654 # If the stable bonds test failed and we had mercy then it is sure our structure will have wrong bonds 

655 # In order to make it coherent with the topology we will mine topology bonds from here and force them in the structure 

656 # If we fail to get bonds from topology then just go along with the default structure bonds 

657 if not self.register.tests.get(STABLE_BONDS_FLAG, None): 

658 self._structure.bonds = self.reference_bonds 

659 # Same procedure if we have coarse grain atoms 

660 elif self.cg_selection: 

661 self._structure.bonds = self.reference_bonds 

662 return self._structure 

663 structure = property(get_structure, None, None, "Parsed structure (read only)") 

664 

665 # First frame PDB file 

666 get_first_frame = Task('firstframe', 'Get first frame structure', 

667 get_first_frame, output_filenames={'output_file': FIRST_FRAME_FILENAME}) 

668 get_first_frame_file = get_first_frame.get_output_file_getter('output_file') 

669 first_frame_file = property(get_first_frame_file, None, None, "First frame (read only)") 

670 

671 # Average structure filename 

672 get_average_structure = Task('average', 'Get average structure', 

673 get_average_structure, output_filenames={'output_file': AVERAGE_STRUCTURE_FILENAME}) 

674 get_average_structure_file = get_average_structure.get_output_file_getter('output_file') 

675 average_structure_file = property(get_average_structure_file, None, None, "Average structure filename (read only)") 

676 

677 # Produce the MD metadata file to be uploaded to the database 

678 prepare_metadata = Task('mdmeta', 'Prepare MD metadata', 

679 generate_md_metadata, output_filenames={'output_file': OUTPUT_METADATA_FILENAME}) 

680 

681 # The processed interactions 

682 get_processed_interactions = Task('inter', 'Interactions processing', 

683 process_interactions, {'frames_limit': 1000}) 

684 interactions = property(get_processed_interactions, None, None, "Processed interactions (read only)") 

685 

686 # MDAnalysis Universe object 

687 get_MDAnalysis_Universe = Task('mda_univ', 'MDAnalysis Universe object', 

688 get_mda_universe, use_cache=False) 

689 universe = property(get_MDAnalysis_Universe, None, None, "MDAnalysis Universe object (read only)") 

690 

691 def input_getter(name: str): 

692 """Get input values which may be MD specific. 

693 If the MD input is missing then we use the project input value. 

694 """ 

695 # Set the getter 

696 def getter(self): 

697 # Get the MD input 

698 value = self.md_inputs.get(name, None) 

699 if value is not None: 

700 return value 

701 # If there is no MD input then return the project value 

702 return getattr(self.project, f'input_{name}') 

703 return getter 

704 

705 # Assign the MD input getters 

706 input_interactions = property(input_getter('interactions'), None, None, "Interactions to be analyzed (read only)") 

707 input_pbc_selection = property(input_getter('pbc_selection'), None, None, "Selection of atoms which are still in periodic boundary conditions (read only)") 

708 input_cg_selection = property(input_getter('cg_selection'), None, None, "Selection of atoms which are not actual atoms but coarse grain beads (read only)") 

709 

710 def _set_pbc_selection(self, reference_structure: 'Structure', verbose: bool = False) -> 'Selection': 

711 """Set PBC selection. 

712 It may parse the inputs file selection string if it is available or guess it otherwise. 

713 """ 

714 # Otherwise we must set the PBC selection 

715 if verbose: print('Setting Periodic Boundary Conditions (PBC) atoms selection') 

716 selection_string = None 

717 # If there is inputs file then get the input pbc selection 

718 if self.project.is_inputs_file_available(): 

719 if verbose: print(' Using selection string in the inputs file') 

720 selection_string = self.input_pbc_selection 

721 # If there is no inputs file we guess PBC atoms automatically 

722 else: 

723 if verbose: print(' No inputs file -> Selection string will be set automatically') 

724 selection_string = 'auto' 

725 # Parse the selection string using the reference structure 

726 parsed_selection = None 

727 # If the input PBC selection is 'auto' then guess it automatically 

728 if selection_string == 'auto': 

729 # To guess PBC atoms (with the current implementation) we must make sure ther eis no CG 

730 if reference_structure.has_cg(): 

731 raise InputError('We can not guess PBC atoms in CG systems. Please set PBC atoms manually.\n' 

732 ' Use the "-pbc" argument or set the inputs file "pbc_selection" field.') 

733 if verbose: print(' Guessing PBC atoms as solvent, counter ions and lipids') 

734 parsed_selection = reference_structure.select_pbc_guess() 

735 # If we have a valid input value then use it 

736 elif selection_string: 

737 if verbose: print(f' Selecting PBC atoms "{selection_string}"') 

738 parsed_selection = reference_structure.select(selection_string) 

739 if not parsed_selection: 

740 raise InputError(f'PBC selection "{selection_string}" selected no atoms') 

741 # If we have an input value but it is empty then we set an empty selection 

742 else: 

743 if verbose: print(' No PBC atoms selected') 

744 parsed_selection = Selection() 

745 # Log a few of the selected residue names 

746 if verbose and parsed_selection: 

747 print(f' Parsed PBC selection has {len(parsed_selection)} atoms') 

748 selected_residues = reference_structure.get_selection_residues(parsed_selection) 

749 selected_residue_names = list(set([residue.name for residue in selected_residues])) 

750 limit = 3 # Show a maximum of 3 residue names 

751 example_residue_names = ', '.join(selected_residue_names[0:limit]) 

752 if len(selected_residue_names) > limit: example_residue_names += ', etc.' 

753 print(' e.g. ' + example_residue_names) 

754 return parsed_selection 

755 

756 def get_pbc_selection(self) -> 'Selection': 

757 """Get the periodic boundary conditions atom selection.""" 

758 # If we already have a stored value then return it 

759 if self.project._pbc_selection is not None: 

760 return self.project._pbc_selection 

761 # Otherwise we must set the PBC selection 

762 self.project._pbc_selection = self._set_pbc_selection(self.structure) 

763 return self.project._pbc_selection 

764 pbc_selection = property(get_pbc_selection, None, None, "Periodic boundary conditions atom selection (read only)") 

765 

766 # WARNING: Do not inherit project pbc residues 

767 # WARNING: It may trigger all the processing logic of the reference MD when there is no need 

768 def get_pbc_residues(self) -> list[int]: 

769 """Get indices of residues in periodic boundary conditions.""" 

770 # If we already have a stored value then return it 

771 if self.project._pbc_residues: 

772 return self.project._pbc_residues 

773 # If there is no inputs file then asume there are no PBC residues 

774 if not self.pbc_selection: 

775 self.project._pbc_residues = [] 

776 return self.project._pbc_residues 

777 # Otherwise we parse the selection and return the list of residue indices 

778 self.project._pbc_residues = self.structure.get_selection_residue_indices(self.pbc_selection) 

779 print(f'PBC residues "{self.input_pbc_selection}" -> {len(self.project._pbc_residues)} residues') 

780 return self.project._pbc_residues 

781 pbc_residues = property(get_pbc_residues, None, None, "Indices of residues in periodic boundary conditions (read only)") 

782 

783 # DANI: Esto algún día habría que tratar de automatizarlo 

784 def _set_cg_selection(self, reference_structure: 'Structure', verbose: bool = False) -> 'Selection': 

785 """Set the coarse grain selection.""" 

786 if verbose: print('Setting Coarse Grained (CG) atoms selection') 

787 # If there is no inputs file then asum there is no CG selection 

788 if not self.project.is_inputs_file_available(): 

789 if verbose: print(' No inputs file -> Asuming there is no CG at all') 

790 return Selection() 

791 # Otherwise we use the selection string from the inputs 

792 if verbose: print(' Using selection string in the inputs file') 

793 selection_string = self.input_cg_selection 

794 # If the selection is empty, again, assume there is no CG selection 

795 if not selection_string: 

796 if verbose: print(' Empty selection -> There is no CG at all') 

797 return Selection() 

798 # Otherwise, process it 

799 # If we have a valid input value then use it 

800 elif selection_string: 

801 if verbose: print(f' Selecting CG atoms "{selection_string}"') 

802 parsed_selection = reference_structure.select(selection_string) 

803 # If we have an input value but it is empty then we set an empty selection 

804 else: 

805 if verbose: print(' No CG atoms selected') 

806 parsed_selection = Selection() 

807 # Lof the parsed selection size 

808 if verbose: print(f' Parsed CG selection has {len(parsed_selection)} atoms') 

809 # Log a few of the selected residue names 

810 if verbose and parsed_selection: 

811 selected_residues = reference_structure.get_selection_residues(parsed_selection) 

812 selected_residue_names = list(set([residue.name for residue in selected_residues])) 

813 limit = 3 # Show a maximum of 3 residue names 

814 example_residue_names = ', '.join(selected_residue_names[0:limit]) 

815 if len(selected_residue_names) > limit: example_residue_names += ', etc.' 

816 print(' e.g. ' + example_residue_names) 

817 return parsed_selection 

818 

819 def get_cg_selection(self) -> 'Selection': 

820 """Get the coarse grain atom selection.""" 

821 # If we already have a stored value then return it 

822 if self.project._cg_selection: 

823 return self.project._cg_selection 

824 # Otherwise we must set the PBC selection 

825 self.project._cg_selection = self._set_cg_selection(self.structure) 

826 return self.project._cg_selection 

827 cg_selection = property(get_cg_selection, None, None, "Periodic boundary conditions atom selection (read only)") 

828 

829 # WARNING: Do not inherit project cg residues 

830 # WARNING: It may trigger all the processing logic of the reference MD when there is no need 

831 def get_cg_residues(self) -> list[int]: 

832 """Get indices of residues in coarse grain.""" 

833 # If we already have a stored value then return it 

834 if self.project._cg_residues: 

835 return self.project._cg_residues 

836 # If there is no inputs file then asume there are no cg residues 

837 if not self.cg_selection: 

838 self.project._cg_residues = [] 

839 return self.project._cg_residues 

840 # Otherwise we parse the selection and return the list of residue indices 

841 self.project._cg_residues = self.structure.get_selection_residue_indices(self.cg_selection) 

842 print(f'CG residues "{self.input_cg_selection}" -> {len(self.project._cg_residues)} residues') 

843 return self.project._cg_residues 

844 cg_residues = property(get_cg_residues, None, None, "Indices of residues in coarse grain (read only)") 

845 

846 def get_populations(self) -> list[float]: 

847 """Get equilibrium populations from a MSM from the project.""" 

848 return self.project.populations 

849 populations = property(get_populations, None, None, "Equilibrium populations from a MSM (read only)") 

850 

851 def get_transitions(self) -> list[list[float]]: 

852 """Get transition probabilities from a MSM from the project.""" 

853 return self.project.transitions 

854 transitions = property(get_transitions, None, None, "Transition probabilities from a MSM (read only)") 

855 

856 def get_protein_map(self) -> dict: 

857 """Get the residues mapping from the project.""" 

858 return self.project.protein_map 

859 protein_map = property(get_protein_map, None, None, "Residues mapping (read only)") 

860 

861 def get_charges(self) -> dict: 

862 """Get the residues mapping from the project.""" 

863 return self.project.charges 

864 charges = property(get_charges, None, None, "Residues charges (read only)") 

865 

866 # Reference frame 

867 get_reference_frame = Task('reframe', 'Reference frame', get_bonds_reference_frame) 

868 reference_frame = property(get_reference_frame, None, None, "Reference frame to be used to represent the MD (read only)") 

869 

870 # --------------------------------------------------------------------------------- 

871 # Tests 

872 # --------------------------------------------------------------------------------- 

873 

874 def is_trajectory_integral(self) -> Optional[bool]: 

875 """Sudden jumps test.""" 

876 # If we already have a stored value then return it 

877 if self._trajectory_integrity is not None: 

878 return self._trajectory_integrity 

879 # Otherwise we must find the value 

880 self._trajectory_integrity = check_trajectory_integrity( 

881 input_structure_filename=self.structure_file.path, 

882 input_trajectory_filename=self.trajectory_file.path, 

883 structure=self.structure, 

884 pbc_selection=self.pbc_selection, 

885 mercy=self.project.mercy, 

886 trust=self.project.trust, 

887 register=self.register, 

888 # time_length = self.time_length, 

889 check_selection=ALL_ATOMS, 

890 standard_deviations_cutoff=self.project.rmsd_cutoff, 

891 snapshots=self.snapshots, 

892 ) 

893 return self._trajectory_integrity 

894 

895 # --------------------------------------------------------------------------------- 

896 # Analyses 

897 # --------------------------------------------------------------------------------- 

898 

899 # RMSDs analysis 

900 run_rmsds_analysis = Task('rmsds', 'RMSDs analysis', 

901 rmsds, {'frames_limit': 5000}) 

902 

903 # TM scores analysis 

904 run_tmscores_analysis = Task('tmscore', 'TM scores analysis', 

905 tmscores, {'frames_limit': 200}) 

906 

907 # RMSF, atom fluctuation analysis 

908 run_rmsf_analysis = Task('rmsf', 'Fluctuation analysis', rmsf) 

909 

910 # Radius of gyration analysis 

911 run_rgyr_analysis = Task('rgyr', 'Radius of gyration analysis', 

912 rgyr, {'frames_limit': 5000}) 

913 

914 # PCA, principal component analysis 

915 run_pca_analysis = Task('pca', 'Principal component analysis', 

916 pca, {'frames_limit': 2000, 'projection_frames': 20}) 

917 

918 # PCA contacts 

919 # DANI: Intenta usar mucha memoria, hay que revisar 

920 # DANI: Puede saltar un error de imposible alojar tanta memoria 

921 # DANI: Puede comerse toda la ram y que al final salte un error de 'Terminado (killed)' 

922 # DANI: De momento me lo salto 

923 # DANI: Lleva mucho tiempo sin mantenerse, habrá que cambiar varias cosas al recuperarlo 

924 # run_pca_contacts('pcacons', 'PCA contacts', pca_contacts) 

925 

926 # RMSD per residue analysis 

927 run_rmsd_perres_analysis = Task('perres', 'RMSD per residue analysis', 

928 rmsd_per_residue, {'frames_limit': 100}) 

929 

930 # RMSD pairwise 

931 # Perform an analysis for the overall structure and then one more analysis for each interaction 

932 run_rmsd_pairwise_analysis = Task('pairwise', 'RMSD pairwise', 

933 rmsd_pairwise, {'frames_limit': 200, 'overall_selection': "name CA or name C5'"}) 

934 

935 # Run the cluster analysis 

936 run_clusters_analysis = Task('clusters', 'Clusters analysis', 

937 clusters_analysis, {'frames_limit': 1000, 'desired_n_clusters': 20}) 

938 

939 # Calculate the distance mean and standard deviation of each pair of residues 

940 run_dist_perres_analysis = Task('dist', 'Distance per residue', 

941 distance_per_residue, {'frames_limit': 200}) 

942 

943 # Hydrogen bonds 

944 run_hbonds_analysis = Task('hbonds', 'Hydrogen bonds analysis', 

945 hydrogen_bonds, {'time_splits': 100}) 

946 

947 # SASA, solvent accessible surface analysis 

948 run_sas_analysis = Task('sas', 'Solvent accessible surface analysis', 

949 sasa, {'frames_limit': 100}) 

950 

951 # Perform the electrostatic and vdw energies analysis for each pair of interaction agents 

952 run_energies_analysis = Task('energies', 'Energies analysis', 

953 energies, {'frames_limit': 100}) 

954 

955 # Calculate torsions and then dihedral energies for every dihedral along the trajectory 

956 run_dihedral_energies = Task('dihedrals', 'Dihedral energies analysis', 

957 compute_dihedral_energies, {'frames_limit': 100}) 

958 

959 # Perform the pockets analysis 

960 run_pockets_analysis = Task('pockets', 'Pockets analysis', 

961 pockets, {'frames_limit': 100, 'maximum_pockets_number': 10}) 

962 

963 # Helical parameters 

964 run_helical_analysis = Task('helical', 'Helical parameters', helical_parameters) 

965 

966 # Markov 

967 run_markov_analysis = Task('markov', 'Markov', markov, {'rmsd_selection': PROTEIN_AND_NUCLEIC}) 

968 

969 # Membrane density analysis 

970 run_density_analysis = Task('density', 'Membrane density analysis', 

971 density, {'frames_limit': 1000}) 

972 

973 # Membrane thickness analysis 

974 run_thickness_analysis = Task('thickness', 'Membrane thickness analysis', 

975 thickness, {'frames_limit': 100}) 

976 

977 # Area per lipid analysis 

978 run_apl_analysis = Task('apl', 'Membrane area per lipid analysis', area_per_lipid) 

979 

980 # Calculate lipid order parameters for membranes 

981 run_lipid_order_analysis = Task('lorder', 'Membrane lipid order analysis', 

982 lipid_order, {'frames_limit': 100}) 

983 

984 # Lipid-protein interactions analysis 

985 run_lipid_interactions_analysis = Task('linter', 'Membrane lipid-protein interactions analysis', 

986 lipid_interactions, {'frames_limit': 100}) 

987 

988 run_channels_analysis = Task('channels', 'Membrane channels analysis', 

989 channels, {'frames_limit': 10}) 

990 

991 def get_warnings(self) -> list: 

992 """Get the warnings. 

993 

994 The warnings list should not be reasigned, but it was back in the day. 

995 To avoid silent bugs, we read it directly from the register every time. 

996 """ 

997 return self.register.warnings 

998 warnings = property(get_warnings, None, None, "MD warnings to be written in metadata") 

999 

1000 

1001class Project: 

1002 """Class for the main project of an MDDB accession. 

1003 A project is a set of related MDs. 

1004 These MDs share all or most topology and metadata. 

1005 """ 

1006 

1007 def __init__(self, 

1008 directory: str = '.', 

1009 accession: Optional[str] = None, 

1010 database_url: str = DEFAULT_API_URL, 

1011 inputs_filepath: str = None, 

1012 input_topology_filepath: Optional[str] = None, 

1013 input_structure_filepath: Optional[str] = None, 

1014 input_trajectory_filepaths: Optional[list[str]] = None, 

1015 md_directories: Optional[list[str]] = None, 

1016 md_config: Optional[list[list[str]]] = None, 

1017 reference_md_index: Optional[int] = None, 

1018 forced_inputs: Optional[list[list[str]]] = None, 

1019 populations_filepath: str = DEFAULT_POPULATIONS_FILENAME, 

1020 transitions_filepath: str = DEFAULT_TRANSITIONS_FILENAME, 

1021 aiida_data_filepath: Optional[str] = None, 

1022 filter_selection: bool | str = False, 

1023 pbc_selection: Optional[str] = None, 

1024 cg_selection: Optional[str] = None, 

1025 image: bool = False, 

1026 fit: bool = False, 

1027 translation: list[float] = [0, 0, 0], 

1028 mercy: list[str] | bool = [], 

1029 trust: list[str] | bool = [], 

1030 faith: bool = False, 

1031 ssleep: bool = False, 

1032 pca_analysis_selection: str = PROTEIN_AND_NUCLEIC_BACKBONE, 

1033 pca_fit_selection: str = PROTEIN_AND_NUCLEIC_BACKBONE, 

1034 rmsd_cutoff: float = DEFAULT_RMSD_CUTOFF, 

1035 interaction_cutoff: float = DEFAULT_INTERACTION_CUTOFF, 

1036 interactions_auto: Optional[str] = None, 

1037 guess_bonds: bool = False, 

1038 ignore_bonds: bool = False, 

1039 sample_trajectory: Optional[int] = None, 

1040 ): 

1041 """Initialize a Project. 

1042 

1043 Args: 

1044 directory (str): 

1045 Local directory where the project takes place. 

1046 accession (Optional[str]): 

1047 Project accession to download missing input files from the database (if already uploaded). 

1048 database_url (str): 

1049 API URL to download missing data. when an accession is provided. 

1050 inputs_filepath (str): 

1051 Path to a file with inputs for metadata, simulation parameters and analysis config. 

1052 input_topology_filepath (Optional[str]): 

1053 Path to input topology file relative to the project directory. 

1054 input_structure_filepath (Optional[str]): 

1055 Path to input structure file. It may be relative to the project or to each MD directory. 

1056 If this value is not passed then the standard structure file is used as input by default. 

1057 input_trajectory_filepaths (Optional[list[str]]): 

1058 Paths to input trajectory files relative to each MD directory. 

1059 If this value is not passed then the standard trajectory file path is used as input by default. 

1060 md_directories (Optional[list[str]]): 

1061 Path to the different MD directories. 

1062 Each directory is to contain an independent trajectory and structure. 

1063 Several output files will be generated in every MD directory. 

1064 md_config (Optional[list]): 

1065 Configuration of a specific MD. You may declare as many as you want. 

1066 Every MD requires a directory name and at least one trajectory path. 

1067 The structure is -md <directory> <trajectory_1> <trajectory_2> ... 

1068 Note that all trajectories from the same MD will be merged. 

1069 For legacy reasons, you may also provide a specific structure for an MD. 

1070 e.g. -md <directory> <structure> <trajectory_1> <trajectory_2> ... 

1071 reference_md_index (Optional[int]): 

1072 Index of the reference MD (used by project-level functions; defaults to first MD). 

1073 forced_inputs (Optional[list]): 

1074 Force a specific input through the command line. 

1075 Inputs passed through command line have priority over the ones from the inputs file. 

1076 In fact, these values will overwritten or be appended in the inputs file. 

1077 Every forced input requires an input name and a value. 

1078 The structure is -fi <input name> <new input value> 

1079 populations_filepath (str): 

1080 Path to equilibrium populations file (Markov State Model only) 

1081 transitions_filepath (str): 

1082 Path to transition probabilities file (Markov State Model only). 

1083 aiida_data_filepath (Optional[str]): 

1084 Path to the AiiDA data file. 

1085 This file may be generated by the aiida-gromacs plugin and contains provenance data. 

1086 filter_selection (bool|str): 

1087 Atoms selection to be filtered in VMD format. 

1088 If the argument is passed alone (i.e. with no selection) then water and counter ions are filtered. 

1089 pbc_selection (Optional[str]): 

1090 Selection of atoms which stay in Periodic Boundary Conditions even after imaging the trajectory. 

1091 e.g. remaining solvent, ions, membrane lipids, etc. 

1092 Selection passed through console overrides the one in inputs file. 

1093 cg_selection (Optional[str]): 

1094 Selection of atoms which are not actual atoms but Coarse Grained beads. 

1095 Selection passed through console overrides the one in inputs file. 

1096 image (bool): 

1097 Set if the trajectory is to be imaged so atoms stay in the PBC box. See -pbc for more information. 

1098 fit (bool): 

1099 Set if the trajectory is to be fitted (both rotation and translation) to minimize the RMSD to PROTEIN_AND_NUCLEIC_BACKBONE selection. 

1100 translation (list[float]): 

1101 Set the x y z translation for the imaging process. 

1102 e.g. -trans 0.5 -1 0 

1103 mercy (list[str]|bool): 

1104 Failures to be tolerated (or boolean to set all/none). 

1105 trust (list[str]|bool): 

1106 Tests to skip/trust (or boolean to set all/none). 

1107 faith (bool): 

1108 If True, require input files to match expected output files and skip processing. 

1109 ssleep (bool): 

1110 If True, SSL certificate authentication is skipped when downloading data from an API. 

1111 pca_analysis_selection (str): 

1112 Atom selection for PCA analysis in VMD syntax. 

1113 pca_fit_selection (str): 

1114 Atom selection for the PCA fitting in VMD syntax. 

1115 rmsd_cutoff (float): 

1116 Set the cutoff for the RMSD sudden jumps analysis to fail. 

1117 This cutoff stands for the number of standard deviations away from the mean an RMSD value is to be. 

1118 interaction_cutoff (float): 

1119 Set the cutoff for the interactions analysis to fail. 

1120 This cutoff stands for percent of the trajectory where the interaction happens (from 0 to 1). 

1121 interactions_auto (Optional[str]): 

1122 Guess input interactions automatically. A VMD selection may be passed to limit guessed interactions to a specific subset of atoms. 

1123 guess_bonds (bool): 

1124 Force the workflow to guess atom bonds based on distance and atom radii in different frames along the trajectory instead of mining topology bonds. 

1125 ignore_bonds (bool): 

1126 Force the workflow to ignore atom bonds. This will result in many check-ins being skipped 

1127 sample_trajectory (Optional[int]): 

1128 If passed, download the first 10 (by default) frames from the trajectory. 

1129 You can specify a different number by providing an integer value. 

1130 

1131 """ 

1132 # Save input parameters 

1133 # RUBEN: directory nunca se usa, eliminar? 

1134 self.directory = normpath(directory) 

1135 # If it is an absolute path then make it relative to the project 

1136 if isabs(self.directory): 

1137 self.directory = relpath(self.directory) 

1138 # Save the directory name alone apart 

1139 if self.directory == '.': 

1140 self.directory_name = basename(getcwd()) 

1141 else: 

1142 self.directory_name = basename(self.directory) 

1143 

1144 # Set the database handler 

1145 self.ssleep = ssleep 

1146 self.database = Database(database_url, self.ssleep) if database_url else None 

1147 # Set the remote project handler 

1148 self.accession = accession 

1149 self.remote = None 

1150 if self.database and self.accession: 

1151 self.remote = self.database.get_remote_project(self.accession) 

1152 

1153 # Set the inputs file 

1154 # Set the expected default name in case there is no inputs file since it may be downloaded 

1155 self._inputs_file = File(self.pathify(DEFAULT_INPUTS_FILENAME)) 

1156 # If there is an input filepath then use it 

1157 if inputs_filepath: 

1158 self._inputs_file = File(inputs_filepath) 

1159 # Otherwise guess the inputs file using the accepted filenames 

1160 else: 

1161 for filename in ACCEPTED_INPUT_FILENAMES: 

1162 if exists(filename): 

1163 self._inputs_file = File(filename) 

1164 break 

1165 # Set the input topology file 

1166 # Note that even if the input topology path is passed we do not check it exists 

1167 # Never forget we can donwload some input files from the database on the fly 

1168 self.arg_input_topology_filepath = input_topology_filepath 

1169 self._input_topology_filepath = None 

1170 self._input_topology_file = None 

1171 self._input_topology_url = None 

1172 # Input structure and trajectory filepaths 

1173 # Do not parse them to files yet, let this to the MD class 

1174 self.input_structure_filepath = input_structure_filepath 

1175 self.input_trajectory_filepaths = input_trajectory_filepaths 

1176 

1177 # Make sure the new MD configuration (-md) was not passed as well as old MD inputs (-mdir, -traj) 

1178 if md_config and (md_directories or input_trajectory_filepaths): 

1179 raise InputError('MD configurations (-md) is not compatible with old MD inputs (-mdir, -traj)') 

1180 # Save the MD configurations 

1181 self.md_config = md_config 

1182 # Make sure MD configuration has the correct format 

1183 if self.md_config: 

1184 # Make sure all MD configurations have at least 3 values each 

1185 for mdc in self.md_config: 

1186 if len(mdc) < 2: 

1187 raise InputError('Wrong MD configuration: the patter is -md <directory> <trajectory> <trajectory 2> ...') 

1188 # Make sure there are no duplictaed MD directories 

1189 md_directories = [mdc[0] for mdc in self.md_config] 

1190 if len(md_directories) > len(set(md_directories)): 

1191 raise InputError('There are duplicated MD directories') 

1192 

1193 # Input populations and transitions for MSM 

1194 self.populations_filepath = populations_filepath 

1195 self._populations_file = File(self.populations_filepath) 

1196 self.transitions_filepath = transitions_filepath 

1197 self._transitions_file = File(self.transitions_filepath) 

1198 # Input AiiDA data 

1199 self.aiida_data_filepath = aiida_data_filepath 

1200 self._aiida_data_file = File(self.aiida_data_filepath) if aiida_data_filepath else None 

1201 

1202 # Set the processed topology filepath, which depends on the input topology filename 

1203 # Note that this file is different from the standard topology, although it may be standard as well 

1204 self._topology_filepath = None 

1205 

1206 # Set the standard topology file 

1207 self._standard_topology_file = None 

1208 

1209 # Set the MD directories 

1210 self._md_directories = md_directories 

1211 # Check input MDs are correct to far 

1212 if self._md_directories: 

1213 self.check_md_directories() 

1214 

1215 # Set the reference MD 

1216 self._reference_md = None 

1217 self._reference_md_index = reference_md_index 

1218 

1219 # Set the rest of inputs 

1220 # Note that the filter selection variable is not handled here at all 

1221 # This is just pased to the filtering function which knows how to handle the default 

1222 self.filter_selection = filter_selection 

1223 # PBC selection may come from the console or from the inputs 

1224 self._input_pbc_selection = pbc_selection 

1225 self._input_cg_selection = cg_selection 

1226 self.image = image 

1227 self.fit = fit 

1228 self.translation = translation 

1229 self.mercy = mercy 

1230 # Fix the mercy input, if needed 

1231 # If a boolean is passed instead of a list then we set its corresponding value 

1232 if type(mercy) is bool: 

1233 if mercy: 

1234 self.mercy = AVAILABLE_FAILURES 

1235 else: 

1236 self.mercy = [] 

1237 self.trust = trust 

1238 # Fix the trust input, if needed 

1239 # If a boolean is passed instead of a list then we set its corresponding value 

1240 if type(trust) is bool: 

1241 if trust: 

1242 self.trust = AVAILABLE_CHECKINGS 

1243 else: 

1244 self.trust = [] 

1245 self.faith = faith 

1246 self.pca_analysis_selection = pca_analysis_selection 

1247 self.pca_fit_selection = pca_fit_selection 

1248 self.rmsd_cutoff = rmsd_cutoff 

1249 self.interaction_cutoff = interaction_cutoff 

1250 self.sample_trajectory = sample_trajectory 

1251 self.interactions_auto = interactions_auto 

1252 self.guess_bonds = guess_bonds 

1253 self.ignore_bonds = ignore_bonds 

1254 # Set the inputs, where values from the inputs file will be stored 

1255 self._inputs = None 

1256 

1257 # Other values which may be found/calculated on demand 

1258 self._pbc_selection = None 

1259 self._pbc_residues = None 

1260 self._cg_selection = None 

1261 self._cg_residues = None 

1262 self._reference_bonds = None 

1263 self._topology_reader = None 

1264 self._dihedrals = None 

1265 self._populations = None 

1266 self._transitions = None 

1267 self._pdb_ids = None 

1268 self._mds = None 

1269 

1270 # Save forced inputs and use them when necessary 

1271 self.forced_inputs = {} 

1272 if forced_inputs: 

1273 # Make sure the format is respected 

1274 for forced_input in forced_inputs: 

1275 n_values = len(forced_input) 

1276 if n_values == 0: 

1277 raise InputError('There is an empty "-fin". Please remove it from the command line.') 

1278 if n_values == 1: 

1279 only_value = forced_input[0] 

1280 raise InputError(f'There is a "-fin {only_value}" which is missing the new input value.') 

1281 if n_values > 2: 

1282 suggested_fix = f'{forced_input[0]} "{" ".join(forced_input[1:])}"' 

1283 raise InputError(f'Too many values in "-fin {" ".join(forced_input)}".\n' + 

1284 ' Note that only two values are expected: -fin <input name> <new input value>\n' + 

1285 f' Did you forget the quotes maybe? Try this: -fin {suggested_fix}') 

1286 

1287 # Save forced inputs as a dict 

1288 self.forced_inputs = {name: value for name, value in forced_inputs} 

1289 

1290 # Check that forced inputs exist 

1291 # This is to prevent the user from loosing a lot of time for a silly typo 

1292 template_inputs = load_yaml(INPUTS_TEMPLATE_FILEPATH) 

1293 for input_name in self.forced_inputs.keys(): 

1294 if input_name in template_inputs: continue 

1295 available_inputs = ', '.join(template_inputs.keys()) 

1296 raise InputError(f'Unrecognized forced input "{input_name}". Available inputs: {available_inputs}') 

1297 

1298 # Overwrite input file values 

1299 for input_name, input_value in self.forced_inputs.items(): 

1300 self.update_inputs(input_name, input_value) 

1301 

1302 # Force a couple of extraordinary files which is generated if atoms are resorted 

1303 self.resorted_bonds_file = File(self.pathify(RESORTED_BONDS_FILENAME)) 

1304 self.resorted_charges_file = File(self.pathify(RESORTED_CHARGES_FILENAME)) 

1305 

1306 # Set a new entry for the register 

1307 # This is useful to track previous workflow runs and problems 

1308 register_filepath = self.pathify(REGISTER_FILENAME) 

1309 register_file = File(register_filepath) 

1310 self.register = Register(register_file) 

1311 

1312 # Set the cache 

1313 cache_filepath = self.pathify(CACHE_FILENAME) 

1314 cache_file = File(cache_filepath) 

1315 self.cache = Cache(cache_file) 

1316 

1317 # Set tasks whose output is to be overwritten 

1318 self.overwritables = set() 

1319 

1320 def __repr__(self): 

1321 """Return a string representation of the Project object.""" 

1322 return 'Project' 

1323 

1324 def pathify(self, filename_or_relative_path: str) -> str: 

1325 """Given a filename or relative path, add the project directory path at the beginning.""" 

1326 return normpath(self.directory + '/' + filename_or_relative_path) 

1327 

1328 def check_md_directories(self): 

1329 """Check MD directories to be right. 

1330 If there is any problem then directly raise an input error. 

1331 """ 

1332 # Check there is at least one MD 

1333 if len(self._md_directories) < 1: 

1334 raise InputError('There must be at least one MD') 

1335 # Check there are not duplicated MD directories 

1336 if len(set(self._md_directories)) != len(self._md_directories): 

1337 raise InputError('There are duplicated MD directories') 

1338 

1339 def get_md_directories(self) -> list: 

1340 """Get MD directories.""" 

1341 # If MD directories are already declared then return them 

1342 if self._md_directories: 

1343 return self._md_directories 

1344 # Otherwise use the default MDs 

1345 self._md_directories = [] 

1346 # Use the MDs from the inputs file when available 

1347 if self.is_inputs_file_available() and self.input_mds: 

1348 for input_md in self.input_mds: 

1349 # Get the directory according to the inputs 

1350 directory = input_md.get(MD_DIRECTORY, None) 

1351 if directory: 

1352 check_directory(directory) 

1353 # If no directory is specified in the inputs then guess it from the MD name 

1354 else: 

1355 name = input_md['name'] 

1356 if not name: 

1357 name = 'unnamed' 

1358 directory = name_2_directory(name) 

1359 self._md_directories.append(directory) 

1360 # Otherwise, guess MD directories by checking which directories include a register file 

1361 else: 

1362 available_directories = sorted(next(walk(self.directory))[1]) 

1363 for directory in available_directories: 

1364 if exists(directory + '/' + REGISTER_FILENAME): 

1365 self._md_directories.append(directory) 

1366 # If we found no MD directory then it means MDs were never declared before 

1367 if len(self._md_directories) == 0: 

1368 raise InputError('Impossible to know which are the MD directories. ' 

1369 'You can either declare them using the "-mdir" option or by providing an inputs file') 

1370 self.check_md_directories() 

1371 return self._md_directories 

1372 md_directories = property(get_md_directories, None, None, "MD directories (read only)") 

1373 

1374 def get_reference_md_index(self) -> int: 

1375 """Get the reference MD index.""" 

1376 # If we are already have a value then return it 

1377 if self._reference_md_index: 

1378 return self._reference_md_index 

1379 # Otherwise we must find the reference MD index 

1380 # If the inputs file is available then it must declare the reference MD index 

1381 if self.is_inputs_file_available(): 

1382 self._reference_md_index = self.get_input('mdref') 

1383 # Otherwise we simply set the first MD as the reference and warn the user about this 

1384 if self._reference_md_index is None: 

1385 warn('No reference MD was specified. The first MD will be used as reference.') 

1386 self._reference_md_index = 0 

1387 return self._reference_md_index 

1388 reference_md_index = property(get_reference_md_index, None, None, "Reference MD index (read only)") 

1389 

1390 def get_reference_md(self) -> MD: 

1391 """Get the reference MD.""" 

1392 # If we are already have a value then return it 

1393 if self._reference_md: 

1394 return self._reference_md 

1395 # Otherwise we must find the reference MD 

1396 self._reference_md = self.mds[self.reference_md_index] 

1397 return self._reference_md 

1398 reference_md: MD = property(get_reference_md, None, None, "Reference MD (read only)") 

1399 

1400 def get_mds(self) -> list: 

1401 """Get the available MDs (read only).""" 

1402 # If MDs are already declared then return them 

1403 if self._mds: 

1404 return self._mds 

1405 # Now instantiate a new MD for each declared MD and save the reference MD 

1406 self._mds = [] 

1407 # New system with MD configurations (-md) 

1408 if self.md_config: 

1409 for n, config in enumerate(self.md_config, 1): 

1410 directory = config[0] 

1411 # LEGACY 

1412 # In a previous version, the md config argument also holded the structure 

1413 # This was the second argument, so we check if we have more than 2 arguments 

1414 # If this is the case, then check if the second argument has different format 

1415 # Note that PDB format is also a trajectory supported format 

1416 has_structure = False 

1417 if len(config) > 2: 

1418 first_sample = File(config[1]) 

1419 second_sample = File(config[2]) 

1420 if first_sample.format != second_sample.format: 

1421 has_structure = True 

1422 # Finally set the input structure and trajectories 

1423 input_structure_filepath = config[1] if has_structure else self.input_structure_filepath 

1424 input_trajectory_filepaths = config[2:] if has_structure else config[1:] 

1425 # Define the MD 

1426 md = MD( 

1427 project=self, number=n, directory=directory, 

1428 input_structure_filepath=input_structure_filepath, 

1429 input_trajectory_filepaths=input_trajectory_filepaths, 

1430 ) 

1431 self._mds.append(md) 

1432 # Old system (-mdir, -stru -traj) 

1433 else: 

1434 for n, md_directory in enumerate(self.md_directories, 1): 

1435 md = MD( 

1436 project=self, number=n, directory=md_directory, 

1437 input_structure_filepath=self.input_structure_filepath, 

1438 input_trajectory_filepaths=self.input_trajectory_filepaths, 

1439 ) 

1440 self._mds.append(md) 

1441 return self._mds 

1442 mds: list[MD] = property(get_mds, None, None, "Available MDs (read only)") 

1443 

1444 # Check input files exist when their filenames are read 

1445 # If they do not exist then try to download them 

1446 # If the download is not possible then raise an error 

1447 

1448 # Inputs filename ------------ 

1449 

1450 def is_inputs_file_available(self) -> bool: 

1451 """Set a function to check if inputs file is available. 

1452 Note that asking for it when it is not available will lead to raising an input error. 

1453 """ 

1454 # If name is not declared then it is impossible to reach it 

1455 if not self._inputs_file: 

1456 return False 

1457 # If the file already exists then it is available 

1458 if self._inputs_file.exists: 

1459 return True 

1460 # If it does not exist but it may be downloaded then it is available 

1461 if self.remote: 

1462 return True 

1463 return False 

1464 

1465 def get_inputs_file(self) -> File: 

1466 """Set a function to load the inputs yaml file.""" 

1467 # There must be an inputs filename 

1468 if not self._inputs_file: 

1469 raise InputError('Not defined inputs filename') 

1470 # If the file already exists then we are done 

1471 if self._inputs_file.exists: 

1472 return self._inputs_file 

1473 # Try to download it 

1474 # If we do not have the required parameters to download it then we surrender here 

1475 if not self.remote: 

1476 raise InputError(f'Missing inputs file "{self._inputs_file.filename}"') 

1477 # Download the inputs json file if it does not exists 

1478 self.remote.download_inputs_file(self._inputs_file) 

1479 return self._inputs_file 

1480 inputs_file = property(get_inputs_file, None, None, "Inputs filename (read only)") 

1481 

1482 # Topology filename ------------ 

1483 

1484 def get_input_topology_filepath(self) -> Optional[str]: 

1485 """Get the input topology filepath from the inputs or try to guess it. 

1486 

1487 If the input topology filepath is a 'no' flag then we consider there is no topology at all 

1488 So far we extract atom charges and atom bonds from the topology file 

1489 In this scenario we can keep working but there are some consecuences: 

1490 

1491 1. Analysis using atom charges such as 'energies' will be skipped 

1492 2. The standard topology file will not include atom charges 

1493 3. Bonds will be guessed 

1494 """ 

1495 # If we already have an internal value calculated then return it 

1496 if self._input_topology_filepath is not None: 

1497 return self._input_topology_filepath 

1498 

1499 def parse(filepath: str) -> str: 

1500 """Parse possible glob notation.""" 

1501 # If it is a URL then set the paths for the further download 

1502 if is_url(filepath): 

1503 self._input_topology_url = filepath 

1504 source_filename = url_to_source_filename(filepath) 

1505 return source_filename 

1506 if not is_glob(filepath): 

1507 return filepath 

1508 # If there is glob pattern then parse it 

1509 parsed_filepaths = glob(filepath) 

1510 if len(parsed_filepaths) == 0: 

1511 # Warn the user in case it was trying to use glob syntax to donwload remote files 

1512 if self.remote: 

1513 warn('Spread syntax is not supported to download remote files') 

1514 raise InputError(f'No topologies found with "{filepath}"') 

1515 if len(parsed_filepaths) > 1: 

1516 raise InputError(f'Multiple topologies found with "{filepath}": {", ".join(parsed_filepaths)}') 

1517 return parsed_filepaths[0] 

1518 # If this value was passed through command line then it would be set as the internal value already 

1519 if self.arg_input_topology_filepath: 

1520 if self.arg_input_topology_filepath.lower() in {'no', 'not', 'na'}: 

1521 self._input_topology_filepath = MISSING_TOPOLOGY 

1522 return self._input_topology_filepath 

1523 self._input_topology_filepath = parse(self.arg_input_topology_filepath) 

1524 # Update the input topology fielpath in the inputs file, in case it is not matching 

1525 self.update_inputs('input_topology_filepath', relpath(self._input_topology_filepath, self.directory)) 

1526 return self._input_topology_filepath 

1527 # Check if the inputs file has the value 

1528 if self.is_inputs_file_available(): 

1529 # Get the input value, whose key must exist 

1530 inputs_value = self.get_input('input_topology_filepath') 

1531 # If there is a valid input then use it 

1532 if inputs_value is not None: 

1533 # WARNING: the yaml parser automatically converts 'no' to False 

1534 if inputs_value is False or inputs_value.lower() in {'no', 'not', 'na'}: 

1535 self._input_topology_filepath = MISSING_TOPOLOGY 

1536 return self._input_topology_filepath 

1537 parsed_input_value = parse(inputs_value) 

1538 self._input_topology_filepath = self.pathify(parsed_input_value) 

1539 return self._input_topology_filepath 

1540 # If nothing worked then surrender 

1541 raise InputError('Missing input topology file path. Please provide a topology file using the "-top" argument.\n' + 

1542 ' Note that you may run the workflow without a topology file. To do so, use the "-top no" argument.\n' + 

1543 ' However this has implications since we usually mine atom charges and bonds from the topology file.\n' + 

1544 ' Some analyses such us the interaction energies will be skiped') 

1545 

1546 def get_input_topology_file(self) -> Optional[File]: 

1547 """Get the input topology file. 

1548 If the file is not found try to download it. 

1549 """ 

1550 # If we already have a value then return it 

1551 if self._input_topology_file is not None: 

1552 return self._input_topology_file 

1553 # Set the input topology filepath 

1554 input_topology_filepath = self.get_input_topology_filepath() 

1555 # If the input filepath is None then it menas we must proceed without a topology 

1556 if input_topology_filepath == MISSING_TOPOLOGY: 

1557 self._input_topology_file = MISSING_TOPOLOGY 

1558 return self._input_topology_file 

1559 # If no input is passed then we check the inputs file 

1560 # Set the file 

1561 input_topology_file = File(input_topology_filepath) 

1562 # If there is a topology URL then we must donwload the topology first 

1563 input_topology_url = self._input_topology_url 

1564 if input_topology_url and not input_topology_file.exists: 

1565 original_filename = input_topology_url.split('/')[-1] 

1566 # If there is a remote then use it 

1567 if self.remote: 

1568 # If the topology filename is the standard topology filename then use the topology endpoint instead 

1569 if original_filename == STANDARD_TOPOLOGY_FILENAME: 

1570 self.remote.download_standard_topology(input_topology_file) 

1571 # Otherwise download the input strucutre file by its filename 

1572 else: 

1573 self.remote.download_file(original_filename, input_topology_file) 

1574 # In case the topology is a '.top' file we consider it is a Gromacs topology 

1575 # It may come with additional itp files we must download as well 

1576 if input_topology_file.format == 'top': 

1577 # Find available .itp files and download each of them 

1578 itp_filenames = [filename for filename in self.remote.available_files if filename[-4:] == '.itp'] 

1579 for itp_filename in itp_filenames: 

1580 itp_filepath = self.pathify(itp_filename) 

1581 itp_file = File(itp_filepath) 

1582 self.remote.download_file(itp_file.filename, itp_file) 

1583 # Otherwise use the URL as is 

1584 else: 

1585 if input_topology_file.format == 'top': 

1586 warn('Automatic download of itp files is not supported without the "-proj" argument.' + 

1587 ' Thus if the topology has associated itp files they will not be downloaded.') 

1588 download_file(input_topology_url, input_topology_file) 

1589 # If the file already exists then we are done 

1590 if input_topology_file.exists: 

1591 self._input_topology_file = input_topology_file 

1592 return self._input_topology_file 

1593 raise InputError(f'Missing input topology file "{input_topology_file.filename}"') 

1594 input_topology_file = property(get_input_topology_file, None, None, "Input topology file (read only)") 

1595 

1596 def get_input_structure_file(self) -> File: 

1597 """Get the input structure filename.""" 

1598 # When calling this function make sure all MDs have the file or try to download it 

1599 return self.reference_md._input_structure_file 

1600 input_structure_file = property(get_input_structure_file, None, None, "Input structure filename for each MD (read only)") 

1601 

1602 def get_input_trajectory_files(self) -> list[File]: 

1603 """Get the input trajectory filename(s) from the inputs. 

1604 If file(s) are not found try to download it. 

1605 """ 

1606 return self.reference_md._input_trajectory_files 

1607 input_trajectory_files = property(get_input_trajectory_files, None, None, "Input trajectory filenames for each MD (read only)") 

1608 

1609 def get_populations_file(self) -> Optional[File]: 

1610 """Get the MSM equilibrium populations file.""" 

1611 if not self.get_file(self._populations_file): 

1612 return None 

1613 return self._populations_file 

1614 populations_file = property(get_populations_file, None, None, "MSM equilibrium populations file (read only)") 

1615 

1616 def get_transitions_file(self) -> Optional[File]: 

1617 """Get the MSM transition probabilities file.""" 

1618 if not self.get_file(self._transitions_file): 

1619 return None 

1620 return self._transitions_file 

1621 transitions_file = property(get_transitions_file, None, None, "MSM transition probabilities file (read only)") 

1622 

1623 def get_aiida_data_file(self) -> Optional[File]: 

1624 """Get the AiiDA data file.""" 

1625 if not self._aiida_data_file: return None 

1626 if not self.get_file(self._aiida_data_file): return None 

1627 return self._aiida_data_file 

1628 aiida_data_file = property(get_aiida_data_file, None, None, "AiiDA data file (read only)") 

1629 

1630 # --------------------------------- 

1631 

1632 def get_file(self, target_file: File) -> bool: 

1633 """Check if a file exists. 

1634 If not, try to download it from the database. 

1635 If the file is not found in the database it is fine, we do not even warn the user. 

1636 Note that nowadays this function is used to get populations and transitions files, which are not common. 

1637 """ 

1638 return self.reference_md.get_file(target_file) 

1639 

1640 # Input file values ----------------------------------------- 

1641 

1642 # First of all set input themselves 

1643 

1644 def get_inputs(self) -> dict: 

1645 """Get inputs.""" 

1646 # If inputs are already loaded then return them 

1647 if self._inputs: 

1648 return self._inputs 

1649 # When loading the inuts file, replace some values automatically 

1650 replaces = [ 

1651 ('$DIR', self.directory_name) 

1652 ] 

1653 # Otherwise, load inputs from the inputs file 

1654 inputs_data = None 

1655 if self.inputs_file.format == 'json': 

1656 inputs_data = load_json(self.inputs_file.path, replaces) 

1657 elif self.inputs_file.format == 'yaml': 

1658 inputs_data = load_yaml(self.inputs_file.path, replaces) 

1659 else: 

1660 raise InputError('Input file format is not supported. Please use json or yaml files.') 

1661 if not inputs_data: 

1662 raise InputError('Input file is empty') 

1663 self._inputs = inputs_data 

1664 # Legacy fixes 

1665 old_pdb_ids = self._inputs.get('pdbIds', None) 

1666 if old_pdb_ids: 

1667 self._inputs['pdb_ids'] = old_pdb_ids 

1668 # Finally return the updated inputs 

1669 return self._inputs 

1670 inputs = property(get_inputs, None, None, "Inputs from the inputs file (read only)") 

1671 

1672 def update_inputs(self, nested_key: str, new_value): 

1673 """Permanently update the inputs file. 

1674 This may be done when command line inputs do not match file inputs. 

1675 """ 

1676 # If there is no inputs file then rhere is nothing to update 

1677 if not self.is_inputs_file_available(): return 

1678 # If the input already matches then do nothing 

1679 current_value = read_ndict(self.inputs, nested_key, MISSING_INPUT_EXCEPTION) 

1680 if current_value == new_value: return 

1681 # Set the new value 

1682 write_ndict(self.inputs, nested_key, new_value) 

1683 # If there is no inputs file then do not try to save anything 

1684 if not self.is_inputs_file_available(): return 

1685 print(f'* Field "{nested_key}" in the inputs file will be permanently modified') 

1686 # Write the new inputs to disk 

1687 if self.inputs_file.format == 'json': 

1688 save_json(self.inputs, self.inputs_file.path) 

1689 elif self.inputs_file.format == 'yaml': 

1690 # Note that comments in the original YAML file will be not kept 

1691 save_yaml(self.inputs, self.inputs_file.path) 

1692 else: 

1693 raise InputError('Input file format is not supported. Please use json or yaml files.') 

1694 

1695 # Then set getters for every value in the inputs file 

1696 def get_input(self, name: str) -> Any: 

1697 """Get a specific 'input' value.""" 

1698 # Check if the value of this input was forced from command line 

1699 if name in self.forced_inputs: 

1700 return self.forced_inputs[name] 

1701 # Get the input value from the inputs file 

1702 value = self.inputs.get(name, MISSING_INPUT_EXCEPTION) 

1703 # If we had a value then return it 

1704 if value != MISSING_INPUT_EXCEPTION: 

1705 return value 

1706 # If the field is not specified in the inputs file then set a defualt value 

1707 default_value = DEFAULT_INPUT_VALUES.get(name, None) 

1708 # Warn the user about this 

1709 warn(f'Missing input "{name}" -> Using default value: {default_value}') 

1710 return default_value 

1711 

1712 def input_property(name: str, doc: str = ""): 

1713 """Set a function to get a specific 'input' value by its key/name. 

1714 Note that we return the property without calling the getter. 

1715 """ 

1716 def getter(self: 'Project'): 

1717 return self.get_input(name) 

1718 return property(getter, doc=doc) 

1719 

1720 # Assign the getters 

1721 input_interactions = input_property('interactions', "Interactions to be analyzed (read only)") 

1722 input_protein_references = input_property('forced_references', "Uniprot IDs to be used first when aligning protein sequences (read only)") 

1723 input_pdb_ids = input_property('pdb_ids', "Protein Data Bank IDs used for the setup of the system (read only)") 

1724 input_type = input_property('type', "Set if its a trajectory or an ensemble (read only)") 

1725 input_mds = input_property('mds', "Input MDs configuration (read only)") 

1726 input_ligands = input_property('ligands', "Input ligand references (read only)") 

1727 input_force_fields = input_property('ff', "Input force fields (read only)") 

1728 input_collections = input_property('collections', "Input collections (read only)") 

1729 input_chain_names = input_property('chainnames', "Input chain names (read only)") 

1730 input_framestep = input_property('framestep', "Input framestep (read only)") 

1731 input_name = input_property('name', "Input name (read only)") 

1732 input_description = input_property('description', "Input description (read only)") 

1733 input_authors = input_property('authors', "Input authors (read only)") 

1734 input_groups = input_property('groups', "Input groups (read only)") 

1735 input_contact = input_property('contact', "Input contact (read only)") 

1736 input_program = input_property('program', "Input program (read only)") 

1737 input_version = input_property('version', "Input version (read only)") 

1738 input_method = input_property('method', "Input method (read only)") 

1739 input_license = input_property('license', "Input license (read only)") 

1740 input_linkcense = input_property('linkcense', "Input license link (read only)") 

1741 input_citation = input_property('citation', "Input citation (read only)") 

1742 input_thanks = input_property('thanks', "Input acknowledgements (read only)") 

1743 input_links = input_property('links', "Input links (read only)") 

1744 input_timestep = input_property('timestep', "Input timestep (read only)") 

1745 input_temperature = input_property('temp', "Input temperature (read only)") 

1746 input_ensemble = input_property('ensemble', "Input ensemble (read only)") 

1747 input_water = input_property('wat', "Input water force field (read only)") 

1748 input_boxtype = input_property('boxtype', "Input boxtype (read only)") 

1749 input_pbc_selection = input_property('pbc_selection', "Input Periodic Boundary Conditions (PBC) selection (read only)") 

1750 input_cg_selection = input_property('cg_selection', "Input Coarse Grained (CG) selection (read only)") 

1751 input_customs = input_property('customs', "Input custom representations (read only)") 

1752 input_orientation = input_property('orientation', "Input orientation (read only)") 

1753 input_multimeric = input_property('multimeric', "Input multimeric labels (read only)") 

1754 # Additional topic-specific inputs 

1755 input_cv19_unit = input_property('cv19_unit', "Input Covid-19 Unit (read only)") 

1756 input_cv19_startconf = input_property('cv19_startconf', "Input Covid-19 starting conformation (read only)") 

1757 input_cv19_abs = input_property('cv19_abs', "Input Covid-19 antibodies (read only)") 

1758 input_cv19_nanobs = input_property('cv19_nanobs', "Input Covid-19 nanobodies (read only)") 

1759 

1760 def get_input_pbc_selection(self) -> Optional[str]: 

1761 """PBC selection may come from the console or from the inputs file. 

1762 Console has priority over the inputs file. 

1763 """ 

1764 # If we have an internal value then return it 

1765 if self._input_pbc_selection: 

1766 return self._input_pbc_selection 

1767 # As an exception, we avoid asking for the inputs file if it is not available 

1768 # This input is required for some early processing steps where we do not need the inputs file for anything else 

1769 if not self.is_inputs_file_available(): 

1770 return None 

1771 # Otherwise, find it in the inputs 

1772 # Get the input value, whose key must exist 

1773 self._input_pbc_selection = self.get_input('pbc_selection') 

1774 return self._input_pbc_selection 

1775 input_pbc_selection = property(get_input_pbc_selection, None, None, "Selection of atoms which are still in periodic boundary conditions (read only)") 

1776 

1777 def get_input_cg_selection(self) -> Optional[str]: 

1778 """CG selection may come from the console or from the inputs file. 

1779 Console has priority over the inputs file. 

1780 """ 

1781 # If we have an internal value then return it 

1782 if self._input_cg_selection: 

1783 return self._input_cg_selection 

1784 # As an exception, we avoid asking for the inputs file if it is not available 

1785 # This input is required for some early processing steps where we do not need the inputs file for anything else 

1786 if not self.is_inputs_file_available(): 

1787 return None 

1788 # Otherwise, find it in the inputs 

1789 # Get the input value, whose key must exist 

1790 self._input_cg_selection = self.get_input('cg_selection') 

1791 return self._input_cg_selection 

1792 input_cg_selection = property(get_input_cg_selection, None, None, "Selection of atoms which are not acutal atoms but Coarse Grained beads (read only)") 

1793 

1794 # Set additional values infered from input values 

1795 

1796 def check_is_time_dependent(self) -> bool: 

1797 """Set if MDs are time dependent.""" 

1798 if self.input_type == 'trajectory': 

1799 return True 

1800 elif self.input_type == 'ensemble': 

1801 return False 

1802 raise InputError(f'Not supported input "type" value: {self.input_type}. It must be "trajectory" or "ensemble"') 

1803 is_time_dependent = property(check_is_time_dependent, None, None, "Check if trajectory frames are time dependent (read only)") 

1804 

1805 # Processed files ---------------------------------------------------- 

1806 

1807 def inherit_topology_filename(self) -> Optional[str]: 

1808 """Set the expected output topology filename given the input topology filename. 

1809 Note that topology formats are conserved. 

1810 """ 

1811 if self.input_topology_file == MISSING_TOPOLOGY: 

1812 return MISSING_TOPOLOGY 

1813 filename = self.input_topology_file.filename 

1814 if not filename: raise RuntimeError('Unnamed file?') 

1815 if filename == RAW_CHARGES_FILENAME: 

1816 return filename 

1817 standard_format = self.input_topology_file.format 

1818 return 'topology.' + standard_format 

1819 

1820 def get_topology_filepath(self) -> str: 

1821 """Get the processed topology file path.""" 

1822 # If we have a stored value then return it 

1823 if self._topology_filepath: 

1824 return self._topology_filepath 

1825 # Otherwise we must find it 

1826 inherited_filename = self.inherit_topology_filename() 

1827 if inherited_filename == MISSING_TOPOLOGY: 

1828 self._topology_filepath = MISSING_TOPOLOGY 

1829 else: 

1830 self._topology_filepath = self.pathify(inherited_filename) 

1831 return self._topology_filepath 

1832 topology_filepath = property(get_topology_filepath, None, None, "Topology file path (read only)") 

1833 

1834 def get_topology_file(self) -> 'File': 

1835 """Get the processed topology from the reference MD.""" 

1836 getter = self.reference_md.input_files_processing.get_output_file_getter('output_topology_file') 

1837 return getter(self.reference_md) 

1838 topology_file = property(get_topology_file, None, None, "Topology file (read only)") 

1839 

1840 def get_structure_file(self) -> 'File': 

1841 """Get the processed structure from the reference MD.""" 

1842 return self.reference_md.structure_file 

1843 structure_file = property(get_structure_file, None, None, "Structure filename from the reference MD (read only)") 

1844 

1845 def get_trajectory_file(self) -> 'File': 

1846 """Get the processed trajectory from the reference MD.""" 

1847 return self.reference_md.trajectory_file 

1848 trajectory_file = property(get_trajectory_file, None, None, "Trajectory filename from the reference MD (read only)") 

1849 

1850 # --------------------------------------------------------------------------------- 

1851 # Others values which may be found/calculated and files to be generated on demand 

1852 # --------------------------------------------------------------------------------- 

1853 

1854 def get_structure(self) -> 'Structure': 

1855 """Get the parsed structure from the reference MD.""" 

1856 return self.reference_md.structure 

1857 structure = property(get_structure, None, None, "Parsed structure from the reference MD (read only)") 

1858 

1859 def get_pbc_residues(self) -> list[int]: 

1860 """Get the indices of residues in periodic boundary conditions.""" 

1861 return self.reference_md.pbc_residues 

1862 pbc_residues = property(get_pbc_residues, None, None, "Indices of residues in periodic boundary conditions (read only)") 

1863 

1864 def get_cg_residues(self) -> list[int]: 

1865 """Get the indices of residues in coarse grain.""" 

1866 return self.reference_md.cg_residues 

1867 cg_residues = property(get_cg_residues, None, None, "Indices of residues in coarse grain (read only)") 

1868 

1869 def get_snapshots(self) -> int: 

1870 """Get the reference MD snapshots.""" 

1871 return self.reference_md.snapshots 

1872 snapshots = property(get_snapshots, None, None, "Reference MD snapshots (read only)") 

1873 

1874 def get_universe(self) -> int: 

1875 """Get the MDAnalysis Universe from the reference MD.""" 

1876 return self.reference_md.universe 

1877 universe = property(get_universe, None, None, "MDAnalysis Universe object (read only)") 

1878 

1879 def get_processed_interactions(self) -> dict: 

1880 """Get the processed interactions from the reference replica, which are the same for all replicas.""" 

1881 return self.reference_md.interactions 

1882 interactions = property(get_processed_interactions, None, None, "Processed interactions (read only)") 

1883 

1884 def get_check_stable_bonds(self) -> bool: 

1885 """Check if we must check stable bonds.""" 

1886 # Set if stable bonds have to be checked 

1887 must_check = STABLE_BONDS_FLAG not in self.trust 

1888 # If this analysis has been already passed then we can trust structure bonds 

1889 if self.register.tests.get(STABLE_BONDS_FLAG, None) is True: 

1890 must_check = False 

1891 return must_check 

1892 must_check_stable_bonds = property(get_check_stable_bonds, None, None, "Check if we must check stable bonds (read only)") 

1893 

1894 # Reference bonds 

1895 get_reference_bonds = Task('refbonds', 'Reference bonds', find_safe_bonds) 

1896 reference_bonds = property(get_reference_bonds, None, None, "Atom bonds to be trusted (read only)") 

1897 

1898 # Atom charges 

1899 get_charges = Task('charges', 'Getting atom charges', get_charges) 

1900 charges = property(get_charges, None, None, "Atom charges (read only)") 

1901 

1902 def get_topology_reader(self) -> 'Topology': 

1903 """Get the topology data reader.""" 

1904 # If we already have a stored value then return it 

1905 if self._topology_reader: return self._topology_reader 

1906 # Instantiate the topology reader 

1907 self._topology_reader = Topology(self.topology_file) 

1908 return self._topology_reader 

1909 topology_reader = property(get_topology_reader, None, None, "Topology reader (read only)") 

1910 

1911 def get_dihedrals(self) -> list[dict]: 

1912 """Get the topology dihedrals.""" 

1913 # If we already have a stored value then return it 

1914 if self._dihedrals: return self._dihedrals 

1915 # Calculate the dihedrals otherwise 

1916 self._dihedrals = self.topology_reader.get_dihedrals_data() 

1917 return self._dihedrals 

1918 dihedrals = property(get_dihedrals, None, None, "Topology dihedrals (read only)") 

1919 

1920 def get_populations(self) -> Optional[list[float]]: 

1921 """Get the equilibrium populations from a MSM.""" 

1922 # If we already have a stored value then return it 

1923 if self._populations: 

1924 return self._populations 

1925 # Otherwise we must find the value 

1926 if not self.populations_file: 

1927 return None 

1928 self._populations = read_file(self.populations_file) 

1929 return self._populations 

1930 populations = property(get_populations, None, None, "Equilibrium populations from a MSM (read only)") 

1931 

1932 def get_transitions(self) -> Optional[list[list[float]]]: 

1933 """Get the transition probabilities from a MSM.""" 

1934 # If we already have a stored value then return it 

1935 if self._transitions: 

1936 return self._transitions 

1937 # Otherwise we must find the value 

1938 if not self.transitions_file: 

1939 return None 

1940 self._transitions = read_file(self.transitions_file) 

1941 return self._transitions 

1942 transitions = property(get_transitions, None, None, "Transition probabilities from a MSM (read only)") 

1943 

1944 def get_pdb_ids(self) -> list[str]: 

1945 """Get the tested and standardized PDB ids.""" 

1946 # If we already have a stored value then return it 

1947 if self._pdb_ids is not None: 

1948 return self._pdb_ids 

1949 # Otherwise test and standarize input PDB ids 

1950 self._pdb_ids = [] 

1951 # If there is no input pdb ids (may be None) then stop here 

1952 if not self.input_pdb_ids: 

1953 return [] 

1954 # If input PDB ids is a string instead of a list then fix it 

1955 input_pdb_ids = self.input_pdb_ids 

1956 if type(input_pdb_ids) is str: 

1957 input_pdb_ids = [input_pdb_ids] 

1958 # Iterate input PDB ids 

1959 for input_pdb_id in input_pdb_ids: 

1960 # First make sure this is a PDB id 

1961 if not re.match(PDB_ID_FORMAT, input_pdb_id): 

1962 raise InputError(f'Input PDB id "{input_pdb_id}" does not look like a PDB id') 

1963 # Make letters upper 

1964 pdb_id = input_pdb_id.upper() 

1965 self._pdb_ids.append(pdb_id) 

1966 return self._pdb_ids 

1967 pdb_ids = property(get_pdb_ids, None, None, "Tested and standarized PDB ids (read only)") 

1968 

1969 # Prepare the PDB references file to be uploaded to the database 

1970 get_pdb_references = Task('pdbs', 'Prepare PDB references', 

1971 prepare_pdb_references, output_filenames={'pdb_references_file': PDB_REFERENCES_FILENAME}) 

1972 pdb_references_file = get_pdb_references.get_output_file_getter('pdb_references_file') 

1973 

1974 # Map the structure aminoacids sequences against the Uniprot reference sequences 

1975 get_protein_map = Task('protmap', 'Protein residues mapping', generate_protein_mapping, 

1976 output_filenames={'protein_references_file': PROTEIN_REFERENCES_FILENAME}) 

1977 protein_map = property(get_protein_map, None, None, "Protein residues mapping (read only)") 

1978 

1979 # Define the output file of the protein mapping including protein references 

1980 get_protein_references_file = get_protein_map.get_output_file_getter('protein_references_file') 

1981 protein_references_file = property(get_protein_references_file, None, None, "File including protein refereces data mined from UniProt (read only)") 

1982 

1983 get_chain_references = Task('chains', 'Chain references', prepare_chain_references, 

1984 output_filenames={'chains_references_file': OUTPUT_CHAINS_FILENAME}) 

1985 

1986 get_inchikeys = Task('inchikeys', 'InChI keys residues mapping', generate_inchikeys) 

1987 inchikeys = property(get_inchikeys, None, None, "InChI keys (read only)") 

1988 

1989 get_lipid_references = Task('lipmap', 'Lipid references', generate_lipid_references) 

1990 lipid_references = property(get_lipid_references, None, None, "Lipid references (read only)") 

1991 

1992 get_membrane_map = Task('memmap', 'Membrane residues mapping', generate_membrane_mapping, 

1993 output_filenames={'output_file': MEMBRANE_MAPPING_FILENAME}) 

1994 membrane_map = property(get_membrane_map, None, None, "Membrane mapping (read only)") 

1995 

1996 get_ligand_references = Task('ligmap', 'Ligand residues mapping', generate_ligand_references) 

1997 ligand_references = property(get_ligand_references, None, None, "Ligand references (read only)") 

1998 

1999 get_inchi_references = Task('inchimap', 'InChI references', generate_inchi_references, 

2000 output_filenames={'output_file': INCHIKEY_REFERENCES_FILENAME}) 

2001 inchikey_map = property(get_inchi_references, None, None, "InChI references (read only)") 

2002 

2003 # Build the residue map from both proteins and ligands maps 

2004 # This is formatted as both the standard topology and metadata producers expect them 

2005 get_residue_map = Task('resmap', 'Residue mapping', generate_residue_mapping) 

2006 residue_map = property(get_residue_map, None, None, "Residue map (read only)") 

2007 

2008 # Prepare the project metadata file to be upload to the database 

2009 prepare_metadata = Task('pmeta', 'Prepare project metadata', 

2010 prepare_project_metadata, output_filenames={'output_file': OUTPUT_METADATA_FILENAME}) 

2011 

2012 # Prepare the standard topology file to be uploaded to the database 

2013 prepare_standard_topology = Task('stopology', 'Standard topology file', 

2014 generate_topology, output_filenames={'output_file': STANDARD_TOPOLOGY_FILENAME}) 

2015 get_standard_topology_file = prepare_standard_topology.get_output_file_getter('output_file') 

2016 standard_topology_file = property(get_standard_topology_file, None, None, "Standard topology filename (read only)") 

2017 

2018 # Get a screenshot of the system 

2019 get_screenshot_filename = Task('screenshot', 'Screenshot file', 

2020 get_screenshot, output_filenames={'output_file': OUTPUT_SCREENSHOT_FILENAME}) 

2021 screenshot_filename = property(get_screenshot_filename, None, None, "Screenshot filename (read only)") 

2022 

2023 # Provenance data 

2024 produce_provenance = Task('aiidata', 'Produce provenance', produce_provenance) 

2025 

2026 def get_warnings(self) -> list: 

2027 """Get the warnings. 

2028 

2029 The warnings list should not be reasigned, but it was back in the day. 

2030 To avoid silent bugs, we read it directly from the register every time. 

2031 """ 

2032 return self.register.warnings 

2033 warnings = property(get_warnings, None, None, "Project warnings to be written in metadata") 

2034 

2035 

2036# AUXILIAR FUNCTIONS --------------------------------------------------------------------------- 

2037 

2038# DANI: En cuanto se concrete el formato de los markov esta función no hará falta 

2039def read_file(target_file: File) -> dict: 

2040 """Set a function to read a file which may be in different formats.""" 

2041 # Get the file format 

2042 file_format = target_file.filename.split('.')[-1] 

2043 # Read numpy files 

2044 if file_format == 'npy': 

2045 return numpy.load(target_file.path) 

2046 # Read JSON files 

2047 if file_format == 'json': 

2048 return load_json(target_file.path) 

2049 

2050 

2051def name_2_directory(name: str) -> str: 

2052 """Set a function to convert an MD name into an equivalent MD directory.""" 

2053 # Replace white spaces with underscores 

2054 directory = name.replace(' ', '_') 

2055 # Remove problematic characters 

2056 for character in FORBIDDEN_DIRECTORY_CHARACTERS: 

2057 directory = directory.replace(character, '') 

2058 return directory 

2059 

2060 

2061def check_directory(directory: str) -> str: 

2062 """Check for problematic characters in a directory path.""" 

2063 # Remove problematic characters 

2064 directory_characters = set(directory) 

2065 for character in FORBIDDEN_DIRECTORY_CHARACTERS: 

2066 if character in directory_characters: 

2067 raise InputError(f'Directory path "{directory}" includes the forbidden character "{character}"') 

2068 

2069 

2070def directory_2_name(directory: str) -> str: 

2071 """Convert an MD directory into an equivalent MD name.""" 

2072 # Remove a possible starting './' 

2073 # Replace white spaces with underscores 

2074 name = directory.split('/')[-1].replace('_', ' ') 

2075 return name 

2076 

2077 

2078# Project input files 

2079project_input_files = { 

2080 'itopology': Project.get_input_topology_file, 

2081 'inputs': Project.get_inputs_file, 

2082 'populations': Project.get_populations_file, 

2083 'transitions': Project.get_transitions_file 

2084} 

2085# MD input files 

2086md_input_files = { 

2087 'istructure': MD.get_input_structure_file, 

2088 'itrajectory': MD.get_input_trajectory_files 

2089} 

2090# Both project and MD input files 

2091input_files = {**project_input_files, **md_input_files} 

2092 

2093# Project processed files 

2094project_processed_files = { 

2095 'topology': Project.get_topology_file 

2096} 

2097# MD processed files 

2098md_processed_files = { 

2099 'structure': MD.get_structure_file, 

2100 'trajectory': MD.get_trajectory_file 

2101} 

2102# Both project and MD processed files 

2103processed_files = {**project_processed_files, **md_processed_files} 

2104 

2105# Project requestable tasks 

2106project_requestables = { 

2107 **project_input_files, 

2108 **project_processed_files, 

2109} 

2110# Add available tasks to project requestables 

2111for callable in vars(Project).values(): 

2112 if isinstance(callable, Task): project_requestables[callable.flag] = callable 

2113# MD requestable tasks 

2114md_requestables = { 

2115 **md_input_files, 

2116 **md_processed_files, 

2117} 

2118# Add available tasks to project requestables 

2119for callable in vars(MD).values(): 

2120 if isinstance(callable, Task): md_requestables[callable.flag] = callable 

2121# Requestables for the console 

2122# Note that this constant is global 

2123requestables = {**project_requestables, **md_requestables} 

2124 

2125# Set groups of dependencies to be requested together using only one flag 

2126DEPENDENCY_FLAGS = { 

2127 'download': list(input_files.keys()), 

2128 'setup': list(processed_files.keys()), 

2129 'meta': ['pmeta', 'mdmeta'], 

2130 'network': ['resmap', 'ligmap', 'lipmap', 'chains', 'pdbs', 'memmap'], 

2131 'minimal': ['pmeta', 'mdmeta', 'stopology'], 

2132 'interdeps': ['inter', 'pairwise', 'hbonds', 'energies', 'perres', 'clusters', 'dist'], 

2133 'membs': ['memmap', 'density', 'thickness', 'apl', 'lorder', 'linter', 'channels'] 

2134} 

2135 

2136# Set the default analyses to be run when no task is specified 

2137DEFAULT_ANALYSES = ['clusters', 'dist', 'energies', 'hbonds', 'pca', 'pockets', 

2138 'rgyr', 'rmsds', 'perres', 'pairwise', 'rmsf', 'sas', 'tmscore', 'density', 

2139 'thickness', 'apl', 'lorder', 'linter'] 

2140 

2141 

2142def workflow( 

2143 project_parameters: dict = {}, 

2144 working_directory: str = '.', 

2145 download: bool = False, 

2146 setup: bool = False, 

2147 include: Optional[list[str]] = None, 

2148 exclude: Optional[list[str]] = None, 

2149 overwrite: Optional[list[str] | bool] = None, 

2150): 

2151 """Run the MDDB workflow. 

2152 

2153 Args: 

2154 project_parameters (dict): Parameters to initiate the project. 

2155 working_directory (str): Working directory where the project is located. 

2156 download (bool): (Deprecated: use -i download) If passed, only download required files. Then exits. 

2157 setup (bool): (Deprecated: use -i setup) If passed, only download required files and run mandatory dependencies. Then exits. 

2158 include (list[str] | None): Set the unique analyses or tools to be run. All other steps will be skipped. 

2159 exclude (list[str] | None): List of analyses or tools to be skipped. All other steps will be run. Note that if we exclude a dependency of something else then it will be run anyway. 

2160 overwrite (list[str] | bool | None): List of tasks that will be re-run, overwriting previous output files. Use this flag alone to overwrite everything. 

2161 

2162 """ 

2163 # Check there are not input errors 

2164 

2165 # Include and exclude are not compatible 

2166 # This is to protect the user to do something which makes not sense 

2167 if include and exclude: 

2168 raise InputError('Include (-i) and exclude (-e) are not compatible. Use one of these options.') 

2169 

2170 # Save the directory where the workflow is called from so we can come back at the very end 

2171 workflow_call_directory = getcwd() 

2172 

2173 # Make sure the working directory exists 

2174 if not exists(working_directory): 

2175 raise InputError(f'Working directory "{working_directory}" does not exist') 

2176 

2177 # Make sure the working directory is actually a directory 

2178 if not isdir(working_directory): 

2179 raise InputError(f'Working directory "{working_directory}" is actually not a directory') 

2180 

2181 # Move the current directory to the working directory 

2182 chdir(working_directory) 

2183 current_directory_name = getcwd().split('/')[-1] 

2184 git_version = get_git_version() 

2185 print(f'\n{CYAN_HEADER}Running MDDB workflow ({git_version}) for project at {current_directory_name}{COLOR_END}') 

2186 

2187 # Initiate the project project 

2188 project = Project(**project_parameters) 

2189 print(f' {len(project.mds)} MDs are to be run') 

2190 

2191 # Set the tasks to be run 

2192 tasks = None 

2193 # If the download argument is passed then just make sure input files are available 

2194 if download: 

2195 warn('The "-d" or "--download" argument is deprecated. Please use "-i download" instead.') 

2196 tasks = list(input_files.keys()) 

2197 # If the setup argument is passed then just process input files 

2198 elif setup: 

2199 warn('The "-s" or "--setup" argument is deprecated. Please use "-i setup" instead.') 

2200 tasks = list(processed_files.keys()) 

2201 # If the include argument then add only the specified tasks to the list 

2202 elif include and len(include) > 0: 

2203 tasks = [*include] 

2204 # Search for special flags among included 

2205 for flag, dependencies in DEPENDENCY_FLAGS.items(): 

2206 if flag not in tasks: continue 

2207 # If the flag is found then remove it and write the corresponding dependencie instead 

2208 # Make sure not to duplicate a dependency if it was already included 

2209 tasks.remove(flag) 

2210 for dep in dependencies: 

2211 if dep in tasks: continue 

2212 tasks.append(dep) 

2213 # Set the default tasks otherwise 

2214 else: 

2215 tasks = [ 

2216 # Project tasks 

2217 'stopology', 

2218 'screenshot', 

2219 'pmeta', 

2220 'pdbs', 

2221 'chains', 

2222 # MD tasks 

2223 'mdmeta', 

2224 'inter', 

2225 *DEFAULT_ANALYSES, 

2226 ] 

2227 # If the exclude parameter was passed then remove excluded tasks from the default tasks 

2228 if exclude and len(exclude) > 0: 

2229 excluded_dependencies = [*exclude] 

2230 # Search for special flags among excluded 

2231 for flag, dependencies in DEPENDENCY_FLAGS.items(): 

2232 if flag not in exclude: continue 

2233 # If the flag is found then exclude the dependencies instead 

2234 # Make sure not to duplicate a dependency if it was already included 

2235 excluded_dependencies.remove(flag) 

2236 for dep in dependencies: 

2237 if dep in exclude: continue 

2238 excluded_dependencies.append(dep) 

2239 tasks = [name for name in tasks if name not in excluded_dependencies] 

2240 

2241 # If the user requested to overwrite something, make sure it is in the tasks list 

2242 

2243 # Update the overwritable variable with the requested overwrites 

2244 overwritables = set() 

2245 if overwrite: 

2246 # If the overwrite argument is simply true then add all requestables to the overwritable 

2247 if type(overwrite) is bool: 

2248 for task in tasks: 

2249 overwritables.add(task) 

2250 # If the overwrite argument is a list of tasks then iterate them 

2251 elif type(overwrite) is list: 

2252 for task in overwrite: 

2253 # Make sure the task to be overwriten is among the tasks to be run 

2254 if task not in tasks: 

2255 raise InputError(f'Task "{task}" is to be overwriten but it is not in the tasks list. Either include it or do not exclude it') 

2256 # Add it to the global variable 

2257 overwritables.add(task) 

2258 else: raise ValueError('Not supported overwrite type') 

2259 

2260 # Get project tasks 

2261 project_tasks = [task for task in tasks if task in project_requestables] 

2262 # Get the MD tasks 

2263 md_tasks = [task for task in tasks if task in md_requestables] 

2264 

2265 # Set project overwritables 

2266 project.overwritables = set([task for task in project_tasks if task in overwritables]) 

2267 # Set MD overwrittables 

2268 # Note that this must be done before running project tasks 

2269 # Some project tasks rely in MD tasks 

2270 for md in project.mds: 

2271 md.overwritables = set([task for task in md_tasks if task in overwritables]) 

2272 

2273 # Run the project tasks now 

2274 for task in project_tasks: 

2275 # Get the function to be called and call it 

2276 getter = requestables[task] 

2277 getter(project) 

2278 

2279 # If there are no MD tasks then we are done already 

2280 if len(md_tasks) == 0: 

2281 print("Finished!") 

2282 return 

2283 

2284 # Now iterate over the different MDs 

2285 for md in project.mds: 

2286 print(f'\n{CYAN_HEADER} Processing MD at {md.directory}{COLOR_END}') 

2287 # Run the MD tasks 

2288 for task in md_tasks: 

2289 # Get the function to be called and call it 

2290 getter = requestables[task] 

2291 getter(md) 

2292 

2293 # Remove gromacs backups and other trash files from this MD 

2294 remove_trash(md.directory) 

2295 

2296 # Remove gromacs backups and other trash files from the project 

2297 remove_trash(project.directory) 

2298 

2299 # Return back to the place where the workflow was called originally 

2300 # This is not important for many applications 

2301 # But if you call the workflow function from a python script then this is important 

2302 chdir(workflow_call_directory) 

2303 

2304 print("Done!")