Coverage for mddb_workflow / tools / generate_metadata.py: 81%
95 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
1from mddb_workflow.tools.get_box_size import get_box_size
2from mddb_workflow.tools.get_atoms_count import get_atoms_count
3from mddb_workflow.tools.generate_map import get_sequence_metadata
4from mddb_workflow.utils.auxiliar import InputError, save_json
5from mddb_workflow.utils.constants import MD_DIRECTORY
6from mddb_workflow.utils.type_hints import *
8# Input fields + interaction type
9METADATA_INTERACTION_FIELDS = { "name", "agent_1", "agent_2", "selection_1", "selection_2", "type" }
12def prepare_project_metadata (
13 structure_file : 'File',
14 trajectory_file : 'File',
15 output_file : 'File',
16 structure : 'Structure',
17 residue_map : dict,
18 protein_references_file : 'File',
19 pdb_ids : list[str],
20 ligand_references : dict,
21 input_protein_references : list[str] | dict,
22 input_ligands : list[dict],
23 interactions : list[dict],
24 warnings : dict,
25 # Set all inputs to be loaded as they are
26 input_force_fields : list[str],
27 input_collections : list[str],
28 input_chain_names : list[str],
29 input_type : str,
30 input_framestep : float,
31 input_name : str,
32 input_description : str,
33 input_authors : list[str],
34 input_groups : list[str],
35 input_contact : str,
36 input_program : str,
37 input_version : str,
38 input_method : str,
39 input_license : str,
40 input_linkcense : str,
41 input_citation : str,
42 input_thanks : str,
43 input_links : list[dict],
44 input_timestep : float,
45 input_temperature : float,
46 input_ensemble : str,
47 input_water : str,
48 input_boxtype : str,
49 input_pbc_selection : str,
50 input_cg_selection : str,
51 input_customs : list[dict],
52 input_orientation : list[float],
53 input_multimeric : list[str],
54 # Additional topic-specific inputs
55 input_cv19_unit : str,
56 input_cv19_startconf : str,
57 input_cv19_abs : bool,
58 input_cv19_nanobs : bool,
59 ):
60 """ Prepare a JSON file with all project metadata. """
62 # Find out the box size (x, y and z)
63 (boxsizex, boxsizey, boxsizez) = get_box_size(
64 structure_file.path, trajectory_file.path)
66 # Count different types of atoms and residues
67 (system_atoms, system_residues, protein_atoms, protein_residues,
68 nucleic_atoms, nucleic_residues, lipid_atoms, lipid_residues,
69 carbohydrates_atoms, carbohydrates_residues, solvent_atoms, solvent_residues,
70 counter_cations, counter_anions, counter_ions) = get_atoms_count(structure)
72 # Get protein references from the residues map
73 # Get ligand references from the residues map
74 protein_references = []
75 ligand_references = []
76 inchikey_references = []
77 references = residue_map['references']
78 if references and len(references) > 0:
79 for ref, ref_type in zip(references, residue_map['reference_types']):
80 if ref_type == 'protein':
81 protein_references.append(ref)
82 elif ref_type == 'ligand':
83 ligand_references.append(ref)
84 elif ref_type == 'inchikey':
85 inchikey_references.append(ref)
87 # Get ligand names if any
88 forced_ligand_names = {
89 lig['name']: lig['forced_name'] for lig in ligand_references if lig.get('forced_name', False) }
90 if len(forced_ligand_names) == 0:
91 forced_ligand_names = None
93 # Make the forcefields a list in case it is a single string
94 forcefields = input_force_fields
95 if type(forcefields) == str:
96 forcefields = [forcefields]
98 # Collections must be null in case there are not collections
99 collections = input_collections
100 if not collections:
101 collections = []
103 # Get additional metadata related to the aminoacids sequence
104 sequence_metadata = get_sequence_metadata(structure, protein_references_file, residue_map)
106 # Find the PTMs
107 # Save only their names for now
108 # DANI: Esto es temporal y de momento solo busca ser un parámetro de facil query
109 # DANI: Cuando esté más maduro también almacenaremos residuo afectado, como mínimo
110 ptms = structure.find_ptms()
111 ptm_names = list(set([ ptm['name'] for ptm in ptms ]))
113 # Check chainnames to actually exist in the structure
114 structure_chains = set([ chain.name for chain in structure.chains ])
115 chainnames = input_chain_names
116 if chainnames:
117 for chain in chainnames.keys():
118 if chain not in structure_chains:
119 raise InputError(f'Chain {chain} from chainnames does not exist in the structure')
121 # Get the MD type
122 md_type = input_type
123 # In case this is an ensemble and not a time related trajectory and not an ensemble, the framestep may be missing
124 framestep = None if md_type == 'ensemble' else input_framestep
126 # Metadata interactions are input interactions and the interaction types combined
127 # Thus we take the processed interactions and remove the field we are not interested in
128 metadata_interactions = []
129 if interactions is not None:
130 for interaction in interactions:
131 metadata_interaction = { k: v for k, v in interaction.items() if k in METADATA_INTERACTION_FIELDS }
132 metadata_interactions.append(metadata_interaction)
134 # Make sure links are correct
135 links = input_links
136 if links != None:
137 if type(links) != list: links = [ links ]
138 for link in input_links:
139 if type(link) != dict: raise InputError('Links must be a list of objects')
140 if link.get('name', None) == None: raise InputError('Links must have a name')
141 if link.get('url', None) == None: raise InputError('Links must have a URL')
143 # Write the metadata file
144 # Metadata keys must be in CAPS, as they are in the client
145 metadata = {
146 'NAME': input_name,
147 'DESCRIPTION': input_description,
148 'AUTHORS': input_authors,
149 'GROUPS': input_groups,
150 'CONTACT': input_contact,
151 'PROGRAM': input_program,
152 'VERSION': input_version,
153 'TYPE': md_type,
154 'METHOD': input_method,
155 'LICENSE': input_license,
156 'LINKCENSE': input_linkcense,
157 'CITATION': input_citation,
158 'THANKS': input_thanks,
159 'LINKS': input_links,
160 'PDBIDS': pdb_ids,
161 'FORCED_REFERENCES': input_protein_references,
162 'REFERENCES': protein_references,
163 'INPUT_LIGANDS': input_ligands,
164 # TODO: Ligands are now inchikeys only, remove after checking it does not break the client removing this
165 'LIGANDS': [],
166 'LIGANDNAMES': forced_ligand_names,
167 'INCHIKEYS': inchikey_references,
168 'PROTSEQ': sequence_metadata['protein_sequences'],
169 'NUCLSEQ': sequence_metadata['nucleic_sequences'],
170 'DOMAINS': sequence_metadata['domains'],
171 'FRAMESTEP': framestep,
172 'TIMESTEP': input_timestep,
173 'TEMP': input_temperature,
174 'ENSEMBLE': input_ensemble,
175 'FF': forcefields,
176 'WAT': input_water,
177 'BOXTYPE': input_boxtype,
178 'SYSTATS': system_atoms,
179 'SYSTRES': system_residues,
180 'PROTATS': protein_atoms,
181 'PROTRES': protein_residues,
182 'NUCLATS': nucleic_atoms,
183 'NUCLRES': nucleic_residues,
184 'LIPIATS': lipid_atoms,
185 'LIPIRES': lipid_residues,
186 'CARBATS': carbohydrates_atoms,
187 'CARBRES': carbohydrates_residues,
188 'SOLVATS': solvent_atoms,
189 'SOLVRES': solvent_residues,
190 'COUNCAT': counter_cations,
191 'COUNANI': counter_anions,
192 'COUNION': counter_ions,
193 'INTERACTIONS': metadata_interactions,
194 'PBC_SELECTION': input_pbc_selection,
195 'CG_SELECTION': input_cg_selection,
196 'CHAINNAMES': chainnames,
197 'CUSTOMS': input_customs,
198 'ORIENTATION': input_orientation,
199 'PTM': ptm_names,
200 'MULTIMERIC': input_multimeric,
201 'COLLECTIONS': collections,
202 'WARNINGS': warnings,
203 }
204 # Add boxsizes only if any of them is 0
205 if boxsizex > 0 and boxsizey > 0 and boxsizez > 0:
206 metadata['BOXSIZEX'] = boxsizex
207 metadata['BOXSIZEY'] = boxsizey
208 metadata['BOXSIZEZ'] = boxsizez
209 # Add collection specific fields
210 if 'cv19' in collections:
211 cv19_unit = input_cv19_unit
212 cv19_startconf = input_cv19_startconf
213 cv19_abs = input_cv19_abs
214 cv19_nanobs = input_cv19_nanobs
215 cv19_variant = sequence_metadata['cv19_variant']
217 if cv19_unit is not None:
218 metadata['CV19_UNIT'] = cv19_unit
220 if cv19_startconf is not None:
221 metadata['CV19_STARTCONF'] = cv19_startconf
223 if cv19_abs is not None:
224 metadata['CV19_ABS'] = cv19_abs
226 if cv19_nanobs is not None:
227 metadata['CV19_NANOBS'] = cv19_nanobs
229 if cv19_variant is not None:
230 metadata['CV19_VARIANT'] = cv19_variant
232 # Write metadata to a file
233 save_json(metadata, output_file.path)
235metadata_fields = set([ 'NAME', 'DESCRIPTION', 'AUTHORS', 'GROUPS', 'CONTACT', 'PROGRAM', 'VERSION',
236 'TYPE', 'METHOD', 'LICENSE', 'LINKCENSE', 'CITATION', 'THANKS', 'LINKS', 'PDBIDS', 'FORCED_REFERENCES',
237 'REFERENCES', 'INPUT_LIGANDS', 'LIGANDS', 'LIGANDNAMES', 'PROTSEQ', 'NUCLSEQ', 'DOMAINS', 'FRAMESTEP', 'TIMESTEP',
238 'TEMP', 'ENSEMBLE', 'FF', 'WAT', 'BOXTYPE', 'SYSTATS', 'PROTATS', 'PROT', 'DPPC', 'SOL', 'NA', 'CL',
239 'INTERACTIONS', 'PBC_SELECTION', 'CHAINNAMES', 'MEMBRANES', 'CUSTOMS', 'ORIENTATION', 'PTM',
240 'MULTIMERIC', 'COLLECTIONS', 'WARNINGS', 'BOXSIZEX', 'BOXSIZEY', 'BOXSIZEZ', 'CV19_UNIT', 'CV19_STARTCONF',
241 'CV19_ABS', 'CV19_NANOBS', 'CV19_VARIANT'
242])
244def generate_md_metadata (
245 md_inputs : dict,
246 structure : 'Structure',
247 snapshots : int,
248 reference_frame : int,
249 warnings : dict,
250 output_file : 'File'
251 ):
252 """Produce the MD metadata file to be uploaded to the database."""
254 # Mine name and directory from MD inputs
255 name = md_inputs.get('name', None)
256 directory = md_inputs.get(MD_DIRECTORY, None)
258 # Write the metadata file
259 md_metadata = {
260 'name': name,
261 'frames': snapshots,
262 'atoms': len(structure.atoms), # Should be always the same but we better have explicit confirmation
263 'refframe': reference_frame,
264 'warnings': warnings,
265 }
267 # Get other MD inputs than the name and the directory
268 other_md_inputs = { k: v for k, v in md_inputs.items() }
269 # Remove name from MD inputs to not further overwrite project metadata
270 if name:
271 del other_md_inputs['name']
272 # Remove the directory name form MD inputs since it is not to be uploaded to the database
273 if directory:
274 del other_md_inputs[MD_DIRECTORY]
276 # Inherit all metadata fields
277 metadata = {}
278 for field in metadata_fields:
279 input_field = field.lower()
280 field_value = other_md_inputs.get(input_field, None)
281 if field_value:
282 metadata[field] = field_value
284 # Add the matadata field only if there is at least one value
285 if len(metadata) > 0:
286 md_metadata['metadata'] = metadata
288 # Write metadata to a file
289 save_json(md_metadata, output_file.path)