Coverage for mddb_workflow/tools/generate_metadata.py: 73%
95 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 15:48 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 15:48 +0000
1from mddb_workflow.tools.get_box_size import get_box_size
2from mddb_workflow.tools.get_atoms_count import get_atoms_count
3from mddb_workflow.tools.generate_map import get_sequence_metadata
4from mddb_workflow.utils.auxiliar import InputError, save_json
5from mddb_workflow.utils.constants import MD_DIRECTORY
6from mddb_workflow.utils.type_hints import *
8# Input fields + interaction type
9METADATA_INTERACTION_FIELDS = { "name", "agent_1", "agent_2", "selection_1", "selection_2", "type" }
11def prepare_project_metadata (
12 structure_file : 'File',
13 trajectory_file : 'File',
14 output_filepath : str,
15 structure : 'Structure',
16 residue_map : dict,
17 protein_references_file : 'File',
18 pdb_ids : list[str],
19 ligand_map : dict,
20 input_protein_references : list[str] | dict,
21 input_ligands : list[dict],
22 interactions : list[dict],
23 warnings : dict,
24 # Set all inputs to be loaded as they are
25 input_force_fields : list[str],
26 input_collections : list[str],
27 input_chain_names : list[str],
28 input_type : str,
29 input_framestep : float,
30 input_name : str,
31 input_description : str,
32 input_authors : list[str],
33 input_groups : list[str],
34 input_contact : str,
35 input_program : str,
36 input_version : str,
37 input_method : str,
38 input_license : str,
39 input_linkcense : str,
40 input_citation : str,
41 input_thanks : str,
42 input_links : list[dict],
43 input_timestep : float,
44 input_temperature : float,
45 input_ensemble : str,
46 input_water : str,
47 input_boxtype : str,
48 input_pbc_selection : str,
49 input_cg_selection : str,
50 input_customs : list[dict],
51 input_orientation : list[float],
52 input_multimeric : list[str],
53 # Additional topic-specific inputs
54 input_cv19_unit : str,
55 input_cv19_startconf : str,
56 input_cv19_abs : bool,
57 input_cv19_nanobs : bool,
58 ):
59 """ Prepare a JSON file with all project metadata. """
61 # Find out the box size (x, y and z)
62 (boxsizex, boxsizey, boxsizez) = get_box_size(
63 structure_file.path, trajectory_file.path)
65 # Count different types of atoms and residues
66 (system_atoms, system_residues, protein_atoms, protein_residues,
67 nucleic_atoms, nucleic_residues, lipid_atoms, lipid_residues,
68 carbohydrates_atoms, carbohydrates_residues, solvent_atoms, solvent_residues,
69 counter_cations, counter_anions, counter_ions) = get_atoms_count(structure)
71 # Get protein references from the residues map
72 # Get ligand references from the residues map
73 protein_references = []
74 ligand_references = []
75 inchikey_references = []
76 references = residue_map['references']
77 if references and len(references) > 0:
78 for ref, ref_type in zip(references, residue_map['reference_types']):
79 if ref_type == 'protein':
80 protein_references.append(ref)
81 elif ref_type == 'ligand':
82 ligand_references.append(ref)
83 elif ref_type == 'inchikey':
84 inchikey_references.append(ref)
86 # Get ligand names if any
87 forced_ligand_names = {
88 lig['name']: lig['forced_name'] for lig in ligand_map if lig.get('forced_name', False) }
89 if len(forced_ligand_names) == 0:
90 forced_ligand_names = None
92 # Make the forcefields a list in case it is a single string
93 forcefields = input_force_fields
94 if type(forcefields) == str:
95 forcefields = [forcefields]
97 # Collections must be null in case there are not collections
98 collections = input_collections
99 if not collections:
100 collections = []
102 # Get additional metadata related to the aminoacids sequence
103 sequence_metadata = get_sequence_metadata(structure, protein_references_file, residue_map)
105 # Find the PTMs
106 # Save only their names for now
107 # DANI: Esto es temporal y de momento solo busca ser un parámetro de facil query
108 # DANI: Cuando esté más maduro también almacenaremos residuo afectado, como mínimo
109 ptms = structure.find_ptms()
110 ptm_names = list(set([ ptm['name'] for ptm in ptms ]))
112 # Check chainnames to actually exist in the structure
113 structure_chains = set([ chain.name for chain in structure.chains ])
114 chainnames = input_chain_names
115 if chainnames:
116 for chain in chainnames.keys():
117 if chain not in structure_chains:
118 raise InputError(f'Chain {chain} from chainnames does not exist in the structure')
120 # Get the MD type
121 md_type = input_type
122 # In case this is an ensemble and not a time related trajectory and not an ensemble, the framestep may be missing
123 framestep = None if md_type == 'ensemble' else input_framestep
125 # Metadata interactions are input interactions and the interaction types combined
126 # Thus we take the processed interactions and remove the field we are not interested in
127 metadata_interactions = []
128 if interactions is not None:
129 for interaction in interactions:
130 metadata_interaction = { k: v for k, v in interaction.items() if k in METADATA_INTERACTION_FIELDS }
131 metadata_interactions.append(metadata_interaction)
133 # Make sure links are correct
134 links = input_links
135 if links != None:
136 if type(links) != list: links = [ links ]
137 for link in input_links:
138 if type(link) != dict: raise InputError('Links must be a list of objects')
139 if link.get('name', None) == None: raise InputError('Links must have a name')
140 if link.get('url', None) == None: raise InputError('Links must have a URL')
142 # Write the metadata file
143 # Metadata keys must be in CAPS, as they are in the client
144 metadata = {
145 'NAME': input_name,
146 'DESCRIPTION': input_description,
147 'AUTHORS': input_authors,
148 'GROUPS': input_groups,
149 'CONTACT': input_contact,
150 'PROGRAM': input_program,
151 'VERSION': input_version,
152 'TYPE': md_type,
153 'METHOD': input_method,
154 'LICENSE': input_license,
155 'LINKCENSE': input_linkcense,
156 'CITATION': input_citation,
157 'THANKS': input_thanks,
158 'LINKS': input_links,
159 'PDBIDS': pdb_ids,
160 'FORCED_REFERENCES': input_protein_references,
161 'REFERENCES': protein_references,
162 'INPUT_LIGANDS': input_ligands,
163 'LIGANDS': ligand_references,
164 'LIGANDNAMES': forced_ligand_names,
165 'INCHIKEYS': inchikey_references,
166 'PROTSEQ': sequence_metadata['protein_sequences'],
167 'NUCLSEQ': sequence_metadata['nucleic_sequences'],
168 'DOMAINS': sequence_metadata['domains'],
169 'FRAMESTEP': framestep,
170 'TIMESTEP': input_timestep,
171 'TEMP': input_temperature,
172 'ENSEMBLE': input_ensemble,
173 'FF': forcefields,
174 'WAT': input_water,
175 'BOXTYPE': input_boxtype,
176 'SYSTATS': system_atoms,
177 'SYSTRES': system_residues,
178 'PROTATS': protein_atoms,
179 'PROTRES': protein_residues,
180 'NUCLATS': nucleic_atoms,
181 'NUCLRES': nucleic_residues,
182 'LIPIATS': lipid_atoms,
183 'LIPIRES': lipid_residues,
184 'CARBATS': carbohydrates_atoms,
185 'CARBRES': carbohydrates_residues,
186 'SOLVATS': solvent_atoms,
187 'SOLVRES': solvent_residues,
188 'COUNCAT': counter_cations,
189 'COUNANI': counter_anions,
190 'COUNION': counter_ions,
191 'INTERACTIONS': metadata_interactions,
192 'PBC_SELECTION': input_pbc_selection,
193 'CG_SELECTION': input_cg_selection,
194 'CHAINNAMES': chainnames,
195 'CUSTOMS': input_customs,
196 'ORIENTATION': input_orientation,
197 'PTM': ptm_names,
198 'MULTIMERIC' : input_multimeric,
199 'COLLECTIONS': collections,
200 'WARNINGS': warnings,
201 }
202 # Add boxsizes only if any of them is 0
203 if boxsizex > 0 and boxsizey > 0 and boxsizez > 0:
204 metadata['BOXSIZEX'] = boxsizex
205 metadata['BOXSIZEY'] = boxsizey
206 metadata['BOXSIZEZ'] = boxsizez
207 # Add collection specific fields
208 if 'cv19' in collections:
209 cv19_unit = input_cv19_unit
210 cv19_startconf = input_cv19_startconf
211 cv19_abs = input_cv19_abs
212 cv19_nanobs = input_cv19_nanobs
213 cv19_variant = sequence_metadata['cv19_variant']
215 if cv19_unit is not None:
216 metadata['CV19_UNIT'] = cv19_unit
218 if cv19_startconf is not None:
219 metadata['CV19_STARTCONF'] = cv19_startconf
221 if cv19_abs is not None:
222 metadata['CV19_ABS'] = cv19_abs
224 if cv19_nanobs is not None:
225 metadata['CV19_NANOBS'] = cv19_nanobs
227 if cv19_variant is not None:
228 metadata['CV19_VARIANT'] = cv19_variant
230 # Write metadata to a file
231 save_json(metadata, output_filepath)
233metadata_fields = set([ 'NAME', 'DESCRIPTION', 'AUTHORS', 'GROUPS', 'CONTACT', 'PROGRAM', 'VERSION',
234 'TYPE', 'METHOD', 'LICENSE', 'LINKCENSE', 'CITATION', 'THANKS', 'LINKS', 'PDBIDS', 'FORCED_REFERENCES',
235 'REFERENCES', 'INPUT_LIGANDS', 'LIGANDS', 'LIGANDNAMES', 'PROTSEQ', 'NUCLSEQ', 'DOMAINS', 'FRAMESTEP', 'TIMESTEP',
236 'TEMP', 'ENSEMBLE', 'FF', 'WAT', 'BOXTYPE', 'SYSTATS', 'PROTATS', 'PROT', 'DPPC', 'SOL', 'NA', 'CL',
237 'INTERACTIONS', 'PBC_SELECTION', 'CHAINNAMES', 'MEMBRANES', 'CUSTOMS', 'ORIENTATION', 'PTM',
238 'MULTIMERIC', 'COLLECTIONS', 'WARNINGS', 'BOXSIZEX', 'BOXSIZEY', 'BOXSIZEZ', 'CV19_UNIT', 'CV19_STARTCONF',
239 'CV19_ABS', 'CV19_NANOBS', 'CV19_VARIANT'
240])
242def generate_md_metadata (
243 md_inputs : dict,
244 structure : 'Structure',
245 snapshots : int,
246 reference_frame : int,
247 warnings : dict,
248 output_filepath : str
249 ):
250 """Produce the MD metadata file to be uploaded to the database."""
252 # Mine name and directory from MD inputs
253 name = md_inputs.get('name', None)
254 directory = md_inputs.get(MD_DIRECTORY, None)
256 # Write the metadata file
257 md_metadata = {
258 'name': name,
259 'frames': snapshots,
260 'atoms': len(structure.atoms), # Should be always the same but we better have explicit confirmation
261 'refframe': reference_frame,
262 'warnings': warnings,
263 }
265 # Get other MD inputs than the name and the directory
266 other_md_inputs = { k: v for k, v in md_inputs.items() }
267 # Remove name from MD inputs to not further overwrite project metadata
268 if name:
269 del other_md_inputs['name']
270 # Remove the directory name form MD inputs since it is not to be uploaded to the database
271 if directory:
272 del other_md_inputs[MD_DIRECTORY]
274 # Inherit all metadata fields
275 metadata = {}
276 for field in metadata_fields:
277 input_field = field.lower()
278 field_value = other_md_inputs.get(input_field, None)
279 if field_value:
280 metadata[field] = field_value
282 # Add the matadata field only if there is at least one value
283 if len(metadata) > 0:
284 md_metadata['metadata'] = metadata
286 # Write metadata to a file
287 save_json(md_metadata, output_filepath)