Coverage for model_workflow/tools/generate_metadata.py: 72%
93 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
1from model_workflow.tools.get_box_size import get_box_size
2from model_workflow.tools.get_atoms_count import get_atoms_count
3from model_workflow.tools.generate_map import get_sequence_metadata
4from model_workflow.utils.auxiliar import InputError, save_json
5from model_workflow.utils.constants import MD_DIRECTORY
6from model_workflow.utils.type_hints import *
8def prepare_project_metadata (
9 structure_file : 'File',
10 trajectory_file : 'File',
11 output_filepath : str,
12 structure : 'Structure',
13 residue_map : dict,
14 protein_references_file : 'File',
15 pdb_ids : List[str],
16 ligand_map : dict,
17 input_protein_references : Union[ List[str], dict ],
18 input_ligands : List[dict],
19 input_interactions : list,
20 interaction_types : dict,
21 warnings : dict,
22 # Set all inputs to be loaded as they are
23 input_force_fields : List[str],
24 input_collections : List[str],
25 input_chain_names : List[str],
26 input_type : str,
27 input_framestep : float,
28 input_name : str,
29 input_description : str,
30 input_authors : List[str],
31 input_groups : List[str],
32 input_contact : str,
33 input_program : str,
34 input_version : str,
35 input_method : str,
36 input_license : str,
37 input_linkcense : str,
38 input_citation : str,
39 input_thanks : str,
40 input_links : List[dict],
41 input_timestep : float,
42 input_temperature : float,
43 input_ensemble : str,
44 input_water : str,
45 input_boxtype : str,
46 input_pbc_selection : str,
47 input_cg_selection : str,
48 input_customs : List[dict],
49 input_orientation : List[float],
50 input_multimeric : List[str],
51 # Additional topic-specific inputs
52 input_cv19_unit : str,
53 input_cv19_startconf : str,
54 input_cv19_abs : bool,
55 input_cv19_nanobs : bool,
56 ):
57 """Prepare a JSON file with all project metadata."""
59 # Find out the box size (x, y and z)
60 (boxsizex, boxsizey, boxsizez) = get_box_size(
61 structure_file.path, trajectory_file.path)
63 # Count different types of atoms and residues
64 (system_atoms, system_residues, protein_atoms, protein_residues,
65 nucleic_atoms, nucleic_residues, lipid_atoms, lipid_residues,
66 carbohydrates_atoms, carbohydrates_residues, solvent_atoms, solvent_residues,
67 counter_cations, counter_anions, counter_ions) = get_atoms_count(structure)
69 # Get protein references from the residues map
70 # Get ligand references from the residues map
71 protein_references = []
72 ligand_references = []
73 references = residue_map['references']
74 if references and len(references) > 0:
75 for ref, ref_type in zip(references, residue_map['reference_types']):
76 if ref_type == 'protein':
77 protein_references.append(ref)
78 elif ref_type == 'ligand':
79 ligand_references.append(ref)
81 # Get ligand names if any
82 forced_ligand_names = {
83 lig['name']: lig['forced_name'] for lig in ligand_map if lig.get('forced_name', False) }
84 if len(forced_ligand_names) == 0:
85 forced_ligand_names = None
87 # Make the forcefields a list in case it is a single string
88 forcefields = input_force_fields
89 if type(forcefields) == str:
90 forcefields = [forcefields]
92 # Collections must be null in case there are not collections
93 collections = input_collections
94 if not collections:
95 collections = []
97 # Get additional metadata related to the aminoacids sequence
98 sequence_metadata = get_sequence_metadata(structure, protein_references_file, residue_map)
100 # Find the PTMs
101 # Save only their names for now
102 # DANI: Esto es temporal y de momento solo busca ser un parámetro de facil query
103 # DANI: Cuando esté más maduro también almacenaremos residuo afectado, como mínimo
104 ptms = structure.find_ptms()
105 ptm_names = list(set([ ptm['name'] for ptm in ptms ]))
107 # Check chainnames to actually exist in the structure
108 structure_chains = set([ chain.name for chain in structure.chains ])
109 chainnames = input_chain_names
110 if chainnames:
111 for chain in chainnames.keys():
112 if chain not in structure_chains:
113 raise InputError(f'Chain {chain} from chainnames does not exist in the structure')
115 # Get the MD type
116 md_type = input_type
117 # In case this is an ensemble and not a time related trajectory and not an ensemble, the framestep may be missing
118 framestep = None if md_type == 'ensemble' else input_framestep
120 # Metadata interactions are input interactions and the interaction types combined
121 metadata_interactions = []
122 if input_interactions is not None:
123 for interaction in input_interactions:
124 metadata_interaction = { k: v for k, v in interaction.items() }
125 interaction_name = metadata_interaction['name']
126 metadata_interaction['type'] = interaction_types[interaction_name]
127 metadata_interactions.append(metadata_interaction)
129 # Make sure links are correct
130 links = input_links
131 if links != None:
132 if type(links) != list: links = [ links ]
133 for link in input_links:
134 if type(link) != dict: raise InputError('Links must be a list of objects')
135 if link.get('name', None) == None: raise InputError('Links must have a name')
136 if link.get('url', None) == None: raise InputError('Links must have a URL')
138 # Write the metadata file
139 # Metadata keys must be in CAPS, as they are in the client
140 metadata = {
141 'NAME': input_name,
142 'DESCRIPTION': input_description,
143 'AUTHORS': input_authors,
144 'GROUPS': input_groups,
145 'CONTACT': input_contact,
146 'PROGRAM': input_program,
147 'VERSION': input_version,
148 'TYPE': md_type,
149 'METHOD': input_method,
150 'LICENSE': input_license,
151 'LINKCENSE': input_linkcense,
152 'CITATION': input_citation,
153 'THANKS': input_thanks,
154 'LINKS': input_links,
155 'PDBIDS': pdb_ids,
156 'FORCED_REFERENCES': input_protein_references,
157 'REFERENCES': protein_references,
158 'INPUT_LIGANDS': input_ligands,
159 'LIGANDS': ligand_references,
160 'LIGANDNAMES': forced_ligand_names,
161 'PROTSEQ': sequence_metadata['protein_sequences'],
162 'NUCLSEQ': sequence_metadata['nucleic_sequences'],
163 'DOMAINS': sequence_metadata['domains'],
164 'FRAMESTEP': framestep,
165 'TIMESTEP': input_timestep,
166 'TEMP': input_temperature,
167 'ENSEMBLE': input_ensemble,
168 'FF': forcefields,
169 'WAT': input_water,
170 'BOXTYPE': input_boxtype,
171 'SYSTATS': system_atoms,
172 'SYSTRES': system_residues,
173 'PROTATS': protein_atoms,
174 'PROTRES': protein_residues,
175 'NUCLATS': nucleic_atoms,
176 'NUCLRES': nucleic_residues,
177 'LIPIATS': lipid_atoms,
178 'LIPIRES': lipid_residues,
179 'CARBATS': carbohydrates_atoms,
180 'CARBRES': carbohydrates_residues,
181 'SOLVATS': solvent_atoms,
182 'SOLVRES': solvent_residues,
183 'COUNCAT': counter_cations,
184 'COUNANI': counter_anions,
185 'COUNION': counter_ions,
186 'INTERACTIONS': metadata_interactions,
187 'PBC_SELECTION': input_pbc_selection,
188 'CG_SELECTION': input_cg_selection,
189 'CHAINNAMES': chainnames,
190 'CUSTOMS': input_customs,
191 'ORIENTATION': input_orientation,
192 'PTM': ptm_names,
193 'MULTIMERIC' : input_multimeric,
194 'COLLECTIONS': collections,
195 'WARNINGS': warnings,
196 }
197 # Add boxsizes only if any of them is 0
198 if boxsizex > 0 and boxsizey > 0 and boxsizez > 0:
199 metadata['BOXSIZEX'] = boxsizex
200 metadata['BOXSIZEY'] = boxsizey
201 metadata['BOXSIZEZ'] = boxsizez
202 # Add collection specific fields
203 if 'cv19' in collections:
204 cv19_unit = input_cv19_unit
205 cv19_startconf = input_cv19_startconf
206 cv19_abs = input_cv19_abs
207 cv19_nanobs = input_cv19_nanobs
208 cv19_variant = sequence_metadata['cv19_variant']
210 if cv19_unit is not None:
211 metadata['CV19_UNIT'] = cv19_unit
213 if cv19_startconf is not None:
214 metadata['CV19_STARTCONF'] = cv19_startconf
216 if cv19_abs is not None:
217 metadata['CV19_ABS'] = cv19_abs
219 if cv19_nanobs is not None:
220 metadata['CV19_NANOBS'] = cv19_nanobs
222 if cv19_variant is not None:
223 metadata['CV19_VARIANT'] = cv19_variant
225 # Write metadata to a file
226 save_json(metadata, output_filepath)
228metadata_fields = set([ 'NAME', 'DESCRIPTION', 'AUTHORS', 'GROUPS', 'CONTACT', 'PROGRAM', 'VERSION',
229 'TYPE', 'METHOD', 'LICENSE', 'LINKCENSE', 'CITATION', 'THANKS', 'LINKS', 'PDBIDS', 'FORCED_REFERENCES',
230 'REFERENCES', 'INPUT_LIGANDS', 'LIGANDS', 'LIGANDNAMES', 'PROTSEQ', 'NUCLSEQ', 'DOMAINS', 'FRAMESTEP', 'TIMESTEP',
231 'TEMP', 'ENSEMBLE', 'FF', 'WAT', 'BOXTYPE', 'SYSTATS', 'PROTATS', 'PROT', 'DPPC', 'SOL', 'NA', 'CL',
232 'INTERACTIONS', 'PBC_SELECTION', 'CHAINNAMES', 'MEMBRANES', 'CUSTOMS', 'ORIENTATION', 'PTM',
233 'MULTIMERIC', 'COLLECTIONS', 'WARNINGS', 'BOXSIZEX', 'BOXSIZEY', 'BOXSIZEZ', 'CV19_UNIT', 'CV19_STARTCONF',
234 'CV19_ABS', 'CV19_NANOBS', 'CV19_VARIANT'
235])
237def generate_md_metadata (
238 md_inputs : dict,
239 structure : 'Structure',
240 snapshots : int,
241 reference_frame : int,
242 warnings : dict,
243 output_filepath : str
244 ):
245 """Produce the MD metadata file to be uploaded to the database."""
247 # Mine name and directory from MD inputs
248 name = md_inputs.get('name', None)
249 directory = md_inputs.get(MD_DIRECTORY, None)
251 # Write the metadata file
252 md_metadata = {
253 'name': name,
254 'frames': snapshots,
255 'atoms': len(structure.atoms), # Should be always the same but we better have explicit confirmation
256 'refframe': reference_frame,
257 'warnings': warnings,
258 }
260 # Get other MD inputs than the name and the directory
261 other_md_inputs = { k: v for k, v in md_inputs.items() }
262 # Remove name from MD inputs to not further overwrite project metadata
263 if name:
264 del other_md_inputs['name']
265 # Remove the directory name form MD inputs since it is not to be uploaded to the database
266 if directory:
267 del other_md_inputs[MD_DIRECTORY]
269 # Inherit all metadata fields
270 metadata = {}
271 for field in metadata_fields:
272 input_field = field.lower()
273 field_value = other_md_inputs.get(input_field, None)
274 if field_value:
275 metadata[field] = field_value
277 # Add the matadata field only if there is at least one value
278 if len(metadata) > 0:
279 md_metadata['metadata'] = metadata
281 # Write metadata to a file
282 save_json(md_metadata, output_filepath)