Coverage for mddb_workflow / tools / generate_metadata.py: 81%

95 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 18:45 +0000

1from mddb_workflow.tools.get_box_size import get_box_size 

2from mddb_workflow.tools.get_atoms_count import get_atoms_count 

3from mddb_workflow.tools.generate_map import get_sequence_metadata 

4from mddb_workflow.utils.auxiliar import InputError, save_json 

5from mddb_workflow.utils.constants import MD_DIRECTORY 

6from mddb_workflow.utils.type_hints import * 

7 

8# Input fields + interaction type 

9METADATA_INTERACTION_FIELDS = { "name", "agent_1", "agent_2", "selection_1", "selection_2", "type" } 

10 

11 

12def prepare_project_metadata ( 

13 structure_file : 'File', 

14 trajectory_file : 'File', 

15 output_file : 'File', 

16 structure : 'Structure', 

17 residue_map : dict, 

18 protein_references_file : 'File', 

19 pdb_ids : list[str], 

20 ligand_references : dict, 

21 input_protein_references : list[str] | dict, 

22 input_ligands : list[dict], 

23 interactions : list[dict], 

24 warnings : dict, 

25 # Set all inputs to be loaded as they are 

26 input_force_fields : list[str], 

27 input_collections : list[str], 

28 input_chain_names : list[str], 

29 input_type : str, 

30 input_framestep : float, 

31 input_name : str, 

32 input_description : str, 

33 input_authors : list[str], 

34 input_groups : list[str], 

35 input_contact : str, 

36 input_program : str, 

37 input_version : str, 

38 input_method : str, 

39 input_license : str, 

40 input_linkcense : str, 

41 input_citation : str, 

42 input_thanks : str, 

43 input_links : list[dict], 

44 input_timestep : float, 

45 input_temperature : float, 

46 input_ensemble : str, 

47 input_water : str, 

48 input_boxtype : str, 

49 input_pbc_selection : str, 

50 input_cg_selection : str, 

51 input_customs : list[dict], 

52 input_orientation : list[float], 

53 input_multimeric : list[str], 

54 # Additional topic-specific inputs 

55 input_cv19_unit : str, 

56 input_cv19_startconf : str, 

57 input_cv19_abs : bool, 

58 input_cv19_nanobs : bool, 

59 ): 

60 """ Prepare a JSON file with all project metadata. """ 

61 

62 # Find out the box size (x, y and z) 

63 (boxsizex, boxsizey, boxsizez) = get_box_size( 

64 structure_file.path, trajectory_file.path) 

65 

66 # Count different types of atoms and residues 

67 (system_atoms, system_residues, protein_atoms, protein_residues, 

68 nucleic_atoms, nucleic_residues, lipid_atoms, lipid_residues, 

69 carbohydrates_atoms, carbohydrates_residues, solvent_atoms, solvent_residues, 

70 counter_cations, counter_anions, counter_ions) = get_atoms_count(structure) 

71 

72 # Get protein references from the residues map 

73 # Get ligand references from the residues map 

74 protein_references = [] 

75 ligand_references = [] 

76 inchikey_references = [] 

77 references = residue_map['references'] 

78 if references and len(references) > 0: 

79 for ref, ref_type in zip(references, residue_map['reference_types']): 

80 if ref_type == 'protein': 

81 protein_references.append(ref) 

82 elif ref_type == 'ligand': 

83 ligand_references.append(ref) 

84 elif ref_type == 'inchikey': 

85 inchikey_references.append(ref) 

86 

87 # Get ligand names if any 

88 forced_ligand_names = { 

89 lig['name']: lig['forced_name'] for lig in ligand_references if lig.get('forced_name', False) } 

90 if len(forced_ligand_names) == 0: 

91 forced_ligand_names = None 

92 

93 # Make the forcefields a list in case it is a single string 

94 forcefields = input_force_fields 

95 if type(forcefields) == str: 

96 forcefields = [forcefields] 

97 

98 # Collections must be null in case there are not collections 

99 collections = input_collections 

100 if not collections: 

101 collections = [] 

102 

103 # Get additional metadata related to the aminoacids sequence 

104 sequence_metadata = get_sequence_metadata(structure, protein_references_file, residue_map) 

105 

106 # Find the PTMs 

107 # Save only their names for now 

108 # DANI: Esto es temporal y de momento solo busca ser un parámetro de facil query 

109 # DANI: Cuando esté más maduro también almacenaremos residuo afectado, como mínimo 

110 ptms = structure.find_ptms() 

111 ptm_names = list(set([ ptm['name'] for ptm in ptms ])) 

112 

113 # Check chainnames to actually exist in the structure 

114 structure_chains = set([ chain.name for chain in structure.chains ]) 

115 chainnames = input_chain_names 

116 if chainnames: 

117 for chain in chainnames.keys(): 

118 if chain not in structure_chains: 

119 raise InputError(f'Chain {chain} from chainnames does not exist in the structure') 

120 

121 # Get the MD type 

122 md_type = input_type 

123 # In case this is an ensemble and not a time related trajectory and not an ensemble, the framestep may be missing 

124 framestep = None if md_type == 'ensemble' else input_framestep 

125 

126 # Metadata interactions are input interactions and the interaction types combined 

127 # Thus we take the processed interactions and remove the field we are not interested in 

128 metadata_interactions = [] 

129 if interactions is not None: 

130 for interaction in interactions: 

131 metadata_interaction = { k: v for k, v in interaction.items() if k in METADATA_INTERACTION_FIELDS } 

132 metadata_interactions.append(metadata_interaction) 

133 

134 # Make sure links are correct 

135 links = input_links 

136 if links != None: 

137 if type(links) != list: links = [ links ] 

138 for link in input_links: 

139 if type(link) != dict: raise InputError('Links must be a list of objects') 

140 if link.get('name', None) == None: raise InputError('Links must have a name') 

141 if link.get('url', None) == None: raise InputError('Links must have a URL') 

142 

143 # Write the metadata file 

144 # Metadata keys must be in CAPS, as they are in the client 

145 metadata = { 

146 'NAME': input_name, 

147 'DESCRIPTION': input_description, 

148 'AUTHORS': input_authors, 

149 'GROUPS': input_groups, 

150 'CONTACT': input_contact, 

151 'PROGRAM': input_program, 

152 'VERSION': input_version, 

153 'TYPE': md_type, 

154 'METHOD': input_method, 

155 'LICENSE': input_license, 

156 'LINKCENSE': input_linkcense, 

157 'CITATION': input_citation, 

158 'THANKS': input_thanks, 

159 'LINKS': input_links, 

160 'PDBIDS': pdb_ids, 

161 'FORCED_REFERENCES': input_protein_references, 

162 'REFERENCES': protein_references, 

163 'INPUT_LIGANDS': input_ligands, 

164 # TODO: Ligands are now inchikeys only, remove after checking it does not break the client removing this 

165 'LIGANDS': [], 

166 'LIGANDNAMES': forced_ligand_names, 

167 'INCHIKEYS': inchikey_references, 

168 'PROTSEQ': sequence_metadata['protein_sequences'], 

169 'NUCLSEQ': sequence_metadata['nucleic_sequences'], 

170 'DOMAINS': sequence_metadata['domains'], 

171 'FRAMESTEP': framestep, 

172 'TIMESTEP': input_timestep, 

173 'TEMP': input_temperature, 

174 'ENSEMBLE': input_ensemble, 

175 'FF': forcefields, 

176 'WAT': input_water, 

177 'BOXTYPE': input_boxtype, 

178 'SYSTATS': system_atoms, 

179 'SYSTRES': system_residues, 

180 'PROTATS': protein_atoms, 

181 'PROTRES': protein_residues, 

182 'NUCLATS': nucleic_atoms, 

183 'NUCLRES': nucleic_residues, 

184 'LIPIATS': lipid_atoms, 

185 'LIPIRES': lipid_residues, 

186 'CARBATS': carbohydrates_atoms, 

187 'CARBRES': carbohydrates_residues, 

188 'SOLVATS': solvent_atoms, 

189 'SOLVRES': solvent_residues, 

190 'COUNCAT': counter_cations, 

191 'COUNANI': counter_anions, 

192 'COUNION': counter_ions, 

193 'INTERACTIONS': metadata_interactions, 

194 'PBC_SELECTION': input_pbc_selection, 

195 'CG_SELECTION': input_cg_selection, 

196 'CHAINNAMES': chainnames, 

197 'CUSTOMS': input_customs, 

198 'ORIENTATION': input_orientation, 

199 'PTM': ptm_names, 

200 'MULTIMERIC': input_multimeric, 

201 'COLLECTIONS': collections, 

202 'WARNINGS': warnings, 

203 } 

204 # Add boxsizes only if any of them is 0 

205 if boxsizex > 0 and boxsizey > 0 and boxsizez > 0: 

206 metadata['BOXSIZEX'] = boxsizex 

207 metadata['BOXSIZEY'] = boxsizey 

208 metadata['BOXSIZEZ'] = boxsizez 

209 # Add collection specific fields 

210 if 'cv19' in collections: 

211 cv19_unit = input_cv19_unit 

212 cv19_startconf = input_cv19_startconf 

213 cv19_abs = input_cv19_abs 

214 cv19_nanobs = input_cv19_nanobs 

215 cv19_variant = sequence_metadata['cv19_variant'] 

216 

217 if cv19_unit is not None: 

218 metadata['CV19_UNIT'] = cv19_unit 

219 

220 if cv19_startconf is not None: 

221 metadata['CV19_STARTCONF'] = cv19_startconf 

222 

223 if cv19_abs is not None: 

224 metadata['CV19_ABS'] = cv19_abs 

225 

226 if cv19_nanobs is not None: 

227 metadata['CV19_NANOBS'] = cv19_nanobs 

228 

229 if cv19_variant is not None: 

230 metadata['CV19_VARIANT'] = cv19_variant 

231 

232 # Write metadata to a file 

233 save_json(metadata, output_file.path) 

234 

235metadata_fields = set([ 'NAME', 'DESCRIPTION', 'AUTHORS', 'GROUPS', 'CONTACT', 'PROGRAM', 'VERSION', 

236 'TYPE', 'METHOD', 'LICENSE', 'LINKCENSE', 'CITATION', 'THANKS', 'LINKS', 'PDBIDS', 'FORCED_REFERENCES', 

237 'REFERENCES', 'INPUT_LIGANDS', 'LIGANDS', 'LIGANDNAMES', 'PROTSEQ', 'NUCLSEQ', 'DOMAINS', 'FRAMESTEP', 'TIMESTEP', 

238 'TEMP', 'ENSEMBLE', 'FF', 'WAT', 'BOXTYPE', 'SYSTATS', 'PROTATS', 'PROT', 'DPPC', 'SOL', 'NA', 'CL', 

239 'INTERACTIONS', 'PBC_SELECTION', 'CHAINNAMES', 'MEMBRANES', 'CUSTOMS', 'ORIENTATION', 'PTM', 

240 'MULTIMERIC', 'COLLECTIONS', 'WARNINGS', 'BOXSIZEX', 'BOXSIZEY', 'BOXSIZEZ', 'CV19_UNIT', 'CV19_STARTCONF', 

241 'CV19_ABS', 'CV19_NANOBS', 'CV19_VARIANT' 

242]) 

243 

244def generate_md_metadata ( 

245 md_inputs : dict, 

246 structure : 'Structure', 

247 snapshots : int, 

248 reference_frame : int, 

249 warnings : dict, 

250 output_file : 'File' 

251 ): 

252 """Produce the MD metadata file to be uploaded to the database.""" 

253 

254 # Mine name and directory from MD inputs 

255 name = md_inputs.get('name', None) 

256 directory = md_inputs.get(MD_DIRECTORY, None) 

257 

258 # Write the metadata file 

259 md_metadata = { 

260 'name': name, 

261 'frames': snapshots, 

262 'atoms': len(structure.atoms), # Should be always the same but we better have explicit confirmation 

263 'refframe': reference_frame, 

264 'warnings': warnings, 

265 } 

266 

267 # Get other MD inputs than the name and the directory 

268 other_md_inputs = { k: v for k, v in md_inputs.items() } 

269 # Remove name from MD inputs to not further overwrite project metadata 

270 if name: 

271 del other_md_inputs['name'] 

272 # Remove the directory name form MD inputs since it is not to be uploaded to the database 

273 if directory: 

274 del other_md_inputs[MD_DIRECTORY] 

275 

276 # Inherit all metadata fields 

277 metadata = {} 

278 for field in metadata_fields: 

279 input_field = field.lower() 

280 field_value = other_md_inputs.get(input_field, None) 

281 if field_value: 

282 metadata[field] = field_value 

283 

284 # Add the matadata field only if there is at least one value 

285 if len(metadata) > 0: 

286 md_metadata['metadata'] = metadata 

287 

288 # Write metadata to a file 

289 save_json(md_metadata, output_file.path)