Coverage for mddb_workflow/tools/generate_metadata.py: 73%

95 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-29 15:48 +0000

1from mddb_workflow.tools.get_box_size import get_box_size 

2from mddb_workflow.tools.get_atoms_count import get_atoms_count 

3from mddb_workflow.tools.generate_map import get_sequence_metadata 

4from mddb_workflow.utils.auxiliar import InputError, save_json 

5from mddb_workflow.utils.constants import MD_DIRECTORY 

6from mddb_workflow.utils.type_hints import * 

7 

8# Input fields + interaction type 

9METADATA_INTERACTION_FIELDS = { "name", "agent_1", "agent_2", "selection_1", "selection_2", "type" } 

10 

11def prepare_project_metadata ( 

12 structure_file : 'File', 

13 trajectory_file : 'File', 

14 output_filepath : str, 

15 structure : 'Structure', 

16 residue_map : dict, 

17 protein_references_file : 'File', 

18 pdb_ids : list[str], 

19 ligand_map : dict, 

20 input_protein_references : list[str] | dict, 

21 input_ligands : list[dict], 

22 interactions : list[dict], 

23 warnings : dict, 

24 # Set all inputs to be loaded as they are 

25 input_force_fields : list[str], 

26 input_collections : list[str], 

27 input_chain_names : list[str], 

28 input_type : str, 

29 input_framestep : float, 

30 input_name : str, 

31 input_description : str, 

32 input_authors : list[str], 

33 input_groups : list[str], 

34 input_contact : str, 

35 input_program : str, 

36 input_version : str, 

37 input_method : str, 

38 input_license : str, 

39 input_linkcense : str, 

40 input_citation : str, 

41 input_thanks : str, 

42 input_links : list[dict], 

43 input_timestep : float, 

44 input_temperature : float, 

45 input_ensemble : str, 

46 input_water : str, 

47 input_boxtype : str, 

48 input_pbc_selection : str, 

49 input_cg_selection : str, 

50 input_customs : list[dict], 

51 input_orientation : list[float], 

52 input_multimeric : list[str], 

53 # Additional topic-specific inputs 

54 input_cv19_unit : str, 

55 input_cv19_startconf : str, 

56 input_cv19_abs : bool, 

57 input_cv19_nanobs : bool, 

58 ): 

59 """ Prepare a JSON file with all project metadata. """ 

60 

61 # Find out the box size (x, y and z) 

62 (boxsizex, boxsizey, boxsizez) = get_box_size( 

63 structure_file.path, trajectory_file.path) 

64 

65 # Count different types of atoms and residues 

66 (system_atoms, system_residues, protein_atoms, protein_residues, 

67 nucleic_atoms, nucleic_residues, lipid_atoms, lipid_residues, 

68 carbohydrates_atoms, carbohydrates_residues, solvent_atoms, solvent_residues, 

69 counter_cations, counter_anions, counter_ions) = get_atoms_count(structure) 

70 

71 # Get protein references from the residues map 

72 # Get ligand references from the residues map 

73 protein_references = [] 

74 ligand_references = [] 

75 inchikey_references = [] 

76 references = residue_map['references'] 

77 if references and len(references) > 0: 

78 for ref, ref_type in zip(references, residue_map['reference_types']): 

79 if ref_type == 'protein': 

80 protein_references.append(ref) 

81 elif ref_type == 'ligand': 

82 ligand_references.append(ref) 

83 elif ref_type == 'inchikey': 

84 inchikey_references.append(ref) 

85 

86 # Get ligand names if any 

87 forced_ligand_names = { 

88 lig['name']: lig['forced_name'] for lig in ligand_map if lig.get('forced_name', False) } 

89 if len(forced_ligand_names) == 0: 

90 forced_ligand_names = None 

91 

92 # Make the forcefields a list in case it is a single string 

93 forcefields = input_force_fields 

94 if type(forcefields) == str: 

95 forcefields = [forcefields] 

96 

97 # Collections must be null in case there are not collections 

98 collections = input_collections 

99 if not collections: 

100 collections = [] 

101 

102 # Get additional metadata related to the aminoacids sequence 

103 sequence_metadata = get_sequence_metadata(structure, protein_references_file, residue_map) 

104 

105 # Find the PTMs 

106 # Save only their names for now 

107 # DANI: Esto es temporal y de momento solo busca ser un parámetro de facil query 

108 # DANI: Cuando esté más maduro también almacenaremos residuo afectado, como mínimo 

109 ptms = structure.find_ptms() 

110 ptm_names = list(set([ ptm['name'] for ptm in ptms ])) 

111 

112 # Check chainnames to actually exist in the structure 

113 structure_chains = set([ chain.name for chain in structure.chains ]) 

114 chainnames = input_chain_names 

115 if chainnames: 

116 for chain in chainnames.keys(): 

117 if chain not in structure_chains: 

118 raise InputError(f'Chain {chain} from chainnames does not exist in the structure') 

119 

120 # Get the MD type 

121 md_type = input_type 

122 # In case this is an ensemble and not a time related trajectory and not an ensemble, the framestep may be missing 

123 framestep = None if md_type == 'ensemble' else input_framestep 

124 

125 # Metadata interactions are input interactions and the interaction types combined 

126 # Thus we take the processed interactions and remove the field we are not interested in 

127 metadata_interactions = [] 

128 if interactions is not None: 

129 for interaction in interactions: 

130 metadata_interaction = { k: v for k, v in interaction.items() if k in METADATA_INTERACTION_FIELDS } 

131 metadata_interactions.append(metadata_interaction) 

132 

133 # Make sure links are correct 

134 links = input_links 

135 if links != None: 

136 if type(links) != list: links = [ links ] 

137 for link in input_links: 

138 if type(link) != dict: raise InputError('Links must be a list of objects') 

139 if link.get('name', None) == None: raise InputError('Links must have a name') 

140 if link.get('url', None) == None: raise InputError('Links must have a URL') 

141 

142 # Write the metadata file 

143 # Metadata keys must be in CAPS, as they are in the client 

144 metadata = { 

145 'NAME': input_name, 

146 'DESCRIPTION': input_description, 

147 'AUTHORS': input_authors, 

148 'GROUPS': input_groups, 

149 'CONTACT': input_contact, 

150 'PROGRAM': input_program, 

151 'VERSION': input_version, 

152 'TYPE': md_type, 

153 'METHOD': input_method, 

154 'LICENSE': input_license, 

155 'LINKCENSE': input_linkcense, 

156 'CITATION': input_citation, 

157 'THANKS': input_thanks, 

158 'LINKS': input_links, 

159 'PDBIDS': pdb_ids, 

160 'FORCED_REFERENCES': input_protein_references, 

161 'REFERENCES': protein_references, 

162 'INPUT_LIGANDS': input_ligands, 

163 'LIGANDS': ligand_references, 

164 'LIGANDNAMES': forced_ligand_names, 

165 'INCHIKEYS': inchikey_references, 

166 'PROTSEQ': sequence_metadata['protein_sequences'], 

167 'NUCLSEQ': sequence_metadata['nucleic_sequences'], 

168 'DOMAINS': sequence_metadata['domains'], 

169 'FRAMESTEP': framestep, 

170 'TIMESTEP': input_timestep, 

171 'TEMP': input_temperature, 

172 'ENSEMBLE': input_ensemble, 

173 'FF': forcefields, 

174 'WAT': input_water, 

175 'BOXTYPE': input_boxtype, 

176 'SYSTATS': system_atoms, 

177 'SYSTRES': system_residues, 

178 'PROTATS': protein_atoms, 

179 'PROTRES': protein_residues, 

180 'NUCLATS': nucleic_atoms, 

181 'NUCLRES': nucleic_residues, 

182 'LIPIATS': lipid_atoms, 

183 'LIPIRES': lipid_residues, 

184 'CARBATS': carbohydrates_atoms, 

185 'CARBRES': carbohydrates_residues, 

186 'SOLVATS': solvent_atoms, 

187 'SOLVRES': solvent_residues, 

188 'COUNCAT': counter_cations, 

189 'COUNANI': counter_anions, 

190 'COUNION': counter_ions, 

191 'INTERACTIONS': metadata_interactions, 

192 'PBC_SELECTION': input_pbc_selection, 

193 'CG_SELECTION': input_cg_selection, 

194 'CHAINNAMES': chainnames, 

195 'CUSTOMS': input_customs, 

196 'ORIENTATION': input_orientation, 

197 'PTM': ptm_names, 

198 'MULTIMERIC' : input_multimeric, 

199 'COLLECTIONS': collections, 

200 'WARNINGS': warnings, 

201 } 

202 # Add boxsizes only if any of them is 0 

203 if boxsizex > 0 and boxsizey > 0 and boxsizez > 0: 

204 metadata['BOXSIZEX'] = boxsizex 

205 metadata['BOXSIZEY'] = boxsizey 

206 metadata['BOXSIZEZ'] = boxsizez 

207 # Add collection specific fields 

208 if 'cv19' in collections: 

209 cv19_unit = input_cv19_unit 

210 cv19_startconf = input_cv19_startconf 

211 cv19_abs = input_cv19_abs 

212 cv19_nanobs = input_cv19_nanobs 

213 cv19_variant = sequence_metadata['cv19_variant'] 

214 

215 if cv19_unit is not None: 

216 metadata['CV19_UNIT'] = cv19_unit 

217 

218 if cv19_startconf is not None: 

219 metadata['CV19_STARTCONF'] = cv19_startconf 

220 

221 if cv19_abs is not None: 

222 metadata['CV19_ABS'] = cv19_abs 

223 

224 if cv19_nanobs is not None: 

225 metadata['CV19_NANOBS'] = cv19_nanobs 

226 

227 if cv19_variant is not None: 

228 metadata['CV19_VARIANT'] = cv19_variant 

229 

230 # Write metadata to a file 

231 save_json(metadata, output_filepath) 

232 

233metadata_fields = set([ 'NAME', 'DESCRIPTION', 'AUTHORS', 'GROUPS', 'CONTACT', 'PROGRAM', 'VERSION', 

234 'TYPE', 'METHOD', 'LICENSE', 'LINKCENSE', 'CITATION', 'THANKS', 'LINKS', 'PDBIDS', 'FORCED_REFERENCES', 

235 'REFERENCES', 'INPUT_LIGANDS', 'LIGANDS', 'LIGANDNAMES', 'PROTSEQ', 'NUCLSEQ', 'DOMAINS', 'FRAMESTEP', 'TIMESTEP', 

236 'TEMP', 'ENSEMBLE', 'FF', 'WAT', 'BOXTYPE', 'SYSTATS', 'PROTATS', 'PROT', 'DPPC', 'SOL', 'NA', 'CL', 

237 'INTERACTIONS', 'PBC_SELECTION', 'CHAINNAMES', 'MEMBRANES', 'CUSTOMS', 'ORIENTATION', 'PTM', 

238 'MULTIMERIC', 'COLLECTIONS', 'WARNINGS', 'BOXSIZEX', 'BOXSIZEY', 'BOXSIZEZ', 'CV19_UNIT', 'CV19_STARTCONF', 

239 'CV19_ABS', 'CV19_NANOBS', 'CV19_VARIANT' 

240]) 

241 

242def generate_md_metadata ( 

243 md_inputs : dict, 

244 structure : 'Structure', 

245 snapshots : int, 

246 reference_frame : int, 

247 warnings : dict, 

248 output_filepath : str 

249 ): 

250 """Produce the MD metadata file to be uploaded to the database.""" 

251 

252 # Mine name and directory from MD inputs 

253 name = md_inputs.get('name', None) 

254 directory = md_inputs.get(MD_DIRECTORY, None) 

255 

256 # Write the metadata file 

257 md_metadata = { 

258 'name': name, 

259 'frames': snapshots, 

260 'atoms': len(structure.atoms), # Should be always the same but we better have explicit confirmation 

261 'refframe': reference_frame, 

262 'warnings': warnings, 

263 } 

264 

265 # Get other MD inputs than the name and the directory 

266 other_md_inputs = { k: v for k, v in md_inputs.items() } 

267 # Remove name from MD inputs to not further overwrite project metadata 

268 if name: 

269 del other_md_inputs['name'] 

270 # Remove the directory name form MD inputs since it is not to be uploaded to the database 

271 if directory: 

272 del other_md_inputs[MD_DIRECTORY] 

273 

274 # Inherit all metadata fields 

275 metadata = {} 

276 for field in metadata_fields: 

277 input_field = field.lower() 

278 field_value = other_md_inputs.get(input_field, None) 

279 if field_value: 

280 metadata[field] = field_value 

281 

282 # Add the matadata field only if there is at least one value 

283 if len(metadata) > 0: 

284 md_metadata['metadata'] = metadata 

285 

286 # Write metadata to a file 

287 save_json(md_metadata, output_filepath)