Coverage for model_workflow/tools/generate_metadata.py: 72%

93 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-23 10:54 +0000

1from model_workflow.tools.get_box_size import get_box_size 

2from model_workflow.tools.get_atoms_count import get_atoms_count 

3from model_workflow.tools.generate_map import get_sequence_metadata 

4from model_workflow.utils.auxiliar import InputError, save_json 

5from model_workflow.utils.constants import MD_DIRECTORY 

6from model_workflow.utils.type_hints import * 

7 

8def prepare_project_metadata ( 

9 structure_file : 'File', 

10 trajectory_file : 'File', 

11 output_filepath : str, 

12 structure : 'Structure', 

13 residue_map : dict, 

14 protein_references_file : 'File', 

15 pdb_ids : List[str], 

16 ligand_map : dict, 

17 input_protein_references : Union[ List[str], dict ], 

18 input_ligands : List[dict], 

19 input_interactions : list, 

20 interaction_types : dict, 

21 warnings : dict, 

22 # Set all inputs to be loaded as they are 

23 input_force_fields : List[str], 

24 input_collections : List[str], 

25 input_chain_names : List[str], 

26 input_type : str, 

27 input_framestep : float, 

28 input_name : str, 

29 input_description : str, 

30 input_authors : List[str], 

31 input_groups : List[str], 

32 input_contact : str, 

33 input_program : str, 

34 input_version : str, 

35 input_method : str, 

36 input_license : str, 

37 input_linkcense : str, 

38 input_citation : str, 

39 input_thanks : str, 

40 input_links : List[dict], 

41 input_timestep : float, 

42 input_temperature : float, 

43 input_ensemble : str, 

44 input_water : str, 

45 input_boxtype : str, 

46 input_pbc_selection : str, 

47 input_cg_selection : str, 

48 input_customs : List[dict], 

49 input_orientation : List[float], 

50 input_multimeric : List[str], 

51 # Additional topic-specific inputs 

52 input_cv19_unit : str, 

53 input_cv19_startconf : str, 

54 input_cv19_abs : bool, 

55 input_cv19_nanobs : bool, 

56 ): 

57 """Prepare a JSON file with all project metadata.""" 

58 

59 # Find out the box size (x, y and z) 

60 (boxsizex, boxsizey, boxsizez) = get_box_size( 

61 structure_file.path, trajectory_file.path) 

62 

63 # Count different types of atoms and residues 

64 (system_atoms, system_residues, protein_atoms, protein_residues, 

65 nucleic_atoms, nucleic_residues, lipid_atoms, lipid_residues, 

66 carbohydrates_atoms, carbohydrates_residues, solvent_atoms, solvent_residues, 

67 counter_cations, counter_anions, counter_ions) = get_atoms_count(structure) 

68 

69 # Get protein references from the residues map 

70 # Get ligand references from the residues map 

71 protein_references = [] 

72 ligand_references = [] 

73 references = residue_map['references'] 

74 if references and len(references) > 0: 

75 for ref, ref_type in zip(references, residue_map['reference_types']): 

76 if ref_type == 'protein': 

77 protein_references.append(ref) 

78 elif ref_type == 'ligand': 

79 ligand_references.append(ref) 

80 

81 # Get ligand names if any 

82 forced_ligand_names = { 

83 lig['name']: lig['forced_name'] for lig in ligand_map if lig.get('forced_name', False) } 

84 if len(forced_ligand_names) == 0: 

85 forced_ligand_names = None 

86 

87 # Make the forcefields a list in case it is a single string 

88 forcefields = input_force_fields 

89 if type(forcefields) == str: 

90 forcefields = [forcefields] 

91 

92 # Collections must be null in case there are not collections 

93 collections = input_collections 

94 if not collections: 

95 collections = [] 

96 

97 # Get additional metadata related to the aminoacids sequence 

98 sequence_metadata = get_sequence_metadata(structure, protein_references_file, residue_map) 

99 

100 # Find the PTMs 

101 # Save only their names for now 

102 # DANI: Esto es temporal y de momento solo busca ser un parámetro de facil query 

103 # DANI: Cuando esté más maduro también almacenaremos residuo afectado, como mínimo 

104 ptms = structure.find_ptms() 

105 ptm_names = list(set([ ptm['name'] for ptm in ptms ])) 

106 

107 # Check chainnames to actually exist in the structure 

108 structure_chains = set([ chain.name for chain in structure.chains ]) 

109 chainnames = input_chain_names 

110 if chainnames: 

111 for chain in chainnames.keys(): 

112 if chain not in structure_chains: 

113 raise InputError(f'Chain {chain} from chainnames does not exist in the structure') 

114 

115 # Get the MD type 

116 md_type = input_type 

117 # In case this is an ensemble and not a time related trajectory and not an ensemble, the framestep may be missing 

118 framestep = None if md_type == 'ensemble' else input_framestep 

119 

120 # Metadata interactions are input interactions and the interaction types combined 

121 metadata_interactions = [] 

122 if input_interactions is not None: 

123 for interaction in input_interactions: 

124 metadata_interaction = { k: v for k, v in interaction.items() } 

125 interaction_name = metadata_interaction['name'] 

126 metadata_interaction['type'] = interaction_types[interaction_name] 

127 metadata_interactions.append(metadata_interaction) 

128 

129 # Make sure links are correct 

130 links = input_links 

131 if links != None: 

132 if type(links) != list: links = [ links ] 

133 for link in input_links: 

134 if type(link) != dict: raise InputError('Links must be a list of objects') 

135 if link.get('name', None) == None: raise InputError('Links must have a name') 

136 if link.get('url', None) == None: raise InputError('Links must have a URL') 

137 

138 # Write the metadata file 

139 # Metadata keys must be in CAPS, as they are in the client 

140 metadata = { 

141 'NAME': input_name, 

142 'DESCRIPTION': input_description, 

143 'AUTHORS': input_authors, 

144 'GROUPS': input_groups, 

145 'CONTACT': input_contact, 

146 'PROGRAM': input_program, 

147 'VERSION': input_version, 

148 'TYPE': md_type, 

149 'METHOD': input_method, 

150 'LICENSE': input_license, 

151 'LINKCENSE': input_linkcense, 

152 'CITATION': input_citation, 

153 'THANKS': input_thanks, 

154 'LINKS': input_links, 

155 'PDBIDS': pdb_ids, 

156 'FORCED_REFERENCES': input_protein_references, 

157 'REFERENCES': protein_references, 

158 'INPUT_LIGANDS': input_ligands, 

159 'LIGANDS': ligand_references, 

160 'LIGANDNAMES': forced_ligand_names, 

161 'PROTSEQ': sequence_metadata['protein_sequences'], 

162 'NUCLSEQ': sequence_metadata['nucleic_sequences'], 

163 'DOMAINS': sequence_metadata['domains'], 

164 'FRAMESTEP': framestep, 

165 'TIMESTEP': input_timestep, 

166 'TEMP': input_temperature, 

167 'ENSEMBLE': input_ensemble, 

168 'FF': forcefields, 

169 'WAT': input_water, 

170 'BOXTYPE': input_boxtype, 

171 'SYSTATS': system_atoms, 

172 'SYSTRES': system_residues, 

173 'PROTATS': protein_atoms, 

174 'PROTRES': protein_residues, 

175 'NUCLATS': nucleic_atoms, 

176 'NUCLRES': nucleic_residues, 

177 'LIPIATS': lipid_atoms, 

178 'LIPIRES': lipid_residues, 

179 'CARBATS': carbohydrates_atoms, 

180 'CARBRES': carbohydrates_residues, 

181 'SOLVATS': solvent_atoms, 

182 'SOLVRES': solvent_residues, 

183 'COUNCAT': counter_cations, 

184 'COUNANI': counter_anions, 

185 'COUNION': counter_ions, 

186 'INTERACTIONS': metadata_interactions, 

187 'PBC_SELECTION': input_pbc_selection, 

188 'CG_SELECTION': input_cg_selection, 

189 'CHAINNAMES': chainnames, 

190 'CUSTOMS': input_customs, 

191 'ORIENTATION': input_orientation, 

192 'PTM': ptm_names, 

193 'MULTIMERIC' : input_multimeric, 

194 'COLLECTIONS': collections, 

195 'WARNINGS': warnings, 

196 } 

197 # Add boxsizes only if any of them is 0 

198 if boxsizex > 0 and boxsizey > 0 and boxsizez > 0: 

199 metadata['BOXSIZEX'] = boxsizex 

200 metadata['BOXSIZEY'] = boxsizey 

201 metadata['BOXSIZEZ'] = boxsizez 

202 # Add collection specific fields 

203 if 'cv19' in collections: 

204 cv19_unit = input_cv19_unit 

205 cv19_startconf = input_cv19_startconf 

206 cv19_abs = input_cv19_abs 

207 cv19_nanobs = input_cv19_nanobs 

208 cv19_variant = sequence_metadata['cv19_variant'] 

209 

210 if cv19_unit is not None: 

211 metadata['CV19_UNIT'] = cv19_unit 

212 

213 if cv19_startconf is not None: 

214 metadata['CV19_STARTCONF'] = cv19_startconf 

215 

216 if cv19_abs is not None: 

217 metadata['CV19_ABS'] = cv19_abs 

218 

219 if cv19_nanobs is not None: 

220 metadata['CV19_NANOBS'] = cv19_nanobs 

221 

222 if cv19_variant is not None: 

223 metadata['CV19_VARIANT'] = cv19_variant 

224 

225 # Write metadata to a file 

226 save_json(metadata, output_filepath) 

227 

228metadata_fields = set([ 'NAME', 'DESCRIPTION', 'AUTHORS', 'GROUPS', 'CONTACT', 'PROGRAM', 'VERSION', 

229 'TYPE', 'METHOD', 'LICENSE', 'LINKCENSE', 'CITATION', 'THANKS', 'LINKS', 'PDBIDS', 'FORCED_REFERENCES', 

230 'REFERENCES', 'INPUT_LIGANDS', 'LIGANDS', 'LIGANDNAMES', 'PROTSEQ', 'NUCLSEQ', 'DOMAINS', 'FRAMESTEP', 'TIMESTEP', 

231 'TEMP', 'ENSEMBLE', 'FF', 'WAT', 'BOXTYPE', 'SYSTATS', 'PROTATS', 'PROT', 'DPPC', 'SOL', 'NA', 'CL', 

232 'INTERACTIONS', 'PBC_SELECTION', 'CHAINNAMES', 'MEMBRANES', 'CUSTOMS', 'ORIENTATION', 'PTM', 

233 'MULTIMERIC', 'COLLECTIONS', 'WARNINGS', 'BOXSIZEX', 'BOXSIZEY', 'BOXSIZEZ', 'CV19_UNIT', 'CV19_STARTCONF', 

234 'CV19_ABS', 'CV19_NANOBS', 'CV19_VARIANT' 

235]) 

236 

237def generate_md_metadata ( 

238 md_inputs : dict, 

239 structure : 'Structure', 

240 snapshots : int, 

241 reference_frame : int, 

242 warnings : dict, 

243 output_filepath : str 

244 ): 

245 """Produce the MD metadata file to be uploaded to the database.""" 

246 

247 # Mine name and directory from MD inputs 

248 name = md_inputs.get('name', None) 

249 directory = md_inputs.get(MD_DIRECTORY, None) 

250 

251 # Write the metadata file 

252 md_metadata = { 

253 'name': name, 

254 'frames': snapshots, 

255 'atoms': len(structure.atoms), # Should be always the same but we better have explicit confirmation 

256 'refframe': reference_frame, 

257 'warnings': warnings, 

258 } 

259 

260 # Get other MD inputs than the name and the directory 

261 other_md_inputs = { k: v for k, v in md_inputs.items() } 

262 # Remove name from MD inputs to not further overwrite project metadata 

263 if name: 

264 del other_md_inputs['name'] 

265 # Remove the directory name form MD inputs since it is not to be uploaded to the database 

266 if directory: 

267 del other_md_inputs[MD_DIRECTORY] 

268 

269 # Inherit all metadata fields 

270 metadata = {} 

271 for field in metadata_fields: 

272 input_field = field.lower() 

273 field_value = other_md_inputs.get(input_field, None) 

274 if field_value: 

275 metadata[field] = field_value 

276 

277 # Add the matadata field only if there is at least one value 

278 if len(metadata) > 0: 

279 md_metadata['metadata'] = metadata 

280 

281 # Write metadata to a file 

282 save_json(md_metadata, output_filepath)