Coverage for mddb_workflow/tools/structure_corrector.py: 49%

151 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-29 15:48 +0000

1from os import remove 

2from mddb_workflow.tools.get_bonds import find_safe_bonds, do_bonds_match, get_bonds_reference_frame 

3from mddb_workflow.tools.get_bonds import get_excluded_atoms_selection 

4from mddb_workflow.tools.get_pdb_frames import get_pdb_frame 

5from mddb_workflow.utils.auxiliar import InputError, TestFailure 

6from mddb_workflow.utils.auxiliar import get_new_letter, save_json, warn 

7from mddb_workflow.utils.constants import CORRECT_ELEMENTS, STABLE_BONDS_FLAG, COHERENT_BONDS_FLAG 

8from mddb_workflow.utils.structures import Structure 

9from mddb_workflow.utils.type_hints import * 

10 

11 

12def structure_corrector ( 

13 # Note that this is an early provisional structure 

14 structure : 'Structure', 

15 input_trajectory_file : Optional['File'], 

16 input_topology_file : Union['File', Exception], 

17 output_structure_file : 'File', 

18 output_trajectory_file : Optional['File'], 

19 MD : 'MD', 

20 # Note that this is an early provisional atom selection 

21 pbc_selection : 'Selection', 

22 snapshots : int, 

23 register : 'Register', 

24 mercy : list[str], 

25 trust : list[str], 

26 guess_bonds : bool 

27) -> dict: 

28 """ 

29 Analyze the structure looking for irregularities and then modify the structure to standarize the format. 

30 

31 Supported cases: 

32 

33 * Missing/Non-standard elements -> Atom elements are guessed when missing and standarized (e.g. ZN -> Zn) 

34 * Unstable atom bonds: A bond is formed / broken because its 2 atoms are too close / far in the structure 

35 -> Coordinates in the pdb file are replaced by those of the first frame in the trajectory where bonds are stable 

36 * Incoherent atom bonds -> If an atom has unexpected bonds then the simulation may be wrong 

37 -> Stop the workflow and warn the user 

38 * Missing chains -> Chains are added through VMD 

39 * Splitted chain -> Kill the process and warn the user. This should never happen by just reading a pdb, but after correcting 

40 * Repeated chains -> Chains are renamed (e.g. A, G, B, G, C, G -> A, G, B, H, C, I) 

41 * Splitted residues -> Atoms are sorted together by residue. Trajectory coordinates are also sorted 

42 * Repeated residues -> Residues are renumerated (e.g. 1, 2, 3, 1, 2, 1, 2 -> 1, 2, 3, 4, 5, 6, 7) 

43 * Repeated atoms -> Atoms are renamed with their numeration (e.g. C, C, C, O, O -> C1, C2, C3, O1, O2) 

44 

45 Note that the 'mercy' flag may be passed for critical checkings to not kill the process on fail. 

46 Note that the 'trust' flag may be passed for critical checkings to skip them. 

47 

48 This function also sets some values in the passed MD. 

49 """ 

50 

51 # Write the inital output structure file which will be overwritten several times further 

52 print(' The structure file has been copied -> ' + output_structure_file.filename) 

53 structure.generate_pdb_file(output_structure_file.path) 

54 

55 # ------------------------------------------------------------------------------------------ 

56 # Missing/Non-standard elements ------------------------------------------------------------ 

57 # ------------------------------------------------------------------------------------------ 

58 

59 # It is important to fix elements before trying to fix bonds, since elements have an impact on bonds 

60 # VMD logic to find bonds relies in the atom element to set the covalent bond distance cutoff 

61 if structure.fix_atom_elements(trust=(CORRECT_ELEMENTS in trust)): 

62 # Update the structure file using the corrected structure 

63 print(' The structure file has been modified (fix atom elements) -> ' + output_structure_file.filename) 

64 structure.generate_pdb_file(output_structure_file.path) 

65 

66 # ------------------------------------------------------------------------------------------ 

67 # Unstable atom bonds ---------------------------------------------------------------------- 

68 # ------------------------------------------------------------------------------------------ 

69 

70 # Set if stable bonds have to be checked 

71 # Note that we must not skip this even if the test already passed 

72 # It may be a corrected structure the one which passed the structure, while this structure comes from the raw input 

73 must_check_stable_bonds = STABLE_BONDS_FLAG not in trust 

74 

75 # Get safe bonds 

76 # Use topology bonds if possible 

77 # Otherwise guess bonds by guessing bonds according to coordinates and atom radius for 10 frames along the trajectory 

78 safe_bonds = find_safe_bonds( 

79 input_topology_file, 

80 output_structure_file, 

81 input_trajectory_file, 

82 must_check_stable_bonds, 

83 snapshots, 

84 structure, 

85 guess_bonds 

86 ) 

87 # If safe bonds do not match structure bonds then we have to fix it 

88 def check_stable_bonds (): 

89 # Save the current structure bonds to further compare with the safe bonds 

90 current_bonds = structure.bonds 

91 # Now force safe bonds in the structure even if they "match" already 

92 # Note that some bonds are excluded from the check and they may be wrong in the structure 

93 # e.g. coarse grain atom bonds 

94 structure.bonds = safe_bonds 

95 # If we have been requested to skip this test then we are done 

96 if not must_check_stable_bonds: return 

97 # Reset warnings related to this analysis 

98 register.remove_warnings(STABLE_BONDS_FLAG) 

99 # Set some atoms which are to be skipped from these test given their "fake" nature 

100 excluded_atoms_selection = get_excluded_atoms_selection(structure, pbc_selection) 

101 # If bonds match from the begining we are done as well 

102 print(f'Checking default structure bonds ({STABLE_BONDS_FLAG})') 

103 if do_bonds_match(current_bonds, safe_bonds, excluded_atoms_selection, verbose=True, atoms=structure.atoms): 

104 register.update_test(STABLE_BONDS_FLAG, True) 

105 print(' They are good') 

106 return 

107 print(' They are wrong') 

108 # Find the first frame in the whole trajectory where safe bonds are respected 

109 bonds_reference_frame = get_bonds_reference_frame( 

110 structure_file = output_structure_file, 

111 trajectory_file = input_trajectory_file, 

112 snapshots = snapshots, 

113 reference_bonds = safe_bonds, 

114 structure = structure, 

115 pbc_selection = pbc_selection 

116 ) 

117 MD.get_reference_frame._set_parent_output(MD, bonds_reference_frame) 

118 # If there is no reference frame then stop here since there must be a problem 

119 if bonds_reference_frame == None: 

120 print('There is no reference frame for the safe bonds. Is the trajectory not imaged?') 

121 must_be_killed = STABLE_BONDS_FLAG not in mercy 

122 if must_be_killed: 

123 raise TestFailure('Failed to find stable bonds') 

124 register.update_test(STABLE_BONDS_FLAG, False) 

125 register.add_warning(STABLE_BONDS_FLAG, ('Could not find a frame in the trajectory respecting all bonds if bonds were guessed according to atom coordinates and radius.\n' 

126 'The main PDB structure is a default structure and it would be considered to have wrong bonds if they were predicted as previously stated.')) 

127 return 

128 # Set also the safe bonds frame structure to mine its coordinates 

129 safe_bonds_frame_filename = get_pdb_frame(output_structure_file.path, input_trajectory_file.path, bonds_reference_frame) 

130 safe_bonds_frame_structure = Structure.from_pdb_file(safe_bonds_frame_filename) 

131 # Set all coordinates in the main structure by copying the safe bonds frame coordinates 

132 for atom_1, atom_2 in zip(structure.atoms, safe_bonds_frame_structure.atoms): 

133 atom_1.coords = atom_2.coords 

134 # Remove the safe bonds frame since it is not required anymore 

135 remove(safe_bonds_frame_filename) 

136 # Set the modified variable as true since we have changes the structure 

137 # Update the structure file using the corrected structure 

138 print(' The structure file has been modified (stable bonds) -> ' + output_structure_file.filename) 

139 structure.generate_pdb_file(output_structure_file.path) 

140 # Set the test as passed 

141 register.update_test(STABLE_BONDS_FLAG, True) 

142 check_stable_bonds() 

143 

144 # Write safe bonds back to the MD 

145 MD.project.get_reference_bonds._set_parent_output(MD.project, safe_bonds) 

146 

147 # ------------------------------------------------------------------------------------------ 

148 # Incoherent residue bonds --------------------------------------------------------------- 

149 # ------------------------------------------------------------------------------------------ 

150 

151 # Make sure there are no disconnected groups of atoms in every residue 

152 for residue in structure.residues: 

153 # If the residue is missing bonds then we can not check if it is coherent 

154 # We assume it is and continue 

155 if residue.is_missing_any_bonds(): continue 

156 # If the residue is coherent then continue 

157 if residue.is_coherent(): continue 

158 # Otherwise we report the problem 

159 residue_selection = residue.get_selection() 

160 fragments = list(structure.find_fragments(residue_selection)) 

161 if len(fragments) == 0: raise RuntimeError('Do we have an empty residue?') 

162 if len(fragments) == 1: raise RuntimeError('Test failed but should not') 

163 warn(f'Multiple fragments in residue {residue.index}: {residue}') 

164 for f, fragment in enumerate(fragments, 1): 

165 fragment_atoms = [ structure.atoms[index] for index in fragment.atom_indices ] 

166 atom_names = [ atom.label for atom in fragment_atoms ] 

167 print(f' Fragment {f}: {", ".join(atom_names)}') 

168 raise TestFailure(f'Residue {residue.index}: {residue} is not coherent: some atoms are disconnected') 

169 

170 # ------------------------------------------------------------------------------------------ 

171 # Incoherent atom bonds --------------------------------------------------------------- 

172 # ------------------------------------------------------------------------------------------ 

173 

174 # Set if coherent bonds have to be checked 

175 must_check_coherent_bonds = COHERENT_BONDS_FLAG not in trust 

176 

177 # If this analysis has been already passed then we skip the process 

178 if register.tests.get(COHERENT_BONDS_FLAG, None) == True: 

179 must_check_stable_bonds = False 

180 

181 # Run the coherent bonds analysis if necessary 

182 if must_check_coherent_bonds: 

183 # Reset warnings related to this analysis 

184 register.remove_warnings(COHERENT_BONDS_FLAG) 

185 # If the test is not passed then report it 

186 if structure.check_incoherent_bonds(): 

187 print('FAIL: Uncoherent bonds were found') 

188 must_be_killed = COHERENT_BONDS_FLAG not in mercy 

189 if must_be_killed: 

190 raise TestFailure('Failed to find coherent bonds') 

191 register.update_test(COHERENT_BONDS_FLAG, False) 

192 register.add_warning(COHERENT_BONDS_FLAG, 'Some atoms may have a higher or lower number of bonds than they should according to their element.') 

193 # Set the test as succeed if all was good 

194 else: 

195 register.update_test(COHERENT_BONDS_FLAG, True) 

196 

197 # ------------------------------------------------------------------------------------------ 

198 # Missing chains --------------------------------------------------------------------------- 

199 # ------------------------------------------------------------------------------------------ 

200 

201 # Check if chains are missing. If so, create a new chainned structure and set it as the reference 

202 chains = structure.chains 

203 

204 # In case there are not chains at all 

205 if len(chains) == 1 and ( chains[0].name == ' ' or chains[0].name == 'X' ): 

206 warn('Chains are missing and they will be added') 

207 # Stop here if we have bonds guessed from coarse grain (i.e. we have no topology) 

208 # Note that we rely in fragments (and thus in bonds) to guess chains 

209 if structure.is_missing_any_bonds(): 

210 raise InputError('We cannot guess chains with bonds guessed from coarse grain.\n' 

211 ' Please either provide a topology including bonds or set chains in the structure PDB file.') 

212 # Run the chainer 

213 structure.auto_chainer() 

214 # Update the structure file using the corrected structure 

215 print(' The structure file has been modified (no chains) -> ' + output_structure_file.filename) 

216 structure.generate_pdb_file(output_structure_file.path) 

217 

218 else: 

219 # In case there are some missing chains 

220 # Note that atoms with no chain are not a problem for the workflow but they are for the web client 

221 unlettered_chain = next((chain for chain in chains if chain.name == ' '), None) 

222 if unlettered_chain: 

223 current_letters = set([ chain.name for chain in chains ]) 

224 new_letter = get_new_letter(current_letters) 

225 # If we run out of letters there may be some problematic chain configuration 

226 # In this cases we cannot respect the original chains 

227 if new_letter == None: 

228 warn('No more letters in the alphabel to fill missing chains -> All chains will be assigned from scratch') 

229 structure.auto_chainer() 

230 else: 

231 warn(f'Some chains are missing -> Unchained regions will be chained as {new_letter}') 

232 unlettered_chain.name = new_letter 

233 # Update the structure file using the corrected structure 

234 print(f' The structure file has been modified (missing chains) -> {output_structure_file.filename}') 

235 structure.generate_pdb_file(output_structure_file.path) 

236 

237 # ------------------------------------------------------------------------------------------ 

238 # Splitted chains -------------------------------------------------------------------------- 

239 # ------------------------------------------------------------------------------------------ 

240 

241 # Note that this will never happen because of the pdb itself, duplicated chains are handled further 

242 # This will only happen if chains were missing and guessed 

243 # This may mean there is something wrong in the structure 

244 # Check fragments with the VMD and searh for wrong bonds 

245 if structure.check_splitted_chains(fix_chains = True, display_summary = True): 

246 # Update the structure file using the corrected structure 

247 print(' The structure file has been modified (splitted chains) -> ' + output_structure_file.filename) 

248 structure.generate_pdb_file(output_structure_file.path) 

249 

250 # ------------------------------------------------------------------------------------------ 

251 # Repeated chains -------------------------------------------------------------------------- 

252 # ------------------------------------------------------------------------------------------ 

253 

254 if structure.check_repeated_chains(fix_chains=True, display_summary=True): 

255 # Update the structure file using the corrected structure 

256 print(' The structure file has been modified (repeated chains) -> ' + output_structure_file.filename) 

257 structure.generate_pdb_file(output_structure_file.path) 

258 

259 # ------------------------------------------------------------------------------------------ 

260 # Coherent chains -------------------------------------------------------------------------- 

261 # ------------------------------------------------------------------------------------------ 

262 

263 # Makes sure polymers with sequence are all bonded 

264 # In other words, make sure there are not multiple proteins/nucleics labeled with the same chain 

265 # Note that their sequences otherwise will be merged silently, thus not reflecting reality 

266 # This would make protein mapping impossible for instance 

267 sequence_polymers_selection = structure.select_protein() + structure.select_nucleic() 

268 

269 # We also want to support having a solution of free aminoacids/nucleotides 

270 # So if residues in the chain are not bonded or bonded in very small fragments we accept it 

271 N_RESIDUES_CUTOFF = 3 

272 

273 # Save if we splitted chains 

274 had_to_split_chains = False 

275 

276 # Iterate sequence polymer chains 

277 for chain in structure.get_selection_chains(sequence_polymers_selection): 

278 # If the chain is missing bonds then we can not check if it is coherent 

279 # We assume it is and continue 

280 if chain.is_missing_any_bonds(): continue 

281 # If there is only one fragment we are good 

282 chain_selection = chain.get_selection() 

283 fragments = list(structure.find_fragments(chain_selection, atom_bonds=safe_bonds)) 

284 if len(fragments) == 0: raise ValueError(f'No fragments found in chain {chain.name}') 

285 if len(fragments) == 1: continue 

286 # We also want to support having a solution of free aminoacids/nucleotides 

287 # So if residues in the chain are not bonded or bonded in very small fragments we accept it 

288 # Make a single fragments out of all small fragments 

289 independent_fragments = [] 

290 merged_fragment = None 

291 for fragment in fragments: 

292 # If the fragments reaches the size cutoff then send it to the independent fragments list 

293 residues = structure.get_selection_residues(fragment) 

294 if len(residues) > N_RESIDUES_CUTOFF: 

295 independent_fragments.append(fragment) 

296 continue 

297 # Otherwise merge it with other small fragments 

298 if merged_fragment: merged_fragment += fragment 

299 else: merged_fragment = fragment 

300 # Add the merged fragment to the list as well 

301 if merged_fragment: 

302 independent_fragments.append(merged_fragment) 

303 # If we have only one fragment after the merging then we are good 

304 if len(independent_fragments) == 1: continue 

305 # Otherwise rename fragments 

306 warn(f'Chain {chain.name} has sequence polymer(s) but not all atoms are bonded.') 

307 # We can skip the first fragment, so it can keep the original chain name 

308 for independent_fragment in independent_fragments[1:]: 

309 print(f' A fragment of this chain including {len(independent_fragment)} atoms will be splitted appart.') 

310 had_to_split_chains = True 

311 # Make a new chain for the current fragment 

312 new_chain_name = structure.get_next_available_chain_name(chain.name) 

313 print(f' This fragment will be renamed as chain "{new_chain_name}"') 

314 structure.set_selection_chain_name(independent_fragment, new_chain_name) 

315 

316 # If we did any change then save the structure 

317 if had_to_split_chains: 

318 print(' The structure file has been modified (coherent chains) -> ' + output_structure_file.filename) 

319 structure.generate_pdb_file(output_structure_file.path) 

320 

321 # ------------------------------------------------------------------------------------------ 

322 # Merged residues ------------------------------------------------------------------------ 

323 # ------------------------------------------------------------------------------------------ 

324 

325 # Check if the structure is missing bonds 

326 missing_any_bonds = structure.is_missing_any_bonds() 

327 

328 # There may be residues which contain unconnected (unbonded) atoms. They are not allowed. 

329 # They may come from a wrong parsing and be indeed duplicated residues. 

330 # NEVER FORGET: Merged residues may be generated when calling the structure.auto_chainer 

331 # Note that we need bonds to check for merged residues 

332 # If bonds are missing then we skip this check 

333 if not missing_any_bonds and structure.check_merged_residues(fix_residues = True, display_summary = True): 

334 # Update the structure file using the corrected structure 

335 print(' The structure file has been modified (merged residues) -> ' + output_structure_file.filename) 

336 structure.generate_pdb_file(output_structure_file.path) 

337 

338 # ------------------------------------------------------------------------------------------ 

339 # Splitted residues ------------------------------------------------------------------------ 

340 # ------------------------------------------------------------------------------------------ 

341 

342 # Note that we need bonds to check for splitted residues 

343 # If bonds are missing then we skip this check 

344 if not missing_any_bonds and structure.check_repeated_residues(fix_residues=True, display_summary=True): 

345 # Update the structure file using the corrected structure 

346 print(' The structure file has been modified (repeated residues) -> ' + output_structure_file.filename) 

347 structure.generate_pdb_file(output_structure_file.path) 

348 

349 # Sort trajectory coordinates in case atoms were sorted 

350 if input_trajectory_file.path and structure.trajectory_atom_sorter: 

351 # Save a warning in the register 

352 warn('Atoms have been sorted to solve splitted residues') 

353 print('Creating resorted files for atom bonds and charges') 

354 # Bonds are already resorted 

355 save_json(safe_bonds, MD.project.resorted_bonds_file.path, indent=4) 

356 # Charges are to be resorted 

357 charges = MD.project.get_charges._get_parent_output(MD.project) 

358 resorted_charges = [ charges[index] for index in structure.new_atom_order ] 

359 MD.project.get_charges._set_parent_output(MD.project, resorted_charges) 

360 save_json(resorted_charges, MD.project.resorted_charges_file.path, indent=4) 

361 print('Sorting trajectory coordinates to fit the new structure atom sort...') 

362 structure.trajectory_atom_sorter(output_structure_file, input_trajectory_file, output_trajectory_file) 

363 

364 # ------------------------------------------------------------------------------------------ 

365 # Repeated atoms --------------------------------------------------------------------------- 

366 # ------------------------------------------------------------------------------------------ 

367 

368 if structure.check_repeated_atoms(fix_atoms=True, display_summary=True): 

369 # Update the structure file using the corrected structure 

370 print(' The structure file has been modified (repeated atoms) -> ' + output_structure_file.filename) 

371 structure.generate_pdb_file(output_structure_file.path)