Coverage for mddb_workflow / utils / structures.py: 84%

1746 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 18:45 +0000

1# Main handler of the toolbelt 

2import os 

3import re 

4import math 

5import pytraj 

6import MDAnalysis 

7import numpy as np 

8import networkx as nx 

9from bisect import bisect 

10 

11from mddb_workflow.utils.file import File 

12from mddb_workflow.utils.selections import Selection 

13from mddb_workflow.utils.vmd_spells import get_vmd_selection_atom_indices, get_covalent_bonds 

14from mddb_workflow.utils.mdt_spells import sort_trajectory_atoms 

15from mddb_workflow.utils.gmx_spells import make_index, read_ndx 

16from mddb_workflow.utils.auxiliar import InputError, MISSING_BONDS 

17from mddb_workflow.utils.auxiliar import is_imported, residue_name_to_letter, otherwise, warn 

18from mddb_workflow.utils.constants import SUPPORTED_ION_ELEMENTS, SUPPORTED_ELEMENTS 

19from mddb_workflow.utils.constants import STANDARD_COUNTER_CATION_ATOM_NAMES, STANDARD_COUNTER_ANION_ATOM_NAMES 

20from mddb_workflow.utils.constants import STANDARD_SOLVENT_RESIDUE_NAMES, STANDARD_COUNTER_ION_ATOM_NAMES 

21from mddb_workflow.utils.constants import STANDARD_DUMMY_ATOM_NAMES, DUMMY_ATOM_ELEMENT, CG_ATOM_ELEMENT 

22from mddb_workflow.utils.constants import PROTEIN_RESIDUE_NAME_LETTERS, NUCLEIC_RESIDUE_NAME_LETTERS 

23from mddb_workflow.utils.constants import DNA_RESIDUE_NAME_LETTERS, RNA_RESIDUE_NAME_LETTERS 

24from mddb_workflow.utils.constants import FATTY_RESIDUE_NAMES, STEROID_RESIDUE_NAMES 

25from mddb_workflow.utils.type_hints import * 

26 

27 

28# Set all available chains according to pdb standards 

29AVAILABLE_CAPS = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') 

30AVAILABLE_LOWS = list('abcdefghijklmnopqrstuvwxyz') 

31AVAILABLE_LETTERS = AVAILABLE_CAPS + AVAILABLE_LOWS 

32 

33# Set letters to be found in alphanumerical bases 

34hexadecimal_letters = set(AVAILABLE_CAPS[:6] + AVAILABLE_LOWS[:6]) 

35alphanumerical_letters = set(AVAILABLE_CAPS[6:] + AVAILABLE_LOWS[6:]) 

36 

37# Set the expected number of bonds for each atom according to its element 

38coherent_bonds_with_hydrogen = { 

39 'H': { 'min': 1, 'max': 1 }, 

40 'O': { 'min': 1, 'max': 2 }, 

41 'N': { 'min': 1, 'max': 4 }, 

42 'C': { 'min': 2, 'max': 4 }, 

43 'S': { 'min': 1, 'max': 6 }, 

44 'P': { 'min': 2, 'max': 6 }, 

45} 

46# WARNING: Not deeply tested 

47coherent_bonds_without_hydrogen = { 

48 'O': { 'min': 0, 'max': 2 }, 

49 'N': { 'min': 1, 'max': 3 }, 

50 'C': { 'min': 1, 'max': 3 }, 

51 'S': { 'min': 1, 'max': 4 }, 

52 'P': { 'min': 2, 'max': 4 }, 

53} 

54 

55# Set the limit of residue numbers according to PDB format (4 characters, starts count at 1) 

56# This means the last number is 9999 and it is equivalent to index 9998 

57pdb_last_decimal_residue_index = 9998 

58 

59class Atom: 

60 """An atom class.""" 

61 

62 def __init__ (self, 

63 name : Optional[str] = None, 

64 element : Optional[str] = None, 

65 coords : Optional[Coords] = None, 

66 ): 

67 self.name = name 

68 self.element = element 

69 self.coords = tuple(coords) if coords else None 

70 # Set variables to store references to other related instances 

71 # These variables will be set further by the structure 

72 self._structure = None 

73 self._index = None 

74 self._residue_index = None 

75 

76 def __repr__ (self): 

77 return '<Atom ' + self.name + '>' 

78 

79 def __eq__ (self, other): 

80 if type(self) != type(other): 

81 return False 

82 return self._residue_index == other._residue_index and self.name == other.name 

83 

84 def __hash__ (self): 

85 return hash((self.index)) 

86 

87 def get_structure (self) -> Optional['Structure']: 

88 """The parent structure (read only). 

89 This value is set by the structure itself.""" 

90 return self._structure 

91 structure: 'Structure' = property(get_structure, None, None, "The parent structure (read only)") 

92 

93 def get_index (self) -> Optional[int]: 

94 """The residue index according to parent structure residues (read only). 

95 This value is set by the structure itself.""" 

96 return self._index 

97 index = property(get_index, None, None, "The residue index according to parent structure residues (read only)") 

98 

99 def get_residue_index (self) -> int: 

100 """The atom residue index according to parent structure residues. 

101 If residue index is set then make changes in all the structure to make this change coherent.""" 

102 return self._residue_index 

103 def set_residue_index (self, new_residue_index : int): 

104 # If the new residue index is the current residue index then do nothing 

105 # WARNING: It is important to stop this here or it could delete a residue which is not to be deleted 

106 if new_residue_index == self.residue_index: 

107 return 

108 # If there is not strucutre yet it means the residue is beeing set before the structure 

109 # We just save the residue index and wait for the structure to be set 

110 if not self.structure: 

111 self._residue_index = new_residue_index 

112 return 

113 # Relational indices are updated through a top-down hierarchy 

114 # Affected residues are the ones to update this atom internal residue index 

115 new_residue = self.structure.residues[new_residue_index] 

116 new_residue.add_atom(self) 

117 residue_index = property(get_residue_index, set_residue_index, None, 

118 "The atom residue index according to parent structure residues") 

119 

120 def get_residue (self) -> Optional['Residue']: 

121 """The atom residue. 

122 If residue is set then make changes in all the structure to make this change coherent.""" 

123 # If there is not strucutre yet it means the atom is beeing set before the structure 

124 # In this case it is not possible to get related residues in the structure 

125 # It also may happend that the residue is missing because the atom has been set rawly 

126 if not self.structure or self.residue_index == None: 

127 return None 

128 # Get the residue in the structure according to the residue index 

129 return self.structure.residues[self.residue_index] 

130 def set_residue (self, new_residue : 'Residue'): 

131 """Find the new residue index and set it as the atom residue index. 

132 Note that the residue must be set in the structure already.""" 

133 new_residue_index = new_residue.index 

134 if new_residue_index == None: 

135 raise ValueError('Residue ' + str(new_residue) + ' is not set in the structure') 

136 self.set_residue_index(new_residue_index) 

137 residue = property(get_residue, set_residue, None, "The atom residue") 

138 

139 def get_chain_index (self) -> Optional[int]: 

140 """Get the atom chain index according to parent structure chains.""" 

141 # The residue may be missing if the atom has been set rawly 

142 if not self.residue: 

143 return None 

144 return self.residue.chain_index 

145 chain_index = property(get_chain_index, None, None, 

146 "The atom chain index according to parent structure chains (read only)") 

147 

148 def get_chain (self) -> Optional['Chain']: 

149 """The atom chain (read only). 

150 In order to change the chain it must be changed in the atom residue.""" 

151 # If there is not strucutre yet it means the atom is beeing set before the structure 

152 # In this case it is not possible to get related residues in the structure 

153 # It also may happend that the chain is missing because the atom has been set rawly 

154 if not self.structure or self.chain_index == None: 

155 return None 

156 # Get the chain in the structure according to the chain index 

157 return self.structure.chains[self.chain_index] 

158 chain: 'Chain' = property(get_chain, None, None, "The atom chain (read only)") 

159 

160 def get_bonds (self, skip_ions : bool = False, skip_dummies : bool = False) -> Optional[ list[int] ]: 

161 """Get indices of other atoms in the structure which are covalently bonded to this atom.""" 

162 if not self.structure: 

163 raise ValueError('The atom has not a structure defined') 

164 if self.index == None: 

165 raise ValueError('The atom has not an index defined') 

166 bonds = self.structure.bonds[self.index] 

167 # If the skip ions argument is passed then remove atom indices of supported ion atoms 

168 if skip_ions: 

169 bonds = list(set(bonds) - self.structure.ion_atom_indices) 

170 if skip_dummies: 

171 bonds = list(set(bonds) - self.structure.dummy_atom_indices) 

172 return bonds 

173 

174 # Atoms indices of atoms in the structure which are covalently bonded to this atom 

175 bonds = property(get_bonds, None, None, 

176 'Atoms indices of atoms in the structure which are covalently bonded to this atom') 

177 

178 def get_bonded_atoms (self) -> list['Atom']: 

179 """Get bonded atoms.""" 

180 return [ self.structure.atoms[atom_index] for atom_index in self.bonds ] 

181 

182 def get_selection (self) -> 'Selection': 

183 """Generate a selection for this atom.""" 

184 return Selection([self.index]) 

185 

186 def copy (self) -> 'Atom': 

187 """Make a copy of the current atom.""" 

188 atom_copy = Atom(self.name, self.element, self.coords) 

189 atom_copy._structure = self._structure 

190 atom_copy._index = self._index 

191 atom_copy._residue_index = self._residue_index 

192 return atom_copy 

193 

194 def is_fatty_candidate (self) -> bool: 

195 """ 

196 Check if this atom meets specific criteria: 

197 1. It is a carbon 

198 2. It is connected only to other carbons and hydrogens 

199 3. It is connected to 1 or 2 carbons 

200 """ 

201 # Ignore non carbon atoms 

202 if self.element != 'C': 

203 return False 

204 # Get bonded atom elements 

205 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ] 

206 # Check only carbon and hydrogen atoms to be bonded 

207 if any((element != 'C' and element != 'H') for element in bonded_atoms_elements): 

208 return False 

209 # Check it is connected to 1 or 2 carbons 

210 connected_carbons_count = bonded_atoms_elements.count('C') 

211 if connected_carbons_count != 1 and connected_carbons_count != 2: 

212 return False 

213 return True 

214 

215 def is_carbohydrate_candidate (self) -> bool: 

216 """ 

217 Check if this atom meets specific criteria: 

218 1. It is a carbon 

219 2. It is connected only to other carbons, hydrogens or oxygens 

220 3. It is connected to 1 or 2 carbons 

221 4. It is connected to 1 oxygen 

222 """ 

223 # Ignore non carbon atoms 

224 if self.element != 'C': 

225 return False 

226 # Get bonded atom elements 

227 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ] 

228 # Check only carbon and hydrogen atoms to be bonded 

229 if any((element != 'C' and element != 'H' and element != 'O') for element in bonded_atoms_elements): 

230 return False 

231 # Check it is connected to 1 or 2 carbons 

232 connected_carbons_count = bonded_atoms_elements.count('C') 

233 if connected_carbons_count != 1 and connected_carbons_count != 2: 

234 return False 

235 # Check it is connected to 1 oxygen 

236 connected_oxygens_count = bonded_atoms_elements.count('O') 

237 if connected_oxygens_count != 1: 

238 return False 

239 return True 

240 

241 def is_ion (self) -> bool: 

242 """Check if it is an ion by checking if it has no bonds with other atoms.""" 

243 return len(self.bonds) == 0 

244 

245 def guess_element (self) -> str: 

246 """Guess an atom element from its name and number of bonds.""" 

247 # If the atom name is among the known dummy atoms then return a standard element for dummy atoms 

248 if self.name.upper() in STANDARD_DUMMY_ATOM_NAMES: 

249 return DUMMY_ATOM_ELEMENT 

250 # If the element is already the "coarse grained" flag then return this very same flag 

251 # WARNING: This has to be preset before guessing or the guess may fail 

252 if self.is_cg(): 

253 return CG_ATOM_ELEMENT 

254 # If the name is SOD and it is a lonely atom then it is clearly sodium, not sulfur 

255 if self.name.upper() == 'SOD' and self.residue.atom_count == 1: 

256 return 'Na' 

257 # If the name is POT and it is a lonely atom then it is clearly potassium, not phosphor 

258 if self.name.upper() == 'POT' and self.residue.atom_count == 1: 

259 return 'K' 

260 # Find a obvios element name in the atom name 

261 element = self.get_name_suggested_element() 

262 # CA may refer to calcium or alpha carbon, so the number of atoms in the residue is decisive 

263 if element == 'Ca' and self.residue.atom_count >= 2: 

264 return 'C' 

265 # NA may refer to sodium or some ligand nitrogen, so the number of atoms in the residue is decisive 

266 if element == 'Na' and self.residue.atom_count >= 2: 

267 return 'N' 

268 return element 

269 

270 def get_name_suggested_element (self) -> str: 

271 """Guess an atom element from its name only.""" 

272 # Get the atom name and its characters length 

273 name = self.name 

274 length = len(name) 

275 next_character = None 

276 for i, character in enumerate(name): 

277 # Get the next character, since element may be formed by 2 letters 

278 if i < length - 1: 

279 next_character = name[i+1] 

280 # If next character is not a string then ignore it 

281 if not next_character.isalpha(): 

282 next_character = None 

283 # Try to get all possible matches between the characters and the supported atoms 

284 # First letter is always caps 

285 character = character.upper() 

286 # First try to match both letters together 

287 if next_character: 

288 # Start with the second letter in caps 

289 next_character = next_character.upper() 

290 both = character + next_character 

291 if both in SUPPORTED_ELEMENTS: 

292 return both 

293 # Continue with the second letter in lowers 

294 next_character = next_character.lower() 

295 both = character + next_character 

296 if both in SUPPORTED_ELEMENTS: 

297 return both 

298 # Finally, try with the first character alone 

299 if character in SUPPORTED_ELEMENTS: 

300 return character 

301 raise InputError(f"Cannot guess atom element from atom name '{name}'") 

302 

303 def get_label (self) -> str: 

304 """Get a standard label.""" 

305 return f'{self.residue.label}.{self.name} (index {self.index})' 

306 # The atom standard label (read only) 

307 label = property(get_label, None, None) 

308 

309 # Ask if the current atom is in coarse grain 

310 def is_cg (self) -> bool: 

311 return self.element == CG_ATOM_ELEMENT 

312 

313class Residue: 

314 """A residue class.""" 

315 def __init__ (self, 

316 name : Optional[str] = None, 

317 number : Optional[int] = None, 

318 icode : Optional[str] = None, 

319 ): 

320 self.name = name 

321 self.number = number 

322 self.icode = icode 

323 # Set variables to store references to other related instaces 

324 # These variables will be set further by the structure 

325 self._structure = None 

326 self._index = None 

327 self._atom_indices = [] 

328 self._chain_index = None 

329 self._classification = None 

330 

331 def __repr__ (self): 

332 return f'<Residue {self.label}>' 

333 

334 def __eq__ (self, other): 

335 if type(self) != type(other): 

336 return False 

337 return ( 

338 self._chain_index == other._chain_index and 

339 #self.name == other.name and 

340 self.number == other.number and 

341 self.icode == other.icode 

342 ) 

343 

344 def __hash__ (self): 

345 # WARNING: This is susceptible to duplicated residues 

346 return hash((self._chain_index, self.number, self.icode)) 

347 # WARNING: This is not susceptible to duplicated residues 

348 #return hash(tuple(self._atom_indices)) 

349 

350 def get_structure (self) -> Optional['Structure']: 

351 """Get the parent structure (read only). 

352 This value is set by the structure itself.""" 

353 return self._structure 

354 structure: 'Structure' = property(get_structure, None, None, 

355 "The parent structure (read only)") 

356 

357 def get_index (self) -> Optional[int]: 

358 """Get the residue index according to parent structure residues (read only). 

359 This value is set by the structure itself.""" 

360 return self._index 

361 def set_index (self, index): 

362 # Update residue atoms 

363 for atom in self.atoms: 

364 atom._residue_index = index 

365 # Update residue chain 

366 chain_residue_index = self.chain._residue_indices.index(self._index) 

367 self.chain._residue_indices[chain_residue_index] = index 

368 # Finally update self index 

369 self._index = index 

370 index = property(get_index, set_index, None, 

371 "The residue index according to parent structure residues (read only)") 

372 

373 def get_atom_indices (self) -> list[int]: 

374 """Get the atom indices according to parent structure atoms for atoms in this residue. 

375 If atom indices are set then make changes in all the structure to make this change coherent.""" 

376 return self._atom_indices 

377 def set_atom_indices (self, new_atom_indices : list[int]): 

378 # If there is not strucutre yet it means the residue is beeing set before the structure 

379 # We just save atom indices and wait for the structure to be set 

380 if not self.structure: 

381 self._atom_indices = new_atom_indices 

382 return 

383 # Update the current atoms 

384 for atom in self.atoms: 

385 atom._residue_index = None 

386 # Update the new atoms 

387 for index in new_atom_indices: 

388 atom = self.structure.atoms[index] 

389 atom._residue_index = self.index 

390 # Now new indices are coherent and thus we can save them 

391 self._atom_indices = new_atom_indices 

392 atom_indices = property(get_atom_indices, set_atom_indices, None, 

393 "The atom indices according to parent structure atoms for atoms in this residue") 

394 

395 def get_atoms (self) -> list['Atom']: 

396 """Get the atoms in this residue. 

397 If atoms are set then make changes in all the structure to make this change coherent.""" 

398 # If there is not strucutre yet it means the chain is beeing set before the structure 

399 # In this case it is not possible to get related atoms in the structure 

400 if not self.structure: 

401 return [] 

402 # Get atoms in the structure according to atom indices 

403 atoms = self.structure.atoms 

404 return [ atoms[atom_index] for atom_index in self.atom_indices ] 

405 def set_atoms (self, new_atoms : list['Atom']): 

406 # Find indices for new atoms and set their indices as the new atom indices 

407 # Note that atoms must be set in the structure already 

408 new_atom_indices = [] 

409 for new_atom in new_atoms: 

410 new_atom_index = new_atom.index 

411 if new_atom_index == None: 

412 raise ValueError('Atom ' + str(new_atom) + ' is not set in the structure') 

413 new_atom_indices.append(new_atom_index) 

414 self.set_atom_indices(new_atom_indices) 

415 atoms: list['Atom'] = property(get_atoms, set_atoms, None, "The atoms in this residue") 

416 

417 # Get the number of atoms in the residue 

418 def get_atom_count (self) -> int: 

419 return len(self.atom_indices) 

420 atom_count = property(get_atom_count, None, None, "The number of atoms in the residue (read only)") 

421 

422 def add_atom (self, new_atom : 'Atom'): 

423 """Add an atom to the residue.""" 

424 # Remove the atom from its previous residue owner 

425 if new_atom.residue_index: 

426 new_atom.residue.remove_atom(new_atom) 

427 # Insert the new atom index in the list of atom indices keeping the order 

428 new_atom_index = new_atom.index 

429 sorted_atom_index = bisect(self.atom_indices, new_atom_index) 

430 self.atom_indices.insert(sorted_atom_index, new_atom_index) 

431 # Update the atom internal index 

432 new_atom._residue_index = self.index 

433 

434 def remove_atom (self, current_atom : 'Atom'): 

435 """Remove an atom from the residue.""" 

436 # Remove the current atom index from the atom indices list 

437 self.atom_indices.remove(current_atom.index) # This index MUST be in the list 

438 # If we removed the last atom then this residue must be removed from its structure 

439 if len(self.atom_indices) == 0 and self.structure: 

440 self.structure.purge_residue(self) 

441 # Update the atom internal index 

442 current_atom._residue_index = None 

443 

444 # The residue chain index according to parent structure chains 

445 # If chain index is set then make changes in all the structure to make this change coherent 

446 def get_chain_index (self) -> int: 

447 return self._chain_index 

448 def set_chain_index (self, new_chain_index : int): 

449 # If the new chain index is the current chain index do nothing 

450 # WARNING: It is important to stop this here or it could delete a chain which is not to be deleted 

451 if new_chain_index == self.chain_index: 

452 return 

453 # If there is not strucutre yet it means the chain is beeing set before the structure 

454 # We just save the chain index and wait for the structure to be set 

455 if not self.structure: 

456 self._chain_index = new_chain_index 

457 return 

458 # Relational indices are updated through a top-down hierarchy 

459 # Affected chains are the ones to update this residue internal chain index 

460 # WARNING: It is critical to find the new chain before removing/adding residues 

461 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged 

462 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed 

463 new_chain = self.structure.chains[new_chain_index] 

464 if self.chain: 

465 self.chain.remove_residue(self) 

466 new_chain.add_residue(self) 

467 chain_index = property(get_chain_index, set_chain_index, None, 

468 "The residue chain index according to parent structure chains") 

469 

470 # The residue chain 

471 # If chain is set then make changes in all the structure to make this change coherent 

472 def get_chain (self) -> Optional['Chain']: 

473 # If there is not strucutre yet then it means the residue is set before the structure 

474 # In this case it is not possible to get related chain in the structure 

475 if not self.structure or self.chain_index == None: 

476 return None 

477 # Get the chain in the structure according to the chain index 

478 return self.structure.chains[self.chain_index] 

479 def set_chain (self, new_chain : Union['Chain', str]): 

480 # In case the chain is just a string we must find/create the corresponding chain 

481 if type(new_chain) == str: 

482 letter = new_chain 

483 # Get the residue structure 

484 structure = self.structure 

485 if not structure: 

486 raise ValueError(f'Cannot find the corresponding {new_chain} chain without the structure') 

487 # Find if the letter belongs to an already existing chain 

488 new_chain = structure.get_chain_by_name(letter) 

489 # If the chain does not exist yet then create it 

490 if not new_chain: 

491 new_chain = Chain(name=letter) 

492 structure.set_new_chain(new_chain) 

493 # Find the new chain index and set it as the residue chain index 

494 # Note that the chain must be set in the structure already 

495 new_chain_index = new_chain.index 

496 if new_chain_index == None: 

497 raise ValueError(f'Chain {new_chain} is not set in the structure') 

498 self.set_chain_index(new_chain_index) 

499 chain: 'Chain' = property(get_chain, set_chain, None, "The residue chain") 

500 

501 def get_bonded_atom_indices (self) -> list[int]: 

502 """Get atom indices from atoms bonded to this residue.""" 

503 # Get all bonds among residue atoms 

504 all_bonds = [] 

505 for atom in self.atoms: 

506 all_bonds += atom.bonds 

507 # Remove self atom indices from the list 

508 external_bonds = set(all_bonds) - set(self.atom_indices) 

509 return list(external_bonds) 

510 

511 def get_bonded_atoms (self) -> list['Atom']: 

512 """Get atoms bonded to this residue.""" 

513 return [ self.structure.atoms[atom_index] for atom_index in self.get_bonded_atom_indices() ] 

514 

515 def get_bonded_residue_indices (self) -> list[int]: 

516 """Get residue indices from residues bonded to this residue.""" 

517 return list(set([ atom.residue_index for atom in self.get_bonded_atoms() ])) 

518 

519 def get_bonded_residues (self) -> list['Residue']: 

520 """Get residues bonded to this residue.""" 

521 return [ self.structure.residues[residue_index] for residue_index in self.get_bonded_residue_indices() ] 

522 

523 def is_bonded_with_residue (self, other : 'Residue') -> bool: 

524 """Given another residue, check if it is bonded with this residue.""" 

525 bonded_atom_indices = set(self.get_bonded_atom_indices()) 

526 if next((index for index in other.atom_indices if index in bonded_atom_indices), None) != None: return True 

527 return False 

528 

529 def is_missing_any_bonds (self) -> bool: 

530 return any(atom.bonds == MISSING_BONDS for atom in self.atoms) 

531 

532 def is_coherent (self) -> bool: 

533 """Make sure atoms within the residue are all bonded.""" 

534 # If bonds are missing then just say everything is 

535 if self.is_missing_any_bonds(): 

536 raise RuntimeError('Trying to check if residue with missing bonds is coherent') 

537 residue_selection = self.get_selection() 

538 residue_fragments = self.structure.find_fragments(residue_selection) 

539 first_residue_fragment = next(residue_fragments) 

540 return len(first_residue_fragment) == self.atom_count 

541 

542 def get_classification (self) -> str: 

543 """ 

544 Get the residue biochemistry classification. 

545 

546 WARNING: Note that this logic will not work in a structure without hydrogens. 

547 

548 Available classifications: 

549 - protein 

550 - dna 

551 - rna 

552 - carbohydrate 

553 - fatty 

554 - steroid 

555 - ion 

556 - solvent 

557 - acetyl 

558 - amide 

559 - other 

560 """ 

561 # Return the internal value, if any 

562 if self._classification: 

563 return self._classification 

564 # If this is a CG residue then we can not classify it 

565 if self.is_cg(): return self.get_classification_by_name() 

566 # If this residue is missing bonds then we can not classify it 

567 # Some parts of the logic like the find_rings logic rely on bonds 

568 if self.is_missing_any_bonds(): return self.get_classification_by_name() 

569 # If we dont have a value then we must classify the residue 

570 # ------------------------------------------------------------------------------------------------------- 

571 # Ions are single atom residues 

572 if self.atom_count == 1: 

573 self._classification = 'ion' 

574 return self._classification 

575 # ------------------------------------------------------------------------------------------------------- 

576 # At this point we need atoms to have elements fixed 

577 # WARNING: missing elements would result in silent failure to recognize some classifications 

578 if self.structure._fixed_atom_elements == False: 

579 self.structure.fix_atom_elements() 

580 # Solvent is water molecules 

581 # First rely on the residue name 

582 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES: 

583 self._classification = 'solvent' 

584 return self._classification 

585 # It may be water with a not known name 

586 # Literally check if its a molecule with 3 atoms: 2 hydrogens and 1 oxygen 

587 if self.atom_count == 3: 

588 atom_elements = [ atom.element for atom in self.atoms ] 

589 if atom_elements.count('H') == 2 and atom_elements.count('O') == 1: 

590 self._classification = 'solvent' 

591 return self._classification 

592 # ------------------------------------------------------------------------------------------------------- 

593 # Protein definition according to vmd: 

594 # a residue with atoms named C, N, CA, and O 

595 # In our case we accept OC1 and OC2 or OT1 and OT2 instead of O for terminal residues 

596 atom_names = set([ atom.name for atom in self.atoms ]) 

597 if ( all((name in atom_names) for name in ['C', 'N', 'CA']) and ( 

598 'O' in atom_names or { 'OC1', 'OC2' } <= atom_names or { 'OT1', 'OT2' } <= atom_names 

599 )): 

600 self._classification = 'protein' 

601 return self._classification 

602 # ------------------------------------------------------------------------------------------------------- 

603 # Nucleic acids definition according to vmd: 

604 # a residue with atoms named P, O1P, O2P and either O3’, C3’, C4’, C5’, O5’ or O3*, C3*, C4*, C5*, O5* 

605 # Apparently it has been fixed so now a residue does not need to be phosphorylated to be considered nucleic 

606 # LORE: This included the condition "all((name in atom_names) for name in ['P', 'OP1', 'OP2'])" 

607 if (( all((name in atom_names) for name in ["O3'", "C3'", "C4'", "C5'", "O5'"]) or 

608 all((name in atom_names) for name in ["O3*", "C3*", "C4*", "C5*", "O5*"]) )): 

609 # At this point we know it is nucleic 

610 # We must tell the difference between DNA and RNA 

611 if "O2'" in atom_names or "O2*" in atom_names: 

612 self._classification = 'rna' 

613 else: 

614 self._classification = 'dna' 

615 return self._classification 

616 # ------------------------------------------------------------------------------------------------------- 

617 # To define carbohydrates search for rings made of 1 oxygen and 'n' carbons 

618 # WARNING: This logic may fail for some very specific molecules such as furine 

619 # LIMITATION: This logic only aims for cyclical carbohydrates. The linear form of carbohydrates is not yet consider 

620 rings = self.find_rings(6) 

621 for ring in rings: 

622 ring_elements = [ atom.element for atom in ring ] 

623 # Check the ring has 1 oxygen 

624 oxygen_count = ring_elements.count('O') 

625 if oxygen_count != 1: 

626 continue 

627 # Check the rest are only carbon 

628 carbon_count = ring_elements.count('C') 

629 if carbon_count != len(ring) - 1: 

630 continue 

631 self._classification = 'carbohydrate' 

632 return self._classification 

633 # ------------------------------------------------------------------------------------------------------- 

634 # To define steroids search for 3 6-carbon rings and 1 5-carbon ring (at least) 

635 # WARNING: According to this logic some exotical non-steroid molecules may result in false positives 

636 num_6_carbon_rings = 0 

637 num_5_carbon_rings = 0 

638 for ring in rings: 

639 ring_elements = [ atom.element for atom in ring ] 

640 if any(element != 'C' for element in ring_elements): 

641 continue 

642 ring_lenght = len(ring) 

643 if ring_lenght == 5: 

644 num_5_carbon_rings += 1 

645 if ring_lenght == 6: 

646 num_6_carbon_rings += 1 

647 if num_6_carbon_rings >= 3 and num_5_carbon_rings >= 1: 

648 self._classification = 'steroid' 

649 return self._classification 

650 # ------------------------------------------------------------------------------------------------------- 

651 # To define fatty acids search for a series of carbon atoms connected together (3 at least) 

652 # These carbon atoms must be connected only to hydrogen in addition to 1-2 carbon 

653 # These carbons must not be bonded in a cycle so check atoms not be repeated in the series 

654 bonded_fatty_atoms = [] 

655 def get_bonded_fatty_atoms_recursive (current_atom : Atom, previous_atom : Optional[Atom] = None) -> bool: 

656 # Iterate over the input atom bonded atoms 

657 for bonded_atom in current_atom.get_bonded_atoms(): 

658 # Discard the previous atom to avoid an infinite loop 

659 if bonded_atom == previous_atom: 

660 continue 

661 # If we find an atom which is already in the fatty atoms list then it means we are cycling 

662 if bonded_atom in bonded_fatty_atoms: 

663 return False 

664 # If this is not a fatty atom then discard it 

665 if not bonded_atom.is_fatty_candidate(): 

666 continue 

667 # Add the current bonded fatty atom to the list 

668 bonded_fatty_atoms.append(bonded_atom) 

669 # Find more bonded fatty atoms and add them to list as well 

670 if get_bonded_fatty_atoms_recursive(current_atom=bonded_atom, previous_atom=current_atom) == False: 

671 return False 

672 return True 

673 # Now check all atoms and try to set the series until we find one which works 

674 already_checked_atoms = [] 

675 for atom in self.atoms: 

676 # If we already checked this atom trough the recursive logic then skip it 

677 if atom in already_checked_atoms: 

678 continue 

679 # If the atom is not a fatty candidate then skip it 

680 if not atom.is_fatty_candidate(): 

681 continue 

682 # Now that we have found a suitable candidate atom we start the series 

683 bonded_fatty_atoms = [atom] 

684 if get_bonded_fatty_atoms_recursive(atom) and len(bonded_fatty_atoms) >= 3: 

685 self._classification = 'fatty' 

686 return self._classification 

687 already_checked_atoms += bonded_fatty_atoms 

688 # ------------------------------------------------------------------------------------------------------- 

689 # Check if it is an acetylation capping terminal 

690 if self.name == 'ACE': 

691 self._classification = 'acetyl' 

692 return self._classification 

693 # ------------------------------------------------------------------------------------------------------- 

694 # Check if it is an N-methyl amide capping terminal 

695 if self.name == 'NME': 

696 self._classification = 'amide' 

697 return self._classification 

698 # ------------------------------------------------------------------------------------------------------- 

699 self._classification = 'other' 

700 return self._classification 

701 

702 def get_classification_by_name (self) -> str: 

703 """ 

704 Set an alternative function to "try" to classify the residues according only to its name. 

705 This is useful for corase grain residues whose atoms may not reflect the real atoms. 

706 WARNING: This logic is very limited and will return "unknown" most of the times. 

707 WARNING: This logic relies mostly in atom names, which may be not standard. 

708 """ 

709 if self.name in PROTEIN_RESIDUE_NAME_LETTERS: 

710 return 'protein' 

711 if self.name in DNA_RESIDUE_NAME_LETTERS: 

712 return 'dna' 

713 if self.name in RNA_RESIDUE_NAME_LETTERS: 

714 return 'rna' 

715 if self.name in NUCLEIC_RESIDUE_NAME_LETTERS: 

716 return 'nucleic' 

717 if self.name in FATTY_RESIDUE_NAMES: 

718 return 'fatty' 

719 if self.name in STEROID_RESIDUE_NAMES: 

720 return 'steroid' 

721 if self.name in STANDARD_COUNTER_ION_ATOM_NAMES: 

722 return 'ion' 

723 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES: 

724 return 'solvent' 

725 # If we do not know what it is 

726 return 'unknown' 

727 

728 # The residue biochemistry classification (read only) 

729 classification = property(get_classification, None, None) 

730 

731 def get_selection (self) -> 'Selection': 

732 """Generate a selection for this residue.""" 

733 return Selection(self.atom_indices) 

734 

735 def copy (self) -> 'Residue': 

736 """Make a copy of the current residue.""" 

737 residue_copy = Residue(self.name, self.number, self.icode) 

738 residue_copy._structure = self._structure 

739 residue_copy._index = self._index 

740 residue_copy._atom_indices = self._atom_indices 

741 residue_copy._chain_index = self._chain_index 

742 return residue_copy 

743 

744 def get_letter (self) -> str: 

745 """Get the residue equivalent single letter code. 

746 Note that this makes sense for aminoacids and nucleotides only. 

747 Non recognized residue names return 'X'.""" 

748 return residue_name_to_letter(self.name) 

749 

750 def get_formula (self) -> str: 

751 """Get the formula of the residue.""" 

752 elements = [ atom.element for atom in self.atoms ] 

753 unique_elements = list(set(elements)) 

754 formula = '' 

755 for unique_element in unique_elements: 

756 count = elements.count(unique_element) 

757 number = get_lower_numbers(str(count)) if count > 1 else '' 

758 formula += unique_element + number 

759 return formula 

760 

761 def find_rings (self, max_ring_size : int) -> list[ list[Atom] ]: 

762 """Find rings in the residue.""" 

763 # Create graph and add edges in one pass 

764 G = nx.Graph() 

765 G.add_edges_from( 

766 (atom.index, bonded_atom) 

767 for atom in self.atoms 

768 for bonded_atom in atom.bonds 

769 ) 

770 # Get cycles and convert to atoms in one comprehension 

771 cycles = list(nx.simple_cycles(G, max_ring_size)) 

772 # Create a mapping from structure atom indices to residue-local atom indices 

773 residue_atom_indices = {atom.index: i for i, atom in enumerate(self.atoms)} 

774 # Return the rings as lists of Atom objects 

775 return [ 

776 [self.atoms[residue_atom_indices[i]] for i in cycle] 

777 for cycle in cycles 

778 ] 

779 

780 def split (self, 

781 first_residue_atom_indices : list[int], 

782 second_residue_atom_indices : list[int], 

783 first_residue_name : Optional[str] = None, 

784 second_residue_name : Optional[str] = None, 

785 first_residue_number : Optional[int] = None, 

786 second_residue_number : Optional[int] = None, 

787 first_residue_icode : Optional[str] = None, 

788 second_residue_icode : Optional[str] = None, 

789 ) -> tuple['Residue', 'Residue']: 

790 """Split this residue in 2 residues and return them in a tuple. 

791 Keep things coherent in the structure (renumerate all residues below this one). 

792 Note that all residue atoms must be covered by the splits.""" 

793 # This function is expected to be called in a residue with an already set structure 

794 if not self.structure: 

795 raise InputError('The split function should be called when the residue has an already defined structure') 

796 # Make sure all atoms in the residue are covered between both the first and the second residue 

797 if set(self.atom_indices) != set(first_residue_atom_indices + second_residue_atom_indices): 

798 print('Residue atoms: ' + ', '.join([ str(v) for v in set(self.atom_indices) ])) 

799 print('Covered atoms: ' + ', '.join([ str(v) for v in set(first_residue_atom_indices + second_residue_atom_indices) ])) 

800 raise InputError('All atom indices must be covered between both the first and the second residue') 

801 # Reuse this first residue instead of creating a new one 

802 if first_residue_name: 

803 self.name = first_residue_name 

804 if first_residue_number: 

805 self.number = first_residue_number 

806 if first_residue_icode: 

807 self.icode = first_residue_icode 

808 # Set the new second residue 

809 _second_residue_name = second_residue_name if second_residue_name else self.name 

810 _second_residue_number = second_residue_number if second_residue_number else self.number 

811 _second_residue_icode = second_residue_icode if second_residue_icode else get_next_letter(self.icode) 

812 second_residue = Residue(_second_residue_name, _second_residue_number, _second_residue_icode) 

813 second_residue._structure = self.structure 

814 new_residue_index = self.index + 1 

815 second_residue._index = new_residue_index 

816 # Insert the second residue in the structure residues list right after this residue 

817 self.structure.residues.insert(new_residue_index, second_residue) 

818 # Set the second residue index 

819 # Update the index of all residues which have been shifted after the insertion 

820 for residue_index in range(new_residue_index + 1, len(self.structure.residues)): 

821 residue = self.structure.residues[residue_index] 

822 residue.index = residue_index 

823 # Now transfer atoms from residue 1 to residue 2 as it is specified 

824 # Note that this will automatically update every atom 

825 second_residue.atom_indices = second_residue_atom_indices 

826 # Now add the new residue to the chain 

827 self.chain.add_residue(second_residue) 

828 

829 def split_by_atom_names (self, 

830 first_residue_atom_names : list[str], 

831 second_residue_atom_names : list[str], 

832 first_residue_name : Optional[str] = None, 

833 second_residue_name : Optional[str] = None, 

834 first_residue_number : Optional[int] = None, 

835 second_residue_number : Optional[int] = None, 

836 first_residue_icode : Optional[str] = None, 

837 second_residue_icode : Optional[str] = None, 

838 ) -> tuple['Residue', 'Residue']: 

839 """Parse atom names to atom indices and then call the split function.""" 

840 # Check all atom names to exist in the residue 

841 input_atom_names = set(first_residue_atom_names + second_residue_atom_names) 

842 residue_atom_names = set([ atom.name for atom in self.atoms ]) 

843 if input_atom_names != residue_atom_names: 

844 print(self) 

845 print(self.atoms) 

846 print('Input atom names: ' + ', '.join(input_atom_names)) 

847 print('Residue atom names: ' + ', '.join(residue_atom_names)) 

848 raise InputError('All residue atoms must be covered between both the first and the second residue atom names') 

849 # Convert atom names to atom indices 

850 first_residue_atom_indices = [ self.get_atom_by_name(name).index for name in first_residue_atom_names ] 

851 second_residue_atom_indices = [ self.get_atom_by_name(name).index for name in second_residue_atom_names ] 

852 # Call the actual split logic 

853 return self.split(first_residue_atom_indices, second_residue_atom_indices, first_residue_name, 

854 second_residue_name, first_residue_number, second_residue_number, first_residue_icode, second_residue_icode) 

855 

856 def get_atom_by_name (self, atom_name : str) -> 'Atom': 

857 """Get a residue atom given its name.""" 

858 return next(( atom for atom in self.atoms if atom.name == atom_name ), None) 

859 

860 def get_label (self) -> str: 

861 """Get a standard label.""" 

862 chainname = self.chain.name if self.chain.name.strip() else '' 

863 return f'{chainname}{self.number}{self.icode}({self.name})' 

864 label = property(get_label, None, None, 

865 "The residue standard label (read only)") 

866 

867 def is_cg (self) -> bool: 

868 """Ask if the current residue is in coarse grain. 

869 Note that we assume there may be not hybrid aa/cg residues.""" 

870 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms) 

871 

872class Chain: 

873 """A chain of residues.""" 

874 def __init__ (self, 

875 name : Optional[str] = None, 

876 classification : Optional[str] = None, 

877 ): 

878 self.name = name 

879 self._classification = classification 

880 # Set variables to store references to other related instaces 

881 # These variables will be set further by the structure 

882 self._structure = None 

883 self._index = None 

884 self._residue_indices = [] 

885 

886 def __repr__ (self): 

887 return '<Chain ' + self.name + '>' 

888 

889 def __eq__ (self, other): 

890 return self.name == other.name 

891 

892 def __hash__ (self): 

893 return hash(self.name) 

894 

895 # The parent structure (read only) 

896 # This value is set by the structure itself 

897 def get_structure (self) -> Optional['Structure']: 

898 return self._structure 

899 structure: 'Structure' = property(get_structure, None, None, "The parent structure (read only)") 

900 

901 # The residue index according to parent structure residues (read only) 

902 # This value is set by the structure itself 

903 def get_index (self) -> Optional[int]: 

904 return self._index 

905 # When the index is set all residues are updated with the next chain index 

906 def set_index (self, index : int): 

907 for residue in self.residues: 

908 residue._chain_index = index 

909 self._index = index 

910 index = property(get_index, set_index, None, "The residue index according to parent structure residues (read only)") 

911 

912 # The residue indices according to parent structure residues for residues in this chain 

913 # If residue indices are set then make changes in all the structure to make this change coherent 

914 def get_residue_indices (self) -> list[int]: 

915 return self._residue_indices 

916 def set_residue_indices (self, new_residue_indices : list[int]): 

917 # If there is not strucutre yet it means the chain is beeing set before the structure 

918 # We just save residue indices and wait for the structure to be set 

919 if not self.structure: 

920 self._residue_indices = new_residue_indices 

921 return 

922 # Update the current residues 

923 for residue in self.residues: 

924 residue._chain_index = None 

925 # Update the new residues 

926 for index in new_residue_indices: 

927 residue = self.structure.residues[index] 

928 residue._chain_index = self.index 

929 # In case the new residue indices list is empty this chain must be removed from its structure 

930 if len(new_residue_indices) == 0: 

931 self.structure.purge_chain(self) 

932 # Now new indices are coherent and thus we can save them 

933 self._residue_indices = new_residue_indices 

934 residue_indices = property(get_residue_indices, set_residue_indices, None, "The residue indices according to parent structure residues for residues in this residue") 

935 

936 def get_residues (self) -> list['Residue']: 

937 """The residues in this chain. 

938 If residues are set then make changes in all the structure to make this change coherent.""" 

939 # If there is not strucutre yet it means the chain is beeing set before the structure 

940 # In this case it is not possible to get related residues in the structure 

941 if not self.structure: 

942 return [] 

943 # Get residues in the structure according to residue indices 

944 residues = self.structure.residues 

945 return [ residues[residue_index] for residue_index in self.residue_indices ] 

946 

947 def set_residues (self, new_residues : list['Residue']): 

948 """Find indices for new residues and set their indices as the new residue indices. 

949 Note that residues must be set in the structure already.""" 

950 new_residue_indices = [] 

951 for new_residue in new_residues: 

952 new_residue_index = new_residue.index 

953 if new_residue_index == None: 

954 raise ValueError(f'Residue {new_residue} is not set in the structure') 

955 new_residue_indices.append(new_residue_index) 

956 self.set_residue_indices(new_residue_indices) 

957 # Set the new residues chain index to this chain index 

958 for residue in new_residues: 

959 residue.chain_index = self.index 

960 residues: list['Residue'] = property(get_residues, set_residues, None, "The residues in this chain") 

961 

962 def add_residue (self, residue : 'Residue'): 

963 """Add a residue to the chain.""" 

964 # Insert the new residue index in the list of residue indices keeping the order 

965 sorted_residue_index = bisect(self.residue_indices, residue.index) 

966 self.residue_indices.insert(sorted_residue_index, residue.index) 

967 # Update the residue internal chain index 

968 residue._chain_index = self.index 

969 

970 def remove_residue (self, residue : 'Residue'): 

971 """Remove a residue from the chain. 

972 WARNING: Note that this function does not trigger the set_residue_indices.""" 

973 self.residue_indices.remove(residue.index) # This index MUST be in the list 

974 # If we removed the last residue then this chain must be removed from its structure 

975 if len(self.residue_indices) == 0 and self.structure: 

976 self.structure.purge_chain(self) 

977 # Update the residue internal chain index 

978 residue._chain_index = None 

979 

980 def get_atom_indices (self) -> list[int]: 

981 """Get atom indices for all atoms in the chain (read only). 

982 In order to change atom indices they must be changed in their corresponding residues.""" 

983 return sum([ residue.atom_indices for residue in self.residues ], []) 

984 atom_indices = property(get_atom_indices, None, None, "Atom indices for all atoms in the chain (read only)") 

985 

986 def get_atoms (self) -> list[int]: 

987 """Get the atoms in the chain (read only). 

988 In order to change atoms they must be changed in their corresponding residues.""" 

989 return sum([ residue.atoms for residue in self.residues ], []) 

990 atoms: list['Atom'] = property(get_atoms, None, None, "Atoms in the chain (read only)") 

991 

992 def get_atom_count (self) -> int: 

993 """Get the number of atoms in the chain (read only).""" 

994 return len(self.atom_indices) 

995 atom_count = property(get_atom_count, None, None, "Number of atoms in the chain (read only)") 

996 

997 def get_residue_count (self) -> int: 

998 """Get the number of residues in the chain (read only).""" 

999 return len(self._residue_indices) 

1000 residue_count = property(get_residue_count, None, None, "Number of residues in the chain (read only)") 

1001 

1002 def get_classification (self) -> str: 

1003 """Get the chain classification.""" 

1004 if self._classification: 

1005 return self._classification 

1006 self._classification = self.structure.get_selection_classification(self.get_selection()) 

1007 return self._classification 

1008 

1009 def set_classification (self, classification : str): 

1010 """Force the chain classification.""" 

1011 self._classification = classification 

1012 

1013 classification = property(get_classification, set_classification, None, 

1014 "Classification of the chain (manual or automatic)") 

1015 

1016 def get_sequence (self) -> str: 

1017 """Get the residues sequence in one-letter code.""" 

1018 return ''.join([ residue_name_to_letter(residue.name) for residue in self.residues ]) 

1019 

1020 def get_selection (self) -> 'Selection': 

1021 """Generate a selection for this chain.""" 

1022 return Selection(self.atom_indices) 

1023 

1024 def copy (self) -> 'Chain': 

1025 """Make a copy of the current chain.""" 

1026 chain_copy = Chain(self.name) 

1027 chain_copy._structure = self._structure 

1028 chain_copy._index = self._index 

1029 chain_copy.residue_indices = self.residue_indices 

1030 return chain_copy 

1031 

1032 def has_cg (self) -> bool: 

1033 """Ask if the current chain has at least one coarse grain atom/residue.""" 

1034 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms) 

1035 

1036 def is_missing_any_bonds (self) -> bool: 

1037 return any(atom.bonds == MISSING_BONDS for atom in self.atoms) 

1038 

1039 def find_residue (self, number : int, icode : str = '', index = None) -> Optional['Residue']: 

1040 """Find a residue by its number and insertion code.""" 

1041 # Iterate chain residues 

1042 for residue in self.residues: 

1043 if residue.number == number and residue.icode == icode and (index is None or residue.index == index): 

1044 return residue 

1045 return None 

1046 

1047class Structure: 

1048 """A structure is a group of atoms organized in chains and residues.""" 

1049 def __init__ (self, 

1050 atoms : list['Atom'] = [], 

1051 residues : list['Residue'] = [], 

1052 chains : list['Chain'] = [], 

1053 residue_numeration_base : int = 10, 

1054 ): 

1055 self.atoms: list['Atom'] = [] 

1056 self.residues: list['Residue'] = [] 

1057 self.chains: list['Chain'] = [] 

1058 # Set references between instances 

1059 for atom in atoms: 

1060 self.set_new_atom(atom) 

1061 for residue in residues: 

1062 self.set_new_residue(residue) 

1063 for chain in chains: 

1064 self.set_new_chain(chain) 

1065 # --- Set other internal variables --- 

1066 # Set bonds between atoms 

1067 self._bonds = None 

1068 # Set fragments of bonded atoms 

1069 self._fragments = None 

1070 # --- Set other auxiliar variables --- 

1071 # Trajectory atom sorter is a function used to sort coordinates in a trajectory file 

1072 # This function is generated when sorting indices in the structure 

1073 # Also the new atom order is saved 

1074 self.trajectory_atom_sorter = None 

1075 self.new_atom_order = None 

1076 # Track when atom elements have been fixed 

1077 self._fixed_atom_elements = False 

1078 # Save indices of supported ion atoms 

1079 self._ion_atom_indices = None 

1080 self._dummy_atom_indices = None 

1081 # Set the scale of the residue numbers 

1082 # It may be decimal (10), hexadecimal(16) or alphanumeric(36) 

1083 # Note that hybrid scales (1-9999 decimal and different further) are not explicitly supported 

1084 # However, the scale is guessed on read and conserved on write, so the original numeration would be conserved 

1085 # The default values is used only when the structure is not read from a PDB 

1086 self.residue_numeration_base = residue_numeration_base 

1087 

1088 def __repr__ (self): 

1089 return f'<Structure ({self.atom_count} atoms)>' 

1090 

1091 def get_bonds (self) -> list[ list[int] ]: 

1092 """Get the bonds between atoms.""" 

1093 # Return the stored value, if exists 

1094 if self._bonds: 

1095 return self._bonds 

1096 # If not, we must calculate the bonds using vmd 

1097 self._bonds = self.get_covalent_bonds() 

1098 return self._bonds 

1099 def set_bonds (self, bonds : list[ list[int] ]): 

1100 """Force specific bonds.""" 

1101 self._bonds = bonds 

1102 # Reset fragments 

1103 self._fragments = None 

1104 bonds = property(get_bonds, set_bonds, None, "The structure bonds") 

1105 

1106 def get_fragments (self) -> list['Selection']: 

1107 """Get the groups of atoms which are covalently bonded.""" 

1108 # Return the stored value, if exists 

1109 if self._fragments != None: 

1110 return self._fragments 

1111 # Otherwise, find fragments in all structure atoms 

1112 self._fragments = list(self.find_fragments()) 

1113 return self._fragments 

1114 # Fragments of covalently bonded atoms (read only) 

1115 fragments: list['Selection'] = property(get_fragments, None, None, "The structure fragments (read only)") 

1116 

1117 def find_fragments (self, 

1118 selection : Optional['Selection'] = None, 

1119 coherent : bool = True, 

1120 atom_bonds : Optional[list[list[int]]] = None, 

1121 ) -> Generator['Selection', None, None]: 

1122 """Find fragments in a selection of atoms. A fragment is a selection of 

1123 covalently bonded atoms. All atoms are searched if no selection is provided. 

1124 

1125 WARNING: Note that fragments generated from a specific selection may not 

1126 match the structure fragments. A selection including 2 separated regions of a 

1127 structure fragment will yield 2 fragments. 

1128 

1129 For convenience, bonds between non-consecutive residues are excluded from this logic. 

1130 This is useful to ignore disulfide bonds. 

1131 May also help to properly find chains in CG simulations where chains may be bonded. 

1132 """ 

1133 # If there is no selection we consider all atoms 

1134 if not selection: selection = self.select_all() 

1135 # Get/Find covalent bonds between atoms in a new object avoid further corruption (deletion) of the original list 

1136 safe_bonds = atom_bonds if atom_bonds else self.bonds 

1137 atom_indexed_covalent_bonds = { atom_index: [ *safe_bonds[atom_index] ] for atom_index in selection.atom_indices } 

1138 # Ignore bonds from non-consecutive residues 

1139 if coherent: 

1140 for atom_index, bonded_atom_indices in atom_indexed_covalent_bonds.items(): 

1141 atom = self.atoms[atom_index] 

1142 residue_index = atom.residue_index 

1143 consecutive_residue_index = residue_index + 1 

1144 self_bonded_atoms = [] 

1145 consecutive_bonded_atoms = [] 

1146 for bonded_atom_index in bonded_atom_indices: 

1147 bonded_atom = self.atoms[bonded_atom_index] 

1148 bonded_residue_index = bonded_atom.residue_index 

1149 if bonded_residue_index == residue_index: 

1150 self_bonded_atoms.append(bonded_atom_index) 

1151 continue 

1152 if bonded_residue_index == consecutive_residue_index: 

1153 consecutive_bonded_atoms.append(bonded_atom_index) 

1154 continue 

1155 new_bonded_atoms = self_bonded_atoms + consecutive_bonded_atoms 

1156 atom_indexed_covalent_bonds[atom_index] = new_bonded_atoms 

1157 # Group the connected atoms in "fragments" 

1158 while len(atom_indexed_covalent_bonds) > 0: 

1159 start_atom_index, bonds = next(iter(atom_indexed_covalent_bonds.items())) 

1160 del atom_indexed_covalent_bonds[start_atom_index] 

1161 fragment_atom_indices = [ start_atom_index ] 

1162 while len(bonds) > 0: 

1163 # Get the next bond atom and remove it from the bonds list 

1164 next_atom_index = bonds[0] 

1165 bonds.remove(next_atom_index) 

1166 next_bonds = atom_indexed_covalent_bonds.get(next_atom_index, None) 

1167 # If this atom is out of the selection then skip it 

1168 if next_bonds == None: continue 

1169 next_new_bonds = [ bond for bond in next_bonds if bond not in fragment_atom_indices + bonds ] 

1170 bonds += next_new_bonds 

1171 fragment_atom_indices.append(next_atom_index) 

1172 del atom_indexed_covalent_bonds[next_atom_index] 

1173 yield Selection(fragment_atom_indices) 

1174 

1175 def find_whole_fragments (self, selection : 'Selection') -> Generator['Selection', None, None]: 

1176 """Given a selection of atoms, find all whole structure fragments on them.""" 

1177 for fragment in self.fragments: 

1178 if selection & fragment: 

1179 yield fragment 

1180 

1181 def name_selection (self, selection : 'Selection') -> str: 

1182 """Name an atom selection depending on the chains it contains. This is used for debug purpouses.""" 

1183 atoms = [ self.atoms[index] for index in selection.atom_indices ] 

1184 # Count atoms per chain 

1185 atom_count_per_chain = { chain: 0 for chain in self.chains } 

1186 for atom in atoms: 

1187 atom_count_per_chain[atom.chain] += 1 

1188 # Set labels accoridng to the coverage of every chain 

1189 chain_labels = [] 

1190 for chain, atom_count in atom_count_per_chain.items(): 

1191 if atom_count == 0: continue 

1192 coverage = atom_count / chain.atom_count 

1193 label = f'chain {chain.name}' 

1194 if coverage < 1: 

1195 percent = round(coverage * 1000) / 10 

1196 label += f' ({percent}%)' 

1197 chain_labels.append(label) 

1198 return ', '.join(chain_labels) 

1199 

1200 def set_new_atom (self, atom : 'Atom'): 

1201 """Set a new atom in the structure.""" 

1202 atom._structure = self 

1203 new_atom_index = self.atom_count 

1204 self.atoms.append(atom) 

1205 atom._index = new_atom_index 

1206 

1207 def set_new_residue (self, residue : 'Residue'): 

1208 """ 

1209 Set a new residue in the structure. 

1210 WARNING: Atoms must be set already before setting residues. 

1211 """ 

1212 residue._structure = self 

1213 new_residue_index = len(self.residues) 

1214 self.residues.append(residue) 

1215 residue._index = new_residue_index 

1216 # In case the residue has atom indices, set relational indices on each atom 

1217 for atom_index in residue.atom_indices: 

1218 atom = self.atoms[atom_index] 

1219 atom._residue_index = new_residue_index 

1220 

1221 def purge_residue (self, residue : 'Residue'): 

1222 """ 

1223 Purge residue from the structure and its chain. 

1224 This can be done only when the residue has no atoms left in the structure. 

1225 Renumerate all residue indices which have been offsetted as a result of the purge. 

1226 """ 

1227 # Check the residue can be purged 

1228 if residue not in self.residues: 

1229 raise ValueError(f'{residue} is not in the structure already') 

1230 if len(residue.atom_indices) > 0: 

1231 raise ValueError(f'{residue} is still having atoms and thus it cannot be purged') 

1232 # Remove the purged residue from its chain 

1233 if residue.chain: residue.chain.remove_residue(residue) 

1234 # Get the current index of the residue to be purged 

1235 purged_index = residue.index 

1236 # Residues and their atoms below this index are not to be modified 

1237 # Residues and their atoms over this index must be renumerated 

1238 for affected_residue in self.residues[purged_index+1:]: 

1239 # Chaging the index automatically changes all residues atoms '_residue_index' values 

1240 # Chaging the index automatically changes its corresponding index in residue chain '_residue_indices' 

1241 affected_residue.index -= 1 

1242 # Finally, remove the current residue from the list of residues in the structure 

1243 del self.residues[purged_index] 

1244 

1245 def set_new_chain (self, chain : 'Chain'): 

1246 """Set a new chain in the structure. 

1247 WARNING: Residues and atoms must be set already before setting chains.""" 

1248 chain._structure = self 

1249 new_chain_index = len(self.chains) 

1250 self.chains.append(chain) 

1251 chain._index = new_chain_index 

1252 # In case the chain has residue indices, set relational indices on each residue 

1253 for residue_index in chain.residue_indices: 

1254 residue = self.residues[residue_index] 

1255 residue._chain_index = new_chain_index 

1256 

1257 def purge_chain (self, chain : 'Chain'): 

1258 """Purge chain from the structure. 

1259 This can be done only when the chain has no residues left in the structure. 

1260 Renumerate all chain indices which have been offsetted as a result of the purge.""" 

1261 # Check the chain can be purged 

1262 if chain not in self.chains: 

1263 raise ValueError(f'Chain {chain.name} is not in the structure already') 

1264 if len(chain.residue_indices) > 0: 

1265 raise ValueError(f'Chain {chain.name} is still having residues and thus it cannot be purged') 

1266 # Get the current index of the chain to be purged 

1267 purged_index = chain.index 

1268 # Chains and their residues below this index are not to be modified 

1269 # Chains and their residues over this index must be renumerated 

1270 for affected_chain in self.chains[purged_index+1:]: 

1271 # Chaging the index automatically changes all chain residues '_chain_index' values 

1272 affected_chain.index -= 1 

1273 # Finally, remove the current chain from the list of chains in the structure 

1274 del self.chains[purged_index] 

1275 

1276 @classmethod 

1277 def from_pdb (cls, pdb_content : str, model : Optional[int] = None, flexible_numeration : bool = True): 

1278 """Set the structure from a pdb file. 

1279 You may filter the PDB content for a specific model. 

1280 Some weird numeration systems are not supported and, when encountered, they are ignored. 

1281 In these cases we set our own numeration system. 

1282 Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead.""" 

1283 # Filter the PDB content in case a model was passed 

1284 filtered_pdb_content = filter_model(pdb_content, model) if model else pdb_content 

1285 # Split the PDB content in lines 

1286 pdb_lines = filtered_pdb_content.split('\n') 

1287 # Before we start, we must guess the numeration system 

1288 # To do so mine all residue numbers 

1289 all_residue_number_characters = set() 

1290 for line in pdb_lines: 

1291 # Parse atoms only 

1292 start = line[0:6] 

1293 is_atom = start == 'ATOM ' or start == 'HETATM' 

1294 if not is_atom: continue 

1295 # Mine all residue numbers 

1296 residue_number = line[22:26] 

1297 for character in residue_number: 

1298 all_residue_number_characters.add(character) 

1299 # Remove white spaces 

1300 all_residue_number_characters.discard(' ') 

1301 # If we find a non-numerical and non-alphabetical character then we assume it has a weird numeration system 

1302 # Since we can not support every scenario, in this cases we set the numeration totally ignoring the one in the PDB 

1303 weird_character = next((character for character in all_residue_number_characters if not (character.isalnum() or character == '-')), None) 

1304 if weird_character: 

1305 if flexible_numeration == False: raise InputError(f'Not supported numeration system including "{weird_character}" characters') 

1306 warn(f'Weird residue numeration including "{weird_character}" characters found in the PDB file. Ignoring it.') 

1307 residue_numeration_base = None 

1308 # Search among all resiude numbers any letters (non-numerical characters) 

1309 elif next((letter for letter in alphanumerical_letters if letter in all_residue_number_characters), None): 

1310 residue_numeration_base = 36 

1311 elif next((letter for letter in hexadecimal_letters if letter in all_residue_number_characters), None): 

1312 residue_numeration_base = 16 

1313 else: 

1314 residue_numeration_base = 10 

1315 # Read the pdb content line by line and set the parsed atoms, residues and chains 

1316 parsed_atoms = [] 

1317 parsed_residues = [] 

1318 parsed_chains = [] 

1319 # Save chains and residues also in dictionaries only to find them faster 

1320 name_chains = {} 

1321 label_residues = {} 

1322 # Keep track of the last issued atom and residue indices 

1323 atom_index = -1 

1324 residue_index = -1 

1325 for line in pdb_lines: 

1326 # Parse atoms only 

1327 start = line[0:6] 

1328 is_atom = start == 'ATOM ' or start == 'HETATM' 

1329 if not is_atom: 

1330 continue 

1331 # Mine all atom data 

1332 atom_name = line[11:16].strip() 

1333 residue_name = line[17:21].strip() 

1334 chain_name = line[21:22] 

1335 residue_number = line[22:26] 

1336 icode = line[26:27] 

1337 if icode == ' ': 

1338 icode = '' 

1339 x_coord = float(line[30:38]) 

1340 y_coord = float(line[38:46]) 

1341 z_coord = float(line[46:54]) 

1342 element = line[76:78].strip() 

1343 # Set the parsed atom 

1344 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord)) 

1345 # Add the parsed atom to the list and update the current atom index 

1346 parsed_atoms.append(parsed_atom) 

1347 # Get the parsed chain 

1348 # DANI: If we always check for an already existing chain there will never be repeated chains 

1349 # DANI: However we may create chains with unconnected atoms silently (they were separated in the PDB) 

1350 # DANI: For this reason we must only consider the last processed chain 

1351 parsed_chain = name_chains.get(chain_name, None) 

1352 if parsed_chain and parsed_chain != parsed_chains[-1]: 

1353 parsed_chain = None 

1354 # If the parsed chain was not yet instantiated then do it now 

1355 if not parsed_chain: 

1356 parsed_chain = Chain(name=chain_name) 

1357 parsed_chains.append(parsed_chain) 

1358 name_chains[chain_name] = parsed_chain 

1359 # Get the parsed residue 

1360 residue_label = (chain_name, residue_number, icode) 

1361 # DANI: If we always check for an already existing residue there will never be repeated residues 

1362 # DANI: However we may create residues with unconnected atoms silently (they were separated in the PDB) 

1363 # DANI: For this reason we must only consider the last processed residue 

1364 parsed_residue = label_residues.get(residue_label, None) 

1365 if parsed_residue and parsed_residue != parsed_residues[-1]: 

1366 parsed_residue = None 

1367 # If the parsed residue was not yet instantiated then do it now 

1368 if not parsed_residue: 

1369 # Parse the residue number if it is to be parsed 

1370 if residue_numeration_base: 

1371 parsed_residue_number = int(residue_number, residue_numeration_base) 

1372 # If we decided to ignore the numeration system then we just issue a new residue number 

1373 # Use the last residue number from the current chain as reference 

1374 else: 

1375 chain_last_residue_index = parsed_chain.residue_indices[-1] if len(parsed_chain.residue_indices) > 0 else None 

1376 chain_last_residue_number = parsed_residues[chain_last_residue_index].number if chain_last_residue_index != None else 0 

1377 parsed_residue_number = chain_last_residue_number + 1 

1378 # Instantiate the new parsed residue 

1379 parsed_residue = Residue(name=residue_name, number=parsed_residue_number, icode=icode) 

1380 parsed_residues.append(parsed_residue) 

1381 label_residues[residue_label] = parsed_residue 

1382 # Add current residue to the parsed chain 

1383 residue_index += 1 

1384 parsed_chain.residue_indices.append(residue_index) 

1385 # Add current atom to the parsed residue 

1386 atom_index += 1 

1387 parsed_residue.atom_indices.append(atom_index) 

1388 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains, residue_numeration_base=residue_numeration_base) 

1389 

1390 @classmethod 

1391 def from_pdb_file (cls, pdb_filepath : str, model : Optional[int] = None, flexible_numeration : bool = True): 

1392 """Set the structure from a pdb file. 

1393 You may filter the input PDB file for a specific model. 

1394 Some weird numeration systems are not supported and, when encountered, they are ignored. 

1395 In these cases we set our own numeration system. 

1396 Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead.""" 

1397 pdb_file = File(pdb_filepath) 

1398 if not pdb_file.exists: 

1399 raise InputError(f'File "{pdb_filepath}" not found') 

1400 if not pdb_file.format == 'pdb': 

1401 raise InputError(f'"{pdb_filepath}" is not a path for a pdb file') 

1402 # Read the pdb file 

1403 pdb_content = None 

1404 with open(pdb_filepath, 'r') as file: 

1405 pdb_content = file.read() 

1406 return cls.from_pdb(pdb_content, model, flexible_numeration) 

1407 

1408 # https://biopandas.github.io/biopandas/tutorials/Working_with_mmCIF_Structures_in_DataFrames/ 

1409 @classmethod 

1410 def from_mmcif (cls, mmcif_content : str, model : Optional[int] = None, author_notation : bool = False): 

1411 """Set the structure from mmcif. 

1412 You may filter the content for a specific model. 

1413 You may ask for the author notation instead of the standarized notation for legacy reasons. 

1414 This may have an effect in atom names, residue names, residue numbers and chain names. 

1415 Read the pdb content line by line and set the parsed atoms, residues and chains. 

1416 """ 

1417 parsed_atoms = [] 

1418 parsed_residues = [] 

1419 parsed_chains = [] 

1420 # Save chains and residues also in dictionaries only to find them faster 

1421 name_chains = {} 

1422 label_residues = {} 

1423 # Keep track of the last issued atom and residue indices 

1424 atom_index = -1 

1425 residue_index = -1 

1426 # Iterate the content line by line 

1427 lines = iter(mmcif_content.split('\n')) 

1428 # Now mine atoms data 

1429 atom_headers = { 'ATOM', 'HETATM', 'ANISOU' } 

1430 for line in lines: 

1431 # Values are separated by spaces 

1432 values = line.split() 

1433 # If this is not atom line then we are done 

1434 if len(values) == 0 or values[0] not in atom_headers: continue 

1435 # Get the atom model number if a model number was passed 

1436 # Note that then model is a number greater than 0 

1437 if model: 

1438 model_number = int(values[20]) 

1439 # If the model number odes not match then skip it 

1440 if model != model_number: continue 

1441 # Mine atom data 

1442 # Next value is just the atom number, we do not need it 

1443 # atom_number = values[1] 

1444 # Mine the atom element 

1445 element = values[2] 

1446 # Mine the atom name 

1447 atom_name = values[3].replace('"','') 

1448 # Next value is a place holder to indicate alternate conformation, according to the docs 

1449 # I don't know what this is but we do not need it 

1450 # idk = values[4] 

1451 # Mine the residue name 

1452 residue_name = values[5] 

1453 # Mine the chain name 

1454 chain_name = values[6] 

1455 # Next value is the chain number, we do not need it 

1456 # chain_number = values[7] 

1457 # Residue number is '.' for chains with only one residue 

1458 residue_number = 1 if values[8] == '.' else int(values[8]) 

1459 icode = '' if values[9] == '?' else values[9] 

1460 x_coord = float(values[10]) 

1461 y_coord = float(values[11]) 

1462 z_coord = float(values[12]) 

1463 # Next value is the occupancy, we do not need it 

1464 # occupancy = float(values[13]) 

1465 # Next value is the isotropic displacement, we do not need it 

1466 # isotropic = float(values[14]) 

1467 # Next value is the charge, we do not need it 

1468 # charge = None if values[15] == '?' else float(values[15]) 

1469 # The rest of values are alternative author values 

1470 if author_notation: 

1471 residue_number = int(values[16]) 

1472 residue_name = values[17] 

1473 chain_name = values[18] 

1474 atom_name = values[19].replace('"','') 

1475 # Set the parsed atom 

1476 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord)) 

1477 # Add the parsed atom to the list and update the current atom index 

1478 parsed_atoms.append(parsed_atom) 

1479 # Get the parsed chain 

1480 parsed_chain = name_chains.get(chain_name, None) 

1481 # If the parsed chain was not yet instantiated then do it now 

1482 if not parsed_chain: 

1483 parsed_chain = Chain(name=chain_name) 

1484 parsed_chains.append(parsed_chain) 

1485 name_chains[chain_name] = parsed_chain 

1486 # Get the parsed residue 

1487 residue_label = (chain_name, residue_number, icode) 

1488 parsed_residue = label_residues.get(residue_label, None) 

1489 # If the parsed residue was not yet instantiated then do it now 

1490 if not parsed_residue: 

1491 # Instantiate the new parsed residue 

1492 parsed_residue = Residue(name=residue_name, number=residue_number, icode=icode) 

1493 parsed_residues.append(parsed_residue) 

1494 label_residues[residue_label] = parsed_residue 

1495 # Add current residue to the parsed chain 

1496 residue_index += 1 

1497 parsed_chain.residue_indices.append(residue_index) 

1498 # Add current atom to the parsed residue 

1499 atom_index += 1 

1500 parsed_residue.atom_indices.append(atom_index) 

1501 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains) 

1502 

1503 @classmethod 

1504 def from_mmcif_file (cls, mmcif_filepath : str, model : Optional[int] = None, author_notation : bool = False): 

1505 """Set the structure from a mmcif file.""" 

1506 mmcif_file = File(mmcif_filepath) 

1507 if not mmcif_file.exists: 

1508 raise InputError(f'File "{mmcif_filepath}" not found') 

1509 if not mmcif_file.format == 'cif': 

1510 raise InputError(f'"{mmcif_filepath}" is not a path for a mmcif file') 

1511 # Read the mmcif file 

1512 mmcif_content = None 

1513 with open(mmcif_filepath, 'r') as file: 

1514 mmcif_content = file.read() 

1515 return cls.from_mmcif(mmcif_content, model, author_notation) 

1516 

1517 @classmethod 

1518 def from_mdanalysis (cls, mdanalysis_universe): 

1519 """Set the structure from an MD analysis object.""" 

1520 # Set the final list of atoms to be included in the structure 

1521 parsed_atoms = [] 

1522 parsed_residues = [] 

1523 parsed_chains = [] 

1524 # Setup atoms 

1525 for atom in mdanalysis_universe.atoms: 

1526 name = atom.name 

1527 # WARNING: MDAnalysis cannot handle reading an element of an atom with no element 

1528 element = atom.element if hasattr(atom, 'element') else '?' 

1529 #coords = atom.position # DANI: Esto da error si no hay coordenadas 

1530 parsed_atom = Atom(name=name, element=element) 

1531 parsed_atoms.append(parsed_atom) 

1532 # Setup residues 

1533 for residue in mdanalysis_universe.residues: 

1534 name = residue.resname 

1535 number = residue.resnum 

1536 icode = residue.icode if hasattr(residue, 'icode') else None # DANI: No se ha provado 

1537 parsed_residue = Residue(name=name, number=number, icode=icode) 

1538 atom_indices = [ atom.index for atom in residue.atoms ] 

1539 parsed_residue.atom_indices = atom_indices 

1540 parsed_residues.append(parsed_residue) 

1541 # Setup chains 

1542 for segment in mdanalysis_universe.segments: 

1543 name = segment.segid 

1544 parsed_chain = Chain(name=name) 

1545 residue_indices = [ residue.resindex for residue in segment.residues ] 

1546 parsed_chain.residue_indices = residue_indices 

1547 parsed_chains.append(parsed_chain) 

1548 # Setup the structure 

1549 structure = cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains) 

1550 # Add bonds and charges which are also available in a topology 

1551 atom_count = len(mdanalysis_universe.atoms) 

1552 atom_bonds = [ [] for i in range(atom_count) ] 

1553 for bond in mdanalysis_universe.bonds: 

1554 a,b = bond.indices 

1555 # Make sure atom indices are regular integers so they are JSON serializables 

1556 atom_bonds[a].append(int(b)) 

1557 atom_bonds[b].append(int(a)) 

1558 structure.bonds = atom_bonds 

1559 structure.charges = list(mdanalysis_universe._topology.charges.values) 

1560 return structure 

1561 

1562 @classmethod 

1563 def from_prmtop_file (cls, prmtop_filepath : str): 

1564 """Set the structure from a prmtop file.""" 

1565 # Make sure the input file exists and has the right format 

1566 prmtop_file = File(prmtop_filepath) 

1567 if not prmtop_file.exists: 

1568 raise InputError(f'File "{prmtop_filepath}" not found') 

1569 if not prmtop_file.format == 'prmtop': 

1570 raise InputError(f'"{prmtop_filepath}" is not a name for a prmtop file') 

1571 # In we do not have mdanalysis in our environment then we cannot proceed 

1572 if not is_imported('MDAnalysis'): 

1573 raise InputError('Missing dependency error: MDAnalysis') 

1574 # Parse the topology using MDanalysis 

1575 parser = MDAnalysis.topology.TOPParser.TOPParser(prmtop_filepath) 

1576 mdanalysis_topology = parser.parse() 

1577 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology) 

1578 return cls.from_mdanalysis(mdanalysis_universe) 

1579 

1580 @classmethod 

1581 def from_tpr_file (cls, tpr_filepath : str): 

1582 """Set the structure from a tpr file.""" 

1583 # Make sure the input file exists and has the right format 

1584 tpr_file = File(tpr_filepath) 

1585 if not tpr_file.exists: 

1586 raise InputError(f'File "{tpr_filepath}" not found') 

1587 if not tpr_file.format == 'tpr': 

1588 raise InputError(f'"{tpr_filepath}" is not a name for a tpr file') 

1589 # In we do not have mdanalysis in our environment then we cannot proceed 

1590 if not is_imported('MDAnalysis'): 

1591 raise InputError('Missing dependency error: MDAnalysis') 

1592 # Parse the topology using MDanalysis 

1593 parser = MDAnalysis.topology.TPRParser.TPRParser(tpr_filepath) 

1594 mdanalysis_topology = parser.parse() 

1595 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology) 

1596 return cls.from_mdanalysis(mdanalysis_universe) 

1597 

1598 @classmethod 

1599 def from_file (cls, mysterious_filepath : str): 

1600 """Set the structure from a file if the file format is supported.""" 

1601 mysterious_file = File(mysterious_filepath) 

1602 if mysterious_file.format == 'pdb': 

1603 return cls.from_pdb_file(mysterious_file.path) 

1604 if mysterious_file.format == 'cif': 

1605 return cls.from_mmcif_file(mysterious_file.path) 

1606 if mysterious_file.format == 'prmtop': 

1607 return cls.from_prmtop_file(mysterious_file.path) 

1608 if mysterious_file.format == 'tpr': 

1609 return cls.from_tpr_file(mysterious_file.path) 

1610 raise InputError(f'Not supported format ({mysterious_file.format}) to setup a Structure') 

1611 

1612 def get_atom_count (self) -> int: 

1613 """Get the number of atoms in the structure.""" 

1614 return len(self.atoms) 

1615 atom_count = property(get_atom_count, None, None, "The number of atoms in the structure (read only)") 

1616 

1617 def get_residue_count (self) -> int: 

1618 """Get the number of residues in the structure (read only).""" 

1619 return len(self.residues) 

1620 residue_count = property(get_residue_count, None, None, "Number of residues in the structure (read only)") 

1621 

1622 def get_chain_count (self) -> int: 

1623 """Get the number of chains in the structure.""" 

1624 return len(self.chains) 

1625 chain_count = property(get_chain_count, None, None, "Number of chains in the structure (read only)") 

1626 

1627 def fix_atom_elements (self, trust : bool = True) -> bool: 

1628 """Fix atom elements by guessing them when missing. 

1629 Set all elements with the first letter upper and the second (if any) lower. 

1630 Also check if atom elements are coherent with atom names. 

1631 

1632 Args: 

1633 trust (bool): If 'trust' is set as False then we impose elements according to what we can guess from the atom name. 

1634 

1635 Returns: 

1636 bool: 

1637 Return True if any element was modified or False if not. 

1638 """ 

1639 modified = False 

1640 added = False 

1641 # Save the wrong guesses for a final report 

1642 # This way we do not crowd the terminal with logs when a lot of atoms are affected 

1643 reports = {} 

1644 for atom in self.atoms: 

1645 # Make sure elements have the first letter cap and the second letter not cap 

1646 if atom.element: 

1647 new_element = first_cap_only(atom.element) 

1648 if atom.element != new_element: 

1649 atom.element = new_element 

1650 modified = True 

1651 # Check the element to match what we would guess from the atom name 

1652 # In case it does not just warn the user 

1653 guess = atom.guess_element() 

1654 if atom.element != guess: 

1655 report = (atom.name, atom.element, guess) 

1656 report_count = reports.get(report, 0) 

1657 reports[report] = report_count + 1 

1658 if not trust: 

1659 atom.element = guess 

1660 modified = True 

1661 # If elements are missing then guess them from atom names 

1662 else: 

1663 atom.element = atom.guess_element() 

1664 added = True 

1665 # Warn the user about anormalies 

1666 for report, count in reports.items(): 

1667 atom_name, atom_element, guess = report 

1668 warn(f"Suspicious element for atom {atom_name}: {atom_element} -> shoudn't it be {guess}? ({count} occurrences)") 

1669 # Warn the user that some elements were modified 

1670 if modified: warn('Atom elements have been modified') 

1671 if added: warn('Atom elements were missing and have been added') 

1672 # Set atom elements as fixed in order to avoid repeating this process 

1673 self._fixed_atom_elements = True 

1674 return modified or added 

1675 

1676 def set_new_coordinates (self, new_coordinates : list[Coords]): 

1677 """Set new coordinates.""" 

1678 # Make sure the list of coordinates is as long as the number of atoms 

1679 if len(new_coordinates) != self.atom_count: 

1680 raise ValueError(f'The number of coordinates ({len(new_coordinates)}) does not match the number of atoms ({self.atom_count})') 

1681 # Overwrite current coordinates with the new coordinates 

1682 for i, atom in enumerate(self.atoms): 

1683 atom.coords = tuple(new_coordinates[i]) 

1684 

1685 def get_ion_atom_indices (self) -> set: 

1686 """Get all supported ion atom indices together in a set.""" 

1687 # If we already did this then return the stored value 

1688 if self._ion_atom_indices != None: 

1689 return self._ion_atom_indices 

1690 # Find ion atom indices 

1691 indices = set() 

1692 for atom in self.atoms: 

1693 if atom.element in SUPPORTED_ION_ELEMENTS: 

1694 indices.add(atom.index) 

1695 self._ion_atom_indices = indices 

1696 return self._ion_atom_indices 

1697 ion_atom_indices = property(get_ion_atom_indices, None, None, "Atom indices for what we consider supported ions") 

1698 

1699 def get_dummy_atom_indices (self) -> set: 

1700 """Get all dummy atom indices together in a set.""" 

1701 # If we already did this then return the stored value 

1702 if self._dummy_atom_indices != None: 

1703 return self._dummy_atom_indices 

1704 # Find ion atom indices 

1705 indices = set() 

1706 for atom in self.atoms: 

1707 if atom.element == DUMMY_ATOM_ELEMENT: 

1708 indices.add(atom.index) 

1709 self._dummy_atom_indices = indices 

1710 return self._dummy_atom_indices 

1711 dummy_atom_indices = property(get_dummy_atom_indices, None, None, "Atom indices for what we consider dummy atoms") 

1712 

1713 def generate_pdb (self): 

1714 """Generate a pdb file content with the current structure.""" 

1715 content = 'REMARK workflow generated pdb file\n' 

1716 for a, atom in enumerate(self.atoms): 

1717 residue = atom.residue 

1718 index = str((a+1) % 100000).rjust(5) 

1719 name = ' ' + atom.name.ljust(3) if len(atom.name) < 4 else atom.name 

1720 residue_name = residue.name.ljust(4) if residue else 'XXX'.ljust(4) 

1721 chain = atom.chain 

1722 chain_name = chain.name if chain.name and len(chain.name) == 1 else 'X' 

1723 residue_number = str(residue.number).rjust(4) if residue else '0'.rjust(4) 

1724 # If residue number is longer than 4 characters then we must parse to hexadecimal 

1725 if len(residue_number) > 4: 

1726 residue_number = hex(residue.number)[2:].rjust(4) 

1727 icode = residue.icode if residue.icode and len(residue.icode) else ' ' 

1728 # Make sure we have atom coordinates 

1729 if atom.coords == None: 

1730 raise InputError('Trying to write a PDB file from a structure with atoms without coordinates') 

1731 x_coord, y_coord, z_coord = [ "{:.3f}".format(coord).rjust(8) for coord in atom.coords ] 

1732 occupancy = '1.00' # Just a placeholder 

1733 temp_factor = '0.00' # Just a placeholder 

1734 element = atom.element.rjust(2) 

1735 atom_line = ('ATOM ' + index + ' ' + name + ' ' + residue_name 

1736 + chain_name + residue_number + icode + ' ' + x_coord + y_coord + z_coord 

1737 + ' ' + occupancy + ' ' + temp_factor + ' ' + element).ljust(80) + '\n' 

1738 content += atom_line 

1739 return content 

1740 

1741 def generate_pdb_file (self, pdb_filepath : str): 

1742 """Generate a pdb file with current structure.""" 

1743 pdb_content = self.generate_pdb() 

1744 with open(pdb_filepath, "w") as file: 

1745 file.write(pdb_content) 

1746 

1747 def get_pytraj_topology (self): 

1748 """Get the structure equivalent pytraj topology.""" 

1749 # In we do not have pytraj in our environment then we cannot proceed 

1750 if not is_imported('pytraj'): 

1751 raise InputError('Missing dependency error: pytraj') 

1752 # Generate a pdb file from the current structure to feed pytraj 

1753 pdb_filepath = '.structure.pdb' 

1754 self.generate_pdb_file(pdb_filepath) 

1755 pytraj_topology = pytraj.load_topology(filename = pdb_filepath) 

1756 os.remove(pdb_filepath) 

1757 return pytraj_topology 

1758 

1759 SUPPORTED_SELECTION_SYNTAXES = { 'vmd', 'pytraj', 'gmx' } 

1760 def select (self, selection_string : str, syntax : str = 'vmd') -> Optional['Selection']: 

1761 """Select atoms from the structure thus generating an atom indices list. 

1762 Different tools may be used to make the selection: 

1763 - vmd (default) 

1764 - pytraj""" 

1765 if syntax == 'vmd': 

1766 # Generate a pdb for vmd to read it 

1767 auxiliar_pdb_filepath = '.structure.pdb' 

1768 self.generate_pdb_file(auxiliar_pdb_filepath) 

1769 # Use vmd to find atom indices 

1770 atom_indices = get_vmd_selection_atom_indices(auxiliar_pdb_filepath, selection_string) 

1771 os.remove(auxiliar_pdb_filepath) 

1772 if len(atom_indices) == 0: 

1773 return Selection() 

1774 return Selection(atom_indices) 

1775 if syntax == 'pytraj': 

1776 # In we do not have pytraj in our environment then we cannot proceed 

1777 if not is_imported('pytraj'): 

1778 raise InputError('Missing dependency error: pytraj') 

1779 pytraj_topology = self.get_pytraj_topology() 

1780 pytraj_selection = pytraj_topology[selection_string] 

1781 atom_indices = [ atom.index for atom in pytraj_selection.atoms ] 

1782 if len(atom_indices) == 0: 

1783 return Selection() 

1784 return Selection(atom_indices) 

1785 if syntax == 'gmx': 

1786 # Generate a pdb strucutre to feed gmx 

1787 auxiliar_pdb_filepath = '.structure.pdb' 

1788 self.generate_pdb_file(auxiliar_pdb_filepath) 

1789 auxiliar_pdb_file = File(auxiliar_pdb_filepath) 

1790 # Create the index file with the current atom selection 

1791 index_filepath = f'.structure.ndx' 

1792 index_file = File(index_filepath) 

1793 selection_name, selection_exists = make_index(auxiliar_pdb_file, index_file, selection_string) 

1794 if not selection_exists: 

1795 available_selections = ', '.join(list(index_groups.keys())) 

1796 raise InputError(f'Something was wrong with the selection. Available gromacs selections: {available_selections}') 

1797 # Read the index file to capture the selection of atoms 

1798 index_groups = read_ndx(index_file) 

1799 indices = index_groups.get(selection_name, None) 

1800 if indices == None: raise RuntimeError('Atom group must exist at this point') 

1801 auxiliar_pdb_file.remove() 

1802 index_file.remove() 

1803 # Return the parsed selection 

1804 return Selection(indices) 

1805 

1806 options = ', '.join(self.SUPPORTED_SELECTION_SYNTAXES) 

1807 raise InputError(f'Syntax "{syntax}" is not supported. Choose one of the following: {options}') 

1808 

1809 def select_atom_indices (self, atom_indices : list[int]) -> 'Selection': 

1810 """Set a function to make selections using atom indices.""" 

1811 # Check atom indices to be in the structure 

1812 atom_count = self.atom_count 

1813 for atom_index in atom_indices: 

1814 if atom_index >= atom_count: 

1815 raise InputError(f'Atom index {atom_index} is out of range ({atom_count})') 

1816 return Selection(atom_indices) 

1817 

1818 def select_residue_indices (self, residue_indices : list[int]) -> 'Selection': 

1819 """Set a function to make selections using residue indices.""" 

1820 # WARNING: The following line gets stucked sometimes, idk why 

1821 #atom_indices = sum([ self.residues[index].atom_indices for index in residue_indices ], []) 

1822 atom_indices = [] 

1823 for i, index in enumerate(residue_indices): 

1824 atom_indices += self.residues[index].atom_indices 

1825 return Selection(atom_indices) 

1826 

1827 def select_all (self) -> 'Selection': 

1828 """Get a selection with all atoms.""" 

1829 return Selection(list(range(self.atom_count))) 

1830 

1831 def select_by_classification (self, classification : str) -> 'Selection': 

1832 """Select atoms according to the classification of its residue.""" 

1833 atom_indices = [] 

1834 for residue in self.residues: 

1835 if residue.classification == classification: 

1836 atom_indices += residue.atom_indices 

1837 return Selection(atom_indices) 

1838 

1839 def select_water (self) -> 'Selection': 

1840 """Select water atoms. 

1841 WARNING: This logic is a bit guessy and it may fail for non-standard residue named structures 

1842 """ 

1843 return self.select_by_classification('solvent') 

1844 

1845 def select_ions (self) -> 'Selection': 

1846 """Select ions.""" 

1847 return self.select_by_classification('ion') 

1848 

1849 def select_counter_ions (self, charge : Optional[str] = None) -> 'Selection': 

1850 """Select counter ion atoms. 

1851 WARNING: This logic is a bit guessy and it may fail for non-standard atom named structures 

1852 """ 

1853 counter_ion_indices = [] 

1854 # Set the accepted names accoridng to the charge 

1855 if charge == None: 

1856 accepted_names = STANDARD_COUNTER_ION_ATOM_NAMES 

1857 elif charge == '+': 

1858 accepted_names = STANDARD_COUNTER_CATION_ATOM_NAMES 

1859 elif charge == '-': 

1860 accepted_names = STANDARD_COUNTER_ANION_ATOM_NAMES 

1861 else: 

1862 raise ValueError('Not supported charge') 

1863 # Iterate atoms 

1864 for atom in self.atoms: 

1865 # If the residue has not one single atom then it is not an ion 

1866 if len(atom.residue.atoms) != 1: continue 

1867 # Get a simplified version of the atom name 

1868 # Set all letters upper and remove non-letter characters (e.g. '+' and '-' symbols) 

1869 simple_atom_name = ''.join(filter(str.isalpha, atom.name.upper())) 

1870 if simple_atom_name in accepted_names: 

1871 counter_ion_indices.append(atom.index) 

1872 return Selection(counter_ion_indices) 

1873 

1874 def select_water_and_counter_ions (self) -> 'Selection': 

1875 """Select both water and counter ions.""" 

1876 return self.select_water() + self.select_counter_ions() 

1877 

1878 def select_heavy_atoms (self) -> 'Selection': 

1879 """Select heavy atoms.""" 

1880 atom_indices = [] 

1881 for atom in self.atoms: 

1882 # If the atom is not an hydrogen then add it to the list 

1883 if atom.element != 'H': 

1884 atom_indices.append(atom.index) 

1885 return Selection(atom_indices) 

1886 

1887 def select_protein (self) -> 'Selection': 

1888 """Select protein atoms. 

1889 WARNING: Note that there is a small difference between VMD protein and our protein. 

1890 This function is improved to consider terminal residues as proteins. 

1891 VMD considers protein any residue including N, C, CA and O while terminals may have OC1 and OC2 instead of O. 

1892 """ 

1893 return self.select_by_classification('protein') 

1894 

1895 def select_nucleic (self) -> 'Selection': 

1896 """Select nucleic atoms.""" 

1897 return self.select_by_classification('dna') + self.select_by_classification('rna') 

1898 

1899 def select_lipids (self) -> 'Selection': 

1900 """Select lipids.""" 

1901 return self.select_by_classification('fatty') + self.select_by_classification('steroid') 

1902 

1903 def select_carbohydrates (self) -> 'Selection': 

1904 """Select carbohydrates.""" 

1905 return self.select_by_classification('carbohydrate') 

1906 

1907 def select_pbc_guess (self) -> 'Selection': 

1908 """Return a selection of the typical PBC atoms: solvent, counter ions and lipids. 

1909 WARNING: This is just a guess.""" 

1910 return self.select_water() + self.select_counter_ions() + self.select_lipids() 

1911 

1912 def select_cg (self) -> 'Selection': 

1913 """Select coarse grain atoms.""" 

1914 return Selection([ atom.index for atom in self.atoms if atom.element == CG_ATOM_ELEMENT ]) 

1915 

1916 def select_missing_bonds (self) -> 'Selection': 

1917 return Selection([ index for index, bonds in enumerate(self.bonds) if bonds == MISSING_BONDS ]) 

1918 

1919 def select_cartoon (self, include_terminals : bool = False) -> 'Selection': 

1920 """Select cartoon representable regions for VMD. 

1921 

1922 Rules are: 

1923 1. Residues must be protein (i.e. must contain C, CA, N and O atoms) or nucleic (P, OP1, OP2, O3', C3', C4', C5', O5') 

1924 2. There must be at least 3 covalently bonded residues 

1925 

1926 It does not matter their chain, numeration or even index order as long as they are bonded. 

1927 * Note that we can represent cartoon while we display one residue alone, but it must be connected anyway. 

1928 Also, we have the option to include terminals in the cartoon selection although they are not representable. 

1929 This is helpful for the screenshot: terminals are better hidden than represented as ligands.""" 

1930 # Set fragments which are candidate to be cartoon representable 

1931 fragments = [] 

1932 # Get protein fragments according to VMD 

1933 protein_selection = self.select_protein() - self.select_cg() if include_terminals else self.select('protein', syntax='vmd') 

1934 if protein_selection: 

1935 fragments += list(self.find_fragments(protein_selection)) 

1936 # Get nucleic fragments according to VMD 

1937 nucleic_selection = self.select_nucleic() - self.select_cg() 

1938 if nucleic_selection: 

1939 fragments += list(self.find_fragments(nucleic_selection)) 

1940 # Set the final selection including all valid fragments 

1941 cartoon_selection = Selection() 

1942 # Iterate over every fragment 

1943 for fragment in fragments: 

1944 # Make sure the fragment has at least 3 residues before adding it to the cartoon selection 

1945 fragment_residue_indices = self.get_selection_residue_indices(fragment) 

1946 if len(fragment_residue_indices) >= 3: 

1947 cartoon_selection += fragment 

1948 return cartoon_selection 

1949 

1950 def invert_selection (self, selection : 'Selection') -> 'Selection': 

1951 """Invert a selection.""" 

1952 atom_indices = list(range(self.atom_count)) 

1953 for atom_index in selection.atom_indices: 

1954 atom_indices[atom_index] = None 

1955 return Selection([ atom_index for atom_index in atom_indices if atom_index != None ]) 

1956 

1957 def get_selection_residue_indices (self, selection : 'Selection') -> list[int]: 

1958 """Given a selection, get a list of residue indices for residues implicated. 

1959 Note that if a single atom from the residue is in the selection then the residue index is returned.""" 

1960 return list(set([ self.atoms[atom_index].residue_index for atom_index in selection.atom_indices ])) 

1961 

1962 def get_selection_residues (self, selection : 'Selection') -> list['Residue']: 

1963 """Given a selection, get a list of residues implicated. 

1964 Note that if a single atom from the residue is in the selection then the residue is returned.""" 

1965 residue_indices = self.get_selection_residue_indices(selection) 

1966 return [ self.residues[index] for index in residue_indices ] 

1967 

1968 def get_selection_chain_indices (self, selection : 'Selection') -> list[int]: 

1969 """Given a selection, get a list of chain indices for chains implicated. 

1970 Note that if a single atom from the chain is in the selection then the chain index is returned.""" 

1971 return list(set([ self.atoms[atom_index].chain_index for atom_index in selection.atom_indices ])) 

1972 

1973 def get_selection_chains (self, selection : 'Selection') -> list['Chain']: 

1974 """Given a selection, get a list of chains implicated. 

1975 Note that if a single atom from the chain is in the selection then the chain is returned.""" 

1976 chain_indices = self.get_selection_chain_indices(selection) 

1977 return [ self.chains[index] for index in chain_indices ] 

1978 

1979 def get_selection_classification (self, selection : 'Selection') -> str: 

1980 """Get type of the chain.""" 

1981 # Get selection residues 

1982 selection_residue_indices = self.get_selection_residue_indices(selection) 

1983 

1984 # Inicializar contadores para cada tipo de residuo 

1985 residue_counts = {} 

1986 

1987 # Count the residues of each type 

1988 for residue_index in selection_residue_indices: 

1989 residue = self.residues[residue_index] 

1990 res_class = residue.classification 

1991 if res_class in residue_counts: 

1992 residue_counts[res_class] += 1 

1993 else: 

1994 # If the classification is not recognized, count it as "other" 

1995 residue_counts[res_class] = 1 

1996 

1997 # Count the total number of residues in the selection 

1998 total_residues = sum(residue_counts.values()) 

1999 if total_residues == 0: raise ValueError('Should have residues at this point') 

2000 

2001 # Calculate the proportion of each type of residue 

2002 proportions = { k: v / total_residues for k, v in residue_counts.items() } 

2003 

2004 # If one type of residue dominates, return it 

2005 primary_type = max(proportions, key=proportions.get) 

2006 # We establish a threshold of 80% to consider a chain as a single type 

2007 if proportions[primary_type] >= 0.8: 

2008 return primary_type 

2009 

2010 # Special cases 

2011 relevant_threshold = 0.3 

2012 if proportions.get("dna", 0) > relevant_threshold and proportions.get("rna", 0) > relevant_threshold: 

2013 return "nucleic" 

2014 if proportions.get("carbohydrate", 0) > relevant_threshold and proportions.get("protein", 0) > relevant_threshold: 

2015 return "glycoprotein" 

2016 if proportions.get("fatty", 0) > relevant_threshold and proportions.get("steroid", 0) > relevant_threshold: 

2017 return "lipid" 

2018 

2019 # Any other combinations of different main proportions 

2020 main_proportions = { k: v for k, v in proportions.items() if v > relevant_threshold } 

2021 # Nothing is above the threshold 

2022 if len(main_proportions) == 0: 

2023 return "mix" 

2024 # There is only one thing above threshold 

2025 elif len(main_proportions) == 1: 

2026 return f"{primary_type}/mix" 

2027 # There are two things above threshold 

2028 elif len(main_proportions) == 2: 

2029 other_type = next(key for key in main_proportions.keys() if key != primary_type) 

2030 return f"{primary_type}/{other_type}" 

2031 # There are three things above the threshold (more is not possible) 

2032 else: 

2033 return "mix" 

2034 

2035 def filter (self, selection : Union['Selection', str], selection_syntax : str = 'vmd') -> 'Structure': 

2036 """Create a new structure from the current using a selection to filter atoms.""" 

2037 if not selection: raise RuntimeError('No selection was passed') 

2038 # In case the selection is not an actual Selection, but a string, parse the string into a Selection 

2039 if type(selection) == str: 

2040 selection = self.select(selection, selection_syntax) 

2041 new_atoms = [] 

2042 new_residues = [] 

2043 new_chains = [] 

2044 # Get the selected atoms 

2045 # Generate dictionaries with new indexes as keys and previous indexes as values for atoms, residues and chains 

2046 # This is done with this structure for the residues and chains further to find the new indices fast 

2047 new_atom_indices = {} 

2048 # Collect also original indices to related atom residues and chains 

2049 original_atom_residue_indices = [] 

2050 for new_index, original_index in enumerate(selection.atom_indices): 

2051 # Make a copy of the selected atom in order to not modify the original one 

2052 original_atom = self.atoms[original_index] 

2053 new_atom = Atom( 

2054 name=original_atom.name, 

2055 element=original_atom.element, 

2056 coords=original_atom.coords, 

2057 ) 

2058 new_atoms.append(new_atom) 

2059 # Save old and new indices in order to reconfigure all indices later 

2060 new_atom_indices[original_index] = new_index 

2061 original_atom_residue_indices.append(original_atom.residue_index) 

2062 # Find the selected residues 

2063 selected_residue_indices = list(set(original_atom_residue_indices)) 

2064 # Repeat the original/new indices backup we did before 

2065 new_residue_indices = {} 

2066 original_residue_atom_indices = [] 

2067 original_residue_chain_indices = [] 

2068 for new_index, original_index in enumerate(selected_residue_indices): 

2069 # Make a copy of the selected residue in order to not modify the original one 

2070 original_residue = self.residues[original_index] 

2071 new_residue = Residue( 

2072 name=original_residue.name, 

2073 number=original_residue.number, 

2074 icode=original_residue.icode, 

2075 ) 

2076 new_residues.append(new_residue) 

2077 # Save old and new indices in order to reconfigure all indices later 

2078 new_residue_indices[original_index] = new_index 

2079 original_residue_atom_indices.append(original_residue.atom_indices) 

2080 original_residue_chain_indices.append(original_residue.chain_index) 

2081 # Find the selected chains 

2082 selected_chain_indices = list(set(original_residue_chain_indices)) 

2083 # Repeat the original/new indices backup we did before 

2084 new_chain_indices = {} 

2085 original_chain_residue_indices = [] 

2086 for new_index, original_index in enumerate(selected_chain_indices): 

2087 # Make a copy of the selected chain in order to not modify the original one 

2088 original_chain = self.chains[original_index] 

2089 new_chain = Chain( 

2090 name=original_chain.name, 

2091 ) 

2092 new_chains.append(new_chain) 

2093 # Save old and new indices in order to reconfigure all indices later 

2094 new_chain_indices[original_index] = new_index 

2095 original_chain_residue_indices.append(original_chain.residue_indices) 

2096 # Finally, reset indices in all instances 

2097 for a, atom in enumerate(new_atoms): 

2098 atom.residue_index = new_residue_indices[ original_atom_residue_indices[a] ] 

2099 for r, residue in enumerate(new_residues): 

2100 atom_indices = [] 

2101 for original_index in original_residue_atom_indices[r]: 

2102 new_index = new_atom_indices.get(original_index, None) 

2103 if new_index != None: 

2104 atom_indices.append(new_index) 

2105 residue.atom_indices = atom_indices 

2106 residue.chain_index = new_chain_indices[ original_residue_chain_indices[r] ] 

2107 for c, chain in enumerate(new_chains): 

2108 residue_indices = [] 

2109 for original_index in original_chain_residue_indices[c]: 

2110 new_index = new_residue_indices.get(original_index, None) 

2111 if new_index != None: 

2112 residue_indices.append(new_index) 

2113 chain.residue_indices = residue_indices 

2114 return Structure(atoms=new_atoms, residues=new_residues, chains=new_chains) 

2115 

2116 def chainer (self, 

2117 selection : Optional['Selection'] = None, 

2118 letter : Optional[str] = None, 

2119 whole_fragments : bool = False): 

2120 """Set chains on demand. 

2121 If no selection is passed then the whole structure will be affected. 

2122 If no chain is passed then a "chain by fragment" logic will be applied.""" 

2123 # If there is no selection we consider all atoms 

2124 if selection == None: 

2125 selection = self.select_all() 

2126 # If the selection is empty then there is nothing to do here 

2127 if len(selection) == 0: return 

2128 # If a letter is specified then the logic is way simpler 

2129 if letter and not whole_fragments: 

2130 self.set_selection_chain_name(selection, letter) 

2131 return 

2132 # If a letter is not specified we run the "fragments" logic 

2133 fragment_getter = self.find_whole_fragments if whole_fragments else self.find_fragments 

2134 for fragment in fragment_getter(selection): 

2135 chain_name = letter if letter else self.get_available_chain_name() 

2136 self.set_selection_chain_name(fragment, chain_name) 

2137 

2138 def auto_chainer (self, verbose : bool = False): 

2139 """Smart function to set chains automatically. 

2140 Original chains will be overwritten.""" 

2141 if verbose: print('Running auto-chainer') 

2142 # Set all chains to X 

2143 self.chainer(letter='X') 

2144 # Set solvent and ions as a unique chain 

2145 ion_selection = self.select_ions() 

2146 if verbose: print(f' Ion atoms: {len(ion_selection)}') 

2147 solvent_selection = self.select_water() 

2148 if verbose: print(f' Solvent atoms: {len(solvent_selection)}') 

2149 ion_and_indices_selection = ion_selection + solvent_selection 

2150 self.chainer(selection=ion_and_indices_selection, letter='S') 

2151 # Set fatty acids and steroids as a unique chain 

2152 # RUBEN: con whole_fragments algunos carbohidratos se sobreescriben como glucolípidos 

2153 # DANI: Se podrían descartarían residuos que no pertenezcan a la membrana por proximidad 

2154 membrane_selection = self.select_lipids() 

2155 if verbose: print(f' Membrane atoms: {len(membrane_selection)}') 

2156 self.chainer(selection=membrane_selection, letter='M', whole_fragments=True) 

2157 # Set carbohydrates as a unique chain as well, just in case we have glycans 

2158 # Note that in case glycan atoms are mixed with protein atoms glycan chains will be overwritten 

2159 # However this is not a problem. It is indeed the best solution if we don't want ro resort atoms 

2160 carbohydrate_selection = self.select_carbohydrates() 

2161 if verbose: print(f' Carbohydrate atoms: {len(carbohydrate_selection)}') 

2162 self.chainer(selection=carbohydrate_selection, letter='H') 

2163 # Add a chain per fragment for both proteins and nucleic acids 

2164 protein_selection = self.select_protein() 

2165 if verbose: print(f' Protein atoms: {len(protein_selection)}') 

2166 nucleic_selection = self.select_nucleic() 

2167 if verbose: print(f' Nucleic acid atoms: {len(nucleic_selection)}') 

2168 protein_and_nucleic_selection = protein_selection + nucleic_selection 

2169 self.chainer(selection=protein_and_nucleic_selection) 

2170 # At this point we should have covered most of the molecules in the structure 

2171 # However, in case there are more molecules, we have already set them all as a single chain ('X') 

2172 # Here we do not apply the chain per fragment logic since it may be dangerous 

2173 # Note that we may have a lot of independent residues (and thus a los of small fragments) 

2174 # This would make us run out of letters in the alphabet and thus there would be no more chains 

2175 # As the last step, fix repeated chains 

2176 # RUBEN: sacar esta parte ya que se hace luego en el structure_corrector? 

2177 self.check_repeated_chains(fix_chains=True) 

2178 

2179 def raw_protein_chainer (self): 

2180 """This is an alternative system to find protein chains (anything else is chained as 'X'). 

2181 This system does not depend on VMD. 

2182 It totally overrides previous chains since it is expected to be used only when chains are missing.""" 

2183 current_chain = self.get_available_chain_name() 

2184 previous_alpha_carbon = None 

2185 for residue in self.residues: 

2186 alpha_carbon = next((atom for atom in residue.atoms if atom.name == 'CA'), None) 

2187 if not alpha_carbon: 

2188 residue.set_chain('X') 

2189 continue 

2190 # Connected aminoacids have their alpha carbons at a distance of around 3.8 Ångstroms 

2191 residues_are_connected = previous_alpha_carbon and calculate_distance(previous_alpha_carbon, alpha_carbon) < 4 

2192 if not residues_are_connected: 

2193 current_chain = self.get_available_chain_name() 

2194 residue.set_chain(current_chain) 

2195 previous_alpha_carbon = alpha_carbon 

2196 

2197 def set_selection_chain_name (self, selection : 'Selection', letter : str): 

2198 """ 

2199 Given an atom selection, set the chain for all these atoms. 

2200 Note that the chain is changed in every whole residue, no 

2201 matter if only one atom was selected. 

2202 """ 

2203 # Find if the letter belongs to an already existing chain 

2204 chain = self.get_chain_by_name(letter) 

2205 if not chain: 

2206 chain = Chain(name=letter) 

2207 self.set_new_chain(chain) 

2208 # Get the selection residue indices 

2209 selection_residue_indices = self.get_selection_residue_indices(selection) 

2210 # Set the chain index of every residue to the new chain 

2211 # WARNING: 

2212 # This may have to be done atom by atom but we forgot why so we change 

2213 # it so see what was the problem. Doing it atom by atom is not efficient 

2214 # and was causing a bug where residues with same number could change their name 

2215 # to that of the first residue found with that number. 

2216 for residue_index in selection_residue_indices: 

2217 # WARNING: Chain index has to be read every iteration since it may change. Do not save it 

2218 residue = self.residues[residue_index] 

2219 residue.set_chain_index(chain.index) 

2220 

2221 def check_available_chains(self): 

2222 """Check if there are more chains than available letters.""" 

2223 n_chains = len(self.chains) 

2224 n_available_letters = len(AVAILABLE_LETTERS) 

2225 if n_chains <= n_available_letters: return 

2226 raise InputError(f'There are {n_chains} chains in the structure so far. If this is not expected then there may be a problem.\n' + 

2227 f' If this is expected then unfortunatelly there is a limit of {n_available_letters} available chain letters.\n' + 

2228 ' Please manually set the chains from scratch or merge together some chains to reduce the number.\n' + 

2229 ' Customize a PDB file using the command "mwf chainer". Then use the customized PDB as input structure (-stru).') 

2230 

2231 def get_available_chain_name (self) -> str: 

2232 """Get an available chain name. 

2233 Find alphabetically the first letter which is not yet used as a chain name. 

2234 If all letters in the alphabet are used already then raise an error.""" 

2235 self.check_available_chains() 

2236 current_chain_names = [ chain.name for chain in self.chains ] 

2237 next_available_chain_name = next((name for name in AVAILABLE_LETTERS if name not in current_chain_names), None) 

2238 return next_available_chain_name 

2239 

2240 def get_next_available_chain_name (self, anterior : str) -> str: 

2241 """ 

2242 Get the next available chain name. 

2243 

2244 Args: 

2245 anterior (str): The last chain name used, which is expected to be a single letter 

2246 

2247 Raises: 

2248 ValueError: If the anterior is not a letter or if there are more chains than available 

2249 """ 

2250 self.check_available_chains() 

2251 current_chain_names = set([ chain.name for chain in self.chains ]) 

2252 # If anterior is cap then try caps first, if anterior is lower then try lowers first 

2253 if anterior.isupper(): 

2254 first_group, second_group = AVAILABLE_CAPS, AVAILABLE_LOWS 

2255 elif anterior.islower(): 

2256 first_group, second_group = AVAILABLE_LOWS, AVAILABLE_CAPS 

2257 else: raise ValueError(f'Is "{anterior}" even a letter?') 

2258 # Reorder letters in the first group, so the anterior is the last letter 

2259 anterior_position = first_group.index(anterior) 

2260 next_position = anterior_position + 1 

2261 reordered_group = first_group[next_position:] + first_group[0:next_position] 

2262 next_letter = next((letter for letter in reordered_group if letter not in current_chain_names), None) 

2263 if next_letter: return next_letter 

2264 # If there is not available letters is the first group then return the first available in the second 

2265 next_letter = next((letter for letter in second_group if letter not in current_chain_names), None) 

2266 return next_letter 

2267 

2268 def get_chain_by_name (self, name : str) -> Optional['Chain']: 

2269 """Get a chain by its name.""" 

2270 return next((c for c in self.chains if c.name == name), None) 

2271 

2272 def find_residue (self, chain_name : str, number : int, icode : str = '' ) -> Optional['Residue']: 

2273 """Find a residue by its chain, number and insertion code.""" 

2274 # Get the corresponding chain 

2275 target_chain = self.get_chain_by_name(chain_name) 

2276 if not target_chain: return None 

2277 # Find the residue in the target chain 

2278 target_chain.find_residue(number, icode) 

2279 

2280 def display_summary (self): 

2281 """Get a summary of the structure.""" 

2282 print(f'Atoms: {self.atom_count}') 

2283 print(f'Residues: {len(self.residues)}') 

2284 print(f'Chains: {len(self.chains)}') 

2285 for chain in self.chains: 

2286 print(f'Chain {chain.name} ({len(chain.residue_indices)} residues)') 

2287 print(' -> ' + chain.get_sequence()) 

2288 

2289 def check_repeated_chains (self, fix_chains : bool = False, display_summary : bool = False) -> bool: 

2290 """ 

2291 There may be chains which are equal in the structure (i.e. same chain name). 

2292 This means we have a duplicated/splitted chain. 

2293 Repeated chains are usual and they are usually supported but with some problems. 

2294 Also, repeated chains usually come with repeated residues, which means more problems (see explanation below). 

2295 

2296 In the context of this structure class we may have 2 different problems with a different solution each: 

2297 1. There is more than one chain with the same letter (repeated chain) -> rename the duplicated chains 

2298 2. There is a chain with atom indices which are not consecutive (splitted chain) -> create new chains 

2299 

2300 Rename repeated chains or create new chains if the fix_chains argument is True. 

2301 

2302 WARNING: These fixes are possible only if there are less chains than the number of letters in the alphabet. 

2303 Although there is no limitation in this code for chain names, setting long chain names is not compatible with pdb format. 

2304 

2305 Check splitted chains (a chains with non consecutive residues) and try to fix them if requested. 

2306 Check repeated chains (two chains with the same name) and return True if there were any repeats. 

2307 """ 

2308 # Order chains according to their names 

2309 # Save also those chains which have a previous duplicate 

2310 name_chains = {} 

2311 repeated_chains = [] 

2312 for chain in self.chains: 

2313 chain_name = chain.name 

2314 current_name_chains = name_chains.get(chain_name, None) 

2315 if not current_name_chains: 

2316 name_chains[chain_name] = [chain] 

2317 else: 

2318 name_chains[chain_name].append(chain) 

2319 repeated_chains.append(chain) 

2320 # Display the summary of repeated chains if requested 

2321 if display_summary: 

2322 if len(repeated_chains) > 0: 

2323 warn('There are repeated chains:') 

2324 for chain_name, chains in name_chains.items(): 

2325 chains_count = len(chains) 

2326 if chains_count > 1: 

2327 print(f'- Chain {chain_name} has {chains_count} repeats') 

2328 # Rename repeated chains if requested 

2329 if len(repeated_chains) > 0 and fix_chains: 

2330 self.check_available_chains() 

2331 current_letters = list(name_chains.keys()) 

2332 for repeated_chain in repeated_chains: 

2333 last_chain_letter = repeated_chain.name 

2334 while last_chain_letter in current_letters: 

2335 last_chain_letter = get_next_letter(last_chain_letter) 

2336 repeated_chain.name = last_chain_letter 

2337 current_letters.append(last_chain_letter) 

2338 # Check if there are splitted chains 

2339 # RUBEN: sacar esta parte ya que esta self.check_splitted_chains 

2340 for chain in self.chains: 

2341 residue_indices = sorted(chain.residue_indices) 

2342 # Check if residue indices are consecutive 

2343 if residue_indices[-1] - residue_indices[0] + 1 == len(residue_indices): 

2344 continue 

2345 warn(f'Splitted chain {chain.name}') 

2346 # If indices are not consecutive then we must find ranges of consecutive residues and create new chains for them 

2347 previous_residue_index = residue_indices[0] 

2348 consecutive_residues = [previous_residue_index] 

2349 overall_consecutive_residues = [] 

2350 for residue_index in residue_indices[1:]: 

2351 # If next residue is consecutive 

2352 if residue_index == previous_residue_index + 1: 

2353 consecutive_residues.append(residue_index) 

2354 # If next residue is NOT consecutive 

2355 else: 

2356 overall_consecutive_residues.append(consecutive_residues) 

2357 consecutive_residues = [residue_index] 

2358 # Update the previous index 

2359 previous_residue_index = residue_index 

2360 # Add the last split 

2361 overall_consecutive_residues.append(consecutive_residues) 

2362 # Now create new chains and reasign residues 

2363 # Skip the first set of consecutive residues since they will stay in the original chain 

2364 for residues_indices in overall_consecutive_residues[1:]: 

2365 chain_name = self.get_available_chain_name() 

2366 residues_selection = self.select_residue_indices(residues_indices) 

2367 self.set_selection_chain_name(residues_selection, chain_name) 

2368 

2369 # Fix repeated chains if requested 

2370 return len(repeated_chains) > 0 

2371 

2372 def check_splitted_chains (self, fix_chains : bool = False, display_summary : bool = False) -> bool: 

2373 """Check if non-consecutive atoms belong to the same chain. 

2374 If so, separate pieces of non-consecuite atoms in different chains. 

2375 Note that the new chains will be duplicated, so you will need to run check_repeated_chains after. 

2376 

2377 Args: 

2378 fix_chains (bool): If True then the splitted chains will be fixed. 

2379 display_summary (bool): If True then a summary of the splitted chains will be displayed. 

2380 

2381 Returns: 

2382 bool: 

2383 True if we encountered splitted chains and false otherwise. 

2384 """ 

2385 splitted_fragments: list[tuple[str, list[int]]] = [] 

2386 # Keep track of already checked chains 

2387 checked_chains = set() 

2388 last_chain = None 

2389 last_fragment_start = None 

2390 for atom_index, atom in enumerate(self.atoms): 

2391 # Get the chain name 

2392 chain_name = atom.chain.name 

2393 # Skip the atom if it belong to the previous chain 

2394 if chain_name == last_chain: continue 

2395 # If we were in a splitted fragment then end it here 

2396 if last_fragment_start != None: 

2397 new_fragment_indices = list(range(last_fragment_start, atom_index)) 

2398 new_fragment = (last_chain, new_fragment_indices) 

2399 splitted_fragments.append(new_fragment) 

2400 last_fragment_start = None 

2401 # Check if the chain was already found 

2402 if chain_name in checked_chains: 

2403 # Start a new fragment 

2404 last_fragment_start = atom_index 

2405 # Update last chain 

2406 last_chain = chain_name 

2407 # Add the new chain to the set of already checked chains 

2408 checked_chains.add(chain_name) 

2409 

2410 # Make a summary of the splitted chains if requested 

2411 if display_summary and len(splitted_fragments) > 0: 

2412 warn(f'Found {len(splitted_fragments)} splitted fragments') 

2413 affected_chains = sorted(list(set([ fragment[0] for fragment in splitted_fragments ]))) 

2414 print(f' We are having splits in chains {", ".join(affected_chains)}') 

2415 

2416 # Fix chains if requested 

2417 if fix_chains: 

2418 for chain_name, fragment_atom_indices in splitted_fragments: 

2419 # Create a new chain 

2420 new_chain = Chain(name=chain_name) 

2421 self.set_new_chain(new_chain) 

2422 # Ge the selection residue indices for the fragment 

2423 fragment_selection = self.select_atom_indices(fragment_atom_indices) 

2424 fragment_residue_indices = self.get_selection_residue_indices(fragment_selection) 

2425 # Move atoms in the fragment to the new chain 

2426 for residue_index in fragment_residue_indices: 

2427 residue = self.residues[residue_index] 

2428 residue.set_chain_index(new_chain.index) 

2429 return len(splitted_fragments) > 0 

2430 

2431 def sort_residues (self): 

2432 """Coherently sort residues according to the indices of the atoms they hold.""" 

2433 # Set a function to sort atoms and residues by index 

2434 def by_first_atom_index (residue): 

2435 return min(residue.atom_indices) 

2436 # Sort residues according to their first atom index 

2437 sorted_residues = sorted(self.residues, key = by_first_atom_index) 

2438 # Iterate sorted residues letting them know their new index 

2439 for r, residue in enumerate(sorted_residues): 

2440 residue.index = r 

2441 # Finally update the structure's residues list 

2442 self.residues = sorted_residues 

2443 

2444 def check_merged_residues (self, fix_residues : bool = False, display_summary : bool = False) -> bool: 

2445 """ 

2446 There may be residues which contain unconnected (unbonded) atoms. They are not allowed. 

2447 They may come from a wrong parsing and be indeed duplicated residues. 

2448 

2449 Search for merged residues. 

2450 Create new residues for every group of connected atoms if the fix_residues argument is True. 

2451 Note that the new residues will be repeated, so you will need to run check_repeated_residues after. 

2452 Return True if there were any merged residues. 

2453 """ 

2454 # Get the list of merged residues we encounter 

2455 merged_residues = [] 

2456 # Iterate residues 

2457 for residue in self.residues: 

2458 residue_selection = residue.get_selection() 

2459 residue_fragments = list(self.find_fragments(residue_selection)) 

2460 if len(residue_fragments) <= 1: continue 

2461 # If current residue has more than 1 fragment then it is a merged residue 

2462 merged_residues.append(residue) 

2463 if not fix_residues: continue 

2464 # If the residue is to be fixed then let the first fragment as the current residue 

2465 # Then create a new residue for every other fragment 

2466 for extra_fragment in residue_fragments[1:]: 

2467 # Set a new residue identical to the current one 

2468 new_residue = Residue(residue.name, residue.number, residue.icode) 

2469 self.set_new_residue(new_residue) 

2470 # Add it to the same chain 

2471 residue.chain.add_residue(new_residue) 

2472 # Add atoms to it 

2473 for atom_index in extra_fragment.atom_indices: 

2474 atom = self.atoms[atom_index] 

2475 new_residue.add_atom(atom) 

2476 # Now that we have new resiudes, sort all residues to keep them coherent 

2477 self.sort_residues() 

2478 # Count how many merged residues we encountered 

2479 merged_residues_count = len(merged_residues) 

2480 # Log some details if the summary is requested 

2481 if display_summary and merged_residues_count > 0: 

2482 print(f'Found {merged_residues_count} merged residues') 

2483 print(f' e.g. {merged_residues[0]}') 

2484 # Return if we found merged residues 

2485 return merged_residues_count > 0 

2486 

2487 def check_repeated_residues (self, fix_residues : bool = False, display_summary : bool = False) -> bool: 

2488 """ 

2489 There may be residues which are equal in the structure (i.e. same chain, number and icode). 

2490 In case 2 residues in the structure are equal we must check distance between their atoms. 

2491 If atoms are far it means they are different residues with the same notation (duplicated residues). 

2492 If atoms are close it means they are indeed the same residue (splitted residue). 

2493 

2494 Splitted residues are found in some pdbs and they are supported by some tools. 

2495 These tools consider all atoms with the same 'record' as the same residue. 

2496 However, there are other tools which would consider the splitted residue as two different residues. 

2497 This causes inconsistency along different tools besides a long list of problems. 

2498 The only possible is fix is changing the order of atoms in the topology. 

2499 Note that this is a breaking change for associated trajectories, which must change the order of coordinates. 

2500 However here we provide tools to fix associates trajectories as well. 

2501 

2502 Duplicated residues are usual and they are usually supported but with some problems. 

2503 For example, pytraj analysis outputs use to sort results by residues and each residue is tagged. 

2504 If there are duplicated residues with the same tag it may be not possible to know which result belongs to each residue. 

2505 Another example are NGL selections once in the web client. 

2506 If you select residue ':A and 1' and there are multiple residues 1 in chain A all of them will be displayed. 

2507 

2508 Check residues to search for duplicated and splitted residues. 

2509 Renumerate repeated residues if the fix_residues argument is True. 

2510 Return True if there were any repeats. 

2511 """ 

2512 # Track if residues have to be changed or not 

2513 modified = False 

2514 # Group all residues in the structure according to their chain, number and icode 

2515 grouped_residues = {} 

2516 # Check repeated residues which are one after the other 

2517 # Note that these residues MUST have a different name 

2518 # Otherwise they would have not been considered different residues 

2519 # For these rare cases we use icodes to solve the problem 

2520 non_icoded_residues = [] 

2521 last_residue = None 

2522 for residue in self.residues: 

2523 # Check residue to be equal than the previous residue 

2524 if residue == last_residue: 

2525 non_icoded_residues.append(residue) 

2526 last_residue = residue 

2527 continue 

2528 last_residue = residue 

2529 # Add residue to the groups of repeated residues 

2530 current_residue_repeats = grouped_residues.get(residue, None) 

2531 if not current_residue_repeats: 

2532 grouped_residues[residue] = [ residue ] 

2533 else: 

2534 current_residue_repeats.append(residue) 

2535 # In case we have non icoded residues 

2536 if len(non_icoded_residues) > 0: 

2537 if display_summary: 

2538 print(f'There are non-icoded residues ({len(non_icoded_residues)})') 

2539 # Set new icodes for non icoded residues 

2540 if fix_residues: 

2541 print(' Non icoded residues will recieve an icode') 

2542 for residue in non_icoded_residues: 

2543 repeated_residues_group = grouped_residues[residue] 

2544 current_icodes = [ residue.icode for residue in repeated_residues_group if residue.icode ] 

2545 next_icode = next((cap for cap in AVAILABLE_CAPS if cap not in current_icodes), None) 

2546 if not next_icode: 

2547 raise ValueError('There are no more icodes available') 

2548 # print('Setting icode ' + next_icode + ' to residue ' + str(residue)) 

2549 residue.icode = next_icode 

2550 modified = True 

2551 # Grouped residues with more than 1 result are considered as repeated 

2552 repeated_residues = [ residues for residues in grouped_residues.values() if len(residues) > 1 ] 

2553 if len(repeated_residues) == 0: 

2554 return modified 

2555 # In case we have repeated residues... 

2556 if display_summary: 

2557 warn(f'There are {len(repeated_residues)} different groups of repeated residues') 

2558 print(f' e.g. {repeated_residues[0][0]}') 

2559 if len(repeated_residues) == 9999 or len(repeated_residues) == 10000: 

2560 print(' Probably you have more residues than the PDB numeration limit (1-9999)') 

2561 # Now for each repeated residue, find out which are splitted and which are duplicated 

2562 covalent_bonds = self.bonds 

2563 overall_splitted_residues = [] 

2564 overall_duplicated_residues = [] 

2565 for residues in repeated_residues: 

2566 # Iterate over repeated residues and check if residues are covalently bonded 

2567 # If any pair of residues are bonded add them both to the splitted residues list 

2568 # At the end, all non-splitted residues will be considered duplicated residues 

2569 # WARNING: Using a set here is not possible since repeated residues have the same hash 

2570 # WARNING: Also comparing residues themselves is not advisable, so we use indices at this point 

2571 splitted_residue_indices = set() 

2572 for residue, other_residues in otherwise(residues): 

2573 if residue.index in splitted_residue_indices: 

2574 continue 

2575 # Get atom indices for all atoms connected to the current residue 

2576 residue_bonds = sum([ covalent_bonds[index] for index in residue.atom_indices ], []) 

2577 for other_residue in other_residues: 

2578 # Get all atom indices for each other residue and collate with the current residue bonds 

2579 if any( index in residue_bonds for index in other_residue.atom_indices ): 

2580 splitted_residue_indices.add(residue.index) 

2581 splitted_residue_indices.add(other_residue.index) 

2582 # Finally obtain the splitted residues from its indices 

2583 splitted_residues = [ self.residues[index] for index in splitted_residue_indices ] 

2584 # Repeated residues which are not splitted are thus duplicated 

2585 duplicated_residues = [ residue for residue in residues if residue.index not in splitted_residue_indices ] 

2586 # Update the overall lists 

2587 if len(splitted_residues) > 0: 

2588 overall_splitted_residues.append(splitted_residues) 

2589 if len(duplicated_residues) > 0: 

2590 overall_duplicated_residues.append(duplicated_residues) 

2591 # In case we have splitted residues 

2592 if len(overall_splitted_residues) > 0: 

2593 if display_summary: 

2594 print(f' There are splitted residues ({len(overall_splitted_residues)})') 

2595 # Fix splitted residues 

2596 if fix_residues: 

2597 print(' Splitted residues will be merged') 

2598 # Set a function to sort atoms and residues by index 

2599 by_index = lambda v: v._index 

2600 # Merge splitted residues 

2601 # WARNING: Merging residues without sorting atoms is possible, but this would be lost after exporting to pdb 

2602 for splitted_residues in overall_splitted_residues: 

2603 # The first residue (i.e. the residue with the lower index) will be the one which remains 

2604 # It will absorb other residue atom indices 

2605 splitted_residues.sort(key=by_index) # Residues are not sorted by default, this is mandatory 

2606 first_residue = splitted_residues[0] 

2607 other_residues = splitted_residues[1:] 

2608 for residue in other_residues: 

2609 for atom in residue.atoms: 

2610 atom.residue = first_residue 

2611 print(' Atoms will be sorted to be together by residues') 

2612 print(' NEVER FORGET: This will break any associated trajectory if coordinates are not sorted as well') 

2613 # Sort atoms to group residue atoms together 

2614 # Note that each atom index must be updated 

2615 new_atom_indices = sum([ residue.atom_indices for residue in self.residues ], []) 

2616 for i, new_atom_index in enumerate(new_atom_indices): 

2617 atom = self.atoms[new_atom_index] 

2618 atom._index = i 

2619 self.atoms.sort(key=by_index) 

2620 # Also residue 'atom_indices' must be updated 

2621 for residue in self.residues: 

2622 residue._atom_indices = [ new_atom_indices.index(atom_index) for atom_index in residue._atom_indices ] 

2623 # Bonds must be reset since atom indices have changes 

2624 self._bonds = None 

2625 # Prepare the trajectory atom sorter which must be returned 

2626 # Include atom indices already so the user has to provide only the structure and trajectory filepaths 

2627 def trajectory_atom_sorter ( 

2628 input_structure_file : 'File', 

2629 input_trajectory_file : 'File', 

2630 output_trajectory_file : 'File' 

2631 ): 

2632 sort_trajectory_atoms( 

2633 input_structure_file, 

2634 input_trajectory_file, 

2635 output_trajectory_file, 

2636 new_atom_indices 

2637 ) 

2638 self.trajectory_atom_sorter = trajectory_atom_sorter 

2639 self.new_atom_order = new_atom_indices 

2640 modified = True 

2641 # In case we have duplicated residues 

2642 duplicated_residues_count = len(overall_duplicated_residues) 

2643 if duplicated_residues_count > 0: 

2644 if display_summary: 

2645 warn(f'There are {duplicated_residues_count} different groups of duplicated residues') 

2646 print(f' e.g. {overall_duplicated_residues[0][0]}') 

2647 # Renumerate duplicated residues if requested 

2648 if fix_residues: 

2649 # First of all, from each group of repeated residues, discard the first residue 

2650 # The first residue will remain as it is and the rest will be renumerated 

2651 # Join all residues to be renumerated in a single list 

2652 residue_to_renumerate = [] 

2653 for residues in overall_duplicated_residues: 

2654 residue_to_renumerate += residues[1:] 

2655 # Now group residues per chain since the renumeration is done by chains 

2656 chain_residues = {} 

2657 for residue in residue_to_renumerate: 

2658 chain = residue.chain 

2659 current_chain_residues = chain_residues.get(chain, None) 

2660 if current_chain_residues: current_chain_residues.append(residue) 

2661 else: chain_residues[chain] = [ residue ] 

2662 # Iterate residue chain groups 

2663 for chain, residues in chain_residues.items(): 

2664 # Find the last residue number in the current chain 

2665 maximum_chain_number = max([ residue.number for residue in chain.residues ]) 

2666 # Sort residues by index 

2667 residues.sort(key=lambda x: x.index, reverse=True) 

2668 for residue in residues: 

2669 # Set the number of the residue as the next available number 

2670 residue.number = maximum_chain_number + 1 

2671 # Update the maximum number 

2672 maximum_chain_number = residue.number 

2673 modified = True 

2674 return modified 

2675 

2676 # DANI: No recuerdo los problemas que daba tener átomos repetidos 

2677 def check_repeated_atoms (self, fix_atoms : bool = False, display_summary : bool = False) -> bool: 

2678 """Check atoms to search for repeated atoms. 

2679 Atoms with identical chain, residue and name are considered repeated atoms. 

2680 

2681 Args: 

2682 fix_atoms (bool): If True, rename repeated atoms. 

2683 display_summary (bool): If True, display a summary of repeated atoms. 

2684 

2685 Returns: 

2686 bool: True if there were any repeated atoms, False otherwise. 

2687 """ 

2688 # Set two trackers for display 

2689 repeated_atoms_count = 0 

2690 example = None 

2691 for residue in self.residues: 

2692 # Iterate over the residue atoms counting how many repeated names we have 

2693 current_names = [] 

2694 for atom in residue.atoms: 

2695 # We check if the atom name already exists. If not, go to the next atom 

2696 name = atom.name 

2697 if name not in current_names: 

2698 current_names.append(name) 

2699 continue 

2700 # When atom is repeated 

2701 repeated_atoms_count += 1 

2702 # If the fix was not requested we stop here 

2703 if not fix_atoms: 

2704 continue 

2705 # We set the name of the atom as the element letter + the count of this element 

2706 # If element is missing take the first character of the atom name 

2707 initial = atom.element 

2708 if not initial or initial == ' ': 

2709 initial = name[0] 

2710 number = 1 

2711 new_name = initial + str(number) 

2712 while new_name in current_names: 

2713 number += 1 

2714 if number > 999: 

2715 raise ValueError('There are more than 999 atoms with the same name in the residue') 

2716 new_name = initial + str(number) 

2717 # Save an example for the logs if there is none yet 

2718 if not example: 

2719 example = f'{atom.name} renamed as {new_name} in residue {residue}' 

2720 atom.name = new_name 

2721 current_names.append(new_name) 

2722 # Display the summary of repeated atoms if requested 

2723 if display_summary: 

2724 if repeated_atoms_count > 0: 

2725 warn(f'There are repeated atoms ({repeated_atoms_count})') 

2726 print(f' e.g. {example}') 

2727 return repeated_atoms_count > 0 

2728 

2729 def is_missing_any_bonds (self) -> bool: 

2730 return any(bond == MISSING_BONDS for bond in self.bonds) 

2731 

2732 def check_incoherent_bonds (self) -> bool: 

2733 """ Check bonds to be incoherent i.e. check atoms not to have more or less 

2734 bonds than expected according to their element. 

2735 Return True if any incoherent bond is found. """ 

2736 # Find out if there are hydrogens in the structure 

2737 # It may happen that we encounter an structure without hydrogens 

2738 has_hydrogen = next(( True for atom in self.atoms if atom.element == 'H' ), False) 

2739 coherent_bonds = coherent_bonds_with_hydrogen if has_hydrogen else coherent_bonds_without_hydrogen 

2740 # Check coherent bonds atom by atom 

2741 for atom in self.atoms: 

2742 # Do not check this atom bonds if this may be an ion 

2743 # Most authors "force" dummy bonds in these atoms to make them stable 

2744 if atom.element in SUPPORTED_ION_ELEMENTS: continue 

2745 # Ignore dummy atoms as well 

2746 if atom.element == DUMMY_ATOM_ELEMENT: continue 

2747 # Ignore coarse grain atoms as well 

2748 if atom.element == CG_ATOM_ELEMENT: continue 

2749 # Get actual number of bonds in the current atom both with and without ion bonds 

2750 # LORE: This was a problem since some ions are force-bonded but bonds are actually not real 

2751 # LORE: When an ion is forced we may end up with hyrogens with 2 bonds or carbons with 5 bonds 

2752 # LORE: When an ions is really bonded we can not discard it or we may end up with orphan carbons (e.g. ligands) 

2753 min_nbonds = len(atom.get_bonds(skip_ions = True, skip_dummies = True)) 

2754 max_nbonds = len(atom.get_bonds(skip_ions = False, skip_dummies = True)) 

2755 # Get the accepted range of number of bonds for the current atom according to its element 

2756 element = atom.element 

2757 element_coherent_bonds = coherent_bonds.get(element, None) 

2758 # If there are no specficiations for the current atom element then skip it 

2759 if not element_coherent_bonds: 

2760 continue 

2761 # Get the range of accepted number of bonds 

2762 min_allowed_bonds = element_coherent_bonds['min'] 

2763 max_allowed_bonds = element_coherent_bonds['max'] 

2764 # Check the actual number of bonds is insdie the accepted range 

2765 if max_nbonds < min_allowed_bonds or min_nbonds > max_allowed_bonds: 

2766 if min_nbonds == max_nbonds: 

2767 print(f' An atom with element {element} has {min_nbonds} bonds') 

2768 else: 

2769 print(f' An atom with element {element} has between {min_nbonds} bonds (withou ions) and {max_nbonds} bonds (with ions)') 

2770 if min_allowed_bonds == max_allowed_bonds: 

2771 plural_sufix = '' if min_allowed_bonds == 1 else 's' 

2772 print(f' It should have {min_allowed_bonds} bond{plural_sufix}') 

2773 else: 

2774 print(f' It should have between {min_allowed_bonds} and {max_allowed_bonds} bonds') 

2775 print(f' -> Atom {atom.label}') 

2776 bond_label = ', '.join([ self.atoms[atom].label for atom in atom.get_bonds(skip_ions = False) ]) 

2777 print(f' -> Bonds {bond_label}') 

2778 return True 

2779 return False 

2780 

2781 def get_covalent_bonds (self, selection : Optional['Selection'] = None) -> list[ list[int] ]: 

2782 """Get all atomic covalent (strong) bonds. 

2783 Bonds are defined as a list of atom indices for each atom in the structure. 

2784 Rely on VMD logic to do so.""" 

2785 # It is important to fix elements before trying to fix bonds, since elements have an impact on bonds 

2786 # VMD logic to find bonds relies in the atom element to set the covalent bond distance cutoff 

2787 self.fix_atom_elements() 

2788 # Generate a pdb strucutre to feed vmd 

2789 auxiliar_pdb_filepath = '.structure.pdb' 

2790 self.generate_pdb_file(auxiliar_pdb_filepath) 

2791 # Get covalent bonds between both residue atoms 

2792 covalent_bonds = get_covalent_bonds(auxiliar_pdb_filepath, selection) 

2793 # For every atom in CG, replace its bonds with a class which will raise and error when read 

2794 # Thus we make sure using these wrong bonds anywhere further will result in failure 

2795 for atom_index in self.select_cg().atom_indices: 

2796 covalent_bonds[atom_index] = MISSING_BONDS 

2797 # Remove the auxiliar pdb file 

2798 os.remove(auxiliar_pdb_filepath) 

2799 return covalent_bonds 

2800 

2801 def copy_bonds (self) -> list[list[int]]: 

2802 """Make a copy of the bonds list.""" 

2803 new_bonds = [] 

2804 for atom_bonds in self.bonds: 

2805 # Missing bonds coming from CG atoms are forwarded 

2806 if atom_bonds == MISSING_BONDS: 

2807 new_bonds.append(MISSING_BONDS) 

2808 continue 

2809 # Copy also the inner lists to avoid further mutation of the original structure 

2810 new_bonds.append([ atom_index for atom_index in atom_bonds ]) 

2811 return new_bonds 

2812 

2813 def copy (self) -> 'Structure': 

2814 """Make a copy of the current structure.""" 

2815 atom_copies = [ atom.copy() for atom in self.atoms ] 

2816 residue_copies = [ residue.copy() for residue in self.residues ] 

2817 chain_copies = [ chain.copy() for chain in self.chains ] 

2818 structure_copy = Structure(atom_copies, residue_copies, chain_copies) 

2819 structure_copy.bonds = self.copy_bonds() 

2820 return structure_copy 

2821 

2822 # DANI: No lo he testeado en profundidad 

2823 def merge (self, other : 'Structure') -> 'Structure': 

2824 """Merge current structure with another structure.""" 

2825 # Copy self atoms, residues and chains 

2826 self_atom_copies = [ atom.copy() for atom in self.atoms ] 

2827 self_residue_copies = [ residue.copy() for residue in self.residues ] 

2828 self_chain_copies = [ chain.copy() for chain in self.chains ] 

2829 # Copy other atoms, residues and chains 

2830 other_atom_copies = [ atom.copy() for atom in other.atoms ] 

2831 other_residue_copies = [ residue.copy() for residue in other.residues ] 

2832 other_chain_copies = [ chain.copy() for chain in other.chains ] 

2833 # Adapt indices in other atoms, residues and chains 

2834 atom_index_offset = len(self_atom_copies) 

2835 residue_index_offset = len(self_residue_copies) 

2836 chain_index_offset = len(self_chain_copies) 

2837 for atom in other_atom_copies: 

2838 atom._index += atom_index_offset 

2839 atom._residue_index += residue_index_offset 

2840 for residue in other_residue_copies: 

2841 residue._index += residue_index_offset 

2842 residue._atom_indices = [ i + atom_index_offset for i in residue._atom_indices ] 

2843 residue._chain_index += chain_index_offset 

2844 for chain in other_chain_copies: 

2845 chain._index += chain_index_offset 

2846 chain._residue_indices = [ i + residue_index_offset for i in chain._residue_indices ] 

2847 # Merge self with other atoms, residues and chains and build the new structure 

2848 merged_atoms = self_atom_copies + other_atom_copies 

2849 merged_residues = self_residue_copies + other_residue_copies 

2850 merged_chains = self_chain_copies + other_chain_copies 

2851 return Structure(merged_atoms, merged_residues, merged_chains) 

2852 

2853 def find_rings (self, max_ring_size : int, selection : Optional[Selection] = None) -> list[ list[Atom] ]: 

2854 """Find rings with a maximum specific size or less in the structure and yield them as they are found.""" 

2855 # Make sure the selection does not include regions without bonds 

2856 if selection & self.select_missing_bonds(): 

2857 raise RuntimeError('The find rings logic can not be used when we are missing bonds') 

2858 

2859 

2860 def get_selection_outer_bonds (self, selection : Selection) -> list[int]: 

2861 """Given an atom selection, get all bonds between these atoms and any other atom in the structure. 

2862 Note that inner bonds between atoms in the selection are discarded.""" 

2863 # Get bonds from all atoms in the selection 

2864 bonds = set() 

2865 for atom_index in selection.atom_indices: 

2866 atom_bonds = set(self.bonds[atom_index]) 

2867 bonds = bonds.union(atom_bonds) 

2868 # Discard selection atoms from the bonds list to discard inner bonds 

2869 bonds -= set(selection.atom_indices) 

2870 return list(bonds) 

2871 

2872 # Set the type of PTM according to the classification of the bonded residue 

2873 ptm_options = { 

2874 'protein': ValueError('A PTM residue must never be protein'), 

2875 'dna': 'DNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama 

2876 'rna': 'RNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama 

2877 'carbohydrate': 'Glycosilation', 

2878 'fatty': 'Lipidation', 

2879 'steroid': 'Steroid linkage', # DANI: Esto es posible aunque muy raro y no se como se llama 

2880 'ion': Warning('Ion is covalently bonded to protein'), # DANI: esto no es "correcto" pero si habitual 

2881 'solvent': Warning('Solvent is covalently bonded to protein'), # DANI: probablemente sea un error 

2882 'acetyl': 'Acetylation', # Typical N-capping terminals 

2883 'amide': 'Amidation', # Typical C-capping terminals 

2884 'other': Warning('Unknow type of PTM'), # Something not supported 

2885 } 

2886 

2887 def find_ptms (self) -> Generator[dict, None, None]: 

2888 """Find Post Translational Modifications (PTM) in the structure.""" 

2889 # Find bonds between protein and non-protein atoms 

2890 # First get all protein atoms with bonds 

2891 protein_selection = self.select_protein() - self.select_missing_bonds() 

2892 if not protein_selection: return 

2893 protein_atom_indices = set(protein_selection.atom_indices) # This is used later 

2894 protein_outer_bonds = set(self.get_selection_outer_bonds(protein_selection)) 

2895 non_protein_selection = self.invert_selection(protein_selection) 

2896 if not non_protein_selection: return 

2897 non_protein_atom_indices = set(non_protein_selection.atom_indices) 

2898 non_protein_atom_indices_bonded_to_protein = protein_outer_bonds.intersection(non_protein_atom_indices) 

2899 # Get each residue bonded to the protein and based on its 'classification' set the name of the PTM 

2900 for atom_index in non_protein_atom_indices_bonded_to_protein: 

2901 atom = self.atoms[atom_index] 

2902 residue = atom.residue 

2903 residue_classification = residue.get_classification() 

2904 ptm_classification = self.ptm_options[residue_classification] 

2905 # If we found something impossible then raise the error 

2906 if type(ptm_classification) == ValueError: 

2907 print(f'Problematic residue: {residue}') 

2908 raise ptm_classification 

2909 # If we do not know what it is then do tag it as a PTM but print a warning 

2910 if type(ptm_classification) == Warning: 

2911 warn(f'{ptm_classification} -> {residue}') 

2912 continue 

2913 # At this point we have found a valid PTM 

2914 # Find the protein residue linked to this PTM 

2915 atom_bonds = self.bonds[atom_index] 

2916 protein_bond = next((bond for bond in atom_bonds if bond in protein_atom_indices), None) 

2917 if protein_bond == None: 

2918 raise ValueError('There must be at least one protein bond to the current atom') 

2919 protein_residue_index = self.atoms[protein_bond].residue_index 

2920 # Set the PTM 

2921 yield { 'name': ptm_classification, 'residue_index': protein_residue_index } 

2922 

2923 def has_cg (self) -> bool: 

2924 """Ask if the structure has at least one coarse grain atom/residue.""" 

2925 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms) 

2926 

2927 

2928# ========================= 

2929# Related functions 

2930# ========================= 

2931 

2932def calculate_distance (atom_1 : Atom, atom_2 : Atom) -> float: 

2933 """Calculate the distance between two atoms.""" 

2934 squared_distances_sum = 0 

2935 for i in { 'x': 0, 'y': 1, 'z': 2 }.values(): 

2936 squared_distances_sum += (atom_1.coords[i] - atom_2.coords[i])**2 

2937 return math.sqrt(squared_distances_sum) 

2938 

2939def calculate_angle (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom) -> float: 

2940 """Calculate the angle between 3 atoms.""" 

2941 # From: https://stackoverflow.com/questions/35176451/python-code-to-calculate-angle-between-three-point-using-their-3d-coordinates 

2942 # Get coordinates in numpy arrays, which allows to easily calculate the difference 

2943 a = np.array(atom_1.coords) 

2944 b = np.array(atom_2.coords) 

2945 c = np.array(atom_3.coords) 

2946 # Set the two vectors which make the angle 

2947 ba = a - b 

2948 bc = c - b 

2949 # Calculate the angle between these 2 vectors 

2950 cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc)) 

2951 angle = np.arccos(cosine_angle) 

2952 return np.degrees(angle) 

2953 

2954def calculate_torsion (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom, atom_4 : Atom) -> float: 

2955 """Calculate the torsion between 4 atoms.""" 

2956 # From: https://stackoverflow.com/questions/20305272/dihedral-torsion-angle-from-four-points-in-cartesian-coordinates-in-python 

2957 p0 = np.array(atom_1.coords) 

2958 p1 = np.array(atom_2.coords) 

2959 p2 = np.array(atom_3.coords) 

2960 p3 = np.array(atom_4.coords) 

2961 # Get some vectors 

2962 b0 = -1.0*(p1 - p0) 

2963 b1 = p2 - p1 

2964 b2 = p3 - p2 

2965 # normalize b1 so that it does not influence magnitude of vector 

2966 # rejections that come next 

2967 b1 /= np.linalg.norm(b1) 

2968 # vector rejections 

2969 # v = projection of b0 onto plane perpendicular to b1 

2970 # = b0 minus component that aligns with b1 

2971 # w = projection of b2 onto plane perpendicular to b1 

2972 # = b2 minus component that aligns with b1 

2973 v = b0 - np.dot(b0, b1)*b1 

2974 w = b2 - np.dot(b2, b1)*b1 

2975 # angle between v and w in a plane is the torsion angle 

2976 # v and w may not be normalized but that's fine since tan is y/x 

2977 x = np.dot(v, w) 

2978 y = np.dot(np.cross(b1, v), w) 

2979 return float(np.degrees(np.arctan2(y, x))) 

2980 

2981# ========================= 

2982# Auxiliar functions 

2983# ========================= 

2984 

2985def get_next_letter (letter : str) -> str: 

2986 """Set a function to get the next letter from an input letter in alphabetic order.""" 

2987 if not letter: 

2988 return 'A' 

2989 if letter == 'z' or letter == 'Z': 

2990 raise InputError("Limit of chain letters has been reached") 

2991 next_letter = chr(ord(letter) + 1) 

2992 return next_letter 

2993 

2994def first_cap_only (name : str) -> str: 

2995 """Given a string with 1 or 2 characters, return a new string with 

2996 the first letter cap and the second letter not cap (if any)""" 

2997 if len(name) == 1: 

2998 return name.upper() 

2999 first_character = name[0].upper() 

3000 second_character = name[1].lower() 

3001 return first_character + second_character 

3002 

3003lower_numbers = { 

3004 '0': '₀', 

3005 '1': '₁', 

3006 '2': '₂', 

3007 '3': '₃', 

3008 '4': '₄', 

3009 '5': '₅', 

3010 '6': '₆', 

3011 '7': '₇', 

3012 '8': '₈', 

3013 '9': '₉', 

3014} 

3015def get_lower_numbers (numbers_text : str) -> str: 

3016 """Convert numbers to lower text characters (chr 8020-8029).""" 

3017 return ''.join([ lower_numbers[c] for c in numbers_text ]) 

3018 

3019def filter_model (pdb_content : str, model : int) -> str: 

3020 """Set a function to filter lines in PDB content for a specific model.""" 

3021 # Make sure the PDB content has multiple models 

3022 # If not, then return the whole PDB content ignoring the flag 

3023 generic_model_header_regex = r'\nMODEL\s*[0-9]*\s*\n' 

3024 if not re.search(generic_model_header_regex, pdb_content): 

3025 print(f'PDB content has no models at all so it will be loaded as is (ignored model "{model}").') 

3026 return pdb_content 

3027 # If a model was passed then it means we must filter the PDB content 

3028 filtered_pdb_content = '' 

3029 # Search PDB lines until we find our model's header 

3030 model_header_regex = rf'^MODEL\s*{model}' 

3031 pdb_lines = iter(pdb_content.split('\n')) 

3032 for line in pdb_lines: 

3033 if not re.match(model_header_regex, line): continue 

3034 # If we just found the header 

3035 filtered_pdb_content = line 

3036 break 

3037 # If we did not find the header then stop here 

3038 if not filtered_pdb_content: raise RuntimeError(f'Could not find model "{model}" header') 

3039 # Add every line to the filtered content until we find the tail 

3040 model_footer_regex = r'^ENDMDL' 

3041 for line in pdb_lines: 

3042 filtered_pdb_content += '\n' + line 

3043 if re.match(model_footer_regex, line): return filtered_pdb_content 

3044 # If we did not find the footer then stop here 

3045 raise RuntimeError(f'Could not find model "{model}" footer')