Coverage for model_workflow/utils/structures.py: 87%

1674 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-23 10:54 +0000

1# Main handler of the toolbelt 

2import os 

3import re 

4import math 

5import pytraj 

6import MDAnalysis 

7import numpy as np 

8from scipy.special import comb # DANI: Substituye al math.comb porque fué añadido en python 3.8 y nosotros seguimos en 3.7 

9from bisect import bisect 

10 

11from model_workflow.utils.file import File 

12from model_workflow.utils.selections import Selection 

13from model_workflow.utils.vmd_spells import get_vmd_selection_atom_indices, get_covalent_bonds 

14from model_workflow.utils.mdt_spells import sort_trajectory_atoms 

15from model_workflow.utils.auxiliar import InputError, MISSING_BONDS 

16from model_workflow.utils.auxiliar import is_imported, residue_name_to_letter, otherwise, warn 

17from model_workflow.utils.constants import SUPPORTED_ION_ELEMENTS, SUPPORTED_ELEMENTS 

18from model_workflow.utils.constants import STANDARD_COUNTER_CATION_ATOM_NAMES, STANDARD_COUNTER_ANION_ATOM_NAMES 

19from model_workflow.utils.constants import STANDARD_SOLVENT_RESIDUE_NAMES, STANDARD_COUNTER_ION_ATOM_NAMES 

20from model_workflow.utils.constants import STANDARD_DUMMY_ATOM_NAMES, DUMMY_ATOM_ELEMENT, CG_ATOM_ELEMENT 

21from model_workflow.utils.constants import PROTEIN_RESIDUE_NAME_LETTERS, NUCLEIC_RESIDUE_NAME_LETTERS 

22from model_workflow.utils.constants import DNA_RESIDUE_NAME_LETTERS, RNA_RESIDUE_NAME_LETTERS 

23from model_workflow.utils.constants import FATTY_RESIDUE_NAMES, STEROID_RESIDUE_NAMES 

24from model_workflow.utils.type_hints import * 

25Coords = Tuple[float, float, float] 

26 

27 

28# Set all available chains according to pdb standards 

29AVAILABLE_CAPS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 

30 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] 

31AVAILABLE_LOWS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 

32 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 

33AVAILABLE_LETTERS = AVAILABLE_CAPS + AVAILABLE_LOWS 

34 

35# Set letters to be found in alphanumerical bases 

36hexadecimal_letters = set(AVAILABLE_CAPS[0:6] + AVAILABLE_LOWS[0:6]) 

37alphanumerical_letters = set(AVAILABLE_CAPS[6:] + AVAILABLE_LOWS[6:]) 

38 

39# Set the expected number of bonds for each atom according to its element 

40coherent_bonds_with_hydrogen = { 

41 'H': { 'min': 1, 'max': 1 }, 

42 'O': { 'min': 1, 'max': 2 }, 

43 'N': { 'min': 1, 'max': 4 }, 

44 'C': { 'min': 2, 'max': 4 }, 

45 'S': { 'min': 1, 'max': 6 }, 

46 'P': { 'min': 2, 'max': 6 }, 

47} 

48# WARNING: Not deeply tested 

49coherent_bonds_without_hydrogen = { 

50 'O': { 'min': 0, 'max': 2 }, 

51 'N': { 'min': 1, 'max': 3 }, 

52 'C': { 'min': 1, 'max': 3 }, 

53 'S': { 'min': 1, 'max': 4 }, 

54 'P': { 'min': 2, 'max': 4 }, 

55} 

56 

57# Set the limit of residue numbers according to PDB format (4 characters, starts count at 1) 

58# This means the last number is 9999 and it is equivalent to index 9998 

59pdb_last_decimal_residue_index = 9998 

60 

61class Atom: 

62 """An atom class.""" 

63 

64 def __init__ (self, 

65 name : Optional[str] = None, 

66 element : Optional[str] = None, 

67 coords : Optional[Coords] = None, 

68 ): 

69 self.name = name 

70 self.element = element 

71 self.coords = tuple(coords) if coords else None 

72 # Set variables to store references to other related instances 

73 # These variables will be set further by the structure 

74 self._structure = None 

75 self._index = None 

76 self._residue_index = None 

77 

78 def __repr__ (self): 

79 return '<Atom ' + self.name + '>' 

80 

81 def __eq__ (self, other): 

82 if type(self) != type(other): 

83 return False 

84 return self._residue_index == other._residue_index and self.name == other.name 

85 

86 def __hash__ (self): 

87 return hash((self.index)) 

88 

89 # The parent structure (read only) 

90 # This value is set by the structure itself 

91 def get_structure (self) -> Optional['Structure']: 

92 return self._structure 

93 structure = property(get_structure, None, None, "The parent structure (read only)") 

94 

95 # The residue index according to parent structure residues (read only) 

96 # This value is set by the structure itself 

97 def get_index (self) -> Optional[int]: 

98 return self._index 

99 index = property(get_index, None, None, "The residue index according to parent structure residues (read only)") 

100 

101 # The atom residue index according to parent structure residues 

102 # If residue index is set then make changes in all the structure to make this change coherent 

103 def get_residue_index (self) -> int: 

104 return self._residue_index 

105 def set_residue_index (self, new_residue_index : int): 

106 # If the new residue index is the current residue index then do nothing 

107 # WARNING: It is important to stop this here or it could delete a residue which is not to be deleted 

108 if new_residue_index == self.residue_index: 

109 return 

110 # If there is not strucutre yet it means the residue is beeing set before the structure 

111 # We just save the residue index and wait for the structure to be set 

112 if not self.structure: 

113 self._residue_index = new_residue_index 

114 return 

115 # Relational indices are updated through a top-down hierarchy 

116 # Affected residues are the ones to update this atom internal residue index 

117 current_residue = self.residue 

118 current_residue.remove_atom(self) 

119 new_residue = self.structure.residues[new_residue_index] 

120 new_residue.add_atom(self) 

121 residue_index = property(get_residue_index, set_residue_index, None, "The atom residue index according to parent structure residues") 

122 

123 # The atom residue 

124 # If residue is set then make changes in all the structure to make this change coherent 

125 def get_residue (self) -> Optional['Residue']: 

126 # If there is not strucutre yet it means the atom is beeing set before the structure 

127 # In this case it is not possible to get related residues in the structure 

128 # It also may happend that the residue is missing because the atom has been set rawly 

129 if not self.structure or self.residue_index == None: 

130 return None 

131 # Get the residue in the structure according to the residue index 

132 return self.structure.residues[self.residue_index] 

133 def set_residue (self, new_residue : 'Residue'): 

134 # Find the new residue index and set it as the atom residue index 

135 # Note that the residue must be set in the structure already 

136 new_residue_index = new_residue.index 

137 if new_residue_index == None: 

138 raise ValueError('Residue ' + str(new_residue) + ' is not set in the structure') 

139 self.set_residue_index(new_residue_index) 

140 residue = property(get_residue, set_residue, None, "The atom residue") 

141 

142 def get_chain_index (self) -> Optional[int]: 

143 """Get the atom chain index according to parent structure chains.""" 

144 # The residue may be missing if the atom has been set rawly 

145 if not self.residue: 

146 return None 

147 return self.residue.chain_index 

148 

149 def set_chain_index (self, new_chain_index : int): 

150 """Set a new chain index. 

151 WARNING: You may create a new residue in this process.""" 

152 # If the new chain index is the current chain index do nothing 

153 # WARNING: It is important to stop this here or it could delete a chain which is not to be deleted 

154 if new_chain_index == self.chain_index: 

155 return 

156 # If there is not strucutre yet it means the chain is beeing set before the structure 

157 # We just save the chain index and wait for the structure to be set 

158 if not self.structure: 

159 self._chain_index = new_chain_index 

160 return 

161 # Relational indices are updated through a top-down hierarchy 

162 # Affected chains are the ones to update this residue internal chain index 

163 # WARNING: It is critical to find the new chain before removing/adding residues 

164 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged 

165 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed 

166 new_chain = self.structure.chains[new_chain_index] 

167 # set the new residue where this atom will be appended 

168 new_residue = new_chain.find_residue(self.residue.number, self.residue.icode) 

169 # If it does not exist yet then create it 

170 # It will be a copy of the current residue (name, number and icode) but in the new chain 

171 if not new_residue: 

172 new_residue = Residue(self.residue.name, self.residue.number, self.residue.icode) 

173 self.structure.set_new_residue(new_residue) 

174 new_chain.add_residue(new_residue) 

175 # Now remove this atom from its current residue and move it to the new chain residue 

176 self.residue.remove_atom(self) 

177 new_residue.add_atom(self) 

178 chain_index = property(get_chain_index, set_chain_index, None, "The atom chain index according to parent structure chains (read only)") 

179 

180 # The atom chain (read only) 

181 # In order to change the chain it must be changed in the atom residue 

182 def get_chain (self) -> Optional['Chain']: 

183 # If there is not strucutre yet it means the atom is beeing set before the structure 

184 # In this case it is not possible to get related residues in the structure 

185 # It also may happend that the chain is missing because the atom has been set rawly 

186 if not self.structure or self.chain_index == None: 

187 return None 

188 # Get the chain in the structure according to the chain index 

189 return self.structure.chains[self.chain_index] 

190 chain = property(get_chain, None, None, "The atom chain (read only)") 

191 

192 def get_bonds (self, skip_ions : bool = False, skip_dummies : bool = False) -> Optional[ List[int] ]: 

193 """Get indices of other atoms in the structure which are covalently bonded to this atom.""" 

194 if not self.structure: 

195 raise ValueError('The atom has not a structure defined') 

196 if self.index == None: 

197 raise ValueError('The atom has not an index defined') 

198 bonds = self.structure.bonds[self.index] 

199 # If the skip ions argument is passed then remove atom indices of supported ion atoms 

200 if skip_ions: 

201 bonds = list(set(bonds) - self.structure.ion_atom_indices) 

202 if skip_dummies: 

203 bonds = list(set(bonds) - self.structure.dummy_atom_indices) 

204 return bonds 

205 

206 # Atoms indices of atoms in the structure which are covalently bonded to this atom 

207 bonds = property(get_bonds, None, None, 'Atoms indices of atoms in the structure which are covalently bonded to this atom') 

208 

209 def get_bonded_atoms (self) -> List['Atom']: 

210 """Get bonded atoms.""" 

211 return [ self.structure.atoms[atom_index] for atom_index in self.bonds ] 

212 

213 def get_selection (self) -> 'Selection': 

214 """Generate a selection for this atom.""" 

215 return Selection([self.index]) 

216 

217 def copy (self) -> 'Atom': 

218 """Make a copy of the current atom.""" 

219 atom_copy = Atom(self.name, self.element, self.coords) 

220 atom_copy._structure = self._structure 

221 atom_copy._index = self._index 

222 atom_copy._residue_index = self._residue_index 

223 return atom_copy 

224 

225 def is_fatty_candidate (self) -> bool: 

226 """ 

227 Check if this atom meets specific criteria: 

228 1. It is a carbon 

229 2. It is connected only to other carbons and hydrogens 

230 3. It is connected to 1 or 2 carbons 

231 """ 

232 # Ignore non carbon atoms 

233 if self.element != 'C': 

234 return False 

235 # Get bonded atom elements 

236 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ] 

237 # Check only carbon and hydrogen atoms to be bonded 

238 if any((element != 'C' and element != 'H') for element in bonded_atoms_elements): 

239 return False 

240 # Check it is connected to 1 or 2 carbons 

241 connected_carbons_count = bonded_atoms_elements.count('C') 

242 if connected_carbons_count != 1 and connected_carbons_count != 2: 

243 return False 

244 return True 

245 

246 def is_carbohydrate_candidate (self) -> bool: 

247 """ 

248 Check if this atom meets specific criteria: 

249 1. It is a carbon 

250 2. It is connected only to other carbons, hydrogens or oxygens 

251 3. It is connected to 1 or 2 carbons 

252 4. It is connected to 1 oxygen 

253 """ 

254 # Ignore non carbon atoms 

255 if self.element != 'C': 

256 return False 

257 # Get bonded atom elements 

258 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ] 

259 # Check only carbon and hydrogen atoms to be bonded 

260 if any((element != 'C' and element != 'H' and element != 'O') for element in bonded_atoms_elements): 

261 return False 

262 # Check it is connected to 1 or 2 carbons 

263 connected_carbons_count = bonded_atoms_elements.count('C') 

264 if connected_carbons_count != 1 and connected_carbons_count != 2: 

265 return False 

266 # Check it is connected to 1 oxygen 

267 connected_oxygens_count = bonded_atoms_elements.count('O') 

268 if connected_oxygens_count != 1: 

269 return False 

270 return True 

271 

272 def is_ion (self) -> bool: 

273 """Check if it is an ion by checking if it has no bonds with other atoms.""" 

274 return len(self.bonds) == 0 

275 

276 def guess_element (self) -> str: 

277 """Guess an atom element from its name and number of bonds.""" 

278 # If the atom name is among the known dummy atoms then return a standard element for dummy atoms 

279 if self.name.upper() in STANDARD_DUMMY_ATOM_NAMES: 

280 return DUMMY_ATOM_ELEMENT 

281 # If the element is already the "coarse grained" flag then return this very same flag 

282 # WARNING: This has to be preset before guessing or the guess may fail 

283 if self.is_cg(): 

284 return CG_ATOM_ELEMENT 

285 # If the name is SOD and it is a lonely atom then it is clearly sodium, not sulfur 

286 if self.name.upper() == 'SOD' and self.residue.atom_count == 1: 

287 return 'Na' 

288 # Find a obvios element name in the atom name 

289 element = self.get_name_suggested_element() 

290 # CA may refer to calcium or alpha carbon, so the number of atoms in the residue is decisive 

291 if element == 'Ca' and self.residue.atom_count >= 2: 

292 return 'C' 

293 # NA may refer to sodium or some ligand nitrogen, so the number of atoms in the residue is decisive 

294 if element == 'Na' and self.residue.atom_count >= 2: 

295 return 'N' 

296 return element 

297 

298 def get_name_suggested_element (self) -> str: 

299 """Guess an atom element from its name only.""" 

300 # Get the atom name and its characters length 

301 name = self.name 

302 length = len(name) 

303 next_character = None 

304 for i, character in enumerate(name): 

305 # Get the next character, since element may be formed by 2 letters 

306 if i < length - 1: 

307 next_character = name[i+1] 

308 # If next character is not a string then ignore it 

309 if not next_character.isalpha(): 

310 next_character = None 

311 # Try to get all possible matches between the characters and the supported atoms 

312 # First letter is always caps 

313 character = character.upper() 

314 # First try to match both letters together 

315 if next_character: 

316 # Start with the second letter in caps 

317 next_character = next_character.upper() 

318 both = character + next_character 

319 if both in SUPPORTED_ELEMENTS: 

320 return both 

321 # Continue with the second letter in lowers 

322 next_character = next_character.lower() 

323 both = character + next_character 

324 if both in SUPPORTED_ELEMENTS: 

325 return both 

326 # Finally, try with the first character alone 

327 if character in SUPPORTED_ELEMENTS: 

328 return character 

329 raise InputError(f"Cannot guess atom element from atom name '{name}'") 

330 

331 # Get a standard label 

332 def get_label (self) -> str: 

333 return f'{self.residue.label}.{self.name} (index {self.index})' 

334 # The atom standard label (read only) 

335 label = property(get_label, None, None) 

336 

337 # Ask if the current atom is in coarse grain 

338 def is_cg (self) -> bool: 

339 return self.element == CG_ATOM_ELEMENT 

340 

341# A residue 

342class Residue: 

343 def __init__ (self, 

344 name : Optional[str] = None, 

345 number : Optional[int] = None, 

346 icode : Optional[str] = None, 

347 ): 

348 self.name = name 

349 self.number = number 

350 self.icode = icode 

351 # Set variables to store references to other related instaces 

352 # These variables will be set further by the structure 

353 self._structure = None 

354 self._index = None 

355 self._atom_indices = [] 

356 self._chain_index = None 

357 self._classification = None 

358 

359 def __repr__ (self): 

360 return '<Residue ' + self.name + str(self.number) + (self.icode if self.icode else '') + '>' 

361 

362 def __eq__ (self, other): 

363 if type(self) != type(other): 

364 return False 

365 return ( 

366 self._chain_index == other._chain_index and 

367 #self.name == other.name and 

368 self.number == other.number and 

369 self.icode == other.icode 

370 ) 

371 

372 def __hash__ (self): 

373 # WARNING: This is susceptible to duplicated residues 

374 return hash((self._chain_index, self.number, self.icode)) 

375 # WARNING: This is not susceptible to duplicated residues 

376 #return hash(tuple(self._atom_indices)) 

377 

378 # The parent structure (read only) 

379 # This value is set by the structure itself 

380 def get_structure (self) -> Optional['Structure']: 

381 return self._structure 

382 structure = property(get_structure, None, None, "The parent structure (read only)") 

383 

384 # The residue index according to parent structure residues (read only) 

385 # This value is set by the structure itself 

386 def get_index (self) -> Optional[int]: 

387 return self._index 

388 def set_index (self, index): 

389 # Update residue atoms 

390 for atom in self.atoms: 

391 atom._residue_index = index 

392 # Update residue chain 

393 chain_residue_index = self.chain._residue_indices.index(self._index) 

394 self.chain._residue_indices[chain_residue_index] = index 

395 # Finally update self index 

396 self._index = index 

397 index = property(get_index, set_index, None, "The residue index according to parent structure residues (read only)") 

398 

399 # The atom indices according to parent structure atoms for atoms in this residue 

400 # If atom indices are set then make changes in all the structure to make this change coherent 

401 def get_atom_indices (self) -> List[int]: 

402 return self._atom_indices 

403 def set_atom_indices (self, new_atom_indices : List[int]): 

404 # If there is not strucutre yet it means the residue is beeing set before the structure 

405 # We just save atom indices and wait for the structure to be set 

406 if not self.structure: 

407 self._atom_indices = new_atom_indices 

408 return 

409 # Update the current atoms 

410 for atom in self.atoms: 

411 atom._residue_index = None 

412 # Update the new atoms 

413 for index in new_atom_indices: 

414 atom = self.structure.atoms[index] 

415 atom._residue_index = self.index 

416 # Now new indices are coherent and thus we can save them 

417 self._atom_indices = new_atom_indices 

418 atom_indices = property(get_atom_indices, set_atom_indices, None, "The atom indices according to parent structure atoms for atoms in this residue") 

419 

420 # The atoms in this residue 

421 # If atoms are set then make changes in all the structure to make this change coherent 

422 def get_atoms (self) -> List['Atom']: 

423 # If there is not strucutre yet it means the chain is beeing set before the structure 

424 # In this case it is not possible to get related atoms in the structure 

425 if not self.structure: 

426 return [] 

427 # Get atoms in the structure according to atom indices 

428 atoms = self.structure.atoms 

429 return [ atoms[atom_index] for atom_index in self.atom_indices ] 

430 def set_atoms (self, new_atoms : List['Atom']): 

431 # Find indices for new atoms and set their indices as the new atom indices 

432 # Note that atoms must be set in the structure already 

433 new_atom_indices = [] 

434 for new_atom in new_atoms: 

435 new_atom_index = new_atom.index 

436 if new_atom_index == None: 

437 raise ValueError('Atom ' + str(new_atom) + ' is not set in the structure') 

438 new_atom_indices.append(new_atom_index) 

439 self.set_atom_indices(new_atom_indices) 

440 atoms = property(get_atoms, set_atoms, None, "The atoms in this residue") 

441 

442 # Get the number of atoms in the residue 

443 def get_atom_count (self) -> int: 

444 return len(self.atom_indices) 

445 atom_count = property(get_atom_count, None, None, "The number of atoms in the residue (read only)") 

446 

447 # Add an atom to the residue 

448 def add_atom (self, new_atom : 'Atom'): 

449 # Insert the new atom index in the list of atom indices keeping the order 

450 new_atom_index = new_atom.index 

451 sorted_atom_index = bisect(self.atom_indices, new_atom_index) 

452 self.atom_indices.insert(sorted_atom_index, new_atom_index) 

453 # Update the atom internal index 

454 new_atom._residue_index = self.index 

455 

456 # Remove an atom from the residue 

457 def remove_atom (self, current_atom : 'Atom'): 

458 # Remove the current atom index from the atom indices list 

459 self.atom_indices.remove(current_atom.index) # This index MUST be in the list 

460 # If we removed the last atom then this residue must be removed from its structure 

461 if len(self.atom_indices) == 0 and self.structure: 

462 self.structure.purge_residue(self) 

463 # Update the atom internal index 

464 current_atom._residue_index = None 

465 

466 # The residue chain index according to parent structure chains 

467 # If chain index is set then make changes in all the structure to make this change coherent 

468 def get_chain_index (self) -> int: 

469 return self._chain_index 

470 def set_chain_index (self, new_chain_index : int): 

471 # If the new chain index is the current chain index do nothing 

472 # WARNING: It is important to stop this here or it could delete a chain which is not to be deleted 

473 if new_chain_index == self.chain_index: 

474 return 

475 # If there is not strucutre yet it means the chain is beeing set before the structure 

476 # We just save the chain index and wait for the structure to be set 

477 if not self.structure: 

478 self._chain_index = new_chain_index 

479 return 

480 # Relational indices are updated through a top-down hierarchy 

481 # Affected chains are the ones to update this residue internal chain index 

482 # WARNING: It is critical to find the new chain before removing/adding residues 

483 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged 

484 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed 

485 new_chain = self.structure.chains[new_chain_index] 

486 if self.chain: 

487 self.chain.remove_residue(self) 

488 new_chain.add_residue(self) 

489 chain_index = property(get_chain_index, set_chain_index, None, "The residue chain index according to parent structure chains") 

490 

491 # The residue chain 

492 # If chain is set then make changes in all the structure to make this change coherent 

493 def get_chain (self) -> Optional['Chain']: 

494 # If there is not strucutre yet then it means the residue is set before the structure 

495 # In this case it is not possible to get related chain in the structure 

496 if not self.structure or self.chain_index == None: 

497 return None 

498 # Get the chain in the structure according to the chain index 

499 return self.structure.chains[self.chain_index] 

500 def set_chain (self, new_chain : Union['Chain', str]): 

501 # In case the chain is just a string we must find/create the corresponding chain 

502 if type(new_chain) == str: 

503 letter = new_chain 

504 # Get the residue structure 

505 structure = self.structure 

506 if not structure: 

507 raise ValueError(f'Cannot find the corresponding {new_chain} chain without the structure') 

508 # Find if the letter belongs to an already existing chain 

509 new_chain = structure.get_chain_by_name(letter) 

510 # If the chain does not exist yet then create it 

511 if not new_chain: 

512 new_chain = Chain(name=letter) 

513 structure.set_new_chain(new_chain) 

514 # Find the new chain index and set it as the residue chain index 

515 # Note that the chain must be set in the structure already 

516 new_chain_index = new_chain.index 

517 if new_chain_index == None: 

518 raise ValueError(f'Chain {new_chain} is not set in the structure') 

519 self.set_chain_index(new_chain_index) 

520 chain = property(get_chain, set_chain, None, "The residue chain") 

521 

522 # Get atom indices from atoms bonded to this residue 

523 def get_bonded_atom_indices (self) -> List[int]: 

524 # Get all bonds among residue atoms 

525 all_bonds = [] 

526 for atom in self.atoms: 

527 all_bonds += atom.bonds 

528 # Remove self atom indices from the list 

529 external_bonds = set(all_bonds) - set(self.atom_indices) 

530 return list(external_bonds) 

531 

532 def get_bonded_atoms (self) -> List['Atom']: 

533 """Get atoms bonded to this residue.""" 

534 return [ self.structure.atoms[atom_index] for atom_index in self.get_bonded_atom_indices() ] 

535 

536 def get_bonded_residue_indices (self) -> List[int]: 

537 """Get residue indices from residues bonded to this residue.""" 

538 return list(set([ atom.residue_index for atom in self.get_bonded_atoms() ])) 

539 

540 def get_bonded_residues (self) -> List['Residue']: 

541 """Get residues bonded to this residue.""" 

542 return [ self.structure.residues[residue_index] for residue_index in self.get_bonded_residue_indices() ] 

543 

544 def is_bonded_with_residue (self, other : 'Residue') -> bool: 

545 """Given another residue, check if it is bonded with this residue.""" 

546 bonded_atom_indices = set(self.get_bonded_atom_indices()) 

547 if next((index for index in other.atom_indices if index in bonded_atom_indices), None) != None: return True 

548 return False 

549 

550 def get_classification (self) -> str: 

551 """ 

552 Get the residue biochemistry classification. 

553 

554 WARNING: Note that this logic will not work in a structure without hydrogens. 

555 

556 Available classifications: 

557 - protein 

558 - dna 

559 - rna 

560 - carbohydrate 

561 - fatty 

562 - steroid 

563 - ion 

564 - solvent 

565 - acetyl 

566 - amide 

567 - other 

568 """ 

569 # Return the internal value, if any 

570 if self._classification: 

571 return self._classification 

572 # If this is a CG residue then we can not classify it 

573 if self.is_cg(): return self.get_classification_by_name() 

574 # If we dont have a value then we must classify the residue 

575 # ------------------------------------------------------------------------------------------------------- 

576 # Ions are single atom residues 

577 if self.atom_count == 1: 

578 self._classification = 'ion' 

579 return self._classification 

580 # ------------------------------------------------------------------------------------------------------- 

581 # At this point we need atoms to have elements fixed 

582 # WARNING: missing elements would result in silent failure to recognize some classifications 

583 if self.structure._fixed_atom_elements == False: 

584 self.structure.fix_atom_elements() 

585 # Solvent is water molecules 

586 # First rely on the residue name 

587 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES: 

588 self._classification = 'solvent' 

589 return self._classification 

590 # It may be water with a not known name 

591 # Literally check if its a molecule with 3 atoms: 2 hydrogens and 1 oxygen 

592 if self.atom_count == 3: 

593 atom_elements = [ atom.element for atom in self.atoms ] 

594 if atom_elements.count('H') == 2 and atom_elements.count('O') == 1: 

595 self._classification = 'solvent' 

596 return self._classification 

597 # ------------------------------------------------------------------------------------------------------- 

598 # Protein definition according to vmd: 

599 # a residue with atoms named C, N, CA, and O 

600 # In our case we accept OC1 and OC2 or OT1 and OT2 instead of O for terminal residues 

601 atom_names = set([ atom.name for atom in self.atoms ]) 

602 if ( all((name in atom_names) for name in ['C', 'N', 'CA']) and ( 

603 'O' in atom_names or { 'OC1', 'OC2' } <= atom_names or { 'OT1', 'OT2' } <= atom_names 

604 )): 

605 self._classification = 'protein' 

606 return self._classification 

607 # ------------------------------------------------------------------------------------------------------- 

608 # Nucleic acids definition according to vmd: 

609 # a residue with atoms named P, O1P, O2P and either O3’, C3’, C4’, C5’, O5’ or O3*, C3*, C4*, C5*, O5* 

610 # Apparently it has been fixed so now a residue does not need to be phosphorylated to be considered nucleic 

611 # LORE: This included the condition "all((name in atom_names) for name in ['P', 'OP1', 'OP2'])" 

612 if (( all((name in atom_names) for name in ["O3'", "C3'", "C4'", "C5'", "O5'"]) or 

613 all((name in atom_names) for name in ["O3*", "C3*", "C4*", "C5*", "O5*"]) )): 

614 # At this point we know it is nucleic 

615 # We must tell the difference between DNA and RNA 

616 if "O2'" in atom_names or "O2*" in atom_names: 

617 self._classification = 'rna' 

618 else: 

619 self._classification = 'dna' 

620 return self._classification 

621 # ------------------------------------------------------------------------------------------------------- 

622 # To define carbohydrates search for rings made of 1 oxygen and 'n' carbons 

623 # WARNING: This logic may fail for some very specific molecules such as furine 

624 # LIMITATION: This logic only aims for cyclical carbohydrates. The linear form of carbohydrates is not yet consider 

625 rings = list(self.find_rings()) 

626 for ring in rings: 

627 ring_elements = [ atom.element for atom in ring ] 

628 # Check the ring has 1 oxygen 

629 oxygen_count = ring_elements.count('O') 

630 if oxygen_count != 1: 

631 continue 

632 # Check the rest are only carbon 

633 carbon_count = ring_elements.count('C') 

634 if carbon_count != len(ring) - 1: 

635 continue 

636 self._classification = 'carbohydrate' 

637 return self._classification 

638 # ------------------------------------------------------------------------------------------------------- 

639 # To define steroids search for 3 6-carbon rings and 1 5-carbon ring (at least) 

640 # WARNING: According to this logic some exotical non-steroid molecules may result in false positives 

641 num_6_carbon_rings = 0 

642 num_5_carbon_rings = 0 

643 for ring in rings: 

644 ring_elements = [ atom.element for atom in ring ] 

645 if any(element != 'C' for element in ring_elements): 

646 continue 

647 ring_lenght = len(ring) 

648 if ring_lenght == 5: 

649 num_5_carbon_rings += 1 

650 if ring_lenght == 6: 

651 num_6_carbon_rings += 1 

652 if num_6_carbon_rings >= 3 and num_5_carbon_rings >= 1: 

653 self._classification = 'steroid' 

654 return self._classification 

655 # ------------------------------------------------------------------------------------------------------- 

656 # To define fatty acids search for a series of carbon atoms connected together (3 at least) 

657 # These carbon atoms must be connected only to hydrogen in addition to 1-2 carbon 

658 # These carbons must not be bonded in a cycle so check atoms not be repeated in the series 

659 bonded_fatty_atoms = [] 

660 def get_bonded_fatty_atoms_recursive (current_atom : Atom, previous_atom : Optional[Atom] = None) -> bool: 

661 # Iterate over the input atom bonded atoms 

662 for bonded_atom in current_atom.get_bonded_atoms(): 

663 # Discard the previous atom to avoid an infinite loop 

664 if bonded_atom == previous_atom: 

665 continue 

666 # If we find an atom which is already in the fatty atoms list then it means we are cycling 

667 if bonded_atom in bonded_fatty_atoms: 

668 return False 

669 # If this is not a fatty atom then discard it 

670 if not bonded_atom.is_fatty_candidate(): 

671 continue 

672 # Add the current bonded fatty atom to the list 

673 bonded_fatty_atoms.append(bonded_atom) 

674 # Find more bonded fatty atoms and add them to list as well 

675 if get_bonded_fatty_atoms_recursive(current_atom=bonded_atom, previous_atom=current_atom) == False: 

676 return False 

677 return True 

678 # Now check all atoms and try to set the series until we find one which works 

679 already_checked_atoms = [] 

680 for atom in self.atoms: 

681 # If we already checked this atom trough the recursive logic then skip it 

682 if atom in already_checked_atoms: 

683 continue 

684 # If the atom is not a fatty candidate then skip it 

685 if not atom.is_fatty_candidate(): 

686 continue 

687 # Now that we have found a suitable candidate atom we start the series 

688 bonded_fatty_atoms = [atom] 

689 if get_bonded_fatty_atoms_recursive(atom) and len(bonded_fatty_atoms) >= 3: 

690 self._classification = 'fatty' 

691 return self._classification 

692 already_checked_atoms += bonded_fatty_atoms 

693 # ------------------------------------------------------------------------------------------------------- 

694 # Check if it is an acetylation capping terminal 

695 if self.name == 'ACE': 

696 self._classification = 'acetyl' 

697 return self._classification 

698 # ------------------------------------------------------------------------------------------------------- 

699 # Check if it is an N-methyl amide capping terminal 

700 if self.name == 'NME': 

701 self._classification = 'amide' 

702 return self._classification 

703 # ------------------------------------------------------------------------------------------------------- 

704 self._classification = 'other' 

705 return self._classification 

706 

707 # Set an alternative function to "try" to classify the residues according only to its name 

708 # This is useful for corase grain residues whose atoms may not reflect the real atoms 

709 # WARNING: This logic is very limited and will return "unknown" most of the times 

710 # WARNING: This logic relies mostly in atom names, which may be not standard 

711 def get_classification_by_name (self) -> str: 

712 if self.name in PROTEIN_RESIDUE_NAME_LETTERS: 

713 return 'protein' 

714 if self.name in DNA_RESIDUE_NAME_LETTERS: 

715 return 'dna' 

716 if self.name in RNA_RESIDUE_NAME_LETTERS: 

717 return 'rna' 

718 if self.name in NUCLEIC_RESIDUE_NAME_LETTERS: 

719 return 'nucleic' 

720 if self.name in FATTY_RESIDUE_NAMES: 

721 return 'fatty' 

722 if self.name in STEROID_RESIDUE_NAMES: 

723 return 'steroid' 

724 if self.name in STANDARD_COUNTER_ION_ATOM_NAMES: 

725 return 'ion' 

726 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES: 

727 return 'solvent' 

728 # If we do not know what it is 

729 return 'unknown' 

730 

731 # The residue biochemistry classification (read only) 

732 classification = property(get_classification, None, None) 

733 

734 # Generate a selection for this residue 

735 def get_selection (self) -> 'Selection': 

736 return Selection(self.atom_indices) 

737 

738 # Make a copy of the current residue 

739 def copy (self) -> 'Residue': 

740 residue_copy = Residue(self.name, self.number, self.icode) 

741 residue_copy._structure = self._structure 

742 residue_copy._index = self._index 

743 residue_copy._atom_indices = self._atom_indices 

744 residue_copy._chain_index = self._chain_index 

745 return residue_copy 

746 

747 # Get the residue equivalent single letter code 

748 # Note that this makes sense for aminoacids and nucleotides only 

749 # Non recognized residue names return 'X' 

750 def get_letter (self) -> str: 

751 return residue_name_to_letter(self.name) 

752 

753 # Get the formula of the residue 

754 def get_formula (self) -> str: 

755 elements = [ atom.element for atom in self.atoms ] 

756 unique_elements = list(set(elements)) 

757 formula = '' 

758 for unique_element in unique_elements: 

759 count = elements.count(unique_element) 

760 number = get_lower_numbers(str(count)) if count > 1 else '' 

761 formula += unique_element + number 

762 return formula 

763 

764 # Find rings in the residue 

765 def find_rings (self) -> Generator[ List[Atom], None, None ]: 

766 return self.structure.find_rings(selection=self.get_selection()) 

767 

768 

769 def split (self, 

770 first_residue_atom_indices : List[int], 

771 second_residue_atom_indices : List[int], 

772 first_residue_name : Optional[str] = None, 

773 second_residue_name : Optional[str] = None, 

774 first_residue_number : Optional[int] = None, 

775 second_residue_number : Optional[int] = None, 

776 first_residue_icode : Optional[str] = None, 

777 second_residue_icode : Optional[str] = None, 

778 ) -> Tuple['Residue', 'Residue']: 

779 """ 

780 Split this residue in 2 residues and return them in a tuple. 

781 Keep things coherent in the structure (renumerate all residues below this one). 

782 Note that all residue atoms must be covered by the splits. 

783 """ 

784 # This function is expected to be called in a residue with an already set structure 

785 if not self.structure: 

786 raise InputError('The split function should be called when the residue has an already defined structure') 

787 # Make sure all atoms in the residue are covered between both the first and the second residue 

788 if set(self.atom_indices) != set(first_residue_atom_indices + second_residue_atom_indices): 

789 print('Residue atoms: ' + ', '.join([ str(v) for v in set(self.atom_indices) ])) 

790 print('Covered atoms: ' + ', '.join([ str(v) for v in set(first_residue_atom_indices + second_residue_atom_indices) ])) 

791 raise InputError('All atom indices must be covered between both the first and the second residue') 

792 # Reuse this first residue instead of creating a new one 

793 if first_residue_name: 

794 self.name = first_residue_name 

795 if first_residue_number: 

796 self.number = first_residue_number 

797 if first_residue_icode: 

798 self.icode = first_residue_icode 

799 # Set the new second residue 

800 _second_residue_name = second_residue_name if second_residue_name else self.name 

801 _second_residue_number = second_residue_number if second_residue_number else self.number 

802 _second_residue_icode = second_residue_icode if second_residue_icode else get_next_letter(self.icode) 

803 second_residue = Residue(_second_residue_name, _second_residue_number, _second_residue_icode) 

804 second_residue._structure = self.structure 

805 new_residue_index = self.index + 1 

806 second_residue._index = new_residue_index 

807 # Insert the second residue in the structure residues list right after this residue 

808 self.structure.residues.insert(new_residue_index, second_residue) 

809 # Set the second residue index 

810 # Update the index of all residues which have been shifted after the insertion 

811 for residue_index in range(new_residue_index + 1, len(self.structure.residues)): 

812 residue = self.structure.residues[residue_index] 

813 residue.index = residue_index 

814 # Now transfer atoms from residue 1 to residue 2 as it is specified 

815 # Note that this will automatically update every atom 

816 second_residue.atom_indices = second_residue_atom_indices 

817 # Now add the new residue to the chain 

818 self.chain.add_residue(second_residue) 

819 

820 # Parse atom names to atom indices and then call the split function 

821 def split_by_atom_names (self, 

822 first_residue_atom_names : List[str], 

823 second_residue_atom_names : List[str], 

824 first_residue_name : Optional[str] = None, 

825 second_residue_name : Optional[str] = None, 

826 first_residue_number : Optional[int] = None, 

827 second_residue_number : Optional[int] = None, 

828 first_residue_icode : Optional[str] = None, 

829 second_residue_icode : Optional[str] = None, 

830 ) -> Tuple['Residue', 'Residue']: 

831 # Check all atom names to exist in the residue 

832 input_atom_names = set(first_residue_atom_names + second_residue_atom_names) 

833 residue_atom_names = set([ atom.name for atom in self.atoms ]) 

834 if input_atom_names != residue_atom_names: 

835 print(self) 

836 print(self.atoms) 

837 print('Input atom names: ' + ', '.join(input_atom_names)) 

838 print('Residue atom names: ' + ', '.join(residue_atom_names)) 

839 raise InputError('All residue atoms must be covered between both the first and the second residue atom names') 

840 # Convert atom names to atom indices 

841 first_residue_atom_indices = [ self.get_atom_by_name(name).index for name in first_residue_atom_names ] 

842 second_residue_atom_indices = [ self.get_atom_by_name(name).index for name in second_residue_atom_names ] 

843 # Call the actual split logic 

844 return self.split(first_residue_atom_indices, second_residue_atom_indices, first_residue_name, 

845 second_residue_name, first_residue_number, second_residue_number, first_residue_icode, second_residue_icode) 

846 

847 # Get a residue atom given its name 

848 def get_atom_by_name (self, atom_name : str) -> 'Atom': 

849 return next(( atom for atom in self.atoms if atom.name == atom_name ), None) 

850 

851 # Get a standard label 

852 def get_label (self) -> str: 

853 chainname = self.chain.name if self.chain.name.strip() else '' 

854 return f'{chainname}{self.number}{self.icode}({self.name})' 

855 # The residue standard label (read only) 

856 label = property(get_label, None, None) 

857 

858 # Ask if the current residue is in coarse grain 

859 # Note that we asome there may be not hybrid aa/cg residues 

860 def is_cg (self) -> bool: 

861 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms) 

862 

863# A chain 

864class Chain: 

865 def __init__ (self, 

866 name : Optional[str] = None, 

867 classification : Optional[str] = None, 

868 ): 

869 self.name = name 

870 self._classification = classification 

871 # Set variables to store references to other related instaces 

872 # These variables will be set further by the structure 

873 self._structure = None 

874 self._index = None 

875 self._residue_indices = [] 

876 

877 def __repr__ (self): 

878 return '<Chain ' + self.name + '>' 

879 

880 def __eq__ (self, other): 

881 return self.name == other.name 

882 

883 def __hash__ (self): 

884 return hash(self.name) 

885 

886 # The parent structure (read only) 

887 # This value is set by the structure itself 

888 def get_structure (self) -> Optional['Structure']: 

889 return self._structure 

890 structure = property(get_structure, None, None, "The parent structure (read only)") 

891 

892 # The residue index according to parent structure residues (read only) 

893 # This value is set by the structure itself 

894 def get_index (self) -> Optional[int]: 

895 return self._index 

896 # When the index is set all residues are updated with the next chain index 

897 def set_index (self, index : int): 

898 for residue in self.residues: 

899 residue._chain_index = index 

900 self._index = index 

901 index = property(get_index, set_index, None, "The residue index according to parent structure residues (read only)") 

902 

903 # The residue indices according to parent structure residues for residues in this chain 

904 # If residue indices are set then make changes in all the structure to make this change coherent 

905 def get_residue_indices (self) -> List[int]: 

906 return self._residue_indices 

907 def set_residue_indices (self, new_residue_indices : List[int]): 

908 # If there is not strucutre yet it means the chain is beeing set before the structure 

909 # We just save residue indices and wait for the structure to be set 

910 if not self.structure: 

911 self._residue_indices = new_residue_indices 

912 return 

913 # Update the current residues 

914 for residue in self.residues: 

915 residue._chain_index = None 

916 # Update the new residues 

917 for index in new_residue_indices: 

918 residue = self.structure.residues[index] 

919 residue._chain_index = self.index 

920 # In case the new residue indices list is empty this chain must be removed from its structure 

921 if len(new_residue_indices) == 0: 

922 self.structure.purge_chain(self) 

923 # Now new indices are coherent and thus we can save them 

924 self._residue_indices = new_residue_indices 

925 residue_indices = property(get_residue_indices, set_residue_indices, None, "The residue indices according to parent structure residues for residues in this residue") 

926 

927 # The residues in this chain 

928 # If residues are set then make changes in all the structure to make this change coherent 

929 def get_residues (self) -> List['Residue']: 

930 # If there is not strucutre yet it means the chain is beeing set before the structure 

931 # In this case it is not possible to get related residues in the structure 

932 if not self.structure: 

933 return [] 

934 # Get residues in the structure according to residue indices 

935 residues = self.structure.residues 

936 return [ residues[residue_index] for residue_index in self.residue_indices ] 

937 

938 def set_residues (self, new_residues : List['Residue']): 

939 """Find indices for new residues and set their indices as the new residue indices. 

940 Note that residues must be set in the structure already.""" 

941 new_residue_indices = [] 

942 for new_residue in new_residues: 

943 new_residue_index = new_residue.index 

944 if new_residue_index == None: 

945 raise ValueError(f'Residue {new_residue} is not set in the structure') 

946 new_residue_indices.append(new_residue_index) 

947 self.set_residue_indices(new_residue_indices) 

948 residues = property(get_residues, set_residues, None, "The residues in this chain") 

949 

950 # Add a residue to the chain 

951 def add_residue (self, residue : 'Residue'): 

952 # Insert the new residue index in the list of residue indices keeping the order 

953 sorted_residue_index = bisect(self.residue_indices, residue.index) 

954 self.residue_indices.insert(sorted_residue_index, residue.index) 

955 # Update the residue internal chain index 

956 residue._chain_index = self.index 

957 

958 # Remove a residue from the chain 

959 # WARNING: Note that this function does not trigger the set_residue_indices 

960 def remove_residue (self, residue : 'Residue'): 

961 self.residue_indices.remove(residue.index) # This index MUST be in the list 

962 # If we removed the last residue then this chain must be removed from its structure 

963 if len(self.residue_indices) == 0 and self.structure: 

964 self.structure.purge_chain(self) 

965 # Update the residue internal chain index 

966 residue._chain_index = None 

967 

968 # Atom indices for all atoms in the chain (read only) 

969 # In order to change atom indices they must be changed in their corresponding residues 

970 def get_atom_indices (self) -> List[int]: 

971 return sum([ residue.atom_indices for residue in self.residues ], []) 

972 atom_indices = property(get_atom_indices, None, None, "Atom indices for all atoms in the chain (read only)") 

973 

974 # Atoms in the chain (read only) 

975 # In order to change atoms they must be changed in their corresponding residues 

976 def get_atoms (self) -> List[int]: 

977 return sum([ residue.atoms for residue in self.residues ], []) 

978 atoms = property(get_atoms, None, None, "Atoms in the chain (read only)") 

979 

980 # Get the number of atoms in the chain (read only) 

981 def get_atom_count (self) -> int: 

982 return len(self.atom_indices) 

983 atom_count = property(get_atom_count, None, None, "Number of atoms in the chain (read only)") 

984 

985 # Get the number of residues in the chain (read only) 

986 def get_residue_count (self) -> int: 

987 return len(self._residue_indices) 

988 residue_count = property(get_residue_count, None, None, "Number of residues in the chain (read only)") 

989 

990 # Get the chain classification 

991 def get_classification (self) -> str: 

992 if self._classification: 

993 return self._classification 

994 self._classification = self.structure.get_selection_classification(self.get_selection()) 

995 return self._classification 

996 

997 # Force the chain classification 

998 def set_classification (self, classification : str): 

999 self._classification = classification 

1000 

1001 # Chain classification 

1002 classification = property(get_classification, set_classification, None, "Classification of the chain (manual or automatic)") 

1003 

1004 # Get the residues sequence in one-letter code 

1005 def get_sequence (self) -> str: 

1006 return ''.join([ residue_name_to_letter(residue.name) for residue in self.residues ]) 

1007 

1008 # Generate a selection for this chain 

1009 def get_selection (self) -> 'Selection': 

1010 return Selection(self.atom_indices) 

1011 

1012 # Make a copy of the current chain 

1013 def copy (self) -> 'Chain': 

1014 chain_copy = Chain(self.name) 

1015 chain_copy._structure = self._structure 

1016 chain_copy._index = self._index 

1017 chain_copy.residue_indices = self.residue_indices 

1018 return chain_copy 

1019 

1020 # Ask if the current chain has at least one coarse grain atom/residue 

1021 def has_cg (self) -> bool: 

1022 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms) 

1023 

1024 # Find a residue by its number and insertion code 

1025 def find_residue (self, number : int, icode : str = '') -> Optional['Residue']: 

1026 # Iterate chain residues 

1027 for residue in self.residues: 

1028 if residue.number == number and residue.icode == icode: 

1029 return residue 

1030 return None 

1031 

1032# A structure is a group of atoms organized in chains and residues 

1033class Structure: 

1034 def __init__ (self, 

1035 atoms : List['Atom'] = [], 

1036 residues : List['Residue'] = [], 

1037 chains : List['Chain'] = [], 

1038 residue_numeration_base : int = 10, 

1039 ): 

1040 self.atoms = [] 

1041 self.residues = [] 

1042 self.chains = [] 

1043 # Set references between instances 

1044 for atom in atoms: 

1045 self.set_new_atom(atom) 

1046 for residue in residues: 

1047 self.set_new_residue(residue) 

1048 for chain in chains: 

1049 self.set_new_chain(chain) 

1050 # --- Set other internal variables --- 

1051 # Set bonds between atoms 

1052 self._bonds = None 

1053 # Set fragments of bonded atoms 

1054 self._fragments = None 

1055 # --- Set other auxiliar variables --- 

1056 # Trajectory atom sorter is a function used to sort coordinates in a trajectory file 

1057 # This function is generated when sorting indices in the structure 

1058 # Also the new atom order is saved 

1059 self.trajectory_atom_sorter = None 

1060 self.new_atom_order = None 

1061 # Track when atom elements have been fixed 

1062 self._fixed_atom_elements = False 

1063 # Save indices of supported ion atoms 

1064 self._ion_atom_indices = None 

1065 self._dummy_atom_indices = None 

1066 # Set the scale of the residue numbers 

1067 # It may be decimal (10), hexadecimal(16) or alphanumeric(36) 

1068 # Note that hybrid scales (1-9999 decimal and different further) are not explicitly supported 

1069 # However, the scale is guessed on read and conserved on write, so the original numeration would be conserved 

1070 # The default values is used only when the structure is not read from a PDB 

1071 self.residue_numeration_base = residue_numeration_base 

1072 

1073 def __repr__ (self): 

1074 return f'<Structure ({self.atom_count} atoms)>' 

1075 

1076 # The bonds between atoms 

1077 def get_bonds (self) -> List[ List[int] ]: 

1078 # Return the stored value, if exists 

1079 if self._bonds: 

1080 return self._bonds 

1081 # If not, we must calculate the bonds using vmd 

1082 self._bonds = self.get_covalent_bonds() 

1083 return self._bonds 

1084 # Force specific bonds 

1085 def set_bonds (self, bonds : List[ List[int] ]): 

1086 self._bonds = bonds 

1087 # Reset fragments 

1088 self._fragments = None 

1089 bonds = property(get_bonds, set_bonds, None, "The structure bonds") 

1090 

1091 # Get the groups of atoms which are covalently bonded 

1092 def get_fragments (self) -> List['Selection']: 

1093 # Return the stored value, if exists 

1094 if self._fragments != None: 

1095 return self._fragments 

1096 # Otherwise, find fragments in all structure atoms 

1097 self._fragments = list(self.find_fragments()) 

1098 return self._fragments 

1099 # Fragments of covalently bonded atoms (read only) 

1100 fragments = property(get_fragments, None, None, "The structure fragments (read only)") 

1101 

1102 # Find fragments* in a selection of atoms 

1103 # * A fragment is a selection of colvalently bonded atoms 

1104 # All atoms are searched if no selection is provided 

1105 # WARNING: Note that fragments generated from a specific selection may not match the structure fragments 

1106 # A selection including 2 separated regions of a structure fragment will yield 2 fragments 

1107 def find_fragments (self, selection : Optional['Selection'] = None) -> Generator['Selection', None, None]: 

1108 # If there is no selection we consider all atoms 

1109 if not selection: 

1110 selection = self.select_all() 

1111 # Get/Find covalent bonds between atoms in a new object avoid further corruption (deletion) of the original list 

1112 atom_indexed_covalent_bonds = { atom_index: [ *self.bonds[atom_index] ] for atom_index in selection.atom_indices } 

1113 # Group the connected atoms in "fragments" 

1114 while len(atom_indexed_covalent_bonds) > 0: 

1115 start_atom_index, bonds = next(iter(atom_indexed_covalent_bonds.items())) 

1116 del atom_indexed_covalent_bonds[start_atom_index] 

1117 fragment_atom_indices = [ start_atom_index ] 

1118 while len(bonds) > 0: 

1119 # Get the next bond atom and remove it from the bonds list 

1120 next_atom_index = bonds[0] 

1121 bonds.remove(next_atom_index) 

1122 next_bonds = atom_indexed_covalent_bonds.get(next_atom_index, None) 

1123 # If this atom is out of the selection then skip it 

1124 if next_bonds == None: 

1125 continue 

1126 next_new_bonds = [ bond for bond in next_bonds if bond not in fragment_atom_indices + bonds ] 

1127 bonds += next_new_bonds 

1128 fragment_atom_indices.append(next_atom_index) 

1129 del atom_indexed_covalent_bonds[next_atom_index] 

1130 yield Selection(fragment_atom_indices) 

1131 

1132 # Given a selection of atoms, find all whole structure fragments on them 

1133 def find_whole_fragments (self, selection : 'Selection') -> Generator['Selection', None, None]: 

1134 for fragment in self.fragments: 

1135 if selection & fragment: 

1136 yield fragment 

1137 

1138 def name_selection (self, selection : 'Selection') -> str: 

1139 """Name an atom selection depending on the chains it contains. This is used for debug purpouses.""" 

1140 atoms = [ self.atoms[index] for index in selection.atom_indices ] 

1141 # Count atoms per chain 

1142 atom_count_per_chain = { chain: 0 for chain in self.chains } 

1143 for atom in atoms: 

1144 atom_count_per_chain[atom.chain] += 1 

1145 # Set labels accoridng to the coverage of every chain 

1146 chain_labels = [] 

1147 for chain, atom_count in atom_count_per_chain.items(): 

1148 if atom_count == 0: continue 

1149 coverage = atom_count / chain.atom_count 

1150 label = f'chain {chain.name}' 

1151 if coverage < 1: 

1152 percent = round(coverage * 1000) / 10 

1153 label += f' ({percent}%)' 

1154 chain_labels.append(label) 

1155 return ', '.join(chain_labels) 

1156 

1157 # Set a new atom in the structure 

1158 def set_new_atom (self, atom : 'Atom'): 

1159 atom._structure = self 

1160 new_atom_index = self.atom_count 

1161 self.atoms.append(atom) 

1162 atom._index = new_atom_index 

1163 

1164 # Set a new residue in the structure 

1165 # WARNING: Atoms must be set already before setting residues 

1166 def set_new_residue (self, residue : 'Residue'): 

1167 residue._structure = self 

1168 new_residue_index = len(self.residues) 

1169 self.residues.append(residue) 

1170 residue._index = new_residue_index 

1171 # In case the residue has atom indices, set relational indices on each atom 

1172 for atom_index in residue.atom_indices: 

1173 atom = self.atoms[atom_index] 

1174 atom._residue_index = new_residue_index 

1175 

1176 # Purge residue from the structure and its chain 

1177 # This can be done only when the residue has no atoms left in the structure 

1178 # Renumerate all residue indices which have been offsetted as a result of the purge 

1179 def purge_residue (self, residue : 'Residue'): 

1180 # Check the residue can be purged 

1181 if residue not in self.residues: 

1182 raise ValueError(f'{residue} is not in the structure already') 

1183 if len(residue.atom_indices) > 0: 

1184 raise ValueError(f'{residue} is still having atoms and thus it cannot be purged') 

1185 # Remove the purged residue from its chain 

1186 if residue.chain: residue.chain.remove_residue(residue) 

1187 # Get the current index of the residue to be purged 

1188 purged_index = residue.index 

1189 # Residues and their atoms below this index are not to be modified 

1190 # Residues and their atoms over this index must be renumerated 

1191 for affected_residue in self.residues[purged_index+1:]: 

1192 # Chaging the index automatically changes all residues atoms '_residue_index' values 

1193 # Chaging the index automatically changes its corresponding index in residue chain '_residue_indices' 

1194 affected_residue.index -= 1 

1195 # Finally, remove the current residue from the list of residues in the structure 

1196 del self.residues[purged_index] 

1197 

1198 # Set a new chain in the structure 

1199 # WARNING: Residues and atoms must be set already before setting chains 

1200 def set_new_chain (self, chain : 'Chain'): 

1201 chain._structure = self 

1202 new_chain_index = len(self.chains) 

1203 self.chains.append(chain) 

1204 chain._index = new_chain_index 

1205 # In case the chain has residue indices, set relational indices on each residue 

1206 for residue_index in chain.residue_indices: 

1207 residue = self.residues[residue_index] 

1208 residue._chain_index = new_chain_index 

1209 

1210 # Purge chain from the structure 

1211 # This can be done only when the chain has no residues left in the structure 

1212 # Renumerate all chain indices which have been offsetted as a result of the purge 

1213 def purge_chain (self, chain : 'Chain'): 

1214 # Check the chain can be purged 

1215 if chain not in self.chains: 

1216 raise ValueError(f'Chain {chain.name} is not in the structure already') 

1217 if len(chain.residue_indices) > 0: 

1218 raise ValueError(f'Chain {chain.name} is still having residues and thus it cannot be purged') 

1219 # Get the current index of the chain to be purged 

1220 purged_index = chain.index 

1221 # Chains and their residues below this index are not to be modified 

1222 # Chains and their residues over this index must be renumerated 

1223 for affected_chain in self.chains[purged_index+1:]: 

1224 # Chaging the index automatically changes all chain residues '_chain_index' values 

1225 affected_chain.index -= 1 

1226 # Finally, remove the current chain from the list of chains in the structure 

1227 del self.chains[purged_index] 

1228 

1229 # Set the structure from a pdb file 

1230 # You may filter the PDB content for a specific model 

1231 # Some weird numeration systems are not supported and, when encountered, they are ignored 

1232 # In these cases we set our own numeration system 

1233 # Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead 

1234 @classmethod 

1235 def from_pdb (cls, pdb_content : str, model : Optional[int] = None, flexible_numeration : bool = True): 

1236 # Filter the PDB content in case a model was passed 

1237 filtered_pdb_content = filter_model(pdb_content, model) if model else pdb_content 

1238 # Split the PDB content in lines 

1239 pdb_lines = filtered_pdb_content.split('\n') 

1240 # Before we start, we must guess the numeration system 

1241 # To do so mine all residue numbers 

1242 all_residue_number_characters = set() 

1243 for line in pdb_lines: 

1244 # Parse atoms only 

1245 start = line[0:6] 

1246 is_atom = start == 'ATOM ' or start == 'HETATM' 

1247 if not is_atom: continue 

1248 # Mine all resiude numbers 

1249 residue_number = line[22:26] 

1250 for character in residue_number: 

1251 all_residue_number_characters.add(character) 

1252 # Remove white spaces 

1253 all_residue_number_characters.discard(' ') 

1254 # If we find a non-numerical and non-alphabetical character then we assume it has a weird numeration system 

1255 # Since we can not support every scenario, in this cases we set the numeration totally ignoring the one in the PDB 

1256 weird_character = next((character for character in all_residue_number_characters if not (character.isalnum() or character == '-')), None) 

1257 if weird_character: 

1258 if flexible_numeration == False: raise InputError(f'Not supported numeration system including "{weird_character}" characters') 

1259 warn(f'Weird residue numeration including "{weird_character}" characters found in the PDB file. Ignoring it.') 

1260 residue_numeration_base = None 

1261 # Search among all resiude numbers any letters (non-numerical characters) 

1262 elif next((letter for letter in alphanumerical_letters if letter in all_residue_number_characters), None): 

1263 residue_numeration_base = 36 

1264 elif next((letter for letter in hexadecimal_letters if letter in all_residue_number_characters), None): 

1265 residue_numeration_base = 16 

1266 else: 

1267 residue_numeration_base = 10 

1268 # Read the pdb content line by line and set the parsed atoms, residues and chains 

1269 parsed_atoms = [] 

1270 parsed_residues = [] 

1271 parsed_chains = [] 

1272 # Save chains and residues also in dictionaries only to find them faster 

1273 name_chains = {} 

1274 label_residues = {} 

1275 # Keep track of the last issued atom and residue indices 

1276 atom_index = -1 

1277 residue_index = -1 

1278 for line in pdb_lines: 

1279 # Parse atoms only 

1280 start = line[0:6] 

1281 is_atom = start == 'ATOM ' or start == 'HETATM' 

1282 if not is_atom: 

1283 continue 

1284 # Mine all atom data 

1285 atom_name = line[11:16].strip() 

1286 residue_name = line[17:21].strip() 

1287 chain_name = line[21:22] 

1288 residue_number = line[22:26] 

1289 icode = line[26:27] 

1290 if icode == ' ': 

1291 icode = '' 

1292 x_coord = float(line[30:38]) 

1293 y_coord = float(line[38:46]) 

1294 z_coord = float(line[46:54]) 

1295 element = line[76:78].strip() 

1296 # Set the parsed atom 

1297 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord)) 

1298 # Add the parsed atom to the list and update the current atom index 

1299 parsed_atoms.append(parsed_atom) 

1300 # Get the parsed chain 

1301 # DANI: If we always check for an already existing chain there will never be repeated chains 

1302 # DANI: However we may create chains with unconnected atoms silently (they were separated in the PDB) 

1303 # DANI: For this reason we must only consider the last processed chain 

1304 parsed_chain = name_chains.get(chain_name, None) 

1305 if parsed_chain and parsed_chain != parsed_chains[-1]: 

1306 parsed_chain = None 

1307 # If the parsed chain was not yet instantiated then do it now 

1308 if not parsed_chain: 

1309 parsed_chain = Chain(name=chain_name) 

1310 parsed_chains.append(parsed_chain) 

1311 name_chains[chain_name] = parsed_chain 

1312 # Get the parsed residue 

1313 residue_label = (chain_name, residue_number, icode) 

1314 # DANI: If we always check for an already existing residue there will never be repeated residues 

1315 # DANI: However we may create residues with unconnected atoms silently (they were separated in the PDB) 

1316 # DANI: For this reason we must only consider the last processed residue 

1317 parsed_residue = label_residues.get(residue_label, None) 

1318 if parsed_residue and parsed_residue != parsed_residues[-1]: 

1319 parsed_residue = None 

1320 # If the parsed residue was not yet instantiated then do it now 

1321 if not parsed_residue: 

1322 # Parse the residue number if it is to be parsed 

1323 if residue_numeration_base: 

1324 parsed_residue_number = int(residue_number, residue_numeration_base) 

1325 # If we decided to ignore the numeration system then we just issue a new residue number 

1326 # Use the last residue number from the current chain as reference 

1327 else: 

1328 chain_last_residue_index = parsed_chain.residue_indices[-1] if len(parsed_chain.residue_indices) > 0 else None 

1329 chain_last_residue_number = parsed_residues[chain_last_residue_index].number if chain_last_residue_index != None else 0 

1330 parsed_residue_number = chain_last_residue_number + 1 

1331 # Instantiate the new parsed residue 

1332 parsed_residue = Residue(name=residue_name, number=parsed_residue_number, icode=icode) 

1333 parsed_residues.append(parsed_residue) 

1334 label_residues[residue_label] = parsed_residue 

1335 # Add current residue to the parsed chain 

1336 residue_index += 1 

1337 parsed_chain.residue_indices.append(residue_index) 

1338 # Add current atom to the parsed residue 

1339 atom_index += 1 

1340 parsed_residue.atom_indices.append(atom_index) 

1341 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains, residue_numeration_base=residue_numeration_base) 

1342 

1343 # Set the structure from a pdb file 

1344 # You may filter the input PDB file for a specific model 

1345 # Some weird numeration systems are not supported and, when encountered, they are ignored 

1346 # In these cases we set our own numeration system 

1347 # Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead 

1348 @classmethod 

1349 def from_pdb_file (cls, pdb_filepath : str, model : Optional[int] = None, flexible_numeration : bool = True): 

1350 pdb_file = File(pdb_filepath) 

1351 if not pdb_file.exists: 

1352 raise InputError(f'File "{pdb_filepath}" not found') 

1353 if not pdb_file.format == 'pdb': 

1354 raise InputError(f'"{pdb_filepath}" is not a path for a pdb file') 

1355 # Read the pdb file 

1356 pdb_content = None 

1357 with open(pdb_filepath, 'r') as file: 

1358 pdb_content = file.read() 

1359 return cls.from_pdb(pdb_content, model, flexible_numeration) 

1360 

1361 # Set the structure from mmcif 

1362 # https://biopandas.github.io/biopandas/tutorials/Working_with_mmCIF_Structures_in_DataFrames/ 

1363 # You may filter the content for a specific model 

1364 # You may ask for the author notation instead of the standarized notation for legacy reasons 

1365 # This may have an effect in atom names, residue names, residue numbers and chain names 

1366 @classmethod 

1367 def from_mmcif (cls, mmcif_content : str, model : Optional[int] = None, author_notation : bool = False): 

1368 # Read the pdb content line by line and set the parsed atoms, residues and chains 

1369 parsed_atoms = [] 

1370 parsed_residues = [] 

1371 parsed_chains = [] 

1372 # Save chains and residues also in dictionaries only to find them faster 

1373 name_chains = {} 

1374 label_residues = {} 

1375 # Keep track of the last issued atom and residue indices 

1376 atom_index = -1 

1377 residue_index = -1 

1378 # Iterate the content line by line 

1379 lines = iter(mmcif_content.split('\n')) 

1380 # Now mine atoms data 

1381 atom_headers = { 'ATOM', 'HETATM', 'ANISOU' } 

1382 for line in lines: 

1383 # Values are separated by spaces 

1384 values = line.split() 

1385 # If this is not atom line then we are done 

1386 if len(values) == 0 or values[0] not in atom_headers: continue 

1387 # Get the atom model number if a model number was passed 

1388 # Note that then model is a number greater than 0 

1389 if model: 

1390 model_number = int(values[20]) 

1391 # If the model number odes not match then skip it 

1392 if model != model_number: continue 

1393 # Mine atom data 

1394 # Next value is just the atom number, we do not need it 

1395 # atom_number = values[1] 

1396 # Mine the atom element 

1397 element = values[2] 

1398 # Mine the atom name 

1399 atom_name = values[3].replace('"','') 

1400 # Next value is a place holder to indicate alternate conformation, according to the docs 

1401 # I don't know what this is but we do not need it 

1402 # idk = values[4] 

1403 # Mine the residue name 

1404 residue_name = values[5] 

1405 # Mine the chain name 

1406 chain_name = values[6] 

1407 # Next value is the chain number, we do not need it 

1408 # chain_number = values[7] 

1409 # Residue number is '.' for chains with only one residue 

1410 residue_number = 1 if values[8] == '.' else int(values[8]) 

1411 icode = '' if values[9] == '?' else values[9] 

1412 x_coord = float(values[10]) 

1413 y_coord = float(values[11]) 

1414 z_coord = float(values[12]) 

1415 # Next value is the occupancy, we do not need it 

1416 # occupancy = float(values[13]) 

1417 # Next value is the isotropic displacement, we do not need it 

1418 # isotropic = float(values[14]) 

1419 # Next value is the charge, we do not need it 

1420 # charge = None if values[15] == '?' else float(values[15]) 

1421 # The rest of values are alternative author values 

1422 if author_notation: 

1423 residue_number = int(values[16]) 

1424 residue_name = values[17] 

1425 chain_name = values[18] 

1426 atom_name = values[19].replace('"','') 

1427 # Set the parsed atom 

1428 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord)) 

1429 # Add the parsed atom to the list and update the current atom index 

1430 parsed_atoms.append(parsed_atom) 

1431 # Get the parsed chain 

1432 parsed_chain = name_chains.get(chain_name, None) 

1433 # If the parsed chain was not yet instantiated then do it now 

1434 if not parsed_chain: 

1435 parsed_chain = Chain(name=chain_name) 

1436 parsed_chains.append(parsed_chain) 

1437 name_chains[chain_name] = parsed_chain 

1438 # Get the parsed residue 

1439 residue_label = (chain_name, residue_number, icode) 

1440 parsed_residue = label_residues.get(residue_label, None) 

1441 # If the parsed residue was not yet instantiated then do it now 

1442 if not parsed_residue: 

1443 # Instantiate the new parsed residue 

1444 parsed_residue = Residue(name=residue_name, number=residue_number, icode=icode) 

1445 parsed_residues.append(parsed_residue) 

1446 label_residues[residue_label] = parsed_residue 

1447 # Add current residue to the parsed chain 

1448 residue_index += 1 

1449 parsed_chain.residue_indices.append(residue_index) 

1450 # Add current atom to the parsed residue 

1451 atom_index += 1 

1452 parsed_residue.atom_indices.append(atom_index) 

1453 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains) 

1454 

1455 # Set the structure from a mmcif file 

1456 @classmethod 

1457 def from_mmcif_file (cls, mmcif_filepath : str, model : Optional[int] = None, author_notation : bool = False): 

1458 mmcif_file = File(mmcif_filepath) 

1459 if not mmcif_file.exists: 

1460 raise InputError(f'File "{mmcif_filepath}" not found') 

1461 if not mmcif_file.format == 'cif': 

1462 raise InputError(f'"{mmcif_filepath}" is not a path for a mmcif file') 

1463 # Read the mmcif file 

1464 mmcif_content = None 

1465 with open(mmcif_filepath, 'r') as file: 

1466 mmcif_content = file.read() 

1467 return cls.from_mmcif(mmcif_content, model, author_notation) 

1468 

1469 # Set the structure from an MD analysis object 

1470 @classmethod 

1471 def from_mdanalysis (cls, mdanalysis_universe): 

1472 # Set the final list of atoms to be included in the structure 

1473 parsed_atoms = [] 

1474 parsed_residues = [] 

1475 parsed_chains = [] 

1476 # Setup atoms 

1477 for atom in mdanalysis_universe.atoms: 

1478 name = atom.name 

1479 # WARNING: MDAnalysis cannot handle reading an element of an atom with no element 

1480 element = atom.element if hasattr(atom, 'element') else '?' 

1481 #coords = atom.position # DANI: Esto da error si no hay coordenadas 

1482 parsed_atom = Atom(name=name, element=element) 

1483 parsed_atoms.append(parsed_atom) 

1484 # Setup residues 

1485 for residue in mdanalysis_universe.residues: 

1486 name = residue.resname 

1487 number = residue.resnum 

1488 icode = residue.icode if hasattr(residue, 'icode') else None # DANI: No se ha provado 

1489 parsed_residue = Residue(name=name, number=number, icode=icode) 

1490 atom_indices = [ atom.index for atom in residue.atoms ] 

1491 parsed_residue.atom_indices = atom_indices 

1492 parsed_residues.append(parsed_residue) 

1493 # Setup chains 

1494 for segment in mdanalysis_universe.segments: 

1495 name = segment.segid 

1496 parsed_chain = Chain(name=name) 

1497 residue_indices = [ residue.resindex for residue in segment.residues ] 

1498 parsed_chain.residue_indices = residue_indices 

1499 parsed_chains.append(parsed_chain) 

1500 # Setup the structure 

1501 structure = cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains) 

1502 # Add bonds and charges which are also available in a topology 

1503 atom_count = len(mdanalysis_universe.atoms) 

1504 atom_bonds = [ [] for i in range(atom_count) ] 

1505 for bond in mdanalysis_universe.bonds: 

1506 a,b = bond.indices 

1507 # Make sure atom indices are regular integers so they are JSON serializables 

1508 atom_bonds[a].append(int(b)) 

1509 atom_bonds[b].append(int(a)) 

1510 structure.bonds = atom_bonds 

1511 structure.charges = list(mdanalysis_universe._topology.charges.values) 

1512 return structure 

1513 

1514 # Set the structure from a prmtop file 

1515 @classmethod 

1516 def from_prmtop_file (cls, prmtop_filepath : str): 

1517 # Make sure the input file exists and has the right format 

1518 prmtop_file = File(prmtop_filepath) 

1519 if not prmtop_file.exists: 

1520 raise InputError(f'File "{prmtop_filepath}" not found') 

1521 if not prmtop_file.format == 'prmtop': 

1522 raise InputError(f'"{prmtop_filepath}" is not a name for a prmtop file') 

1523 # In we do not have mdanalysis in our environment then we cannot proceed 

1524 if not is_imported('MDAnalysis'): 

1525 raise InputError('Missing dependency error: MDAnalysis') 

1526 # Parse the topology using MDanalysis 

1527 parser = MDAnalysis.topology.TOPParser.TOPParser(prmtop_filepath) 

1528 mdanalysis_topology = parser.parse() 

1529 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology) 

1530 return cls.from_mdanalysis(mdanalysis_universe) 

1531 

1532 # Set the structure from a tpr file 

1533 @classmethod 

1534 def from_tpr_file (cls, tpr_filepath : str): 

1535 # Make sure the input file exists and has the right format 

1536 tpr_file = File(tpr_filepath) 

1537 if not tpr_file.exists: 

1538 raise InputError(f'File "{tpr_filepath}" not found') 

1539 if not tpr_file.format == 'tpr': 

1540 raise InputError(f'"{tpr_filepath}" is not a name for a tpr file') 

1541 # In we do not have mdanalysis in our environment then we cannot proceed 

1542 if not is_imported('MDAnalysis'): 

1543 raise InputError('Missing dependency error: MDAnalysis') 

1544 # Parse the topology using MDanalysis 

1545 parser = MDAnalysis.topology.TPRParser.TPRParser(tpr_filepath) 

1546 mdanalysis_topology = parser.parse() 

1547 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology) 

1548 return cls.from_mdanalysis(mdanalysis_universe) 

1549 

1550 # Set the structure from a file if the file format is supported 

1551 @classmethod 

1552 def from_file (cls, mysterious_filepath : str): 

1553 mysterious_file = File(mysterious_filepath) 

1554 if mysterious_file.format == 'pdb': 

1555 return cls.from_pdb_file(mysterious_file.path) 

1556 if mysterious_file.format == 'cif': 

1557 return cls.from_mmcif_file(mysterious_file.path) 

1558 if mysterious_file.format == 'prmtop': 

1559 return cls.from_prmtop_file(mysterious_file.path) 

1560 if mysterious_file.format == 'tpr': 

1561 return cls.from_tpr_file(mysterious_file.path) 

1562 raise InputError(f'Not supported format ({mysterious_file.format}) to setup a Structure') 

1563 

1564 # Get the number of atoms in the structure 

1565 def get_atom_count (self) -> int: 

1566 return len(self.atoms) 

1567 atom_count = property(get_atom_count, None, None, "The number of atoms in the structure (read only)") 

1568 

1569 # Get the number of residues in the structure (read only) 

1570 def get_residue_count (self) -> int: 

1571 return len(self.residues) 

1572 residue_count = property(get_residue_count, None, None, "Number of residues in the structure (read only)") 

1573 

1574 # Get the number of chains in the structure (read only) 

1575 def get_chain_count (self) -> int: 

1576 return len(self.chains) 

1577 chain_count = property(get_chain_count, None, None, "Number of chains in the structure (read only)") 

1578 

1579 # Fix atom elements by guessing them when missing 

1580 # Set all elements with the first letter upper and the second (if any) lower 

1581 # Also check if atom elements are coherent with atom names 

1582 # If 'trust' is set as False then we impose elements according to what we can guess from the atom name 

1583 # Return True if any element was modified or False if not 

1584 def fix_atom_elements (self, trust : bool = True) -> bool: 

1585 modified = False 

1586 added = False 

1587 # Save the wrong guesses for a final report 

1588 # This way we do not crowd the terminal with logs when a lot of atoms are affected 

1589 reports = {} 

1590 for atom in self.atoms: 

1591 # Make sure elements have the first letter cap and the second letter not cap 

1592 if atom.element: 

1593 new_element = first_cap_only(atom.element) 

1594 if atom.element != new_element: 

1595 atom.element = new_element 

1596 modified = True 

1597 # Check the element to match what we would guess from the atom name 

1598 # In case it does not just warn the user 

1599 guess = atom.guess_element() 

1600 if atom.element != guess: 

1601 report = (atom.name, atom.element, guess) 

1602 report_count = reports.get(report, 0) 

1603 reports[report] = report_count + 1 

1604 if not trust: 

1605 atom.element = guess 

1606 modified = True 

1607 # If elements are missing then guess them from atom names 

1608 else: 

1609 atom.element = atom.guess_element() 

1610 added = True 

1611 # Warn the user about anormalies 

1612 for report, count in reports.items(): 

1613 atom_name, atom_element, guess = report 

1614 warn(f"Suspicious element for atom {atom_name}: {atom_element} -> shoudn't it be {guess}? ({count} occurrences)") 

1615 # Warn the user that some elements were modified 

1616 if modified: warn('Atom elements have been modified') 

1617 if added: warn('Atom elements were missing and have been added') 

1618 # Set atom elements as fixed in order to avoid repeating this process 

1619 self._fixed_atom_elements = True 

1620 return modified or added 

1621 

1622 # Set new coordinates 

1623 def set_new_coordinates (self, new_coordinates : List[Coords]): 

1624 # Make sure the list of coordinates is as long as the number of atoms 

1625 if len(new_coordinates) != self.atom_count: 

1626 raise ValueError(f'The number of coordinates ({len(new_coordinates)}) does not match the number of atoms ({self.atom_count})') 

1627 # Overwrite current coordinates with the new coordinates 

1628 for i, atom in enumerate(self.atoms): 

1629 atom.coords = tuple(new_coordinates[i]) 

1630 

1631 # Get all supported ion atom indices together in a set 

1632 def get_ion_atom_indices (self) -> Set: 

1633 # If we already did this then return the stored value 

1634 if self._ion_atom_indices != None: 

1635 return self._ion_atom_indices 

1636 # Find ion atom indices 

1637 indices = set() 

1638 for atom in self.atoms: 

1639 if atom.element in SUPPORTED_ION_ELEMENTS: 

1640 indices.add(atom.index) 

1641 self._ion_atom_indices = indices 

1642 return self._ion_atom_indices 

1643 ion_atom_indices = property(get_ion_atom_indices, None, None, "Atom indices for what we consider supported ions") 

1644 

1645 # Get all dummy atom indices together in a set 

1646 def get_dummy_atom_indices (self) -> Set: 

1647 # If we already did this then return the stored value 

1648 if self._dummy_atom_indices != None: 

1649 return self._dummy_atom_indices 

1650 # Find ion atom indices 

1651 indices = set() 

1652 for atom in self.atoms: 

1653 if atom.element == DUMMY_ATOM_ELEMENT: 

1654 indices.add(atom.index) 

1655 self._dummy_atom_indices = indices 

1656 return self._dummy_atom_indices 

1657 dummy_atom_indices = property(get_dummy_atom_indices, None, None, "Atom indices for what we consider dummy atoms") 

1658 

1659 # Generate a pdb file content with the current structure 

1660 def generate_pdb (self): 

1661 content = 'REMARK workflow generated pdb file\n' 

1662 for a, atom in enumerate(self.atoms): 

1663 residue = atom.residue 

1664 index = str((a+1) % 100000).rjust(5) 

1665 name = ' ' + atom.name.ljust(3) if len(atom.name) < 4 else atom.name 

1666 residue_name = residue.name.ljust(4) if residue else 'XXX'.ljust(4) 

1667 chain = atom.chain 

1668 chain_name = chain.name if chain.name and len(chain.name) == 1 else 'X' 

1669 residue_number = str(residue.number).rjust(4) if residue else '0'.rjust(4) 

1670 # If residue number is longer than 4 characters then we must parse to hexadecimal 

1671 if len(residue_number) > 4: 

1672 residue_number = hex(residue.number)[2:].rjust(4) 

1673 icode = residue.icode if residue.icode and len(residue.icode) else ' ' 

1674 # Make sure we have atom coordinates 

1675 if atom.coords == None: 

1676 raise InputError('Trying to write a PDB file from a structure with atoms without coordinates') 

1677 x_coord, y_coord, z_coord = [ "{:.3f}".format(coord).rjust(8) for coord in atom.coords ] 

1678 occupancy = '1.00' # Just a placeholder 

1679 temp_factor = '0.00' # Just a placeholder 

1680 element = atom.element.rjust(2) 

1681 atom_line = ('ATOM ' + index + ' ' + name + ' ' + residue_name 

1682 + chain_name + residue_number + icode + ' ' + x_coord + y_coord + z_coord 

1683 + ' ' + occupancy + ' ' + temp_factor + ' ' + element).ljust(80) + '\n' 

1684 content += atom_line 

1685 return content 

1686 

1687 # Generate a pdb file with current structure 

1688 def generate_pdb_file (self, pdb_filepath : str): 

1689 pdb_content = self.generate_pdb() 

1690 with open(pdb_filepath, "w") as file: 

1691 file.write(pdb_content) 

1692 

1693 # Get the structure equivalent pytraj topology 

1694 def get_pytraj_topology (self): 

1695 # In we do not have pytraj in our environment then we cannot proceed 

1696 if not is_imported('pytraj'): 

1697 raise InputError('Missing dependency error: pytraj') 

1698 # Generate a pdb file from the current structure to feed pytraj 

1699 pdb_filepath = '.structure.pdb' 

1700 self.generate_pdb_file(pdb_filepath) 

1701 pytraj_topology = pytraj.load_topology(filename = pdb_filepath) 

1702 os.remove(pdb_filepath) 

1703 return pytraj_topology 

1704 

1705 # Select atoms from the structure thus generating an atom indices list 

1706 # Different tools may be used to make the selection: 

1707 # - vmd (default) 

1708 # - pytraj 

1709 SUPPORTED_SELECTION_SYNTAXES = { 'vmd', 'pytraj' } 

1710 def select (self, selection_string : str, syntax : str = 'vmd') -> Optional['Selection']: 

1711 if syntax == 'vmd': 

1712 # Generate a pdb for vmd to read it 

1713 pdb_filepath = '.structure.pdb' 

1714 self.generate_pdb_file(pdb_filepath) 

1715 # Use vmd to find atom indices 

1716 atom_indices = get_vmd_selection_atom_indices(pdb_filepath, selection_string) 

1717 os.remove(pdb_filepath) 

1718 if len(atom_indices) == 0: 

1719 return Selection() 

1720 return Selection(atom_indices) 

1721 if syntax == 'pytraj': 

1722 # In we do not have pytraj in our environment then we cannot proceed 

1723 if not is_imported('pytraj'): 

1724 raise InputError('Missing dependency error: pytraj') 

1725 pytraj_topology = self.get_pytraj_topology() 

1726 pytraj_selection = pytraj_topology[selection_string] 

1727 atom_indices = [ atom.index for atom in pytraj_selection.atoms ] 

1728 if len(atom_indices) == 0: 

1729 return Selection() 

1730 return Selection(atom_indices) 

1731 

1732 options = ', '.join(self.SUPPORTED_SELECTION_SYNTAXES) 

1733 raise InputError(f'Syntax "{syntax}" is not supported. Choose one of the following: {options}') 

1734 

1735 # Set a function to make selections using atom indices 

1736 def select_atom_indices (self, atom_indices : List[int]) -> 'Selection': 

1737 # Check atom indices to be in the structure 

1738 atom_count = self.atom_count 

1739 for atom_index in atom_indices: 

1740 if atom_index >= atom_count: 

1741 raise InputError(f'Atom index {atom_index} is out of range ({atom_count})') 

1742 return Selection(atom_indices) 

1743 

1744 # Set a function to make selections using residue indices 

1745 def select_residue_indices (self, residue_indices : List[int]) -> 'Selection': 

1746 # WARNING: The following line gets stucked sometimes, idk why 

1747 #atom_indices = sum([ self.residues[index].atom_indices for index in residue_indices ], []) 

1748 atom_indices = [] 

1749 for i, index in enumerate(residue_indices): 

1750 atom_indices += self.residues[index].atom_indices 

1751 return Selection(atom_indices) 

1752 

1753 # Get a selection with all atoms 

1754 def select_all (self) -> 'Selection': 

1755 return Selection(list(range(self.atom_count))) 

1756 

1757 # Select atoms according to the classification of its residue 

1758 def select_by_classification (self, classification : str) -> 'Selection': 

1759 atom_indices = [] 

1760 for residue in self.residues: 

1761 if residue.classification == classification: 

1762 atom_indices += residue.atom_indices 

1763 return Selection(atom_indices) 

1764 

1765 # Select water atoms 

1766 # WARNING: This logic is a bit guessy and it may fail for non-standard residue named structures 

1767 def select_water (self) -> 'Selection': 

1768 return self.select_by_classification('solvent') 

1769 

1770 # Select ions 

1771 def select_ions (self) -> 'Selection': 

1772 return self.select_by_classification('ion') 

1773 

1774 # Select counter ion atoms 

1775 # WARNING: This logic is a bit guessy and it may fail for non-standard atom named structures 

1776 def select_counter_ions (self, charge : Optional[str] = None) -> 'Selection': 

1777 counter_ion_indices = [] 

1778 # Set the accepted names accoridng to the charge 

1779 if charge == None: 

1780 accepted_names = STANDARD_COUNTER_ION_ATOM_NAMES 

1781 elif charge == '+': 

1782 accepted_names = STANDARD_COUNTER_CATION_ATOM_NAMES 

1783 elif charge == '-': 

1784 accepted_names = STANDARD_COUNTER_ANION_ATOM_NAMES 

1785 else: 

1786 raise ValueError('Not supported charge') 

1787 # Iterate atoms 

1788 for atom in self.atoms: 

1789 # If the residue has not one single atom then it is not an ion 

1790 if len(atom.residue.atoms) != 1: continue 

1791 # Get a simplified version of the atom name 

1792 # Set all letters upper and remove non-letter characters (e.g. '+' and '-' symbols) 

1793 simple_atom_name = ''.join(filter(str.isalpha, atom.name.upper())) 

1794 if simple_atom_name in accepted_names: 

1795 counter_ion_indices.append(atom.index) 

1796 return Selection(counter_ion_indices) 

1797 

1798 # Select both water and counter ions 

1799 def select_water_and_counter_ions (self) -> 'Selection': 

1800 return self.select_water() + self.select_counter_ions() 

1801 

1802 # Select heavy atoms 

1803 def select_heavy_atoms (self) -> 'Selection': 

1804 atom_indices = [] 

1805 for atom in self.atoms: 

1806 # If the atom is not an hydrogen then add it to the list 

1807 if atom.element != 'H': 

1808 atom_indices.append(atom.index) 

1809 return Selection(atom_indices) 

1810 

1811 # Select protein atoms 

1812 # WARNING: Note that there is a small difference between VMD protein and our protein 

1813 # This function is improved to consider terminal residues as proteins 

1814 # VMD considers protein any resiude including N, C, CA and O while terminals may have OC1 and OC2 instead of O 

1815 def select_protein (self) -> 'Selection': 

1816 return self.select_by_classification('protein') 

1817 

1818 # Select nucleic atoms 

1819 def select_nucleic (self) -> 'Selection': 

1820 return self.select_by_classification('dna') + self.select_by_classification('rna') 

1821 

1822 # Select lipids 

1823 def select_lipids (self) -> 'Selection': 

1824 return self.select_by_classification('fatty') + self.select_by_classification('steroid') 

1825 

1826 # Select carbohydrates 

1827 def select_carbohydrates (self) -> 'Selection': 

1828 return self.select_by_classification('carbohydrate') 

1829 

1830 # Return a selection of the typical PBC atoms: solvent, counter ions and lipids 

1831 # WARNING: This is just a guess 

1832 def select_pbc_guess (self) -> 'Selection': 

1833 return self.select_water() + self.select_counter_ions() + self.select_lipids() 

1834 

1835 def select_cg (self) -> 'Selection': 

1836 """Select coarse grain atoms.""" 

1837 return Selection([ atom.index for atom in self.atoms if atom.element == CG_ATOM_ELEMENT ]) 

1838 

1839 def select_cartoon (self, include_terminals : bool = False) -> 'Selection': 

1840 """Select cartoon representable regions for VMD. 

1841 

1842 Rules are: 

1843 1. Residues must be protein (i.e. must contain C, CA, N and O atoms) or nucleic (P, OP1, OP2, O3', C3', C4', C5', O5') 

1844 2. There must be at least 3 covalently bonded residues 

1845  

1846 It does not matter their chain, numeration or even index order as long as they are bonded. 

1847 * Note that we can represent cartoon while we display one residue alone, but it must be connected anyway. 

1848 Also, we have the option to include terminals in the cartoon selection although they are not representable. 

1849 This is helpful for the screenshot: terminals are better hidden than represented as ligands.""" 

1850 # Set fragments which are candidate to be cartoon representable 

1851 fragments = [] 

1852 # Get protein fragments according to VMD 

1853 protein_selection = self.select_protein() - self.select_cg() if include_terminals else self.select('protein', syntax='vmd') 

1854 if protein_selection: 

1855 fragments += list(self.find_fragments(protein_selection)) 

1856 # Get nucleic fragments according to VMD 

1857 nucleic_selection = self.select_nucleic() - self.select_cg() 

1858 if nucleic_selection: 

1859 fragments += list(self.find_fragments(nucleic_selection)) 

1860 # Set the final selection including all valid fragments 

1861 cartoon_selection = Selection() 

1862 # Iterate over every fragment 

1863 for fragment in fragments: 

1864 # Make sure the fragment has at least 3 residues before adding it to the cartoon selection 

1865 fragment_residue_indices = self.get_selection_residue_indices(fragment) 

1866 if len(fragment_residue_indices) >= 3: 

1867 cartoon_selection += fragment 

1868 return cartoon_selection 

1869 

1870 # Invert a selection 

1871 def invert_selection (self, selection : 'Selection') -> 'Selection': 

1872 atom_indices = list(range(self.atom_count)) 

1873 for atom_index in selection.atom_indices: 

1874 atom_indices[atom_index] = None 

1875 return Selection([ atom_index for atom_index in atom_indices if atom_index != None ]) 

1876 

1877 # Given a selection, get a list of residue indices for residues implicated 

1878 # Note that if a single atom from the residue is in the selection then the residue index is returned 

1879 def get_selection_residue_indices (self, selection : 'Selection') -> List[int]: 

1880 return list(set([ self.atoms[atom_index].residue_index for atom_index in selection.atom_indices ])) 

1881 

1882 # Given a selection, get a list of residues implicated 

1883 # Note that if a single atom from the residue is in the selection then the residue is returned 

1884 def get_selection_residues (self, selection : 'Selection') -> List['Residue']: 

1885 residue_indices = self.get_selection_residue_indices(selection) 

1886 return [ self.residues[index] for index in residue_indices ] 

1887 

1888 # Given a selection, get a list of chain indices for chains implicated 

1889 # Note that if a single atom from the chain is in the selection then the chain index is returned 

1890 def get_selection_chain_indices (self, selection : 'Selection') -> List[int]: 

1891 return list(set([ self.atoms[atom_index].chain_index for atom_index in selection.atom_indices ])) 

1892 

1893 # Given a selection, get a list of chains implicated 

1894 # Note that if a single atom from the chain is in the selection then the chain is returned 

1895 def get_selection_chains (self, selection : 'Selection') -> List['Chain']: 

1896 chain_indices = self.get_selection_chain_indices(selection) 

1897 return [ self.chains[index] for index in chain_indices ] 

1898 

1899 # Get type of the chain 

1900 def get_selection_classification (self, selection : 'Selection') -> str: 

1901 

1902 # Get selection residues 

1903 selection_residue_indices = self.get_selection_residue_indices(selection) 

1904 

1905 # Inicializar contadores para cada tipo de residuo 

1906 residue_counts = {} 

1907 

1908 # Count the residues of each type 

1909 for residue_index in selection_residue_indices: 

1910 residue = self.residues[residue_index] 

1911 res_class = residue.classification 

1912 if res_class in residue_counts: 

1913 residue_counts[res_class] += 1 

1914 else: 

1915 # If the classification is not recognized, count it as "other" 

1916 residue_counts[res_class] = 1 

1917 

1918 # Count the total number of residues in the selection 

1919 total_residues = sum(residue_counts.values()) 

1920 if total_residues == 0: raise ValueError('Should have residues at this point') 

1921 

1922 # Calculate the proportion of each type of residue 

1923 proportions = { k: v / total_residues for k, v in residue_counts.items() } 

1924 

1925 # If one type of residue dominates, return it 

1926 primary_type = max(proportions, key=proportions.get) 

1927 # We establish a threshold of 80% to consider a chain as a single type 

1928 if proportions[primary_type] >= 0.8: 

1929 return primary_type 

1930 

1931 # Special cases 

1932 relevant_threshold = 0.3 

1933 if proportions.get("dna", 0) > relevant_threshold and proportions.get("rna", 0) > relevant_threshold: 

1934 return "nucleic" 

1935 if proportions.get("carbohydrate", 0) > relevant_threshold and proportions.get("protein", 0) > relevant_threshold: 

1936 return "glycoprotein" 

1937 if proportions.get("fatty", 0) > relevant_threshold and proportions.get("steroid", 0) > relevant_threshold: 

1938 return "lipid" 

1939 

1940 # Any other combinations of different main proportions 

1941 main_proportions = { k: v for k, v in proportions.items() if v > relevant_threshold } 

1942 # Nothing is above the threshold 

1943 if len(main_proportions) == 0: 

1944 return "mix" 

1945 # There is only one thing above threshold 

1946 elif len(main_proportions) == 1: 

1947 return f"{primary_type}/mix" 

1948 # There are two things above threshold 

1949 elif len(main_proportions) == 2: 

1950 other_type = next(key for key in main_proportions.keys() if key != primary_type) 

1951 return f"{primary_type}/{other_type}" 

1952 # There are three things above the threshold (more is not possible) 

1953 else: 

1954 return "mix" 

1955 

1956 # Create a new structure from the current using a selection to filter atoms 

1957 def filter (self, selection : Union['Selection', str], selection_syntax : str = 'vmd') -> 'Structure': 

1958 if not selection: 

1959 raise InputError('No selection was passed') 

1960 # In case the selection is not an actual Selection, but a string, parse the string into a Selection 

1961 if type(selection) == str: 

1962 selection = self.select(selection, selection_syntax) 

1963 new_atoms = [] 

1964 new_residues = [] 

1965 new_chains = [] 

1966 # Get the selected atoms 

1967 # Generate dictionaries with new indexes as keys and previous indexes as values for atoms, residues and chains 

1968 # This is done with this structure for the residues and chains further to find the new indices fast 

1969 new_atom_indices = {} 

1970 # Collect also original indices to related atom residues and chains 

1971 original_atom_residue_indices = [] 

1972 for new_index, original_index in enumerate(selection.atom_indices): 

1973 # Make a copy of the selected atom in order to not modify the original one 

1974 original_atom = self.atoms[original_index] 

1975 new_atom = Atom( 

1976 name=original_atom.name, 

1977 element=original_atom.element, 

1978 coords=original_atom.coords, 

1979 ) 

1980 new_atoms.append(new_atom) 

1981 # Save old and new indices in order to reconfigure all indices later 

1982 new_atom_indices[original_index] = new_index 

1983 original_atom_residue_indices.append(original_atom.residue_index) 

1984 # Find the selected residues 

1985 selected_residue_indices = list(set(original_atom_residue_indices)) 

1986 # Repeat the original/new indices backup we did before 

1987 new_residue_indices = {} 

1988 original_residue_atom_indices = [] 

1989 original_residue_chain_indices = [] 

1990 for new_index, original_index in enumerate(selected_residue_indices): 

1991 # Make a copy of the selected residue in order to not modify the original one 

1992 original_residue = self.residues[original_index] 

1993 new_residue = Residue( 

1994 name=original_residue.name, 

1995 number=original_residue.number, 

1996 icode=original_residue.icode, 

1997 ) 

1998 new_residues.append(new_residue) 

1999 # Save old and new indices in order to reconfigure all indices later 

2000 new_residue_indices[original_index] = new_index 

2001 original_residue_atom_indices.append(original_residue.atom_indices) 

2002 original_residue_chain_indices.append(original_residue.chain_index) 

2003 # Find the selected chains 

2004 selected_chain_indices = list(set(original_residue_chain_indices)) 

2005 # Repeat the original/new indices backup we did before 

2006 new_chain_indices = {} 

2007 original_chain_residue_indices = [] 

2008 for new_index, original_index in enumerate(selected_chain_indices): 

2009 # Make a copy of the selected chain in order to not modify the original one 

2010 original_chain = self.chains[original_index] 

2011 new_chain = Chain( 

2012 name=original_chain.name, 

2013 ) 

2014 new_chains.append(new_chain) 

2015 # Save old and new indices in order to reconfigure all indices later 

2016 new_chain_indices[original_index] = new_index 

2017 original_chain_residue_indices.append(original_chain.residue_indices) 

2018 # Finally, reset indices in all instances 

2019 for a, atom in enumerate(new_atoms): 

2020 atom.residue_index = new_residue_indices[ original_atom_residue_indices[a] ] 

2021 for r, residue in enumerate(new_residues): 

2022 atom_indices = [] 

2023 for original_index in original_residue_atom_indices[r]: 

2024 new_index = new_atom_indices.get(original_index, None) 

2025 if new_index != None: 

2026 atom_indices.append(new_index) 

2027 residue.atom_indices = atom_indices 

2028 residue.chain_index = new_chain_indices[ original_residue_chain_indices[r] ] 

2029 for c, chain in enumerate(new_chains): 

2030 residue_indices = [] 

2031 for original_index in original_chain_residue_indices[c]: 

2032 new_index = new_residue_indices.get(original_index, None) 

2033 if new_index != None: 

2034 residue_indices.append(new_index) 

2035 chain.residue_indices = residue_indices 

2036 return Structure(atoms=new_atoms, residues=new_residues, chains=new_chains) 

2037 

2038 

2039 def chainer (self, selection : Optional['Selection'] = None, letter : Optional[str] = None, whole_fragments : bool = True): 

2040 """Set chains on demand. 

2041 If no selection is passed then the whole structure will be affected. 

2042 If no chain is passed then a "chain by fragment" logic will be applied.""" 

2043 # If there is no selection we consider all atoms 

2044 if selection == None: 

2045 selection = self.select_all() 

2046 # If the selection is empty then there is nothing to do here 

2047 if len(selection) == 0: return 

2048 # If a letter is specified then the logic is way simpler 

2049 if letter: 

2050 self.set_selection_chain_name(selection, letter) 

2051 return 

2052 # If a letter is not specified we run the "fragments" logic 

2053 fragment_getter = self.find_whole_fragments if whole_fragments else self.find_fragments 

2054 for fragment in fragment_getter(selection): 

2055 chain_name = self.get_available_chain_name() 

2056 self.set_selection_chain_name(fragment, chain_name) 

2057 

2058 # Smart function to set chains automatically 

2059 # Original chains will be overwritten 

2060 def auto_chainer (self): 

2061 # Reset chains 

2062 self.chainer(letter='X') 

2063 # Set solvent and ions as a unique chain 

2064 ion_selection = self.select_ions() 

2065 solvent_selection = self.select_water() 

2066 ion_and_indices_selection = ion_selection + solvent_selection 

2067 self.chainer(selection=ion_and_indices_selection, letter='S') 

2068 # Set fatty acids and steroids as a unique chain 

2069 # DANI: Se podrían incluir algunos carbohydrates (glycans) 

2070 # DANI: Se podrían descartarían residuos que no pertenezcan a la membrana por proximidad 

2071 membrane_selection = self.select_lipids() 

2072 self.chainer(selection=membrane_selection, letter='M') 

2073 # Set carbohydrates as a unique chain as well, just in case we have glycans 

2074 # Note that in case glycan atoms are mixed with protein atoms glycan chains will be overwritten 

2075 # However this is not a problem. It is indeed the best solution if we don't want ro resort atoms 

2076 carbohydrate_selection = self.select_carbohydrates() 

2077 self.chainer(selection=carbohydrate_selection, letter='H') 

2078 # Add a chain per fragment for both proteins and nucleic acids 

2079 protein_selection = self.select_protein() 

2080 nucleic_selection = self.select_nucleic() 

2081 protein_and_nucleic_selection = protein_selection + nucleic_selection 

2082 self.chainer(selection=protein_and_nucleic_selection) 

2083 # At this point we should have covered most of the molecules in the structure 

2084 # However, in case there are more molecules, we have already set them all as a single chain ('X') 

2085 # Here we do not apply the chain per fragment logic since it may be dangerous 

2086 # Note that we may have a lot of independent residues (and thus a los of small fragments) 

2087 # This would make us run out of letters in the alphabet and thus there would be no more chains 

2088 # As the last step, fix repeated chains 

2089 self.check_repeated_chains(fix_chains=True) 

2090 

2091 # This is an alternative system to find protein chains (anything else is chained as 'X') 

2092 # This system does not depend on VMD 

2093 # It totally overrides previous chains since it is expected to be used only when chains are missing 

2094 def raw_protein_chainer (self): 

2095 current_chain = self.get_available_chain_name() 

2096 previous_alpha_carbon = None 

2097 for residue in self.residues: 

2098 alpha_carbon = next((atom for atom in residue.atoms if atom.name == 'CA'), None) 

2099 if not alpha_carbon: 

2100 residue.set_chain('X') 

2101 continue 

2102 # Connected aminoacids have their alpha carbons at a distance of around 3.8 Ångstroms 

2103 residues_are_connected = previous_alpha_carbon and calculate_distance(previous_alpha_carbon, alpha_carbon) < 4 

2104 if not residues_are_connected: 

2105 current_chain = self.get_available_chain_name() 

2106 residue.set_chain(current_chain) 

2107 previous_alpha_carbon = alpha_carbon 

2108 

2109 # Given an atom selection, set the chain for all these atoms 

2110 # Note that the chain is changed in every whole residue, no matter if only one atom was selected 

2111 def set_selection_chain_name (self, selection : 'Selection', letter : str): 

2112 # Find if the letter belongs to an already existing chain 

2113 chain = self.get_chain_by_name(letter) 

2114 # If the chain does not exist yet then create it 

2115 if not chain: 

2116 chain = Chain(name=letter) 

2117 self.set_new_chain(chain) 

2118 # Set the chain index of every residue to the new chain 

2119 for atom_index in selection.atom_indices: 

2120 # WARNING: Chain index has to be read every iteration since it may change. Do not save it 

2121 atom = self.atoms[atom_index] 

2122 atom.set_chain_index(chain.index) 

2123 

2124 # Get an available chain name 

2125 # Find alphabetically the first letter which is not yet used as a chain name 

2126 # If all letters in the alphabet are used already then raise an error 

2127 def get_available_chain_name (self) -> str: 

2128 current_chain_names = [ chain.name for chain in self.chains ] 

2129 next_available_chain_name = next((name for name in AVAILABLE_LETTERS if name not in current_chain_names), None) 

2130 if next_available_chain_name == None: 

2131 raise InputError(f'There are more chains than available chain letters ({len(AVAILABLE_LETTERS)})') 

2132 return next_available_chain_name 

2133 

2134 def get_next_available_chain_name (self, anterior : str) -> str: 

2135 """ 

2136 Get the next available chain name. 

2137 

2138 Args: 

2139 anterior (str): The last chain name used, which is expected to be a single letter 

2140  

2141 Raises: 

2142 ValueError: If the anterior is not a letter or if there are more chains than available 

2143 """ 

2144 current_chain_names = set([ chain.name for chain in self.chains ]) 

2145 # If anterior is cap then try caps first, if anterior is lower then try lowers first 

2146 if anterior.isupper(): 

2147 first_group, second_group = AVAILABLE_CAPS, AVAILABLE_LOWS 

2148 elif anterior.islower(): 

2149 first_group, second_group = AVAILABLE_LOWS, AVAILABLE_CAPS 

2150 else: raise ValueError(f'Is "{anterior}" even a letter?') 

2151 # Reorder letters in the first group, so the anterior is the last letter 

2152 anterior_position = first_group.index(anterior) 

2153 next_position = anterior_position + 1 

2154 reordered_group = first_group[next_position:] + first_group[0:next_position] 

2155 next_letter = next((letter for letter in reordered_group if letter not in current_chain_names), None) 

2156 if next_letter: return next_letter 

2157 # If there is not available letters is the first group then return the first available in the second 

2158 next_letter = next((letter for letter in second_group if letter not in current_chain_names), None) 

2159 if next_letter: return next_letter 

2160 raise ValueError(f'There are more chains than available chain letters ({len(AVAILABLE_LETTERS)})') 

2161 

2162 # Get a chain by its name 

2163 def get_chain_by_name (self, name : str) -> Optional['Chain']: 

2164 return next((c for c in self.chains if c.name == name), None) 

2165 

2166 # Find a residue by its chain, number and insertion code 

2167 def find_residue (self, chain_name : str, number : int, icode : str = '') -> Optional['Residue']: 

2168 # Get the corresponding chain 

2169 target_chain = self.get_chain_by_name(chain_name) 

2170 if not target_chain: return None 

2171 # Find the residue in the target chain 

2172 target_chain.find_residue(number, icode) 

2173 

2174 # Get a summary of the structure 

2175 def display_summary (self): 

2176 print(f'Atoms: {self.atom_count}') 

2177 print(f'Residues: {len(self.residues)}') 

2178 print(f'Chains: {len(self.chains)}') 

2179 for chain in self.chains: 

2180 print(f'Chain {chain.name} ({len(chain.residue_indices)} residues)') 

2181 print(' -> ' + chain.get_sequence()) 

2182 

2183 

2184 def check_repeated_chains (self, fix_chains : bool = False, display_summary : bool = False) -> bool: 

2185 """ 

2186 There may be chains which are equal in the structure (i.e. same chain name). 

2187 This means we have a duplicated/splitted chain. 

2188 Repeated chains are usual and they are usually supported but with some problems. 

2189 Also, repeated chains usually come with repeated residues, which means more problems (see explanation below). 

2190 

2191 In the context of this structure class we may have 2 different problems with a different solution each: 

2192 1. There is more than one chain with the same letter (repeated chain) -> rename the duplicated chains 

2193 2. There is a chain with atom indices which are not consecutive (splitted chain) -> create new chains 

2194 

2195 Rename repeated chains or create new chains if the fix_chains argument is True. 

2196 

2197 WARNING: These fixes are possible only if there are less chains than the number of letters in the alphabet. 

2198 Although there is no limitation in this code for chain names, setting long chain names is not compatible with pdb format. 

2199 

2200 Check splitted chains (a chains with non consecutive residues) and try to fix them if requested. 

2201 Check repeated chains (two chains with the same name) and return True if there were any repeats. 

2202 """ 

2203 # Order chains according to their names 

2204 # Save also those chains which have a previous duplicate 

2205 name_chains = {} 

2206 repeated_chains = [] 

2207 for chain in self.chains: 

2208 chain_name = chain.name 

2209 current_name_chains = name_chains.get(chain_name, None) 

2210 if not current_name_chains: 

2211 name_chains[chain_name] = [chain] 

2212 else: 

2213 name_chains[chain_name].append(chain) 

2214 repeated_chains.append(chain) 

2215 # Display the summary of repeated chains if requested 

2216 if display_summary: 

2217 if len(repeated_chains) > 0: 

2218 warn('There are repeated chains:') 

2219 for chain_name, chains in name_chains.items(): 

2220 chains_count = len(chains) 

2221 if chains_count > 1: 

2222 print(f'- Chain {chain_name} has {chains_count} repeats') 

2223 # Rename repeated chains if requested 

2224 if len(repeated_chains) > 0 and fix_chains: 

2225 n_chains = len(self.chains) 

2226 n_available_letters = len(AVAILABLE_LETTERS) 

2227 if n_chains > n_available_letters: 

2228 # for chain in self.chains: 

2229 # print(str(chain) + ': ' + str(chain.atom_indices[0]) + ' to ' + str(chain.atom_indices[-1])) 

2230 raise ValueError(f'There are more chains ({n_chains}) than available chain letters ({n_available_letters})') 

2231 current_letters = list(name_chains.keys()) 

2232 for repeated_chain in repeated_chains: 

2233 last_chain_letter = repeated_chain.name 

2234 while last_chain_letter in current_letters: 

2235 last_chain_letter = get_next_letter(last_chain_letter) 

2236 repeated_chain.name = last_chain_letter 

2237 current_letters.append(last_chain_letter) 

2238 # Check if there are splitted chains 

2239 for chain in self.chains: 

2240 residue_indices = sorted(chain.residue_indices) 

2241 # Check if residue indices are consecutive 

2242 if residue_indices[-1] - residue_indices[0] + 1 == len(residue_indices): 

2243 continue 

2244 warn(f'Splitted chain {chain.name}') 

2245 # If indices are not consecutive then we must find ranges of consecutive residues and create new chains for them 

2246 previous_residue_index = residue_indices[0] 

2247 consecutive_residues = [previous_residue_index] 

2248 overall_consecutive_residues = [] 

2249 for residue_index in residue_indices[1:]: 

2250 # If next residue is consecutive 

2251 if residue_index == previous_residue_index + 1: 

2252 consecutive_residues.append(residue_index) 

2253 # If next residue is NOT consecutive 

2254 else: 

2255 overall_consecutive_residues.append(consecutive_residues) 

2256 consecutive_residues = [residue_index] 

2257 # Update the previous index 

2258 previous_residue_index = residue_index 

2259 # Add the last split 

2260 overall_consecutive_residues.append(consecutive_residues) 

2261 # Now create new chains and reasign residues 

2262 # Skip the first set of consecutive residues since they will stay in the original chain 

2263 for residues_indices in overall_consecutive_residues[1:]: 

2264 chain_name = self.get_available_chain_name() 

2265 residues_selection = self.select_residue_indices(residues_indices) 

2266 self.set_selection_chain_name(residues_selection, chain_name) 

2267 

2268 # Fix repeated chains if requested 

2269 return len(repeated_chains) > 0 

2270 

2271 

2272 def check_repeated_residues (self, fix_residues : bool = False, display_summary : bool = False) -> bool: 

2273 """ 

2274 There may be residues which are equal in the structure (i.e. same chain, name, number and icode). 

2275 In case 2 residues in the structure are equal we must check distance between their atoms. 

2276 If atoms are far it means they are different residues with the same notation (duplicated residues). 

2277 If atoms are close it means they are indeed the same residue (splitted residue). 

2278 

2279 Splitted residues are found in some pdbs and they are supported by some tools. 

2280 These tools consider all atoms with the same 'record' as the same residue. 

2281 However, there are other tools which would consider the splitted residue as two different residues. 

2282 This causes inconsistency along different tools besides a long list of problems. 

2283 The only possible is fix is changing the order of atoms in the topology. 

2284 Note that this is a breaking change for associated trajectories, which must change the order of coordinates. 

2285 However here we provide tools to fix associates trajectories as well. 

2286 

2287 Duplicated residues are usual and they are usually supported but with some problems. 

2288 For example, pytraj analysis outputs use to sort results by residues and each residue is tagged. 

2289 If there are duplicated residues with the same tag it may be not possible to know which result belongs to each residue. 

2290 Another example are NGL selections once in the web client. 

2291 If you select residue ':A and 1' and there are multiple residues 1 in chain A all of them will be displayed. 

2292 

2293 Check residues to search for duplicated and splitted residues. 

2294 Renumerate repeated residues if the fix_residues argument is True. 

2295 Return True if there were any repeats. 

2296 """ 

2297 # Track if residues have to be changed or not 

2298 modified = False 

2299 # Group all residues in the structure according to their chain, number and icode 

2300 grouped_residues = {} 

2301 # Check repeated residues which are one after the other 

2302 # Note that these residues MUST have a different name 

2303 # Otherwise they would have not been considered different residues 

2304 # For these rare cases we use icodes to solve the problem 

2305 non_icoded_residues = [] 

2306 last_residue = None 

2307 for residue in self.residues: 

2308 # Check residue to be equal than the previous residue 

2309 if residue == last_residue: 

2310 non_icoded_residues.append(residue) 

2311 last_residue = residue 

2312 continue 

2313 last_residue = residue 

2314 # Add residue to the groups of repeated residues 

2315 current_residue_repeats = grouped_residues.get(residue, None) 

2316 if not current_residue_repeats: 

2317 grouped_residues[residue] = [ residue ] 

2318 else: 

2319 current_residue_repeats.append(residue) 

2320 # In case we have non icoded residues 

2321 if len(non_icoded_residues) > 0: 

2322 if display_summary: 

2323 print(f'There are non-icoded residues ({len(non_icoded_residues)})') 

2324 # Set new icodes for non icoded residues 

2325 if fix_residues: 

2326 print(' Non icoded residues will recieve an icode') 

2327 for residue in non_icoded_residues: 

2328 repeated_residues_group = grouped_residues[residue] 

2329 current_icodes = [ residue.icode for residue in repeated_residues_group if residue.icode ] 

2330 next_icode = next((cap for cap in AVAILABLE_CAPS if cap not in current_icodes), None) 

2331 if not next_icode: 

2332 raise ValueError('There are no more icodes available') 

2333 # print('Setting icode ' + next_icode + ' to residue ' + str(residue)) 

2334 residue.icode = next_icode 

2335 modified = True 

2336 # Grouped residues with more than 1 result are considered as repeated 

2337 repeated_residues = [ residues for residues in grouped_residues.values() if len(residues) > 1 ] 

2338 if len(repeated_residues) == 0: 

2339 return modified 

2340 # In case we have repeated residues... 

2341 if display_summary: 

2342 warn(f'There are {len(repeated_residues)} different groups of repeated residues') 

2343 print(f' e.g. {repeated_residues[0][0]}') 

2344 if len(repeated_residues) == 9999 or len(repeated_residues) == 10000: 

2345 print(' Probably you have more residues than the PDB numeration limit (1-9999)') 

2346 # Now for each repeated residue, find out which are splitted and which are duplicated 

2347 covalent_bonds = self.bonds 

2348 overall_splitted_residues = [] 

2349 overall_duplicated_residues = [] 

2350 for residues in repeated_residues: 

2351 # Iterate over repeated residues and check if residues are covalently bonded 

2352 # If any pair of residues are bonded add them both to the splitted residues list 

2353 # At the end, all non-splitted residues will be considered duplicated residues 

2354 # WARNING: Using a set here is not possible since repeated residues have the same hash 

2355 # WARNING: Also comparing residues themselves is not advisable, so we use indices at this point 

2356 splitted_residue_indices = set() 

2357 for residue, other_residues in otherwise(residues): 

2358 if residue.index in splitted_residue_indices: 

2359 continue 

2360 # Get atom indices for all atoms connected to the current residue 

2361 residue_bonds = sum([ covalent_bonds[index] for index in residue.atom_indices ], []) 

2362 for other_residue in other_residues: 

2363 # Get all atom indices for each other residue and collate with the current residue bonds 

2364 if any( index in residue_bonds for index in other_residue.atom_indices ): 

2365 splitted_residue_indices.add(residue.index) 

2366 splitted_residue_indices.add(other_residue.index) 

2367 # Finally obtain the splitted residues from its indices 

2368 splitted_residues = [ self.residues[index] for index in splitted_residue_indices ] 

2369 # Repeated residues which are not splitted are thus duplicated 

2370 duplicated_residues = [ residue for residue in residues if residue.index not in splitted_residue_indices ] 

2371 # Update the overall lists 

2372 if len(splitted_residues) > 0: 

2373 overall_splitted_residues.append(splitted_residues) 

2374 if len(duplicated_residues) > 0: 

2375 overall_duplicated_residues.append(duplicated_residues) 

2376 # In case we have splitted residues 

2377 if len(overall_splitted_residues) > 0: 

2378 if display_summary: 

2379 print(f' There are splitted residues ({len(overall_splitted_residues)})') 

2380 # Fix splitted residues 

2381 if fix_residues: 

2382 print(' Splitted residues will be merged') 

2383 # Set a function to sort atoms and residues by index 

2384 def by_index (v): 

2385 return v._index 

2386 # Merge splitted residues 

2387 # WARNING: Merging residues without sorting atoms is possible, but this would be lost after exporting to pdb 

2388 for splitted_residues in overall_splitted_residues: 

2389 # The first residue (i.e. the residue with the lower index) will be the one which remains 

2390 # It will absorb other residue atom indices 

2391 splitted_residues.sort(key=by_index) # Residues are not sorted by default, this is mandatory 

2392 first_residue = splitted_residues[0] 

2393 other_residues = splitted_residues[1:] 

2394 for residue in other_residues: 

2395 for atom in residue.atoms: 

2396 atom.residue = first_residue 

2397 # Then these other residues are eliminated 

2398 self.purge_residue(residue) 

2399 print(' Atoms will be sorted to be together by residues') 

2400 print(' NEVER FORGET: This will break any associated trajectory if coordinates are not sorted as well') 

2401 # Sort atoms to group residue atoms together 

2402 # Note that each atom index must be updated 

2403 new_atom_indices = sum([ residue.atom_indices for residue in self.residues ], []) 

2404 for i, new_atom_index in enumerate(new_atom_indices): 

2405 atom = self.atoms[new_atom_index] 

2406 atom._index = i 

2407 self.atoms.sort(key=by_index) 

2408 # Also residue 'atom_indices' must be updated 

2409 for residue in self.residues: 

2410 residue._atom_indices = [ new_atom_indices.index(atom_index) for atom_index in residue._atom_indices ] 

2411 # Bonds must be reset since atom indices have changes 

2412 self._bonds = None 

2413 # Prepare the trajectory atom sorter which must be returned 

2414 # Include atom indices already so the user has to provide only the structure and trajectory filepaths 

2415 def trajectory_atom_sorter ( 

2416 input_structure_file : 'File', 

2417 input_trajectory_file : 'File', 

2418 output_trajectory_file : 'File' 

2419 ): 

2420 sort_trajectory_atoms( 

2421 input_structure_file, 

2422 input_trajectory_file, 

2423 output_trajectory_file, 

2424 new_atom_indices 

2425 ) 

2426 self.trajectory_atom_sorter = trajectory_atom_sorter 

2427 self.new_atom_order = new_atom_indices 

2428 modified = True 

2429 # In case we have duplicated residues 

2430 duplicated_residues_count = len(overall_duplicated_residues) 

2431 if duplicated_residues_count > 0: 

2432 if display_summary: 

2433 warn(f'There are {duplicated_residues_count} different groups of duplicated residues') 

2434 # Renumerate duplicated residues if requested 

2435 if fix_residues: 

2436 # First of all, from each group of repeated residues, discard the first residue 

2437 # The first residue will remain as it is and the rest will be renumerated 

2438 # Join all residues to be renumerated in a single list 

2439 residue_to_renumerate = [] 

2440 for residues in overall_duplicated_residues: 

2441 residue_to_renumerate += residues[1:] 

2442 # Now group residues per chain since the renumeration is done by chains 

2443 chain_residues = {} 

2444 for residue in residue_to_renumerate: 

2445 chain = residue.chain 

2446 current_chain_residues = chain_residues.get(chain, None) 

2447 if current_chain_residues: current_chain_residues.append(residue) 

2448 else: chain_residues[chain] = [ residue ] 

2449 # Iterate residue chain groups 

2450 for chain, residues in chain_residues.items(): 

2451 # Find the last residue number in the current chain 

2452 maximum_chain_number = max([ residue.number for residue in chain.residues ]) 

2453 # Sort residues by index 

2454 residues.sort(key=lambda x: x.index, reverse=True) 

2455 for residue in residues: 

2456 # Set the number of the residue as the next available number 

2457 residue.number = maximum_chain_number + 1 

2458 # Update the maximum number 

2459 maximum_chain_number = residue.number 

2460 modified = True 

2461 return modified 

2462 

2463 # Atoms with identical chain, residue and name are considered repeated atoms 

2464 # DANI: No recuerdo los problemas que daba tener átomos repetidos 

2465 

2466 # Check atoms to search for repeated atoms 

2467 # Rename repeated atoms if the fix_atoms argument is True 

2468 # Return True if there were any repeats 

2469 def check_repeated_atoms (self, fix_atoms : bool = False, display_summary : bool = False) -> bool: 

2470 # Set two trackers for display 

2471 repeated_atoms_count = 0 

2472 example = None 

2473 for residue in self.residues: 

2474 # Iterate over the residue atoms counting how many repeated names we have 

2475 current_names = [] 

2476 for atom in residue.atoms: 

2477 # We check if the atom name already exists. If not, go to the next atom 

2478 name = atom.name 

2479 if name not in current_names: 

2480 current_names.append(name) 

2481 continue 

2482 # When atom is repeated 

2483 repeated_atoms_count += 1 

2484 # If the fix was not requested we stop here 

2485 if not fix_atoms: 

2486 continue 

2487 # We set the name of the atom as the element letter + the count of this element 

2488 # If element is missing take the first character of the atom name 

2489 initial = atom.element 

2490 if not initial or initial == ' ': 

2491 initial = name[0] 

2492 number = 1 

2493 new_name = initial + str(number) 

2494 while new_name in current_names: 

2495 number += 1 

2496 new_name = initial + str(number) 

2497 # Save an example for the logs if there is none yet 

2498 if not example: 

2499 example = f'{atom.name} renamed as {new_name} in residue {residue}' 

2500 atom.name = new_name 

2501 current_names.append(new_name) 

2502 # Display the summary of repeated atoms if requested 

2503 if display_summary: 

2504 if repeated_atoms_count > 0: 

2505 warn(f'There are repeated atoms ({repeated_atoms_count})') 

2506 print(f' e.g. {example}') 

2507 return repeated_atoms_count > 0 

2508 

2509 # Check bonds to be incoherent 

2510 # i.e. check atoms not to have more or less bonds than expected according to their element 

2511 def check_incoherent_bonds (self) -> bool: 

2512 # Find out if there are hydrogens in the structure 

2513 # It may happen that we encounter an structure without hydrogens 

2514 has_hydrogen = next(( True for atom in self.atoms if atom.element == 'H' ), False) 

2515 coherent_bonds = coherent_bonds_with_hydrogen if has_hydrogen else coherent_bonds_without_hydrogen 

2516 # Check coherent bonds atom by atom 

2517 for atom in self.atoms: 

2518 # Do not check this atom bonds if this may be an ion 

2519 # Most authors "force" dummy bonds in these atoms to make them stable 

2520 if atom.element in SUPPORTED_ION_ELEMENTS: continue 

2521 # Ignore dummy atoms as well 

2522 if atom.element == DUMMY_ATOM_ELEMENT: continue 

2523 # Ignore coarse grain atoms as well 

2524 if atom.element == CG_ATOM_ELEMENT: continue 

2525 # Get actual number of bonds in the current atom both with and without ion bonds 

2526 # LORE: This was a problem since some ions are force-bonded but bonds are actually not real 

2527 # LORE: When an ion is forced we may end up with hyrogens with 2 bonds or carbons with 5 bonds 

2528 # LORE: When an ions is really bonded we can not discard it or we may end up with orphan carbons (e.g. ligands) 

2529 min_nbonds = len(atom.get_bonds(skip_ions = True, skip_dummies = True)) 

2530 max_nbonds = len(atom.get_bonds(skip_ions = False, skip_dummies = True)) 

2531 # Get the accepted range of number of bonds for the current atom according to its element 

2532 element = atom.element 

2533 element_coherent_bonds = coherent_bonds.get(element, None) 

2534 # If there are no specficiations for the current atom element then skip it 

2535 if not element_coherent_bonds: 

2536 continue 

2537 # Get the range of accepted number of bonds 

2538 min_allowed_bonds = element_coherent_bonds['min'] 

2539 max_allowed_bonds = element_coherent_bonds['max'] 

2540 # Check the actual number of bonds is insdie the accepted range 

2541 if max_nbonds < min_allowed_bonds or min_nbonds > max_allowed_bonds: 

2542 if min_nbonds == max_nbonds: 

2543 print(f' An atom with element {element} has {min_nbonds} bonds') 

2544 else: 

2545 print(f' An atom with element {element} has between {min_nbonds} bonds (withou ions) and {max_nbonds} bonds (with ions)') 

2546 if min_allowed_bonds == max_allowed_bonds: 

2547 plural_sufix = '' if min_allowed_bonds == 1 else 's' 

2548 print(f' It should have {min_allowed_bonds} bond{plural_sufix}') 

2549 else: 

2550 print(f' It should have between {min_allowed_bonds} and {max_allowed_bonds} bonds') 

2551 print(f' -> Atom {atom.label}') 

2552 bond_label = ', '.join([ self.atoms[atom].label for atom in atom.get_bonds(skip_ions = False) ]) 

2553 print(f' -> Bonds {bond_label}') 

2554 return True 

2555 return False 

2556 

2557 # Get all atomic covalent (strong) bonds 

2558 # Bonds are defined as a list of atom indices for each atom in the structure 

2559 # Rely on VMD logic to do so 

2560 def get_covalent_bonds (self, selection : Optional['Selection'] = None) -> List[ List[int] ]: 

2561 # It is important to fix elements before trying to fix bonds, since elements have an impact on bonds 

2562 # VMD logic to find bonds relies in the atom element to set the covalent bond distance cutoff 

2563 self.fix_atom_elements() 

2564 # Generate a pdb strucutre to feed vmd 

2565 auxiliar_pdb_filepath = '.structure.pdb' 

2566 self.generate_pdb_file(auxiliar_pdb_filepath) 

2567 # Get covalent bonds between both residue atoms 

2568 covalent_bonds = get_covalent_bonds(auxiliar_pdb_filepath, selection) 

2569 # For every atom in CG, replace its bonds with a class which will raise and error when read 

2570 # Thus we make sure using these wrong bonds anywhere further will result in failure 

2571 for atom_index in self.select_cg().atom_indices: 

2572 covalent_bonds[atom_index] = MISSING_BONDS 

2573 # Remove the auxiliar pdb file 

2574 os.remove(auxiliar_pdb_filepath) 

2575 return covalent_bonds 

2576 

2577 # Make a copy of the bonds list 

2578 def copy_bonds (self) -> List[List[int]]: 

2579 new_bonds = [] 

2580 for atom_bonds in self.bonds: 

2581 # Missing bonds coming from CG atoms are forwarded 

2582 if atom_bonds == MISSING_BONDS: 

2583 new_bonds.append(MISSING_BONDS) 

2584 continue 

2585 # Copy also the inner lists to avoid further mutation of the original structure 

2586 new_bonds.append([ atom_index for atom_index in atom_bonds ]) 

2587 return new_bonds 

2588 

2589 # Make a copy of the current structure 

2590 def copy (self) -> 'Structure': 

2591 atom_copies = [ atom.copy() for atom in self.atoms ] 

2592 residue_copies = [ residue.copy() for residue in self.residues ] 

2593 chain_copies = [ chain.copy() for chain in self.chains ] 

2594 structure_copy = Structure(atom_copies, residue_copies, chain_copies) 

2595 structure_copy.bonds = self.copy_bonds() 

2596 return structure_copy 

2597 

2598 # DANI: No lo he testeado en profundidad 

2599 def merge (self, other : 'Structure') -> 'Structure': 

2600 """Merge current structure with another structure.""" 

2601 # Copy self atoms, residues and chains 

2602 self_atom_copies = [ atom.copy() for atom in self.atoms ] 

2603 self_residue_copies = [ residue.copy() for residue in self.residues ] 

2604 self_chain_copies = [ chain.copy() for chain in self.chains ] 

2605 # Copy other atoms, residues and chains 

2606 other_atom_copies = [ atom.copy() for atom in other.atoms ] 

2607 other_residue_copies = [ residue.copy() for residue in other.residues ] 

2608 other_chain_copies = [ chain.copy() for chain in other.chains ] 

2609 # Adapt indices in other atoms, residues and chains 

2610 atom_index_offset = len(self_atom_copies) 

2611 residue_index_offset = len(self_residue_copies) 

2612 chain_index_offset = len(self_chain_copies) 

2613 for atom in other_atom_copies: 

2614 atom._index += atom_index_offset 

2615 atom._residue_index += residue_index_offset 

2616 for residue in other_residue_copies: 

2617 residue._index += residue_index_offset 

2618 residue._atom_indices = [ i + atom_index_offset for i in residue._atom_indices ] 

2619 residue._chain_index += chain_index_offset 

2620 for chain in other_chain_copies: 

2621 chain._index += chain_index_offset 

2622 chain._residue_indices = [ i + residue_index_offset for i in chain._residue_indices ] 

2623 # Merge self with other atoms, residues and chains and build the new structure 

2624 merged_atoms = self_atom_copies + other_atom_copies 

2625 merged_residues = self_residue_copies + other_residue_copies 

2626 merged_chains = self_chain_copies + other_chain_copies 

2627 return Structure(merged_atoms, merged_residues, merged_chains) 

2628 

2629 # Find rings in the structure and yield them as they are found 

2630 # LIMITATION: Wrong rings may be found including smaller rings 

2631 # LIMITATION: Use this function to see if a specific ring exists, but not to count rings 

2632 def find_rings (self, selection : Optional[Selection] = None) -> Generator[ List[Atom], None, None ]: 

2633 selected_atom_indices = selection.atom_indices if selection else None 

2634 # We will use a logic which finds long paths of bonded heavy atoms 

2635 # These paths have no 'branches' and thus, when there are 2 possible paths then both paths are explored independently 

2636 # Save already searched atoms and the number of times it has been explored already 

2637 # Note that atoms may be visited as many times as their number of heavy atom bonds -1 

2638 already_searched_atoms = {} 

2639 def find_rings_recursive (atom_path : List[Atom]) -> Generator[ List[Atom], None, None ]: 

2640 # Get the current atom to continue the followup: the last atom 

2641 current_atom = atom_path[-1] # Note that the list MUST always have at least 1 atom 

2642 # Get bonded atoms to continue the path 

2643 followup_atoms = current_atom.get_bonded_atoms() 

2644 # Hydrogens are discarded since they have only 1 bond and thus they can only lead to a dead end 

2645 followup_atoms = [ atom for atom in followup_atoms if atom.element != 'H' ] 

2646 # In case there is a selection, check followup atoms not to be in the selection 

2647 if selected_atom_indices: 

2648 followup_atoms = [ atom for atom in followup_atoms if atom.index in selected_atom_indices ] 

2649 # Check if this atom has been visited already too many times and, if so, stop here 

2650 # Allowed search times is the posible number of rings where this atom can belong 

2651 allowed_search_times = comb(len(followup_atoms), 2) 

2652 already_search_times = already_searched_atoms.get(current_atom, 0) 

2653 if already_search_times == allowed_search_times: 

2654 return 

2655 already_searched_atoms[current_atom] = already_search_times + 1 

2656 # Remove the previous atom to avoid going back 

2657 previous_atom = atom_path[-2] if len(atom_path) > 1 else None 

2658 if previous_atom: 

2659 followup_atoms.remove(previous_atom) 

2660 # Iterate over the following bonded atoms 

2661 for followup_atom in followup_atoms: 

2662 # Check if the followup atom is in the path 

2663 path_index = next(( index for index, atom in enumerate(atom_path) if atom == followup_atom ), None) 

2664 # If so, we have found a ring 

2665 if path_index != None: 

2666 ring = atom_path[path_index:] 

2667 yield ring 

2668 continue 

2669 # Otherwise, keep searching 

2670 new_path = atom_path + [followup_atom] 

2671 for ring in find_rings_recursive(atom_path=new_path): 

2672 yield ring 

2673 # Find a starting atom and run the recursive logic 

2674 candidate_atom_indices = selected_atom_indices if selected_atom_indices else range(self.atom_count) 

2675 for candidate_atom_index in candidate_atom_indices: 

2676 candidate_atom = self.atoms[candidate_atom_index] 

2677 # It must be heavy atom 

2678 if candidate_atom.element == 'H': 

2679 continue 

2680 # It must not be a dead end already 

2681 bonded_atoms = candidate_atom.get_bonded_atoms() 

2682 bonded_atoms = [ atom for atom in bonded_atoms if atom.element != 'H' ] 

2683 # In case there is a selection, check followup atoms not to be in the selection 

2684 if selected_atom_indices: 

2685 bonded_atoms = [ atom for atom in bonded_atoms if atom.index in selected_atom_indices ] 

2686 # Check this atom is not a dead end already 

2687 allowed_search_times = comb(len(bonded_atoms), 2) 

2688 if allowed_search_times < 1: 

2689 continue 

2690 for ring in find_rings_recursive(atom_path=[candidate_atom]): 

2691 yield ring 

2692 

2693 # Given an atom selection, get all bonds between these atoms and any other atom in the structure 

2694 # Note that inner bonds between atoms in the selection are discarded 

2695 def get_selection_outer_bonds (self, selection : Selection) -> List[int]: 

2696 # Get bonds from all atoms in the selection 

2697 bonds = set() 

2698 for atom_index in selection.atom_indices: 

2699 atom_bonds = set(self.bonds[atom_index]) 

2700 bonds = bonds.union(atom_bonds) 

2701 # Discard selection atoms from the bonds list to discard inner bonds 

2702 bonds -= set(selection.atom_indices) 

2703 return list(bonds) 

2704 

2705 # Set the type of PTM according to the classification of the bonded residue 

2706 ptm_options = { 

2707 'protein': ValueError('A PTM residue must never be protein'), 

2708 'dna': 'DNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama 

2709 'rna': 'RNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama 

2710 'carbohydrate': 'Glycosilation', 

2711 'fatty': 'Lipidation', 

2712 'steroid': 'Steroid linkage', # DANI: Esto es posible aunque muy raro y no se como se llama 

2713 'ion': Warning('Ion is covalently bonded to protein'), # DANI: esto no es "correcto" pero si habitual 

2714 'solvent': Warning('Solvent is covalently bonded to protein'), # DANI: probablemente sea un error 

2715 'acetyl': 'Acetylation', # Typical N-capping terminals 

2716 'amide': 'Amidation', # Typical C-capping terminals 

2717 'other': Warning('Unknow type of PTM'), # Something not supported 

2718 } 

2719 

2720 # Find Post Translational Modifications (PTM) in the structure 

2721 def find_ptms (self) -> Generator[dict, None, None]: 

2722 # Find bonds between protein and non-protein atoms 

2723 # First get all protein atoms 

2724 protein_selection = self.select_protein() 

2725 if not protein_selection: 

2726 return 

2727 protein_atom_indices = set(protein_selection.atom_indices) # This is used later 

2728 protein_outer_bonds = set(self.get_selection_outer_bonds(protein_selection)) 

2729 non_protein_selection = self.invert_selection(protein_selection) 

2730 if not non_protein_selection: 

2731 return 

2732 non_protein_atom_indices = set(non_protein_selection.atom_indices) 

2733 non_protein_atom_indices_bonded_to_protein = protein_outer_bonds.intersection(non_protein_atom_indices) 

2734 # Get each residue bonded to the protein and based on its 'classification' set the name of the PTM 

2735 for atom_index in non_protein_atom_indices_bonded_to_protein: 

2736 atom = self.atoms[atom_index] 

2737 residue = atom.residue 

2738 residue_classification = residue.get_classification() 

2739 ptm_classification = self.ptm_options[residue_classification] 

2740 # If we found something impossible then raise the error 

2741 if type(ptm_classification) == ValueError: 

2742 print(f'Problematic residue: {residue}') 

2743 raise ptm_classification 

2744 # If we do not know what it is then do tag it as a PTM but print a warning 

2745 if type(ptm_classification) == Warning: 

2746 warn(f'{ptm_classification} -> {residue}') 

2747 continue 

2748 # At this point we have found a valid PTM 

2749 # Find the protein residue linked to this PTM 

2750 atom_bonds = self.bonds[atom_index] 

2751 protein_bond = next((bond for bond in atom_bonds if bond in protein_atom_indices), None) 

2752 if protein_bond == None: 

2753 raise ValueError('There must be at least one protein bond to the current atom') 

2754 protein_residue_index = self.atoms[protein_bond].residue_index 

2755 # Set the PTM 

2756 yield { 'name': ptm_classification, 'residue_index': protein_residue_index } 

2757 

2758 # Ask if the structure has at least one coarse grain atom/residue 

2759 def has_cg (self) -> bool: 

2760 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms) 

2761 

2762### Related functions ### 

2763 

2764# Calculate the distance between two atoms 

2765def calculate_distance (atom_1 : Atom, atom_2 : Atom) -> float: 

2766 squared_distances_sum = 0 

2767 for i in { 'x': 0, 'y': 1, 'z': 2 }.values(): 

2768 squared_distances_sum += (atom_1.coords[i] - atom_2.coords[i])**2 

2769 return math.sqrt(squared_distances_sum) 

2770 

2771# Calculate the angle between 3 atoms 

2772# https://stackoverflow.com/questions/35176451/python-code-to-calculate-angle-between-three-point-using-their-3d-coordinates 

2773def calculate_angle (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom) -> float: 

2774 # Get coordinates in numpy arrays, which allows to easily calculate the difference 

2775 a = np.array(atom_1.coords) 

2776 b = np.array(atom_2.coords) 

2777 c = np.array(atom_3.coords) 

2778 # Set the two vectors which make the angle 

2779 ba = a - b 

2780 bc = c - b 

2781 # Calculate the angle between these 2 vectors 

2782 cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc)) 

2783 angle = np.arccos(cosine_angle) 

2784 return np.degrees(angle) 

2785 

2786# Calculate the torsion between 4 atoms 

2787# https://stackoverflow.com/questions/20305272/dihedral-torsion-angle-from-four-points-in-cartesian-coordinates-in-python 

2788def calculate_torsion (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom, atom_4 : Atom) -> float: 

2789 p0 = np.array(atom_1.coords) 

2790 p1 = np.array(atom_2.coords) 

2791 p2 = np.array(atom_3.coords) 

2792 p3 = np.array(atom_4.coords) 

2793 # Ge some vectors 

2794 b0 = -1.0*(p1 - p0) 

2795 b1 = p2 - p1 

2796 b2 = p3 - p2 

2797 # normalize b1 so that it does not influence magnitude of vector 

2798 # rejections that come next 

2799 b1 /= np.linalg.norm(b1) 

2800 # vector rejections 

2801 # v = projection of b0 onto plane perpendicular to b1 

2802 # = b0 minus component that aligns with b1 

2803 # w = projection of b2 onto plane perpendicular to b1 

2804 # = b2 minus component that aligns with b1 

2805 v = b0 - np.dot(b0, b1)*b1 

2806 w = b2 - np.dot(b2, b1)*b1 

2807 # angle between v and w in a plane is the torsion angle 

2808 # v and w may not be normalized but that's fine since tan is y/x 

2809 x = np.dot(v, w) 

2810 y = np.dot(np.cross(b1, v), w) 

2811 return float(np.degrees(np.arctan2(y, x))) 

2812 

2813### Auxiliar functions ### 

2814 

2815# Set a function to get the next letter from an input letter in alphabetic order 

2816def get_next_letter (letter : str) -> str: 

2817 if not letter: 

2818 return 'A' 

2819 if letter == 'z' or letter == 'Z': 

2820 raise InputError("Limit of chain letters has been reached") 

2821 next_letter = chr(ord(letter) + 1) 

2822 return next_letter 

2823 

2824# Given a string with 1 or 2 characters, return a new string with the first letter cap and the second letter not cap (if any) 

2825def first_cap_only (name : str) -> str: 

2826 if len(name) == 1: 

2827 return name.upper() 

2828 first_character = name[0].upper() 

2829 second_character = name[1].lower() 

2830 return first_character + second_character 

2831 

2832# Convert numbers to lower text characters (chr 8020-8029) 

2833lower_numbers = { 

2834 '0': '₀', 

2835 '1': '₁', 

2836 '2': '₂', 

2837 '3': '₃', 

2838 '4': '₄', 

2839 '5': '₅', 

2840 '6': '₆', 

2841 '7': '₇', 

2842 '8': '₈', 

2843 '9': '₉', 

2844} 

2845def get_lower_numbers (numbers_text : str) -> str: 

2846 return ''.join([ lower_numbers[c] for c in numbers_text ]) 

2847 

2848# Set a function to filter lines in PDB content for a specific model 

2849def filter_model (pdb_content : str, model : int) -> str: 

2850 # Make sure the PDB content has multiple models 

2851 # If not, then teturn the whole PDB content ignoring the flag 

2852 generic_model_header_regex = r'\nMODEL\s*[0-9]*\s*\n' 

2853 if not re.search(generic_model_header_regex, pdb_content): 

2854 print(f'PDB content has no models at all so it will be loaded as is (ignored model "{model}").') 

2855 return pdb_content 

2856 # If a model was passed then it means we must filter the PDB content 

2857 filtered_pdb_content = '' 

2858 # Search PDB lines until we find our model's header 

2859 model_header_regex = rf'^MODEL\s*{model}' 

2860 pdb_lines = iter(pdb_content.split('\n')) 

2861 for line in pdb_lines: 

2862 if not re.match(model_header_regex, line): continue 

2863 # If we just found the header 

2864 filtered_pdb_content = line 

2865 break 

2866 # If we did not find the header then stop here 

2867 if not filtered_pdb_content: raise RuntimeError(f'Could not find model "{model}" header') 

2868 # Add every line to the filtered content until we find the tail 

2869 model_footer_regex = r'^ENDMDL' 

2870 for line in pdb_lines: 

2871 filtered_pdb_content += '\n' + line 

2872 if re.match(model_footer_regex, line): return filtered_pdb_content 

2873 # If we did not find the footer then stop here 

2874 raise RuntimeError(f'Could not find model "{model}" footer')