Coverage for model_workflow/utils/structures.py: 87%

1# Main handler of the toolbelt

2import os

3import re

4import math

5import pytraj

6import MDAnalysis

7import numpy as np

8from scipy.special import comb # DANI: Substituye al math.comb porque fué añadido en python 3.8 y nosotros seguimos en 3.7

9from bisect import bisect

11from model_workflow.utils.file import File

12from model_workflow.utils.selections import Selection

13from model_workflow.utils.vmd_spells import get_vmd_selection_atom_indices, get_covalent_bonds

14from model_workflow.utils.mdt_spells import sort_trajectory_atoms

15from model_workflow.utils.auxiliar import InputError, MISSING_BONDS

16from model_workflow.utils.auxiliar import is_imported, residue_name_to_letter, otherwise, warn

17from model_workflow.utils.constants import SUPPORTED_ION_ELEMENTS, SUPPORTED_ELEMENTS

18from model_workflow.utils.constants import STANDARD_COUNTER_CATION_ATOM_NAMES, STANDARD_COUNTER_ANION_ATOM_NAMES

19from model_workflow.utils.constants import STANDARD_SOLVENT_RESIDUE_NAMES, STANDARD_COUNTER_ION_ATOM_NAMES

20from model_workflow.utils.constants import STANDARD_DUMMY_ATOM_NAMES, DUMMY_ATOM_ELEMENT, CG_ATOM_ELEMENT

21from model_workflow.utils.constants import PROTEIN_RESIDUE_NAME_LETTERS, NUCLEIC_RESIDUE_NAME_LETTERS

22from model_workflow.utils.constants import DNA_RESIDUE_NAME_LETTERS, RNA_RESIDUE_NAME_LETTERS

23from model_workflow.utils.constants import FATTY_RESIDUE_NAMES, STEROID_RESIDUE_NAMES

24from model_workflow.utils.type_hints import *

25Coords = Tuple[float, float, float]

28# Set all available chains according to pdb standards

29AVAILABLE_CAPS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',

30 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

31AVAILABLE_LOWS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',

32 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

33AVAILABLE_LETTERS = AVAILABLE_CAPS + AVAILABLE_LOWS

35# Set letters to be found in alphanumerical bases

36hexadecimal_letters = set(AVAILABLE_CAPS[0:6] + AVAILABLE_LOWS[0:6])

37alphanumerical_letters = set(AVAILABLE_CAPS[6:] + AVAILABLE_LOWS[6:])

39# Set the expected number of bonds for each atom according to its element

40coherent_bonds_with_hydrogen = {

41 'H': { 'min': 1, 'max': 1 },

42 'O': { 'min': 1, 'max': 2 },

43 'N': { 'min': 1, 'max': 4 },

44 'C': { 'min': 2, 'max': 4 },

45 'S': { 'min': 1, 'max': 6 },

46 'P': { 'min': 2, 'max': 6 },

47}

48# WARNING: Not deeply tested

49coherent_bonds_without_hydrogen = {

50 'O': { 'min': 0, 'max': 2 },

51 'N': { 'min': 1, 'max': 3 },

52 'C': { 'min': 1, 'max': 3 },

53 'S': { 'min': 1, 'max': 4 },

54 'P': { 'min': 2, 'max': 4 },

55}

57# Set the limit of residue numbers according to PDB format (4 characters, starts count at 1)

58# This means the last number is 9999 and it is equivalent to index 9998

59pdb_last_decimal_residue_index = 9998

61class Atom:

62 """An atom class."""

64 def __init__ (self,

65 name : Optional[str] = None,

66 element : Optional[str] = None,

67 coords : Optional[Coords] = None,

68 ):

69 self.name = name

70 self.element = element

71 self.coords = tuple(coords) if coords else None

72 # Set variables to store references to other related instances

73 # These variables will be set further by the structure

74 self._structure = None

75 self._index = None

76 self._residue_index = None

78 def __repr__ (self):

79 return '<Atom ' + self.name + '>'

81 def __eq__ (self, other):

82 if type(self) != type(other):

83 return False

84 return self._residue_index == other._residue_index and self.name == other.name

86 def __hash__ (self):

87 return hash((self.index))

89 # The parent structure (read only)

90 # This value is set by the structure itself

91 def get_structure (self) -> Optional['Structure']:

92 return self._structure

93 structure = property(get_structure, None, None, "The parent structure (read only)")

95 # The residue index according to parent structure residues (read only)

96 # This value is set by the structure itself

97 def get_index (self) -> Optional[int]:

98 return self._index

99 index = property(get_index, None, None, "The residue index according to parent structure residues (read only)")

100

101 # The atom residue index according to parent structure residues

102 # If residue index is set then make changes in all the structure to make this change coherent

103 def get_residue_index (self) -> int:

104 return self._residue_index

105 def set_residue_index (self, new_residue_index : int):

106 # If the new residue index is the current residue index then do nothing

107 # WARNING: It is important to stop this here or it could delete a residue which is not to be deleted

108 if new_residue_index == self.residue_index:

109 return

110 # If there is not strucutre yet it means the residue is beeing set before the structure

111 # We just save the residue index and wait for the structure to be set

112 if not self.structure:

113 self._residue_index = new_residue_index

114 return

115 # Relational indices are updated through a top-down hierarchy

116 # Affected residues are the ones to update this atom internal residue index

117 current_residue = self.residue

118 current_residue.remove_atom(self)

119 new_residue = self.structure.residues[new_residue_index]

120 new_residue.add_atom(self)

121 residue_index = property(get_residue_index, set_residue_index, None, "The atom residue index according to parent structure residues")

122

123 # The atom residue

124 # If residue is set then make changes in all the structure to make this change coherent

125 def get_residue (self) -> Optional['Residue']:

126 # If there is not strucutre yet it means the atom is beeing set before the structure

127 # In this case it is not possible to get related residues in the structure

128 # It also may happend that the residue is missing because the atom has been set rawly

129 if not self.structure or self.residue_index == None:

130 return None

131 # Get the residue in the structure according to the residue index

132 return self.structure.residues[self.residue_index]

133 def set_residue (self, new_residue : 'Residue'):

134 # Find the new residue index and set it as the atom residue index

135 # Note that the residue must be set in the structure already

136 new_residue_index = new_residue.index

137 if new_residue_index == None:

138 raise ValueError('Residue ' + str(new_residue) + ' is not set in the structure')

139 self.set_residue_index(new_residue_index)

140 residue = property(get_residue, set_residue, None, "The atom residue")

141

142 def get_chain_index (self) -> Optional[int]:

143 """Get the atom chain index according to parent structure chains."""

144 # The residue may be missing if the atom has been set rawly

145 if not self.residue:

146 return None

147 return self.residue.chain_index

148

149 def set_chain_index (self, new_chain_index : int):

150 """Set a new chain index.

151 WARNING: You may create a new residue in this process."""

152 # If the new chain index is the current chain index do nothing

153 # WARNING: It is important to stop this here or it could delete a chain which is not to be deleted

154 if new_chain_index == self.chain_index:

155 return

156 # If there is not strucutre yet it means the chain is beeing set before the structure

157 # We just save the chain index and wait for the structure to be set

158 if not self.structure:

159 self._chain_index = new_chain_index

160 return

161 # Relational indices are updated through a top-down hierarchy

162 # Affected chains are the ones to update this residue internal chain index

163 # WARNING: It is critical to find the new chain before removing/adding residues

164 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged

165 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed

166 new_chain = self.structure.chains[new_chain_index]

167 # set the new residue where this atom will be appended

168 new_residue = new_chain.find_residue(self.residue.number, self.residue.icode)

169 # If it does not exist yet then create it

170 # It will be a copy of the current residue (name, number and icode) but in the new chain

171 if not new_residue:

172 new_residue = Residue(self.residue.name, self.residue.number, self.residue.icode)

173 self.structure.set_new_residue(new_residue)

174 new_chain.add_residue(new_residue)

175 # Now remove this atom from its current residue and move it to the new chain residue

176 self.residue.remove_atom(self)

177 new_residue.add_atom(self)

178 chain_index = property(get_chain_index, set_chain_index, None, "The atom chain index according to parent structure chains (read only)")

179

180 # The atom chain (read only)

181 # In order to change the chain it must be changed in the atom residue

182 def get_chain (self) -> Optional['Chain']:

183 # If there is not strucutre yet it means the atom is beeing set before the structure

184 # In this case it is not possible to get related residues in the structure

185 # It also may happend that the chain is missing because the atom has been set rawly

186 if not self.structure or self.chain_index == None:

187 return None

188 # Get the chain in the structure according to the chain index

189 return self.structure.chains[self.chain_index]

190 chain = property(get_chain, None, None, "The atom chain (read only)")

191

192 def get_bonds (self, skip_ions : bool = False, skip_dummies : bool = False) -> Optional[ List[int] ]:

193 """Get indices of other atoms in the structure which are covalently bonded to this atom."""

194 if not self.structure:

195 raise ValueError('The atom has not a structure defined')

196 if self.index == None:

197 raise ValueError('The atom has not an index defined')

198 bonds = self.structure.bonds[self.index]

199 # If the skip ions argument is passed then remove atom indices of supported ion atoms

200 if skip_ions:

201 bonds = list(set(bonds) - self.structure.ion_atom_indices)

202 if skip_dummies:

203 bonds = list(set(bonds) - self.structure.dummy_atom_indices)

204 return bonds

205

206 # Atoms indices of atoms in the structure which are covalently bonded to this atom

207 bonds = property(get_bonds, None, None, 'Atoms indices of atoms in the structure which are covalently bonded to this atom')

208

209 def get_bonded_atoms (self) -> List['Atom']:

210 """Get bonded atoms."""

211 return [ self.structure.atoms[atom_index] for atom_index in self.bonds ]

212

213 def get_selection (self) -> 'Selection':

214 """Generate a selection for this atom."""

215 return Selection([self.index])

216

217 def copy (self) -> 'Atom':

218 """Make a copy of the current atom."""

219 atom_copy = Atom(self.name, self.element, self.coords)

220 atom_copy._structure = self._structure

221 atom_copy._index = self._index

222 atom_copy._residue_index = self._residue_index

223 return atom_copy

224

225 def is_fatty_candidate (self) -> bool:

226 """

227 Check if this atom meets specific criteria:

228 1. It is a carbon

229 2. It is connected only to other carbons and hydrogens

230 3. It is connected to 1 or 2 carbons

231 """

232 # Ignore non carbon atoms

233 if self.element != 'C':

234 return False

235 # Get bonded atom elements

236 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ]

237 # Check only carbon and hydrogen atoms to be bonded

238 if any((element != 'C' and element != 'H') for element in bonded_atoms_elements):

239 return False

240 # Check it is connected to 1 or 2 carbons

241 connected_carbons_count = bonded_atoms_elements.count('C')

242 if connected_carbons_count != 1 and connected_carbons_count != 2:

243 return False

244 return True

245

246 def is_carbohydrate_candidate (self) -> bool:

247 """

248 Check if this atom meets specific criteria:

249 1. It is a carbon

250 2. It is connected only to other carbons, hydrogens or oxygens

251 3. It is connected to 1 or 2 carbons

252 4. It is connected to 1 oxygen

253 """

254 # Ignore non carbon atoms

255 if self.element != 'C':

256 return False

257 # Get bonded atom elements

258 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ]

259 # Check only carbon and hydrogen atoms to be bonded

260 if any((element != 'C' and element != 'H' and element != 'O') for element in bonded_atoms_elements):

261 return False

262 # Check it is connected to 1 or 2 carbons

263 connected_carbons_count = bonded_atoms_elements.count('C')

264 if connected_carbons_count != 1 and connected_carbons_count != 2:

265 return False

266 # Check it is connected to 1 oxygen

267 connected_oxygens_count = bonded_atoms_elements.count('O')

268 if connected_oxygens_count != 1:

269 return False

270 return True

271

272 def is_ion (self) -> bool:

273 """Check if it is an ion by checking if it has no bonds with other atoms."""

274 return len(self.bonds) == 0

275

276 def guess_element (self) -> str:

277 """Guess an atom element from its name and number of bonds."""

278 # If the atom name is among the known dummy atoms then return a standard element for dummy atoms

279 if self.name.upper() in STANDARD_DUMMY_ATOM_NAMES:

280 return DUMMY_ATOM_ELEMENT

281 # If the element is already the "coarse grained" flag then return this very same flag

282 # WARNING: This has to be preset before guessing or the guess may fail

283 if self.is_cg():

284 return CG_ATOM_ELEMENT

285 # If the name is SOD and it is a lonely atom then it is clearly sodium, not sulfur

286 if self.name.upper() == 'SOD' and self.residue.atom_count == 1:

287 return 'Na'

288 # Find a obvios element name in the atom name

289 element = self.get_name_suggested_element()

290 # CA may refer to calcium or alpha carbon, so the number of atoms in the residue is decisive

291 if element == 'Ca' and self.residue.atom_count >= 2:

292 return 'C'

293 # NA may refer to sodium or some ligand nitrogen, so the number of atoms in the residue is decisive

294 if element == 'Na' and self.residue.atom_count >= 2:

295 return 'N'

296 return element

297

298 def get_name_suggested_element (self) -> str:

299 """Guess an atom element from its name only."""

300 # Get the atom name and its characters length

301 name = self.name

302 length = len(name)

303 next_character = None

304 for i, character in enumerate(name):

305 # Get the next character, since element may be formed by 2 letters

306 if i < length - 1:

307 next_character = name[i+1]

308 # If next character is not a string then ignore it

309 if not next_character.isalpha():

310 next_character = None

311 # Try to get all possible matches between the characters and the supported atoms

312 # First letter is always caps

313 character = character.upper()

314 # First try to match both letters together

315 if next_character:

316 # Start with the second letter in caps

317 next_character = next_character.upper()

318 both = character + next_character

319 if both in SUPPORTED_ELEMENTS:

320 return both

321 # Continue with the second letter in lowers

322 next_character = next_character.lower()

323 both = character + next_character

324 if both in SUPPORTED_ELEMENTS:

325 return both

326 # Finally, try with the first character alone

327 if character in SUPPORTED_ELEMENTS:

328 return character

329 raise InputError(f"Cannot guess atom element from atom name '{name}'")

330

331 # Get a standard label

332 def get_label (self) -> str:

333 return f'{self.residue.label}.{self.name} (index {self.index})'

334 # The atom standard label (read only)

335 label = property(get_label, None, None)

336

337 # Ask if the current atom is in coarse grain

338 def is_cg (self) -> bool:

339 return self.element == CG_ATOM_ELEMENT

340

341# A residue

342class Residue:

343 def __init__ (self,

344 name : Optional[str] = None,

345 number : Optional[int] = None,

346 icode : Optional[str] = None,

347 ):

348 self.name = name

349 self.number = number

350 self.icode = icode

351 # Set variables to store references to other related instaces

352 # These variables will be set further by the structure

353 self._structure = None

354 self._index = None

355 self._atom_indices = []

356 self._chain_index = None

357 self._classification = None

358

359 def __repr__ (self):

360 return '<Residue ' + self.name + str(self.number) + (self.icode if self.icode else '') + '>'

361

362 def __eq__ (self, other):

363 if type(self) != type(other):

364 return False

365 return (

366 self._chain_index == other._chain_index and

367 #self.name == other.name and

368 self.number == other.number and

369 self.icode == other.icode

370 )

371

372 def __hash__ (self):

373 # WARNING: This is susceptible to duplicated residues

374 return hash((self._chain_index, self.number, self.icode))

375 # WARNING: This is not susceptible to duplicated residues

376 #return hash(tuple(self._atom_indices))

377

378 # The parent structure (read only)

379 # This value is set by the structure itself

380 def get_structure (self) -> Optional['Structure']:

381 return self._structure

382 structure = property(get_structure, None, None, "The parent structure (read only)")

383

384 # The residue index according to parent structure residues (read only)

385 # This value is set by the structure itself

386 def get_index (self) -> Optional[int]:

387 return self._index

388 def set_index (self, index):

389 # Update residue atoms

390 for atom in self.atoms:

391 atom._residue_index = index

392 # Update residue chain

393 chain_residue_index = self.chain._residue_indices.index(self._index)

394 self.chain._residue_indices[chain_residue_index] = index

395 # Finally update self index

396 self._index = index

397 index = property(get_index, set_index, None, "The residue index according to parent structure residues (read only)")

398

399 # The atom indices according to parent structure atoms for atoms in this residue

400 # If atom indices are set then make changes in all the structure to make this change coherent

401 def get_atom_indices (self) -> List[int]:

402 return self._atom_indices

403 def set_atom_indices (self, new_atom_indices : List[int]):

404 # If there is not strucutre yet it means the residue is beeing set before the structure

405 # We just save atom indices and wait for the structure to be set

406 if not self.structure:

407 self._atom_indices = new_atom_indices

408 return

409 # Update the current atoms

410 for atom in self.atoms:

411 atom._residue_index = None

412 # Update the new atoms

413 for index in new_atom_indices:

414 atom = self.structure.atoms[index]

415 atom._residue_index = self.index

416 # Now new indices are coherent and thus we can save them

417 self._atom_indices = new_atom_indices

418 atom_indices = property(get_atom_indices, set_atom_indices, None, "The atom indices according to parent structure atoms for atoms in this residue")

419

420 # The atoms in this residue

421 # If atoms are set then make changes in all the structure to make this change coherent

422 def get_atoms (self) -> List['Atom']:

423 # If there is not strucutre yet it means the chain is beeing set before the structure

424 # In this case it is not possible to get related atoms in the structure

425 if not self.structure:

426 return []

427 # Get atoms in the structure according to atom indices

428 atoms = self.structure.atoms

429 return [ atoms[atom_index] for atom_index in self.atom_indices ]

430 def set_atoms (self, new_atoms : List['Atom']):

431 # Find indices for new atoms and set their indices as the new atom indices

432 # Note that atoms must be set in the structure already

433 new_atom_indices = []

434 for new_atom in new_atoms:

435 new_atom_index = new_atom.index

436 if new_atom_index == None:

437 raise ValueError('Atom ' + str(new_atom) + ' is not set in the structure')

438 new_atom_indices.append(new_atom_index)

439 self.set_atom_indices(new_atom_indices)

440 atoms = property(get_atoms, set_atoms, None, "The atoms in this residue")

441

442 # Get the number of atoms in the residue

443 def get_atom_count (self) -> int:

444 return len(self.atom_indices)

445 atom_count = property(get_atom_count, None, None, "The number of atoms in the residue (read only)")

446

447 # Add an atom to the residue

448 def add_atom (self, new_atom : 'Atom'):

449 # Insert the new atom index in the list of atom indices keeping the order

450 new_atom_index = new_atom.index

451 sorted_atom_index = bisect(self.atom_indices, new_atom_index)

452 self.atom_indices.insert(sorted_atom_index, new_atom_index)

453 # Update the atom internal index

454 new_atom._residue_index = self.index

455

456 # Remove an atom from the residue

457 def remove_atom (self, current_atom : 'Atom'):

458 # Remove the current atom index from the atom indices list

459 self.atom_indices.remove(current_atom.index) # This index MUST be in the list

460 # If we removed the last atom then this residue must be removed from its structure

461 if len(self.atom_indices) == 0 and self.structure:

462 self.structure.purge_residue(self)

463 # Update the atom internal index

464 current_atom._residue_index = None

465

466 # The residue chain index according to parent structure chains

467 # If chain index is set then make changes in all the structure to make this change coherent

468 def get_chain_index (self) -> int:

469 return self._chain_index

470 def set_chain_index (self, new_chain_index : int):

471 # If the new chain index is the current chain index do nothing

472 # WARNING: It is important to stop this here or it could delete a chain which is not to be deleted

473 if new_chain_index == self.chain_index:

474 return

475 # If there is not strucutre yet it means the chain is beeing set before the structure

476 # We just save the chain index and wait for the structure to be set

477 if not self.structure:

478 self._chain_index = new_chain_index

479 return

480 # Relational indices are updated through a top-down hierarchy

481 # Affected chains are the ones to update this residue internal chain index

482 # WARNING: It is critical to find the new chain before removing/adding residues

483 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged

484 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed

485 new_chain = self.structure.chains[new_chain_index]

486 if self.chain:

487 self.chain.remove_residue(self)

488 new_chain.add_residue(self)

489 chain_index = property(get_chain_index, set_chain_index, None, "The residue chain index according to parent structure chains")

490

491 # The residue chain

492 # If chain is set then make changes in all the structure to make this change coherent

493 def get_chain (self) -> Optional['Chain']:

494 # If there is not strucutre yet then it means the residue is set before the structure

495 # In this case it is not possible to get related chain in the structure

496 if not self.structure or self.chain_index == None:

497 return None

498 # Get the chain in the structure according to the chain index

499 return self.structure.chains[self.chain_index]

500 def set_chain (self, new_chain : Union['Chain', str]):

501 # In case the chain is just a string we must find/create the corresponding chain

502 if type(new_chain) == str:

503 letter = new_chain

504 # Get the residue structure

505 structure = self.structure

506 if not structure:

507 raise ValueError(f'Cannot find the corresponding {new_chain} chain without the structure')

508 # Find if the letter belongs to an already existing chain

509 new_chain = structure.get_chain_by_name(letter)

510 # If the chain does not exist yet then create it

511 if not new_chain:

512 new_chain = Chain(name=letter)

513 structure.set_new_chain(new_chain)

514 # Find the new chain index and set it as the residue chain index

515 # Note that the chain must be set in the structure already

516 new_chain_index = new_chain.index

517 if new_chain_index == None:

518 raise ValueError(f'Chain {new_chain} is not set in the structure')

519 self.set_chain_index(new_chain_index)

520 chain = property(get_chain, set_chain, None, "The residue chain")

521

522 # Get atom indices from atoms bonded to this residue

523 def get_bonded_atom_indices (self) -> List[int]:

524 # Get all bonds among residue atoms

525 all_bonds = []

526 for atom in self.atoms:

527 all_bonds += atom.bonds

528 # Remove self atom indices from the list

529 external_bonds = set(all_bonds) - set(self.atom_indices)

530 return list(external_bonds)

531

532 def get_bonded_atoms (self) -> List['Atom']:

533 """Get atoms bonded to this residue."""

534 return [ self.structure.atoms[atom_index] for atom_index in self.get_bonded_atom_indices() ]

535

536 def get_bonded_residue_indices (self) -> List[int]:

537 """Get residue indices from residues bonded to this residue."""

538 return list(set([ atom.residue_index for atom in self.get_bonded_atoms() ]))

539

540 def get_bonded_residues (self) -> List['Residue']:

541 """Get residues bonded to this residue."""

542 return [ self.structure.residues[residue_index] for residue_index in self.get_bonded_residue_indices() ]

543

544 def is_bonded_with_residue (self, other : 'Residue') -> bool:

545 """Given another residue, check if it is bonded with this residue."""

546 bonded_atom_indices = set(self.get_bonded_atom_indices())

547 if next((index for index in other.atom_indices if index in bonded_atom_indices), None) != None: return True

548 return False

549

550 def get_classification (self) -> str:

551 """

552 Get the residue biochemistry classification.

553

554 WARNING: Note that this logic will not work in a structure without hydrogens.

555

556 Available classifications:

557 - protein

558 - dna

559 - rna

560 - carbohydrate

561 - fatty

562 - steroid

563 - ion

564 - solvent

565 - acetyl

566 - amide

567 - other

568 """

569 # Return the internal value, if any

570 if self._classification:

571 return self._classification

572 # If this is a CG residue then we can not classify it

573 if self.is_cg(): return self.get_classification_by_name()

574 # If we dont have a value then we must classify the residue

575 # -------------------------------------------------------------------------------------------------------

576 # Ions are single atom residues

577 if self.atom_count == 1:

578 self._classification = 'ion'

579 return self._classification

580 # -------------------------------------------------------------------------------------------------------

581 # At this point we need atoms to have elements fixed

582 # WARNING: missing elements would result in silent failure to recognize some classifications

583 if self.structure._fixed_atom_elements == False:

584 self.structure.fix_atom_elements()

585 # Solvent is water molecules

586 # First rely on the residue name

587 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES:

588 self._classification = 'solvent'

589 return self._classification

590 # It may be water with a not known name

591 # Literally check if its a molecule with 3 atoms: 2 hydrogens and 1 oxygen

592 if self.atom_count == 3:

593 atom_elements = [ atom.element for atom in self.atoms ]

594 if atom_elements.count('H') == 2 and atom_elements.count('O') == 1:

595 self._classification = 'solvent'

596 return self._classification

597 # -------------------------------------------------------------------------------------------------------

598 # Protein definition according to vmd:

599 # a residue with atoms named C, N, CA, and O

600 # In our case we accept OC1 and OC2 or OT1 and OT2 instead of O for terminal residues

601 atom_names = set([ atom.name for atom in self.atoms ])

602 if ( all((name in atom_names) for name in ['C', 'N', 'CA']) and (

603 'O' in atom_names or { 'OC1', 'OC2' } <= atom_names or { 'OT1', 'OT2' } <= atom_names

604 )):

605 self._classification = 'protein'

606 return self._classification

607 # -------------------------------------------------------------------------------------------------------

608 # Nucleic acids definition according to vmd:

609 # a residue with atoms named P, O1P, O2P and either O3’, C3’, C4’, C5’, O5’ or O3*, C3*, C4*, C5*, O5*

610 # Apparently it has been fixed so now a residue does not need to be phosphorylated to be considered nucleic

611 # LORE: This included the condition "all((name in atom_names) for name in ['P', 'OP1', 'OP2'])"

612 if (( all((name in atom_names) for name in ["O3'", "C3'", "C4'", "C5'", "O5'"]) or

613 all((name in atom_names) for name in ["O3*", "C3*", "C4*", "C5*", "O5*"]) )):

614 # At this point we know it is nucleic

615 # We must tell the difference between DNA and RNA

616 if "O2'" in atom_names or "O2*" in atom_names:

617 self._classification = 'rna'

618 else:

619 self._classification = 'dna'

620 return self._classification

621 # -------------------------------------------------------------------------------------------------------

622 # To define carbohydrates search for rings made of 1 oxygen and 'n' carbons

623 # WARNING: This logic may fail for some very specific molecules such as furine

624 # LIMITATION: This logic only aims for cyclical carbohydrates. The linear form of carbohydrates is not yet consider

625 rings = list(self.find_rings())

626 for ring in rings:

627 ring_elements = [ atom.element for atom in ring ]

628 # Check the ring has 1 oxygen

629 oxygen_count = ring_elements.count('O')

630 if oxygen_count != 1:

631 continue

632 # Check the rest are only carbon

633 carbon_count = ring_elements.count('C')

634 if carbon_count != len(ring) - 1:

635 continue

636 self._classification = 'carbohydrate'

637 return self._classification

638 # -------------------------------------------------------------------------------------------------------

639 # To define steroids search for 3 6-carbon rings and 1 5-carbon ring (at least)

640 # WARNING: According to this logic some exotical non-steroid molecules may result in false positives

641 num_6_carbon_rings = 0

642 num_5_carbon_rings = 0

643 for ring in rings:

644 ring_elements = [ atom.element for atom in ring ]

645 if any(element != 'C' for element in ring_elements):

646 continue

647 ring_lenght = len(ring)

648 if ring_lenght == 5:

649 num_5_carbon_rings += 1

650 if ring_lenght == 6:

651 num_6_carbon_rings += 1

652 if num_6_carbon_rings >= 3 and num_5_carbon_rings >= 1:

653 self._classification = 'steroid'

654 return self._classification

655 # -------------------------------------------------------------------------------------------------------

656 # To define fatty acids search for a series of carbon atoms connected together (3 at least)

657 # These carbon atoms must be connected only to hydrogen in addition to 1-2 carbon

658 # These carbons must not be bonded in a cycle so check atoms not be repeated in the series

659 bonded_fatty_atoms = []

660 def get_bonded_fatty_atoms_recursive (current_atom : Atom, previous_atom : Optional[Atom] = None) -> bool:

661 # Iterate over the input atom bonded atoms

662 for bonded_atom in current_atom.get_bonded_atoms():

663 # Discard the previous atom to avoid an infinite loop

664 if bonded_atom == previous_atom:

665 continue

666 # If we find an atom which is already in the fatty atoms list then it means we are cycling

667 if bonded_atom in bonded_fatty_atoms:

668 return False

669 # If this is not a fatty atom then discard it

670 if not bonded_atom.is_fatty_candidate():

671 continue

672 # Add the current bonded fatty atom to the list

673 bonded_fatty_atoms.append(bonded_atom)

674 # Find more bonded fatty atoms and add them to list as well

675 if get_bonded_fatty_atoms_recursive(current_atom=bonded_atom, previous_atom=current_atom) == False:

676 return False

677 return True

678 # Now check all atoms and try to set the series until we find one which works

679 already_checked_atoms = []

680 for atom in self.atoms:

681 # If we already checked this atom trough the recursive logic then skip it

682 if atom in already_checked_atoms:

683 continue

684 # If the atom is not a fatty candidate then skip it

685 if not atom.is_fatty_candidate():

686 continue

687 # Now that we have found a suitable candidate atom we start the series

688 bonded_fatty_atoms = [atom]

689 if get_bonded_fatty_atoms_recursive(atom) and len(bonded_fatty_atoms) >= 3:

690 self._classification = 'fatty'

691 return self._classification

692 already_checked_atoms += bonded_fatty_atoms

693 # -------------------------------------------------------------------------------------------------------

694 # Check if it is an acetylation capping terminal

695 if self.name == 'ACE':

696 self._classification = 'acetyl'

697 return self._classification

698 # -------------------------------------------------------------------------------------------------------

699 # Check if it is an N-methyl amide capping terminal

700 if self.name == 'NME':

701 self._classification = 'amide'

702 return self._classification

703 # -------------------------------------------------------------------------------------------------------

704 self._classification = 'other'

705 return self._classification

706

707 # Set an alternative function to "try" to classify the residues according only to its name

708 # This is useful for corase grain residues whose atoms may not reflect the real atoms

709 # WARNING: This logic is very limited and will return "unknown" most of the times

710 # WARNING: This logic relies mostly in atom names, which may be not standard

711 def get_classification_by_name (self) -> str:

712 if self.name in PROTEIN_RESIDUE_NAME_LETTERS:

713 return 'protein'

714 if self.name in DNA_RESIDUE_NAME_LETTERS:

715 return 'dna'

716 if self.name in RNA_RESIDUE_NAME_LETTERS:

717 return 'rna'

718 if self.name in NUCLEIC_RESIDUE_NAME_LETTERS:

719 return 'nucleic'

720 if self.name in FATTY_RESIDUE_NAMES:

721 return 'fatty'

722 if self.name in STEROID_RESIDUE_NAMES:

723 return 'steroid'

724 if self.name in STANDARD_COUNTER_ION_ATOM_NAMES:

725 return 'ion'

726 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES:

727 return 'solvent'

728 # If we do not know what it is

729 return 'unknown'

730

731 # The residue biochemistry classification (read only)

732 classification = property(get_classification, None, None)

733

734 # Generate a selection for this residue

735 def get_selection (self) -> 'Selection':

736 return Selection(self.atom_indices)

737

738 # Make a copy of the current residue

739 def copy (self) -> 'Residue':

740 residue_copy = Residue(self.name, self.number, self.icode)

741 residue_copy._structure = self._structure

742 residue_copy._index = self._index

743 residue_copy._atom_indices = self._atom_indices

744 residue_copy._chain_index = self._chain_index

745 return residue_copy

746

747 # Get the residue equivalent single letter code

748 # Note that this makes sense for aminoacids and nucleotides only

749 # Non recognized residue names return 'X'

750 def get_letter (self) -> str:

751 return residue_name_to_letter(self.name)

752

753 # Get the formula of the residue

754 def get_formula (self) -> str:

755 elements = [ atom.element for atom in self.atoms ]

756 unique_elements = list(set(elements))

757 formula = ''

758 for unique_element in unique_elements:

759 count = elements.count(unique_element)

760 number = get_lower_numbers(str(count)) if count > 1 else ''

761 formula += unique_element + number

762 return formula

763

764 # Find rings in the residue

765 def find_rings (self) -> Generator[ List[Atom], None, None ]:

766 return self.structure.find_rings(selection=self.get_selection())

767

768

769 def split (self,

770 first_residue_atom_indices : List[int],

771 second_residue_atom_indices : List[int],

772 first_residue_name : Optional[str] = None,

773 second_residue_name : Optional[str] = None,

774 first_residue_number : Optional[int] = None,

775 second_residue_number : Optional[int] = None,

776 first_residue_icode : Optional[str] = None,

777 second_residue_icode : Optional[str] = None,

778 ) -> Tuple['Residue', 'Residue']:

779 """

780 Split this residue in 2 residues and return them in a tuple.

781 Keep things coherent in the structure (renumerate all residues below this one).

782 Note that all residue atoms must be covered by the splits.

783 """

784 # This function is expected to be called in a residue with an already set structure

785 if not self.structure:

786 raise InputError('The split function should be called when the residue has an already defined structure')

787 # Make sure all atoms in the residue are covered between both the first and the second residue

788 if set(self.atom_indices) != set(first_residue_atom_indices + second_residue_atom_indices):

789 print('Residue atoms: ' + ', '.join([ str(v) for v in set(self.atom_indices) ]))

790 print('Covered atoms: ' + ', '.join([ str(v) for v in set(first_residue_atom_indices + second_residue_atom_indices) ]))

791 raise InputError('All atom indices must be covered between both the first and the second residue')

792 # Reuse this first residue instead of creating a new one

793 if first_residue_name:

794 self.name = first_residue_name

795 if first_residue_number:

796 self.number = first_residue_number

797 if first_residue_icode:

798 self.icode = first_residue_icode

799 # Set the new second residue

800 _second_residue_name = second_residue_name if second_residue_name else self.name

801 _second_residue_number = second_residue_number if second_residue_number else self.number

802 _second_residue_icode = second_residue_icode if second_residue_icode else get_next_letter(self.icode)

803 second_residue = Residue(_second_residue_name, _second_residue_number, _second_residue_icode)

804 second_residue._structure = self.structure

805 new_residue_index = self.index + 1

806 second_residue._index = new_residue_index

807 # Insert the second residue in the structure residues list right after this residue

808 self.structure.residues.insert(new_residue_index, second_residue)

809 # Set the second residue index

810 # Update the index of all residues which have been shifted after the insertion

811 for residue_index in range(new_residue_index + 1, len(self.structure.residues)):

812 residue = self.structure.residues[residue_index]

813 residue.index = residue_index

814 # Now transfer atoms from residue 1 to residue 2 as it is specified

815 # Note that this will automatically update every atom

816 second_residue.atom_indices = second_residue_atom_indices

817 # Now add the new residue to the chain

818 self.chain.add_residue(second_residue)

819

820 # Parse atom names to atom indices and then call the split function

821 def split_by_atom_names (self,

822 first_residue_atom_names : List[str],

823 second_residue_atom_names : List[str],

824 first_residue_name : Optional[str] = None,

825 second_residue_name : Optional[str] = None,

826 first_residue_number : Optional[int] = None,

827 second_residue_number : Optional[int] = None,

828 first_residue_icode : Optional[str] = None,

829 second_residue_icode : Optional[str] = None,

830 ) -> Tuple['Residue', 'Residue']:

831 # Check all atom names to exist in the residue

832 input_atom_names = set(first_residue_atom_names + second_residue_atom_names)

833 residue_atom_names = set([ atom.name for atom in self.atoms ])

834 if input_atom_names != residue_atom_names:

835 print(self)

836 print(self.atoms)

837 print('Input atom names: ' + ', '.join(input_atom_names))

838 print('Residue atom names: ' + ', '.join(residue_atom_names))

839 raise InputError('All residue atoms must be covered between both the first and the second residue atom names')

840 # Convert atom names to atom indices

841 first_residue_atom_indices = [ self.get_atom_by_name(name).index for name in first_residue_atom_names ]

842 second_residue_atom_indices = [ self.get_atom_by_name(name).index for name in second_residue_atom_names ]

843 # Call the actual split logic

844 return self.split(first_residue_atom_indices, second_residue_atom_indices, first_residue_name,

845 second_residue_name, first_residue_number, second_residue_number, first_residue_icode, second_residue_icode)

846

847 # Get a residue atom given its name

848 def get_atom_by_name (self, atom_name : str) -> 'Atom':

849 return next(( atom for atom in self.atoms if atom.name == atom_name ), None)

850

851 # Get a standard label

852 def get_label (self) -> str:

853 chainname = self.chain.name if self.chain.name.strip() else ''

854 return f'{chainname}{self.number}{self.icode}({self.name})'

855 # The residue standard label (read only)

856 label = property(get_label, None, None)

857

858 # Ask if the current residue is in coarse grain

859 # Note that we asome there may be not hybrid aa/cg residues

860 def is_cg (self) -> bool:

861 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)

862

863# A chain

864class Chain:

865 def __init__ (self,

866 name : Optional[str] = None,

867 classification : Optional[str] = None,

868 ):

869 self.name = name

870 self._classification = classification

871 # Set variables to store references to other related instaces

872 # These variables will be set further by the structure

873 self._structure = None

874 self._index = None

875 self._residue_indices = []

876

877 def __repr__ (self):

878 return '<Chain ' + self.name + '>'

879

880 def __eq__ (self, other):

881 return self.name == other.name

882

883 def __hash__ (self):

884 return hash(self.name)

885

886 # The parent structure (read only)

887 # This value is set by the structure itself

888 def get_structure (self) -> Optional['Structure']:

889 return self._structure

890 structure = property(get_structure, None, None, "The parent structure (read only)")

891

892 # The residue index according to parent structure residues (read only)

893 # This value is set by the structure itself

894 def get_index (self) -> Optional[int]:

895 return self._index

896 # When the index is set all residues are updated with the next chain index

897 def set_index (self, index : int):

898 for residue in self.residues:

899 residue._chain_index = index

900 self._index = index

901 index = property(get_index, set_index, None, "The residue index according to parent structure residues (read only)")

902

903 # The residue indices according to parent structure residues for residues in this chain

904 # If residue indices are set then make changes in all the structure to make this change coherent

905 def get_residue_indices (self) -> List[int]:

906 return self._residue_indices

907 def set_residue_indices (self, new_residue_indices : List[int]):

908 # If there is not strucutre yet it means the chain is beeing set before the structure

909 # We just save residue indices and wait for the structure to be set

910 if not self.structure:

911 self._residue_indices = new_residue_indices

912 return

913 # Update the current residues

914 for residue in self.residues:

915 residue._chain_index = None

916 # Update the new residues

917 for index in new_residue_indices:

918 residue = self.structure.residues[index]

919 residue._chain_index = self.index

920 # In case the new residue indices list is empty this chain must be removed from its structure

921 if len(new_residue_indices) == 0:

922 self.structure.purge_chain(self)

923 # Now new indices are coherent and thus we can save them

924 self._residue_indices = new_residue_indices

925 residue_indices = property(get_residue_indices, set_residue_indices, None, "The residue indices according to parent structure residues for residues in this residue")

926

927 # The residues in this chain

928 # If residues are set then make changes in all the structure to make this change coherent

929 def get_residues (self) -> List['Residue']:

930 # If there is not strucutre yet it means the chain is beeing set before the structure

931 # In this case it is not possible to get related residues in the structure

932 if not self.structure:

933 return []

934 # Get residues in the structure according to residue indices

935 residues = self.structure.residues

936 return [ residues[residue_index] for residue_index in self.residue_indices ]

937

938 def set_residues (self, new_residues : List['Residue']):

939 """Find indices for new residues and set their indices as the new residue indices.

940 Note that residues must be set in the structure already."""

941 new_residue_indices = []

942 for new_residue in new_residues:

943 new_residue_index = new_residue.index

944 if new_residue_index == None:

945 raise ValueError(f'Residue {new_residue} is not set in the structure')

946 new_residue_indices.append(new_residue_index)

947 self.set_residue_indices(new_residue_indices)

948 residues = property(get_residues, set_residues, None, "The residues in this chain")

949

950 # Add a residue to the chain

951 def add_residue (self, residue : 'Residue'):

952 # Insert the new residue index in the list of residue indices keeping the order

953 sorted_residue_index = bisect(self.residue_indices, residue.index)

954 self.residue_indices.insert(sorted_residue_index, residue.index)

955 # Update the residue internal chain index

956 residue._chain_index = self.index

957

958 # Remove a residue from the chain

959 # WARNING: Note that this function does not trigger the set_residue_indices

960 def remove_residue (self, residue : 'Residue'):

961 self.residue_indices.remove(residue.index) # This index MUST be in the list

962 # If we removed the last residue then this chain must be removed from its structure

963 if len(self.residue_indices) == 0 and self.structure:

964 self.structure.purge_chain(self)

965 # Update the residue internal chain index

966 residue._chain_index = None

967

968 # Atom indices for all atoms in the chain (read only)

969 # In order to change atom indices they must be changed in their corresponding residues

970 def get_atom_indices (self) -> List[int]:

971 return sum([ residue.atom_indices for residue in self.residues ], [])

972 atom_indices = property(get_atom_indices, None, None, "Atom indices for all atoms in the chain (read only)")

973

974 # Atoms in the chain (read only)

975 # In order to change atoms they must be changed in their corresponding residues

976 def get_atoms (self) -> List[int]:

977 return sum([ residue.atoms for residue in self.residues ], [])

978 atoms = property(get_atoms, None, None, "Atoms in the chain (read only)")

979

980 # Get the number of atoms in the chain (read only)

981 def get_atom_count (self) -> int:

982 return len(self.atom_indices)

983 atom_count = property(get_atom_count, None, None, "Number of atoms in the chain (read only)")

984

985 # Get the number of residues in the chain (read only)

986 def get_residue_count (self) -> int:

987 return len(self._residue_indices)

988 residue_count = property(get_residue_count, None, None, "Number of residues in the chain (read only)")

989

990 # Get the chain classification

991 def get_classification (self) -> str:

992 if self._classification:

993 return self._classification

994 self._classification = self.structure.get_selection_classification(self.get_selection())

995 return self._classification

996

997 # Force the chain classification

998 def set_classification (self, classification : str):

999 self._classification = classification

1000

1001 # Chain classification

1002 classification = property(get_classification, set_classification, None, "Classification of the chain (manual or automatic)")

1003

1004 # Get the residues sequence in one-letter code

1005 def get_sequence (self) -> str:

1006 return ''.join([ residue_name_to_letter(residue.name) for residue in self.residues ])

1007

1008 # Generate a selection for this chain

1009 def get_selection (self) -> 'Selection':

1010 return Selection(self.atom_indices)

1011

1012 # Make a copy of the current chain

1013 def copy (self) -> 'Chain':

1014 chain_copy = Chain(self.name)

1015 chain_copy._structure = self._structure

1016 chain_copy._index = self._index

1017 chain_copy.residue_indices = self.residue_indices

1018 return chain_copy

1019

1020 # Ask if the current chain has at least one coarse grain atom/residue

1021 def has_cg (self) -> bool:

1022 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)

1023

1024 # Find a residue by its number and insertion code

1025 def find_residue (self, number : int, icode : str = '') -> Optional['Residue']:

1026 # Iterate chain residues

1027 for residue in self.residues:

1028 if residue.number == number and residue.icode == icode:

1029 return residue

1030 return None

1031

1032# A structure is a group of atoms organized in chains and residues

1033class Structure:

1034 def __init__ (self,

1035 atoms : List['Atom'] = [],

1036 residues : List['Residue'] = [],

1037 chains : List['Chain'] = [],

1038 residue_numeration_base : int = 10,

1039 ):

1040 self.atoms = []

1041 self.residues = []

1042 self.chains = []

1043 # Set references between instances

1044 for atom in atoms:

1045 self.set_new_atom(atom)

1046 for residue in residues:

1047 self.set_new_residue(residue)

1048 for chain in chains:

1049 self.set_new_chain(chain)

1050 # --- Set other internal variables ---

1051 # Set bonds between atoms

1052 self._bonds = None

1053 # Set fragments of bonded atoms

1054 self._fragments = None

1055 # --- Set other auxiliar variables ---

1056 # Trajectory atom sorter is a function used to sort coordinates in a trajectory file

1057 # This function is generated when sorting indices in the structure

1058 # Also the new atom order is saved

1059 self.trajectory_atom_sorter = None

1060 self.new_atom_order = None

1061 # Track when atom elements have been fixed

1062 self._fixed_atom_elements = False

1063 # Save indices of supported ion atoms

1064 self._ion_atom_indices = None

1065 self._dummy_atom_indices = None

1066 # Set the scale of the residue numbers

1067 # It may be decimal (10), hexadecimal(16) or alphanumeric(36)

1068 # Note that hybrid scales (1-9999 decimal and different further) are not explicitly supported

1069 # However, the scale is guessed on read and conserved on write, so the original numeration would be conserved

1070 # The default values is used only when the structure is not read from a PDB

1071 self.residue_numeration_base = residue_numeration_base

1072

1073 def __repr__ (self):

1074 return f'<Structure ({self.atom_count} atoms)>'

1075

1076 # The bonds between atoms

1077 def get_bonds (self) -> List[ List[int] ]:

1078 # Return the stored value, if exists

1079 if self._bonds:

1080 return self._bonds

1081 # If not, we must calculate the bonds using vmd

1082 self._bonds = self.get_covalent_bonds()

1083 return self._bonds

1084 # Force specific bonds

1085 def set_bonds (self, bonds : List[ List[int] ]):

1086 self._bonds = bonds

1087 # Reset fragments

1088 self._fragments = None

1089 bonds = property(get_bonds, set_bonds, None, "The structure bonds")

1090

1091 # Get the groups of atoms which are covalently bonded

1092 def get_fragments (self) -> List['Selection']:

1093 # Return the stored value, if exists

1094 if self._fragments != None:

1095 return self._fragments

1096 # Otherwise, find fragments in all structure atoms

1097 self._fragments = list(self.find_fragments())

1098 return self._fragments

1099 # Fragments of covalently bonded atoms (read only)

1100 fragments = property(get_fragments, None, None, "The structure fragments (read only)")

1101

1102 # Find fragments* in a selection of atoms

1103 # * A fragment is a selection of colvalently bonded atoms

1104 # All atoms are searched if no selection is provided

1105 # WARNING: Note that fragments generated from a specific selection may not match the structure fragments

1106 # A selection including 2 separated regions of a structure fragment will yield 2 fragments

1107 def find_fragments (self, selection : Optional['Selection'] = None) -> Generator['Selection', None, None]:

1108 # If there is no selection we consider all atoms

1109 if not selection:

1110 selection = self.select_all()

1111 # Get/Find covalent bonds between atoms in a new object avoid further corruption (deletion) of the original list

1112 atom_indexed_covalent_bonds = { atom_index: [ *self.bonds[atom_index] ] for atom_index in selection.atom_indices }

1113 # Group the connected atoms in "fragments"

1114 while len(atom_indexed_covalent_bonds) > 0:

1115 start_atom_index, bonds = next(iter(atom_indexed_covalent_bonds.items()))

1116 del atom_indexed_covalent_bonds[start_atom_index]

1117 fragment_atom_indices = [ start_atom_index ]

1118 while len(bonds) > 0:

1119 # Get the next bond atom and remove it from the bonds list

1120 next_atom_index = bonds[0]

1121 bonds.remove(next_atom_index)

1122 next_bonds = atom_indexed_covalent_bonds.get(next_atom_index, None)

1123 # If this atom is out of the selection then skip it

1124 if next_bonds == None:

1125 continue

1126 next_new_bonds = [ bond for bond in next_bonds if bond not in fragment_atom_indices + bonds ]

1127 bonds += next_new_bonds

1128 fragment_atom_indices.append(next_atom_index)

1129 del atom_indexed_covalent_bonds[next_atom_index]

1130 yield Selection(fragment_atom_indices)

1131

1132 # Given a selection of atoms, find all whole structure fragments on them

1133 def find_whole_fragments (self, selection : 'Selection') -> Generator['Selection', None, None]:

1134 for fragment in self.fragments:

1135 if selection & fragment:

1136 yield fragment

1137

1138 def name_selection (self, selection : 'Selection') -> str:

1139 """Name an atom selection depending on the chains it contains. This is used for debug purpouses."""

1140 atoms = [ self.atoms[index] for index in selection.atom_indices ]

1141 # Count atoms per chain

1142 atom_count_per_chain = { chain: 0 for chain in self.chains }

1143 for atom in atoms:

1144 atom_count_per_chain[atom.chain] += 1

1145 # Set labels accoridng to the coverage of every chain

1146 chain_labels = []

1147 for chain, atom_count in atom_count_per_chain.items():

1148 if atom_count == 0: continue

1149 coverage = atom_count / chain.atom_count

1150 label = f'chain {chain.name}'

1151 if coverage < 1:

1152 percent = round(coverage * 1000) / 10

1153 label += f' ({percent}%)'

1154 chain_labels.append(label)

1155 return ', '.join(chain_labels)

1156

1157 # Set a new atom in the structure

1158 def set_new_atom (self, atom : 'Atom'):

1159 atom._structure = self

1160 new_atom_index = self.atom_count

1161 self.atoms.append(atom)

1162 atom._index = new_atom_index

1163

1164 # Set a new residue in the structure

1165 # WARNING: Atoms must be set already before setting residues

1166 def set_new_residue (self, residue : 'Residue'):

1167 residue._structure = self

1168 new_residue_index = len(self.residues)

1169 self.residues.append(residue)

1170 residue._index = new_residue_index

1171 # In case the residue has atom indices, set relational indices on each atom

1172 for atom_index in residue.atom_indices:

1173 atom = self.atoms[atom_index]

1174 atom._residue_index = new_residue_index

1175

1176 # Purge residue from the structure and its chain

1177 # This can be done only when the residue has no atoms left in the structure

1178 # Renumerate all residue indices which have been offsetted as a result of the purge

1179 def purge_residue (self, residue : 'Residue'):

1180 # Check the residue can be purged

1181 if residue not in self.residues:

1182 raise ValueError(f'{residue} is not in the structure already')

1183 if len(residue.atom_indices) > 0:

1184 raise ValueError(f'{residue} is still having atoms and thus it cannot be purged')

1185 # Remove the purged residue from its chain

1186 if residue.chain: residue.chain.remove_residue(residue)

1187 # Get the current index of the residue to be purged

1188 purged_index = residue.index

1189 # Residues and their atoms below this index are not to be modified

1190 # Residues and their atoms over this index must be renumerated

1191 for affected_residue in self.residues[purged_index+1:]:

1192 # Chaging the index automatically changes all residues atoms '_residue_index' values

1193 # Chaging the index automatically changes its corresponding index in residue chain '_residue_indices'

1194 affected_residue.index -= 1

1195 # Finally, remove the current residue from the list of residues in the structure

1196 del self.residues[purged_index]

1197

1198 # Set a new chain in the structure

1199 # WARNING: Residues and atoms must be set already before setting chains

1200 def set_new_chain (self, chain : 'Chain'):

1201 chain._structure = self

1202 new_chain_index = len(self.chains)

1203 self.chains.append(chain)

1204 chain._index = new_chain_index

1205 # In case the chain has residue indices, set relational indices on each residue

1206 for residue_index in chain.residue_indices:

1207 residue = self.residues[residue_index]

1208 residue._chain_index = new_chain_index

1209

1210 # Purge chain from the structure

1211 # This can be done only when the chain has no residues left in the structure

1212 # Renumerate all chain indices which have been offsetted as a result of the purge

1213 def purge_chain (self, chain : 'Chain'):

1214 # Check the chain can be purged

1215 if chain not in self.chains:

1216 raise ValueError(f'Chain {chain.name} is not in the structure already')

1217 if len(chain.residue_indices) > 0:

1218 raise ValueError(f'Chain {chain.name} is still having residues and thus it cannot be purged')

1219 # Get the current index of the chain to be purged

1220 purged_index = chain.index

1221 # Chains and their residues below this index are not to be modified

1222 # Chains and their residues over this index must be renumerated

1223 for affected_chain in self.chains[purged_index+1:]:

1224 # Chaging the index automatically changes all chain residues '_chain_index' values

1225 affected_chain.index -= 1

1226 # Finally, remove the current chain from the list of chains in the structure

1227 del self.chains[purged_index]

1228

1229 # Set the structure from a pdb file

1230 # You may filter the PDB content for a specific model

1231 # Some weird numeration systems are not supported and, when encountered, they are ignored

1232 # In these cases we set our own numeration system

1233 # Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead

1234 @classmethod

1235 def from_pdb (cls, pdb_content : str, model : Optional[int] = None, flexible_numeration : bool = True):

1236 # Filter the PDB content in case a model was passed

1237 filtered_pdb_content = filter_model(pdb_content, model) if model else pdb_content

1238 # Split the PDB content in lines

1239 pdb_lines = filtered_pdb_content.split('\n')

1240 # Before we start, we must guess the numeration system

1241 # To do so mine all residue numbers

1242 all_residue_number_characters = set()

1243 for line in pdb_lines:

1244 # Parse atoms only

1245 start = line[0:6]

1246 is_atom = start == 'ATOM ' or start == 'HETATM'

1247 if not is_atom: continue

1248 # Mine all resiude numbers

1249 residue_number = line[22:26]

1250 for character in residue_number:

1251 all_residue_number_characters.add(character)

1252 # Remove white spaces

1253 all_residue_number_characters.discard(' ')

1254 # If we find a non-numerical and non-alphabetical character then we assume it has a weird numeration system

1255 # Since we can not support every scenario, in this cases we set the numeration totally ignoring the one in the PDB

1256 weird_character = next((character for character in all_residue_number_characters if not (character.isalnum() or character == '-')), None)

1257 if weird_character:

1258 if flexible_numeration == False: raise InputError(f'Not supported numeration system including "{weird_character}" characters')

1259 warn(f'Weird residue numeration including "{weird_character}" characters found in the PDB file. Ignoring it.')

1260 residue_numeration_base = None

1261 # Search among all resiude numbers any letters (non-numerical characters)

1262 elif next((letter for letter in alphanumerical_letters if letter in all_residue_number_characters), None):

1263 residue_numeration_base = 36

1264 elif next((letter for letter in hexadecimal_letters if letter in all_residue_number_characters), None):

1265 residue_numeration_base = 16

1266 else:

1267 residue_numeration_base = 10

1268 # Read the pdb content line by line and set the parsed atoms, residues and chains

1269 parsed_atoms = []

1270 parsed_residues = []

1271 parsed_chains = []

1272 # Save chains and residues also in dictionaries only to find them faster

1273 name_chains = {}

1274 label_residues = {}

1275 # Keep track of the last issued atom and residue indices

1276 atom_index = -1

1277 residue_index = -1

1278 for line in pdb_lines:

1279 # Parse atoms only

1280 start = line[0:6]

1281 is_atom = start == 'ATOM ' or start == 'HETATM'

1282 if not is_atom:

1283 continue

1284 # Mine all atom data

1285 atom_name = line[11:16].strip()

1286 residue_name = line[17:21].strip()

1287 chain_name = line[21:22]

1288 residue_number = line[22:26]

1289 icode = line[26:27]

1290 if icode == ' ':

1291 icode = ''

1292 x_coord = float(line[30:38])

1293 y_coord = float(line[38:46])

1294 z_coord = float(line[46:54])

1295 element = line[76:78].strip()

1296 # Set the parsed atom

1297 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord))

1298 # Add the parsed atom to the list and update the current atom index

1299 parsed_atoms.append(parsed_atom)

1300 # Get the parsed chain

1301 # DANI: If we always check for an already existing chain there will never be repeated chains

1302 # DANI: However we may create chains with unconnected atoms silently (they were separated in the PDB)

1303 # DANI: For this reason we must only consider the last processed chain

1304 parsed_chain = name_chains.get(chain_name, None)

1305 if parsed_chain and parsed_chain != parsed_chains[-1]:

1306 parsed_chain = None

1307 # If the parsed chain was not yet instantiated then do it now

1308 if not parsed_chain:

1309 parsed_chain = Chain(name=chain_name)

1310 parsed_chains.append(parsed_chain)

1311 name_chains[chain_name] = parsed_chain

1312 # Get the parsed residue

1313 residue_label = (chain_name, residue_number, icode)

1314 # DANI: If we always check for an already existing residue there will never be repeated residues

1315 # DANI: However we may create residues with unconnected atoms silently (they were separated in the PDB)

1316 # DANI: For this reason we must only consider the last processed residue

1317 parsed_residue = label_residues.get(residue_label, None)

1318 if parsed_residue and parsed_residue != parsed_residues[-1]:

1319 parsed_residue = None

1320 # If the parsed residue was not yet instantiated then do it now

1321 if not parsed_residue:

1322 # Parse the residue number if it is to be parsed

1323 if residue_numeration_base:

1324 parsed_residue_number = int(residue_number, residue_numeration_base)

1325 # If we decided to ignore the numeration system then we just issue a new residue number

1326 # Use the last residue number from the current chain as reference

1327 else:

1328 chain_last_residue_index = parsed_chain.residue_indices[-1] if len(parsed_chain.residue_indices) > 0 else None

1329 chain_last_residue_number = parsed_residues[chain_last_residue_index].number if chain_last_residue_index != None else 0

1330 parsed_residue_number = chain_last_residue_number + 1

1331 # Instantiate the new parsed residue

1332 parsed_residue = Residue(name=residue_name, number=parsed_residue_number, icode=icode)

1333 parsed_residues.append(parsed_residue)

1334 label_residues[residue_label] = parsed_residue

1335 # Add current residue to the parsed chain

1336 residue_index += 1

1337 parsed_chain.residue_indices.append(residue_index)

1338 # Add current atom to the parsed residue

1339 atom_index += 1

1340 parsed_residue.atom_indices.append(atom_index)

1341 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains, residue_numeration_base=residue_numeration_base)

1342

1343 # Set the structure from a pdb file

1344 # You may filter the input PDB file for a specific model

1345 # Some weird numeration systems are not supported and, when encountered, they are ignored

1346 # In these cases we set our own numeration system

1347 # Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead

1348 @classmethod

1349 def from_pdb_file (cls, pdb_filepath : str, model : Optional[int] = None, flexible_numeration : bool = True):

1350 pdb_file = File(pdb_filepath)

1351 if not pdb_file.exists:

1352 raise InputError(f'File "{pdb_filepath}" not found')

1353 if not pdb_file.format == 'pdb':

1354 raise InputError(f'"{pdb_filepath}" is not a path for a pdb file')

1355 # Read the pdb file

1356 pdb_content = None

1357 with open(pdb_filepath, 'r') as file:

1358 pdb_content = file.read()

1359 return cls.from_pdb(pdb_content, model, flexible_numeration)

1360

1361 # Set the structure from mmcif

1362 # https://biopandas.github.io/biopandas/tutorials/Working_with_mmCIF_Structures_in_DataFrames/

1363 # You may filter the content for a specific model

1364 # You may ask for the author notation instead of the standarized notation for legacy reasons

1365 # This may have an effect in atom names, residue names, residue numbers and chain names

1366 @classmethod

1367 def from_mmcif (cls, mmcif_content : str, model : Optional[int] = None, author_notation : bool = False):

1368 # Read the pdb content line by line and set the parsed atoms, residues and chains

1369 parsed_atoms = []

1370 parsed_residues = []

1371 parsed_chains = []

1372 # Save chains and residues also in dictionaries only to find them faster

1373 name_chains = {}

1374 label_residues = {}

1375 # Keep track of the last issued atom and residue indices

1376 atom_index = -1

1377 residue_index = -1

1378 # Iterate the content line by line

1379 lines = iter(mmcif_content.split('\n'))

1380 # Now mine atoms data

1381 atom_headers = { 'ATOM', 'HETATM', 'ANISOU' }

1382 for line in lines:

1383 # Values are separated by spaces

1384 values = line.split()

1385 # If this is not atom line then we are done

1386 if len(values) == 0 or values[0] not in atom_headers: continue

1387 # Get the atom model number if a model number was passed

1388 # Note that then model is a number greater than 0

1389 if model:

1390 model_number = int(values[20])

1391 # If the model number odes not match then skip it

1392 if model != model_number: continue

1393 # Mine atom data

1394 # Next value is just the atom number, we do not need it

1395 # atom_number = values[1]

1396 # Mine the atom element

1397 element = values[2]

1398 # Mine the atom name

1399 atom_name = values[3].replace('"','')

1400 # Next value is a place holder to indicate alternate conformation, according to the docs

1401 # I don't know what this is but we do not need it

1402 # idk = values[4]

1403 # Mine the residue name

1404 residue_name = values[5]

1405 # Mine the chain name

1406 chain_name = values[6]

1407 # Next value is the chain number, we do not need it

1408 # chain_number = values[7]

1409 # Residue number is '.' for chains with only one residue

1410 residue_number = 1 if values[8] == '.' else int(values[8])

1411 icode = '' if values[9] == '?' else values[9]

1412 x_coord = float(values[10])

1413 y_coord = float(values[11])

1414 z_coord = float(values[12])

1415 # Next value is the occupancy, we do not need it

1416 # occupancy = float(values[13])

1417 # Next value is the isotropic displacement, we do not need it

1418 # isotropic = float(values[14])

1419 # Next value is the charge, we do not need it

1420 # charge = None if values[15] == '?' else float(values[15])

1421 # The rest of values are alternative author values

1422 if author_notation:

1423 residue_number = int(values[16])

1424 residue_name = values[17]

1425 chain_name = values[18]

1426 atom_name = values[19].replace('"','')

1427 # Set the parsed atom

1428 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord))

1429 # Add the parsed atom to the list and update the current atom index

1430 parsed_atoms.append(parsed_atom)

1431 # Get the parsed chain

1432 parsed_chain = name_chains.get(chain_name, None)

1433 # If the parsed chain was not yet instantiated then do it now

1434 if not parsed_chain:

1435 parsed_chain = Chain(name=chain_name)

1436 parsed_chains.append(parsed_chain)

1437 name_chains[chain_name] = parsed_chain

1438 # Get the parsed residue

1439 residue_label = (chain_name, residue_number, icode)

1440 parsed_residue = label_residues.get(residue_label, None)

1441 # If the parsed residue was not yet instantiated then do it now

1442 if not parsed_residue:

1443 # Instantiate the new parsed residue

1444 parsed_residue = Residue(name=residue_name, number=residue_number, icode=icode)

1445 parsed_residues.append(parsed_residue)

1446 label_residues[residue_label] = parsed_residue

1447 # Add current residue to the parsed chain

1448 residue_index += 1

1449 parsed_chain.residue_indices.append(residue_index)

1450 # Add current atom to the parsed residue

1451 atom_index += 1

1452 parsed_residue.atom_indices.append(atom_index)

1453 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)

1454

1455 # Set the structure from a mmcif file

1456 @classmethod

1457 def from_mmcif_file (cls, mmcif_filepath : str, model : Optional[int] = None, author_notation : bool = False):

1458 mmcif_file = File(mmcif_filepath)

1459 if not mmcif_file.exists:

1460 raise InputError(f'File "{mmcif_filepath}" not found')

1461 if not mmcif_file.format == 'cif':

1462 raise InputError(f'"{mmcif_filepath}" is not a path for a mmcif file')

1463 # Read the mmcif file

1464 mmcif_content = None

1465 with open(mmcif_filepath, 'r') as file:

1466 mmcif_content = file.read()

1467 return cls.from_mmcif(mmcif_content, model, author_notation)

1468

1469 # Set the structure from an MD analysis object

1470 @classmethod

1471 def from_mdanalysis (cls, mdanalysis_universe):

1472 # Set the final list of atoms to be included in the structure

1473 parsed_atoms = []

1474 parsed_residues = []

1475 parsed_chains = []

1476 # Setup atoms

1477 for atom in mdanalysis_universe.atoms:

1478 name = atom.name

1479 # WARNING: MDAnalysis cannot handle reading an element of an atom with no element

1480 element = atom.element if hasattr(atom, 'element') else '?'

1481 #coords = atom.position # DANI: Esto da error si no hay coordenadas

1482 parsed_atom = Atom(name=name, element=element)

1483 parsed_atoms.append(parsed_atom)

1484 # Setup residues

1485 for residue in mdanalysis_universe.residues:

1486 name = residue.resname

1487 number = residue.resnum

1488 icode = residue.icode if hasattr(residue, 'icode') else None # DANI: No se ha provado

1489 parsed_residue = Residue(name=name, number=number, icode=icode)

1490 atom_indices = [ atom.index for atom in residue.atoms ]

1491 parsed_residue.atom_indices = atom_indices

1492 parsed_residues.append(parsed_residue)

1493 # Setup chains

1494 for segment in mdanalysis_universe.segments:

1495 name = segment.segid

1496 parsed_chain = Chain(name=name)

1497 residue_indices = [ residue.resindex for residue in segment.residues ]

1498 parsed_chain.residue_indices = residue_indices

1499 parsed_chains.append(parsed_chain)

1500 # Setup the structure

1501 structure = cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)

1502 # Add bonds and charges which are also available in a topology

1503 atom_count = len(mdanalysis_universe.atoms)

1504 atom_bonds = [ [] for i in range(atom_count) ]

1505 for bond in mdanalysis_universe.bonds:

1506 a,b = bond.indices

1507 # Make sure atom indices are regular integers so they are JSON serializables

1508 atom_bonds[a].append(int(b))

1509 atom_bonds[b].append(int(a))

1510 structure.bonds = atom_bonds

1511 structure.charges = list(mdanalysis_universe._topology.charges.values)

1512 return structure

1513

1514 # Set the structure from a prmtop file

1515 @classmethod

1516 def from_prmtop_file (cls, prmtop_filepath : str):

1517 # Make sure the input file exists and has the right format

1518 prmtop_file = File(prmtop_filepath)

1519 if not prmtop_file.exists:

1520 raise InputError(f'File "{prmtop_filepath}" not found')

1521 if not prmtop_file.format == 'prmtop':

1522 raise InputError(f'"{prmtop_filepath}" is not a name for a prmtop file')

1523 # In we do not have mdanalysis in our environment then we cannot proceed

1524 if not is_imported('MDAnalysis'):

1525 raise InputError('Missing dependency error: MDAnalysis')

1526 # Parse the topology using MDanalysis

1527 parser = MDAnalysis.topology.TOPParser.TOPParser(prmtop_filepath)

1528 mdanalysis_topology = parser.parse()

1529 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology)

1530 return cls.from_mdanalysis(mdanalysis_universe)

1531

1532 # Set the structure from a tpr file

1533 @classmethod

1534 def from_tpr_file (cls, tpr_filepath : str):

1535 # Make sure the input file exists and has the right format

1536 tpr_file = File(tpr_filepath)

1537 if not tpr_file.exists:

1538 raise InputError(f'File "{tpr_filepath}" not found')

1539 if not tpr_file.format == 'tpr':

1540 raise InputError(f'"{tpr_filepath}" is not a name for a tpr file')

1541 # In we do not have mdanalysis in our environment then we cannot proceed

1542 if not is_imported('MDAnalysis'):

1543 raise InputError('Missing dependency error: MDAnalysis')

1544 # Parse the topology using MDanalysis

1545 parser = MDAnalysis.topology.TPRParser.TPRParser(tpr_filepath)

1546 mdanalysis_topology = parser.parse()

1547 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology)

1548 return cls.from_mdanalysis(mdanalysis_universe)

1549

1550 # Set the structure from a file if the file format is supported

1551 @classmethod

1552 def from_file (cls, mysterious_filepath : str):

1553 mysterious_file = File(mysterious_filepath)

1554 if mysterious_file.format == 'pdb':

1555 return cls.from_pdb_file(mysterious_file.path)

1556 if mysterious_file.format == 'cif':

1557 return cls.from_mmcif_file(mysterious_file.path)

1558 if mysterious_file.format == 'prmtop':

1559 return cls.from_prmtop_file(mysterious_file.path)

1560 if mysterious_file.format == 'tpr':

1561 return cls.from_tpr_file(mysterious_file.path)

1562 raise InputError(f'Not supported format ({mysterious_file.format}) to setup a Structure')

1563

1564 # Get the number of atoms in the structure

1565 def get_atom_count (self) -> int:

1566 return len(self.atoms)

1567 atom_count = property(get_atom_count, None, None, "The number of atoms in the structure (read only)")

1568

1569 # Get the number of residues in the structure (read only)

1570 def get_residue_count (self) -> int:

1571 return len(self.residues)

1572 residue_count = property(get_residue_count, None, None, "Number of residues in the structure (read only)")

1573

1574 # Get the number of chains in the structure (read only)

1575 def get_chain_count (self) -> int:

1576 return len(self.chains)

1577 chain_count = property(get_chain_count, None, None, "Number of chains in the structure (read only)")

1578

1579 # Fix atom elements by guessing them when missing

1580 # Set all elements with the first letter upper and the second (if any) lower

1581 # Also check if atom elements are coherent with atom names

1582 # If 'trust' is set as False then we impose elements according to what we can guess from the atom name

1583 # Return True if any element was modified or False if not

1584 def fix_atom_elements (self, trust : bool = True) -> bool:

1585 modified = False

1586 added = False

1587 # Save the wrong guesses for a final report

1588 # This way we do not crowd the terminal with logs when a lot of atoms are affected

1589 reports = {}

1590 for atom in self.atoms:

1591 # Make sure elements have the first letter cap and the second letter not cap

1592 if atom.element:

1593 new_element = first_cap_only(atom.element)

1594 if atom.element != new_element:

1595 atom.element = new_element

1596 modified = True

1597 # Check the element to match what we would guess from the atom name

1598 # In case it does not just warn the user

1599 guess = atom.guess_element()

1600 if atom.element != guess:

1601 report = (atom.name, atom.element, guess)

1602 report_count = reports.get(report, 0)

1603 reports[report] = report_count + 1

1604 if not trust:

1605 atom.element = guess

1606 modified = True

1607 # If elements are missing then guess them from atom names

1608 else:

1609 atom.element = atom.guess_element()

1610 added = True

1611 # Warn the user about anormalies

1612 for report, count in reports.items():

1613 atom_name, atom_element, guess = report

1614 warn(f"Suspicious element for atom {atom_name}: {atom_element} -> shoudn't it be {guess}? ({count} occurrences)")

1615 # Warn the user that some elements were modified

1616 if modified: warn('Atom elements have been modified')

1617 if added: warn('Atom elements were missing and have been added')

1618 # Set atom elements as fixed in order to avoid repeating this process

1619 self._fixed_atom_elements = True

1620 return modified or added

1621

1622 # Set new coordinates

1623 def set_new_coordinates (self, new_coordinates : List[Coords]):

1624 # Make sure the list of coordinates is as long as the number of atoms

1625 if len(new_coordinates) != self.atom_count:

1626 raise ValueError(f'The number of coordinates ({len(new_coordinates)}) does not match the number of atoms ({self.atom_count})')

1627 # Overwrite current coordinates with the new coordinates

1628 for i, atom in enumerate(self.atoms):

1629 atom.coords = tuple(new_coordinates[i])

1630

1631 # Get all supported ion atom indices together in a set

1632 def get_ion_atom_indices (self) -> Set:

1633 # If we already did this then return the stored value

1634 if self._ion_atom_indices != None:

1635 return self._ion_atom_indices

1636 # Find ion atom indices

1637 indices = set()

1638 for atom in self.atoms:

1639 if atom.element in SUPPORTED_ION_ELEMENTS:

1640 indices.add(atom.index)

1641 self._ion_atom_indices = indices

1642 return self._ion_atom_indices

1643 ion_atom_indices = property(get_ion_atom_indices, None, None, "Atom indices for what we consider supported ions")

1644

1645 # Get all dummy atom indices together in a set

1646 def get_dummy_atom_indices (self) -> Set:

1647 # If we already did this then return the stored value

1648 if self._dummy_atom_indices != None:

1649 return self._dummy_atom_indices

1650 # Find ion atom indices

1651 indices = set()

1652 for atom in self.atoms:

1653 if atom.element == DUMMY_ATOM_ELEMENT:

1654 indices.add(atom.index)

1655 self._dummy_atom_indices = indices

1656 return self._dummy_atom_indices

1657 dummy_atom_indices = property(get_dummy_atom_indices, None, None, "Atom indices for what we consider dummy atoms")

1658

1659 # Generate a pdb file content with the current structure

1660 def generate_pdb (self):

1661 content = 'REMARK workflow generated pdb file\n'

1662 for a, atom in enumerate(self.atoms):

1663 residue = atom.residue

1664 index = str((a+1) % 100000).rjust(5)

1665 name = ' ' + atom.name.ljust(3) if len(atom.name) < 4 else atom.name

1666 residue_name = residue.name.ljust(4) if residue else 'XXX'.ljust(4)

1667 chain = atom.chain

1668 chain_name = chain.name if chain.name and len(chain.name) == 1 else 'X'

1669 residue_number = str(residue.number).rjust(4) if residue else '0'.rjust(4)

1670 # If residue number is longer than 4 characters then we must parse to hexadecimal

1671 if len(residue_number) > 4:

1672 residue_number = hex(residue.number)[2:].rjust(4)

1673 icode = residue.icode if residue.icode and len(residue.icode) else ' '

1674 # Make sure we have atom coordinates

1675 if atom.coords == None:

1676 raise InputError('Trying to write a PDB file from a structure with atoms without coordinates')

1677 x_coord, y_coord, z_coord = [ "{:.3f}".format(coord).rjust(8) for coord in atom.coords ]

1678 occupancy = '1.00' # Just a placeholder

1679 temp_factor = '0.00' # Just a placeholder

1680 element = atom.element.rjust(2)

1681 atom_line = ('ATOM ' + index + ' ' + name + ' ' + residue_name

1682 + chain_name + residue_number + icode + ' ' + x_coord + y_coord + z_coord

1683 + ' ' + occupancy + ' ' + temp_factor + ' ' + element).ljust(80) + '\n'

1684 content += atom_line

1685 return content

1686

1687 # Generate a pdb file with current structure

1688 def generate_pdb_file (self, pdb_filepath : str):

1689 pdb_content = self.generate_pdb()

1690 with open(pdb_filepath, "w") as file:

1691 file.write(pdb_content)

1692

1693 # Get the structure equivalent pytraj topology

1694 def get_pytraj_topology (self):

1695 # In we do not have pytraj in our environment then we cannot proceed

1696 if not is_imported('pytraj'):

1697 raise InputError('Missing dependency error: pytraj')

1698 # Generate a pdb file from the current structure to feed pytraj

1699 pdb_filepath = '.structure.pdb'

1700 self.generate_pdb_file(pdb_filepath)

1701 pytraj_topology = pytraj.load_topology(filename = pdb_filepath)

1702 os.remove(pdb_filepath)

1703 return pytraj_topology

1704

1705 # Select atoms from the structure thus generating an atom indices list

1706 # Different tools may be used to make the selection:

1707 # - vmd (default)

1708 # - pytraj

1709 SUPPORTED_SELECTION_SYNTAXES = { 'vmd', 'pytraj' }

1710 def select (self, selection_string : str, syntax : str = 'vmd') -> Optional['Selection']:

1711 if syntax == 'vmd':

1712 # Generate a pdb for vmd to read it

1713 pdb_filepath = '.structure.pdb'

1714 self.generate_pdb_file(pdb_filepath)

1715 # Use vmd to find atom indices

1716 atom_indices = get_vmd_selection_atom_indices(pdb_filepath, selection_string)

1717 os.remove(pdb_filepath)

1718 if len(atom_indices) == 0:

1719 return Selection()

1720 return Selection(atom_indices)

1721 if syntax == 'pytraj':

1722 # In we do not have pytraj in our environment then we cannot proceed

1723 if not is_imported('pytraj'):

1724 raise InputError('Missing dependency error: pytraj')

1725 pytraj_topology = self.get_pytraj_topology()

1726 pytraj_selection = pytraj_topology[selection_string]

1727 atom_indices = [ atom.index for atom in pytraj_selection.atoms ]

1728 if len(atom_indices) == 0:

1729 return Selection()

1730 return Selection(atom_indices)

1731

1732 options = ', '.join(self.SUPPORTED_SELECTION_SYNTAXES)

1733 raise InputError(f'Syntax "{syntax}" is not supported. Choose one of the following: {options}')

1734

1735 # Set a function to make selections using atom indices

1736 def select_atom_indices (self, atom_indices : List[int]) -> 'Selection':

1737 # Check atom indices to be in the structure

1738 atom_count = self.atom_count

1739 for atom_index in atom_indices:

1740 if atom_index >= atom_count:

1741 raise InputError(f'Atom index {atom_index} is out of range ({atom_count})')

1742 return Selection(atom_indices)

1743

1744 # Set a function to make selections using residue indices

1745 def select_residue_indices (self, residue_indices : List[int]) -> 'Selection':

1746 # WARNING: The following line gets stucked sometimes, idk why

1747 #atom_indices = sum([ self.residues[index].atom_indices for index in residue_indices ], [])

1748 atom_indices = []

1749 for i, index in enumerate(residue_indices):

1750 atom_indices += self.residues[index].atom_indices

1751 return Selection(atom_indices)

1752

1753 # Get a selection with all atoms

1754 def select_all (self) -> 'Selection':

1755 return Selection(list(range(self.atom_count)))

1756

1757 # Select atoms according to the classification of its residue

1758 def select_by_classification (self, classification : str) -> 'Selection':

1759 atom_indices = []

1760 for residue in self.residues:

1761 if residue.classification == classification:

1762 atom_indices += residue.atom_indices

1763 return Selection(atom_indices)

1764

1765 # Select water atoms

1766 # WARNING: This logic is a bit guessy and it may fail for non-standard residue named structures

1767 def select_water (self) -> 'Selection':

1768 return self.select_by_classification('solvent')

1769

1770 # Select ions

1771 def select_ions (self) -> 'Selection':

1772 return self.select_by_classification('ion')

1773

1774 # Select counter ion atoms

1775 # WARNING: This logic is a bit guessy and it may fail for non-standard atom named structures

1776 def select_counter_ions (self, charge : Optional[str] = None) -> 'Selection':

1777 counter_ion_indices = []

1778 # Set the accepted names accoridng to the charge

1779 if charge == None:

1780 accepted_names = STANDARD_COUNTER_ION_ATOM_NAMES

1781 elif charge == '+':

1782 accepted_names = STANDARD_COUNTER_CATION_ATOM_NAMES

1783 elif charge == '-':

1784 accepted_names = STANDARD_COUNTER_ANION_ATOM_NAMES

1785 else:

1786 raise ValueError('Not supported charge')

1787 # Iterate atoms

1788 for atom in self.atoms:

1789 # If the residue has not one single atom then it is not an ion

1790 if len(atom.residue.atoms) != 1: continue

1791 # Get a simplified version of the atom name

1792 # Set all letters upper and remove non-letter characters (e.g. '+' and '-' symbols)

1793 simple_atom_name = ''.join(filter(str.isalpha, atom.name.upper()))

1794 if simple_atom_name in accepted_names:

1795 counter_ion_indices.append(atom.index)

1796 return Selection(counter_ion_indices)

1797

1798 # Select both water and counter ions

1799 def select_water_and_counter_ions (self) -> 'Selection':

1800 return self.select_water() + self.select_counter_ions()

1801

1802 # Select heavy atoms

1803 def select_heavy_atoms (self) -> 'Selection':

1804 atom_indices = []

1805 for atom in self.atoms:

1806 # If the atom is not an hydrogen then add it to the list

1807 if atom.element != 'H':

1808 atom_indices.append(atom.index)

1809 return Selection(atom_indices)

1810

1811 # Select protein atoms

1812 # WARNING: Note that there is a small difference between VMD protein and our protein

1813 # This function is improved to consider terminal residues as proteins

1814 # VMD considers protein any resiude including N, C, CA and O while terminals may have OC1 and OC2 instead of O

1815 def select_protein (self) -> 'Selection':

1816 return self.select_by_classification('protein')

1817

1818 # Select nucleic atoms

1819 def select_nucleic (self) -> 'Selection':

1820 return self.select_by_classification('dna') + self.select_by_classification('rna')

1821

1822 # Select lipids

1823 def select_lipids (self) -> 'Selection':

1824 return self.select_by_classification('fatty') + self.select_by_classification('steroid')

1825

1826 # Select carbohydrates

1827 def select_carbohydrates (self) -> 'Selection':

1828 return self.select_by_classification('carbohydrate')

1829

1830 # Return a selection of the typical PBC atoms: solvent, counter ions and lipids

1831 # WARNING: This is just a guess

1832 def select_pbc_guess (self) -> 'Selection':

1833 return self.select_water() + self.select_counter_ions() + self.select_lipids()

1834

1835 def select_cg (self) -> 'Selection':

1836 """Select coarse grain atoms."""

1837 return Selection([ atom.index for atom in self.atoms if atom.element == CG_ATOM_ELEMENT ])

1838

1839 def select_cartoon (self, include_terminals : bool = False) -> 'Selection':

1840 """Select cartoon representable regions for VMD.

1841

1842 Rules are:

1843 1. Residues must be protein (i.e. must contain C, CA, N and O atoms) or nucleic (P, OP1, OP2, O3', C3', C4', C5', O5')

1844 2. There must be at least 3 covalently bonded residues

1845

1846 It does not matter their chain, numeration or even index order as long as they are bonded.

1847 * Note that we can represent cartoon while we display one residue alone, but it must be connected anyway.

1848 Also, we have the option to include terminals in the cartoon selection although they are not representable.

1849 This is helpful for the screenshot: terminals are better hidden than represented as ligands."""

1850 # Set fragments which are candidate to be cartoon representable

1851 fragments = []

1852 # Get protein fragments according to VMD

1853 protein_selection = self.select_protein() - self.select_cg() if include_terminals else self.select('protein', syntax='vmd')

1854 if protein_selection:

1855 fragments += list(self.find_fragments(protein_selection))

1856 # Get nucleic fragments according to VMD

1857 nucleic_selection = self.select_nucleic() - self.select_cg()

1858 if nucleic_selection:

1859 fragments += list(self.find_fragments(nucleic_selection))

1860 # Set the final selection including all valid fragments

1861 cartoon_selection = Selection()

1862 # Iterate over every fragment

1863 for fragment in fragments:

1864 # Make sure the fragment has at least 3 residues before adding it to the cartoon selection

1865 fragment_residue_indices = self.get_selection_residue_indices(fragment)

1866 if len(fragment_residue_indices) >= 3:

1867 cartoon_selection += fragment

1868 return cartoon_selection

1869

1870 # Invert a selection

1871 def invert_selection (self, selection : 'Selection') -> 'Selection':

1872 atom_indices = list(range(self.atom_count))

1873 for atom_index in selection.atom_indices:

1874 atom_indices[atom_index] = None

1875 return Selection([ atom_index for atom_index in atom_indices if atom_index != None ])

1876

1877 # Given a selection, get a list of residue indices for residues implicated

1878 # Note that if a single atom from the residue is in the selection then the residue index is returned

1879 def get_selection_residue_indices (self, selection : 'Selection') -> List[int]:

1880 return list(set([ self.atoms[atom_index].residue_index for atom_index in selection.atom_indices ]))

1881

1882 # Given a selection, get a list of residues implicated

1883 # Note that if a single atom from the residue is in the selection then the residue is returned

1884 def get_selection_residues (self, selection : 'Selection') -> List['Residue']:

1885 residue_indices = self.get_selection_residue_indices(selection)

1886 return [ self.residues[index] for index in residue_indices ]

1887

1888 # Given a selection, get a list of chain indices for chains implicated

1889 # Note that if a single atom from the chain is in the selection then the chain index is returned

1890 def get_selection_chain_indices (self, selection : 'Selection') -> List[int]:

1891 return list(set([ self.atoms[atom_index].chain_index for atom_index in selection.atom_indices ]))

1892

1893 # Given a selection, get a list of chains implicated

1894 # Note that if a single atom from the chain is in the selection then the chain is returned

1895 def get_selection_chains (self, selection : 'Selection') -> List['Chain']:

1896 chain_indices = self.get_selection_chain_indices(selection)

1897 return [ self.chains[index] for index in chain_indices ]

1898

1899 # Get type of the chain

1900 def get_selection_classification (self, selection : 'Selection') -> str:

1901

1902 # Get selection residues

1903 selection_residue_indices = self.get_selection_residue_indices(selection)

1904

1905 # Inicializar contadores para cada tipo de residuo

1906 residue_counts = {}

1907

1908 # Count the residues of each type

1909 for residue_index in selection_residue_indices:

1910 residue = self.residues[residue_index]

1911 res_class = residue.classification

1912 if res_class in residue_counts:

1913 residue_counts[res_class] += 1

1914 else:

1915 # If the classification is not recognized, count it as "other"

1916 residue_counts[res_class] = 1

1917

1918 # Count the total number of residues in the selection

1919 total_residues = sum(residue_counts.values())

1920 if total_residues == 0: raise ValueError('Should have residues at this point')

1921

1922 # Calculate the proportion of each type of residue

1923 proportions = { k: v / total_residues for k, v in residue_counts.items() }

1924

1925 # If one type of residue dominates, return it

1926 primary_type = max(proportions, key=proportions.get)

1927 # We establish a threshold of 80% to consider a chain as a single type

1928 if proportions[primary_type] >= 0.8:

1929 return primary_type

1930

1931 # Special cases

1932 relevant_threshold = 0.3

1933 if proportions.get("dna", 0) > relevant_threshold and proportions.get("rna", 0) > relevant_threshold:

1934 return "nucleic"

1935 if proportions.get("carbohydrate", 0) > relevant_threshold and proportions.get("protein", 0) > relevant_threshold:

1936 return "glycoprotein"

1937 if proportions.get("fatty", 0) > relevant_threshold and proportions.get("steroid", 0) > relevant_threshold:

1938 return "lipid"

1939

1940 # Any other combinations of different main proportions

1941 main_proportions = { k: v for k, v in proportions.items() if v > relevant_threshold }

1942 # Nothing is above the threshold

1943 if len(main_proportions) == 0:

1944 return "mix"

1945 # There is only one thing above threshold

1946 elif len(main_proportions) == 1:

1947 return f"{primary_type}/mix"

1948 # There are two things above threshold

1949 elif len(main_proportions) == 2:

1950 other_type = next(key for key in main_proportions.keys() if key != primary_type)

1951 return f"{primary_type}/{other_type}"

1952 # There are three things above the threshold (more is not possible)

1953 else:

1954 return "mix"

1955

1956 # Create a new structure from the current using a selection to filter atoms

1957 def filter (self, selection : Union['Selection', str], selection_syntax : str = 'vmd') -> 'Structure':

1958 if not selection:

1959 raise InputError('No selection was passed')

1960 # In case the selection is not an actual Selection, but a string, parse the string into a Selection

1961 if type(selection) == str:

1962 selection = self.select(selection, selection_syntax)

1963 new_atoms = []

1964 new_residues = []

1965 new_chains = []

1966 # Get the selected atoms

1967 # Generate dictionaries with new indexes as keys and previous indexes as values for atoms, residues and chains

1968 # This is done with this structure for the residues and chains further to find the new indices fast

1969 new_atom_indices = {}

1970 # Collect also original indices to related atom residues and chains

1971 original_atom_residue_indices = []

1972 for new_index, original_index in enumerate(selection.atom_indices):

1973 # Make a copy of the selected atom in order to not modify the original one

1974 original_atom = self.atoms[original_index]

1975 new_atom = Atom(

1976 name=original_atom.name,

1977 element=original_atom.element,

1978 coords=original_atom.coords,

1979 )

1980 new_atoms.append(new_atom)

1981 # Save old and new indices in order to reconfigure all indices later

1982 new_atom_indices[original_index] = new_index

1983 original_atom_residue_indices.append(original_atom.residue_index)

1984 # Find the selected residues

1985 selected_residue_indices = list(set(original_atom_residue_indices))

1986 # Repeat the original/new indices backup we did before

1987 new_residue_indices = {}

1988 original_residue_atom_indices = []

1989 original_residue_chain_indices = []

1990 for new_index, original_index in enumerate(selected_residue_indices):

1991 # Make a copy of the selected residue in order to not modify the original one

1992 original_residue = self.residues[original_index]

1993 new_residue = Residue(

1994 name=original_residue.name,

1995 number=original_residue.number,

1996 icode=original_residue.icode,

1997 )

1998 new_residues.append(new_residue)

1999 # Save old and new indices in order to reconfigure all indices later

2000 new_residue_indices[original_index] = new_index

2001 original_residue_atom_indices.append(original_residue.atom_indices)

2002 original_residue_chain_indices.append(original_residue.chain_index)

2003 # Find the selected chains

2004 selected_chain_indices = list(set(original_residue_chain_indices))

2005 # Repeat the original/new indices backup we did before

2006 new_chain_indices = {}

2007 original_chain_residue_indices = []

2008 for new_index, original_index in enumerate(selected_chain_indices):

2009 # Make a copy of the selected chain in order to not modify the original one

2010 original_chain = self.chains[original_index]

2011 new_chain = Chain(

2012 name=original_chain.name,

2013 )

2014 new_chains.append(new_chain)

2015 # Save old and new indices in order to reconfigure all indices later

2016 new_chain_indices[original_index] = new_index

2017 original_chain_residue_indices.append(original_chain.residue_indices)

2018 # Finally, reset indices in all instances

2019 for a, atom in enumerate(new_atoms):

2020 atom.residue_index = new_residue_indices[ original_atom_residue_indices[a] ]

2021 for r, residue in enumerate(new_residues):

2022 atom_indices = []

2023 for original_index in original_residue_atom_indices[r]:

2024 new_index = new_atom_indices.get(original_index, None)

2025 if new_index != None:

2026 atom_indices.append(new_index)

2027 residue.atom_indices = atom_indices

2028 residue.chain_index = new_chain_indices[ original_residue_chain_indices[r] ]

2029 for c, chain in enumerate(new_chains):

2030 residue_indices = []

2031 for original_index in original_chain_residue_indices[c]:

2032 new_index = new_residue_indices.get(original_index, None)

2033 if new_index != None:

2034 residue_indices.append(new_index)

2035 chain.residue_indices = residue_indices

2036 return Structure(atoms=new_atoms, residues=new_residues, chains=new_chains)

2037

2038

2039 def chainer (self, selection : Optional['Selection'] = None, letter : Optional[str] = None, whole_fragments : bool = True):

2040 """Set chains on demand.

2041 If no selection is passed then the whole structure will be affected.

2042 If no chain is passed then a "chain by fragment" logic will be applied."""

2043 # If there is no selection we consider all atoms

2044 if selection == None:

2045 selection = self.select_all()

2046 # If the selection is empty then there is nothing to do here

2047 if len(selection) == 0: return

2048 # If a letter is specified then the logic is way simpler

2049 if letter:

2050 self.set_selection_chain_name(selection, letter)

2051 return

2052 # If a letter is not specified we run the "fragments" logic

2053 fragment_getter = self.find_whole_fragments if whole_fragments else self.find_fragments

2054 for fragment in fragment_getter(selection):

2055 chain_name = self.get_available_chain_name()

2056 self.set_selection_chain_name(fragment, chain_name)

2057

2058 # Smart function to set chains automatically

2059 # Original chains will be overwritten

2060 def auto_chainer (self):

2061 # Reset chains

2062 self.chainer(letter='X')

2063 # Set solvent and ions as a unique chain

2064 ion_selection = self.select_ions()

2065 solvent_selection = self.select_water()

2066 ion_and_indices_selection = ion_selection + solvent_selection

2067 self.chainer(selection=ion_and_indices_selection, letter='S')

2068 # Set fatty acids and steroids as a unique chain

2069 # DANI: Se podrían incluir algunos carbohydrates (glycans)

2070 # DANI: Se podrían descartarían residuos que no pertenezcan a la membrana por proximidad

2071 membrane_selection = self.select_lipids()

2072 self.chainer(selection=membrane_selection, letter='M')

2073 # Set carbohydrates as a unique chain as well, just in case we have glycans

2074 # Note that in case glycan atoms are mixed with protein atoms glycan chains will be overwritten

2075 # However this is not a problem. It is indeed the best solution if we don't want ro resort atoms

2076 carbohydrate_selection = self.select_carbohydrates()

2077 self.chainer(selection=carbohydrate_selection, letter='H')

2078 # Add a chain per fragment for both proteins and nucleic acids

2079 protein_selection = self.select_protein()

2080 nucleic_selection = self.select_nucleic()

2081 protein_and_nucleic_selection = protein_selection + nucleic_selection

2082 self.chainer(selection=protein_and_nucleic_selection)

2083 # At this point we should have covered most of the molecules in the structure

2084 # However, in case there are more molecules, we have already set them all as a single chain ('X')

2085 # Here we do not apply the chain per fragment logic since it may be dangerous

2086 # Note that we may have a lot of independent residues (and thus a los of small fragments)

2087 # This would make us run out of letters in the alphabet and thus there would be no more chains

2088 # As the last step, fix repeated chains

2089 self.check_repeated_chains(fix_chains=True)

2090

2091 # This is an alternative system to find protein chains (anything else is chained as 'X')

2092 # This system does not depend on VMD

2093 # It totally overrides previous chains since it is expected to be used only when chains are missing

2094 def raw_protein_chainer (self):

2095 current_chain = self.get_available_chain_name()

2096 previous_alpha_carbon = None

2097 for residue in self.residues:

2098 alpha_carbon = next((atom for atom in residue.atoms if atom.name == 'CA'), None)

2099 if not alpha_carbon:

2100 residue.set_chain('X')

2101 continue

2102 # Connected aminoacids have their alpha carbons at a distance of around 3.8 Ångstroms

2103 residues_are_connected = previous_alpha_carbon and calculate_distance(previous_alpha_carbon, alpha_carbon) < 4

2104 if not residues_are_connected:

2105 current_chain = self.get_available_chain_name()

2106 residue.set_chain(current_chain)

2107 previous_alpha_carbon = alpha_carbon

2108

2109 # Given an atom selection, set the chain for all these atoms

2110 # Note that the chain is changed in every whole residue, no matter if only one atom was selected

2111 def set_selection_chain_name (self, selection : 'Selection', letter : str):

2112 # Find if the letter belongs to an already existing chain

2113 chain = self.get_chain_by_name(letter)

2114 # If the chain does not exist yet then create it

2115 if not chain:

2116 chain = Chain(name=letter)

2117 self.set_new_chain(chain)

2118 # Set the chain index of every residue to the new chain

2119 for atom_index in selection.atom_indices:

2120 # WARNING: Chain index has to be read every iteration since it may change. Do not save it

2121 atom = self.atoms[atom_index]

2122 atom.set_chain_index(chain.index)

2123

2124 # Get an available chain name

2125 # Find alphabetically the first letter which is not yet used as a chain name

2126 # If all letters in the alphabet are used already then raise an error

2127 def get_available_chain_name (self) -> str:

2128 current_chain_names = [ chain.name for chain in self.chains ]

2129 next_available_chain_name = next((name for name in AVAILABLE_LETTERS if name not in current_chain_names), None)

2130 if next_available_chain_name == None:

2131 raise InputError(f'There are more chains than available chain letters ({len(AVAILABLE_LETTERS)})')

2132 return next_available_chain_name

2133

2134 def get_next_available_chain_name (self, anterior : str) -> str:

2135 """

2136 Get the next available chain name.

2137

2138 Args:

2139 anterior (str): The last chain name used, which is expected to be a single letter

2140

2141 Raises:

2142 ValueError: If the anterior is not a letter or if there are more chains than available

2143 """

2144 current_chain_names = set([ chain.name for chain in self.chains ])

2145 # If anterior is cap then try caps first, if anterior is lower then try lowers first

2146 if anterior.isupper():

2147 first_group, second_group = AVAILABLE_CAPS, AVAILABLE_LOWS

2148 elif anterior.islower():

2149 first_group, second_group = AVAILABLE_LOWS, AVAILABLE_CAPS

2150 else: raise ValueError(f'Is "{anterior}" even a letter?')

2151 # Reorder letters in the first group, so the anterior is the last letter

2152 anterior_position = first_group.index(anterior)

2153 next_position = anterior_position + 1

2154 reordered_group = first_group[next_position:] + first_group[0:next_position]

2155 next_letter = next((letter for letter in reordered_group if letter not in current_chain_names), None)

2156 if next_letter: return next_letter

2157 # If there is not available letters is the first group then return the first available in the second

2158 next_letter = next((letter for letter in second_group if letter not in current_chain_names), None)

2159 if next_letter: return next_letter

2160 raise ValueError(f'There are more chains than available chain letters ({len(AVAILABLE_LETTERS)})')

2161

2162 # Get a chain by its name

2163 def get_chain_by_name (self, name : str) -> Optional['Chain']:

2164 return next((c for c in self.chains if c.name == name), None)

2165

2166 # Find a residue by its chain, number and insertion code

2167 def find_residue (self, chain_name : str, number : int, icode : str = '') -> Optional['Residue']:

2168 # Get the corresponding chain

2169 target_chain = self.get_chain_by_name(chain_name)

2170 if not target_chain: return None

2171 # Find the residue in the target chain

2172 target_chain.find_residue(number, icode)

2173

2174 # Get a summary of the structure

2175 def display_summary (self):

2176 print(f'Atoms: {self.atom_count}')

2177 print(f'Residues: {len(self.residues)}')

2178 print(f'Chains: {len(self.chains)}')

2179 for chain in self.chains:

2180 print(f'Chain {chain.name} ({len(chain.residue_indices)} residues)')

2181 print(' -> ' + chain.get_sequence())

2182

2183

2184 def check_repeated_chains (self, fix_chains : bool = False, display_summary : bool = False) -> bool:

2185 """

2186 There may be chains which are equal in the structure (i.e. same chain name).

2187 This means we have a duplicated/splitted chain.

2188 Repeated chains are usual and they are usually supported but with some problems.

2189 Also, repeated chains usually come with repeated residues, which means more problems (see explanation below).

2190

2191 In the context of this structure class we may have 2 different problems with a different solution each:

2192 1. There is more than one chain with the same letter (repeated chain) -> rename the duplicated chains

2193 2. There is a chain with atom indices which are not consecutive (splitted chain) -> create new chains

2194

2195 Rename repeated chains or create new chains if the fix_chains argument is True.

2196

2197 WARNING: These fixes are possible only if there are less chains than the number of letters in the alphabet.

2198 Although there is no limitation in this code for chain names, setting long chain names is not compatible with pdb format.

2199

2200 Check splitted chains (a chains with non consecutive residues) and try to fix them if requested.

2201 Check repeated chains (two chains with the same name) and return True if there were any repeats.

2202 """

2203 # Order chains according to their names

2204 # Save also those chains which have a previous duplicate

2205 name_chains = {}

2206 repeated_chains = []

2207 for chain in self.chains:

2208 chain_name = chain.name

2209 current_name_chains = name_chains.get(chain_name, None)

2210 if not current_name_chains:

2211 name_chains[chain_name] = [chain]

2212 else:

2213 name_chains[chain_name].append(chain)

2214 repeated_chains.append(chain)

2215 # Display the summary of repeated chains if requested

2216 if display_summary:

2217 if len(repeated_chains) > 0:

2218 warn('There are repeated chains:')

2219 for chain_name, chains in name_chains.items():

2220 chains_count = len(chains)

2221 if chains_count > 1:

2222 print(f'- Chain {chain_name} has {chains_count} repeats')

2223 # Rename repeated chains if requested

2224 if len(repeated_chains) > 0 and fix_chains:

2225 n_chains = len(self.chains)

2226 n_available_letters = len(AVAILABLE_LETTERS)

2227 if n_chains > n_available_letters:

2228 # for chain in self.chains:

2229 # print(str(chain) + ': ' + str(chain.atom_indices[0]) + ' to ' + str(chain.atom_indices[-1]))

2230 raise ValueError(f'There are more chains ({n_chains}) than available chain letters ({n_available_letters})')

2231 current_letters = list(name_chains.keys())

2232 for repeated_chain in repeated_chains:

2233 last_chain_letter = repeated_chain.name

2234 while last_chain_letter in current_letters:

2235 last_chain_letter = get_next_letter(last_chain_letter)

2236 repeated_chain.name = last_chain_letter

2237 current_letters.append(last_chain_letter)

2238 # Check if there are splitted chains

2239 for chain in self.chains:

2240 residue_indices = sorted(chain.residue_indices)

2241 # Check if residue indices are consecutive

2242 if residue_indices[-1] - residue_indices[0] + 1 == len(residue_indices):

2243 continue

2244 warn(f'Splitted chain {chain.name}')

2245 # If indices are not consecutive then we must find ranges of consecutive residues and create new chains for them

2246 previous_residue_index = residue_indices[0]

2247 consecutive_residues = [previous_residue_index]

2248 overall_consecutive_residues = []

2249 for residue_index in residue_indices[1:]:

2250 # If next residue is consecutive

2251 if residue_index == previous_residue_index + 1:

2252 consecutive_residues.append(residue_index)

2253 # If next residue is NOT consecutive

2254 else:

2255 overall_consecutive_residues.append(consecutive_residues)

2256 consecutive_residues = [residue_index]

2257 # Update the previous index

2258 previous_residue_index = residue_index

2259 # Add the last split

2260 overall_consecutive_residues.append(consecutive_residues)

2261 # Now create new chains and reasign residues

2262 # Skip the first set of consecutive residues since they will stay in the original chain

2263 for residues_indices in overall_consecutive_residues[1:]:

2264 chain_name = self.get_available_chain_name()

2265 residues_selection = self.select_residue_indices(residues_indices)

2266 self.set_selection_chain_name(residues_selection, chain_name)

2267

2268 # Fix repeated chains if requested

2269 return len(repeated_chains) > 0

2270

2271

2272 def check_repeated_residues (self, fix_residues : bool = False, display_summary : bool = False) -> bool:

2273 """

2274 There may be residues which are equal in the structure (i.e. same chain, name, number and icode).

2275 In case 2 residues in the structure are equal we must check distance between their atoms.

2276 If atoms are far it means they are different residues with the same notation (duplicated residues).

2277 If atoms are close it means they are indeed the same residue (splitted residue).

2278

2279 Splitted residues are found in some pdbs and they are supported by some tools.

2280 These tools consider all atoms with the same 'record' as the same residue.

2281 However, there are other tools which would consider the splitted residue as two different residues.

2282 This causes inconsistency along different tools besides a long list of problems.

2283 The only possible is fix is changing the order of atoms in the topology.

2284 Note that this is a breaking change for associated trajectories, which must change the order of coordinates.

2285 However here we provide tools to fix associates trajectories as well.

2286

2287 Duplicated residues are usual and they are usually supported but with some problems.

2288 For example, pytraj analysis outputs use to sort results by residues and each residue is tagged.

2289 If there are duplicated residues with the same tag it may be not possible to know which result belongs to each residue.

2290 Another example are NGL selections once in the web client.

2291 If you select residue ':A and 1' and there are multiple residues 1 in chain A all of them will be displayed.

2292

2293 Check residues to search for duplicated and splitted residues.

2294 Renumerate repeated residues if the fix_residues argument is True.

2295 Return True if there were any repeats.

2296 """

2297 # Track if residues have to be changed or not

2298 modified = False

2299 # Group all residues in the structure according to their chain, number and icode

2300 grouped_residues = {}

2301 # Check repeated residues which are one after the other

2302 # Note that these residues MUST have a different name

2303 # Otherwise they would have not been considered different residues

2304 # For these rare cases we use icodes to solve the problem

2305 non_icoded_residues = []

2306 last_residue = None

2307 for residue in self.residues:

2308 # Check residue to be equal than the previous residue

2309 if residue == last_residue:

2310 non_icoded_residues.append(residue)

2311 last_residue = residue

2312 continue

2313 last_residue = residue

2314 # Add residue to the groups of repeated residues

2315 current_residue_repeats = grouped_residues.get(residue, None)

2316 if not current_residue_repeats:

2317 grouped_residues[residue] = [ residue ]

2318 else:

2319 current_residue_repeats.append(residue)

2320 # In case we have non icoded residues

2321 if len(non_icoded_residues) > 0:

2322 if display_summary:

2323 print(f'There are non-icoded residues ({len(non_icoded_residues)})')

2324 # Set new icodes for non icoded residues

2325 if fix_residues:

2326 print(' Non icoded residues will recieve an icode')

2327 for residue in non_icoded_residues:

2328 repeated_residues_group = grouped_residues[residue]

2329 current_icodes = [ residue.icode for residue in repeated_residues_group if residue.icode ]

2330 next_icode = next((cap for cap in AVAILABLE_CAPS if cap not in current_icodes), None)

2331 if not next_icode:

2332 raise ValueError('There are no more icodes available')

2333 # print('Setting icode ' + next_icode + ' to residue ' + str(residue))

2334 residue.icode = next_icode

2335 modified = True

2336 # Grouped residues with more than 1 result are considered as repeated

2337 repeated_residues = [ residues for residues in grouped_residues.values() if len(residues) > 1 ]

2338 if len(repeated_residues) == 0:

2339 return modified

2340 # In case we have repeated residues...

2341 if display_summary:

2342 warn(f'There are {len(repeated_residues)} different groups of repeated residues')

2343 print(f' e.g. {repeated_residues[0][0]}')

2344 if len(repeated_residues) == 9999 or len(repeated_residues) == 10000:

2345 print(' Probably you have more residues than the PDB numeration limit (1-9999)')

2346 # Now for each repeated residue, find out which are splitted and which are duplicated

2347 covalent_bonds = self.bonds

2348 overall_splitted_residues = []

2349 overall_duplicated_residues = []

2350 for residues in repeated_residues:

2351 # Iterate over repeated residues and check if residues are covalently bonded

2352 # If any pair of residues are bonded add them both to the splitted residues list

2353 # At the end, all non-splitted residues will be considered duplicated residues

2354 # WARNING: Using a set here is not possible since repeated residues have the same hash

2355 # WARNING: Also comparing residues themselves is not advisable, so we use indices at this point

2356 splitted_residue_indices = set()

2357 for residue, other_residues in otherwise(residues):

2358 if residue.index in splitted_residue_indices:

2359 continue

2360 # Get atom indices for all atoms connected to the current residue

2361 residue_bonds = sum([ covalent_bonds[index] for index in residue.atom_indices ], [])

2362 for other_residue in other_residues:

2363 # Get all atom indices for each other residue and collate with the current residue bonds

2364 if any( index in residue_bonds for index in other_residue.atom_indices ):

2365 splitted_residue_indices.add(residue.index)

2366 splitted_residue_indices.add(other_residue.index)

2367 # Finally obtain the splitted residues from its indices

2368 splitted_residues = [ self.residues[index] for index in splitted_residue_indices ]

2369 # Repeated residues which are not splitted are thus duplicated

2370 duplicated_residues = [ residue for residue in residues if residue.index not in splitted_residue_indices ]

2371 # Update the overall lists

2372 if len(splitted_residues) > 0:

2373 overall_splitted_residues.append(splitted_residues)

2374 if len(duplicated_residues) > 0:

2375 overall_duplicated_residues.append(duplicated_residues)

2376 # In case we have splitted residues

2377 if len(overall_splitted_residues) > 0:

2378 if display_summary:

2379 print(f' There are splitted residues ({len(overall_splitted_residues)})')

2380 # Fix splitted residues

2381 if fix_residues:

2382 print(' Splitted residues will be merged')

2383 # Set a function to sort atoms and residues by index

2384 def by_index (v):

2385 return v._index

2386 # Merge splitted residues

2387 # WARNING: Merging residues without sorting atoms is possible, but this would be lost after exporting to pdb

2388 for splitted_residues in overall_splitted_residues:

2389 # The first residue (i.e. the residue with the lower index) will be the one which remains

2390 # It will absorb other residue atom indices

2391 splitted_residues.sort(key=by_index) # Residues are not sorted by default, this is mandatory

2392 first_residue = splitted_residues[0]

2393 other_residues = splitted_residues[1:]

2394 for residue in other_residues:

2395 for atom in residue.atoms:

2396 atom.residue = first_residue

2397 # Then these other residues are eliminated

2398 self.purge_residue(residue)

2399 print(' Atoms will be sorted to be together by residues')

2400 print(' NEVER FORGET: This will break any associated trajectory if coordinates are not sorted as well')

2401 # Sort atoms to group residue atoms together

2402 # Note that each atom index must be updated

2403 new_atom_indices = sum([ residue.atom_indices for residue in self.residues ], [])

2404 for i, new_atom_index in enumerate(new_atom_indices):

2405 atom = self.atoms[new_atom_index]

2406 atom._index = i

2407 self.atoms.sort(key=by_index)

2408 # Also residue 'atom_indices' must be updated

2409 for residue in self.residues:

2410 residue._atom_indices = [ new_atom_indices.index(atom_index) for atom_index in residue._atom_indices ]

2411 # Bonds must be reset since atom indices have changes

2412 self._bonds = None

2413 # Prepare the trajectory atom sorter which must be returned

2414 # Include atom indices already so the user has to provide only the structure and trajectory filepaths

2415 def trajectory_atom_sorter (

2416 input_structure_file : 'File',

2417 input_trajectory_file : 'File',

2418 output_trajectory_file : 'File'

2419 ):

2420 sort_trajectory_atoms(

2421 input_structure_file,

2422 input_trajectory_file,

2423 output_trajectory_file,

2424 new_atom_indices

2425 )

2426 self.trajectory_atom_sorter = trajectory_atom_sorter

2427 self.new_atom_order = new_atom_indices

2428 modified = True

2429 # In case we have duplicated residues

2430 duplicated_residues_count = len(overall_duplicated_residues)

2431 if duplicated_residues_count > 0:

2432 if display_summary:

2433 warn(f'There are {duplicated_residues_count} different groups of duplicated residues')

2434 # Renumerate duplicated residues if requested

2435 if fix_residues:

2436 # First of all, from each group of repeated residues, discard the first residue

2437 # The first residue will remain as it is and the rest will be renumerated

2438 # Join all residues to be renumerated in a single list

2439 residue_to_renumerate = []

2440 for residues in overall_duplicated_residues:

2441 residue_to_renumerate += residues[1:]

2442 # Now group residues per chain since the renumeration is done by chains

2443 chain_residues = {}

2444 for residue in residue_to_renumerate:

2445 chain = residue.chain

2446 current_chain_residues = chain_residues.get(chain, None)

2447 if current_chain_residues: current_chain_residues.append(residue)

2448 else: chain_residues[chain] = [ residue ]

2449 # Iterate residue chain groups

2450 for chain, residues in chain_residues.items():

2451 # Find the last residue number in the current chain

2452 maximum_chain_number = max([ residue.number for residue in chain.residues ])

2453 # Sort residues by index

2454 residues.sort(key=lambda x: x.index, reverse=True)

2455 for residue in residues:

2456 # Set the number of the residue as the next available number

2457 residue.number = maximum_chain_number + 1

2458 # Update the maximum number

2459 maximum_chain_number = residue.number

2460 modified = True

2461 return modified

2462

2463 # Atoms with identical chain, residue and name are considered repeated atoms

2464 # DANI: No recuerdo los problemas que daba tener átomos repetidos

2465

2466 # Check atoms to search for repeated atoms

2467 # Rename repeated atoms if the fix_atoms argument is True

2468 # Return True if there were any repeats

2469 def check_repeated_atoms (self, fix_atoms : bool = False, display_summary : bool = False) -> bool:

2470 # Set two trackers for display

2471 repeated_atoms_count = 0

2472 example = None

2473 for residue in self.residues:

2474 # Iterate over the residue atoms counting how many repeated names we have

2475 current_names = []

2476 for atom in residue.atoms:

2477 # We check if the atom name already exists. If not, go to the next atom

2478 name = atom.name

2479 if name not in current_names:

2480 current_names.append(name)

2481 continue

2482 # When atom is repeated

2483 repeated_atoms_count += 1

2484 # If the fix was not requested we stop here

2485 if not fix_atoms:

2486 continue

2487 # We set the name of the atom as the element letter + the count of this element

2488 # If element is missing take the first character of the atom name

2489 initial = atom.element

2490 if not initial or initial == ' ':

2491 initial = name[0]

2492 number = 1

2493 new_name = initial + str(number)

2494 while new_name in current_names:

2495 number += 1

2496 new_name = initial + str(number)

2497 # Save an example for the logs if there is none yet

2498 if not example:

2499 example = f'{atom.name} renamed as {new_name} in residue {residue}'

2500 atom.name = new_name

2501 current_names.append(new_name)

2502 # Display the summary of repeated atoms if requested

2503 if display_summary:

2504 if repeated_atoms_count > 0:

2505 warn(f'There are repeated atoms ({repeated_atoms_count})')

2506 print(f' e.g. {example}')

2507 return repeated_atoms_count > 0

2508

2509 # Check bonds to be incoherent

2510 # i.e. check atoms not to have more or less bonds than expected according to their element

2511 def check_incoherent_bonds (self) -> bool:

2512 # Find out if there are hydrogens in the structure

2513 # It may happen that we encounter an structure without hydrogens

2514 has_hydrogen = next(( True for atom in self.atoms if atom.element == 'H' ), False)

2515 coherent_bonds = coherent_bonds_with_hydrogen if has_hydrogen else coherent_bonds_without_hydrogen

2516 # Check coherent bonds atom by atom

2517 for atom in self.atoms:

2518 # Do not check this atom bonds if this may be an ion

2519 # Most authors "force" dummy bonds in these atoms to make them stable

2520 if atom.element in SUPPORTED_ION_ELEMENTS: continue

2521 # Ignore dummy atoms as well

2522 if atom.element == DUMMY_ATOM_ELEMENT: continue

2523 # Ignore coarse grain atoms as well

2524 if atom.element == CG_ATOM_ELEMENT: continue

2525 # Get actual number of bonds in the current atom both with and without ion bonds

2526 # LORE: This was a problem since some ions are force-bonded but bonds are actually not real

2527 # LORE: When an ion is forced we may end up with hyrogens with 2 bonds or carbons with 5 bonds

2528 # LORE: When an ions is really bonded we can not discard it or we may end up with orphan carbons (e.g. ligands)

2529 min_nbonds = len(atom.get_bonds(skip_ions = True, skip_dummies = True))

2530 max_nbonds = len(atom.get_bonds(skip_ions = False, skip_dummies = True))

2531 # Get the accepted range of number of bonds for the current atom according to its element

2532 element = atom.element

2533 element_coherent_bonds = coherent_bonds.get(element, None)

2534 # If there are no specficiations for the current atom element then skip it

2535 if not element_coherent_bonds:

2536 continue

2537 # Get the range of accepted number of bonds

2538 min_allowed_bonds = element_coherent_bonds['min']

2539 max_allowed_bonds = element_coherent_bonds['max']

2540 # Check the actual number of bonds is insdie the accepted range

2541 if max_nbonds < min_allowed_bonds or min_nbonds > max_allowed_bonds:

2542 if min_nbonds == max_nbonds:

2543 print(f' An atom with element {element} has {min_nbonds} bonds')

2544 else:

2545 print(f' An atom with element {element} has between {min_nbonds} bonds (withou ions) and {max_nbonds} bonds (with ions)')

2546 if min_allowed_bonds == max_allowed_bonds:

2547 plural_sufix = '' if min_allowed_bonds == 1 else 's'

2548 print(f' It should have {min_allowed_bonds} bond{plural_sufix}')

2549 else:

2550 print(f' It should have between {min_allowed_bonds} and {max_allowed_bonds} bonds')

2551 print(f' -> Atom {atom.label}')

2552 bond_label = ', '.join([ self.atoms[atom].label for atom in atom.get_bonds(skip_ions = False) ])

2553 print(f' -> Bonds {bond_label}')

2554 return True

2555 return False

2556

2557 # Get all atomic covalent (strong) bonds

2558 # Bonds are defined as a list of atom indices for each atom in the structure

2559 # Rely on VMD logic to do so

2560 def get_covalent_bonds (self, selection : Optional['Selection'] = None) -> List[ List[int] ]:

2561 # It is important to fix elements before trying to fix bonds, since elements have an impact on bonds

2562 # VMD logic to find bonds relies in the atom element to set the covalent bond distance cutoff

2563 self.fix_atom_elements()

2564 # Generate a pdb strucutre to feed vmd

2565 auxiliar_pdb_filepath = '.structure.pdb'

2566 self.generate_pdb_file(auxiliar_pdb_filepath)

2567 # Get covalent bonds between both residue atoms

2568 covalent_bonds = get_covalent_bonds(auxiliar_pdb_filepath, selection)

2569 # For every atom in CG, replace its bonds with a class which will raise and error when read

2570 # Thus we make sure using these wrong bonds anywhere further will result in failure

2571 for atom_index in self.select_cg().atom_indices:

2572 covalent_bonds[atom_index] = MISSING_BONDS

2573 # Remove the auxiliar pdb file

2574 os.remove(auxiliar_pdb_filepath)

2575 return covalent_bonds

2576

2577 # Make a copy of the bonds list

2578 def copy_bonds (self) -> List[List[int]]:

2579 new_bonds = []

2580 for atom_bonds in self.bonds:

2581 # Missing bonds coming from CG atoms are forwarded

2582 if atom_bonds == MISSING_BONDS:

2583 new_bonds.append(MISSING_BONDS)

2584 continue

2585 # Copy also the inner lists to avoid further mutation of the original structure

2586 new_bonds.append([ atom_index for atom_index in atom_bonds ])

2587 return new_bonds

2588

2589 # Make a copy of the current structure

2590 def copy (self) -> 'Structure':

2591 atom_copies = [ atom.copy() for atom in self.atoms ]

2592 residue_copies = [ residue.copy() for residue in self.residues ]

2593 chain_copies = [ chain.copy() for chain in self.chains ]

2594 structure_copy = Structure(atom_copies, residue_copies, chain_copies)

2595 structure_copy.bonds = self.copy_bonds()

2596 return structure_copy

2597

2598 # DANI: No lo he testeado en profundidad

2599 def merge (self, other : 'Structure') -> 'Structure':

2600 """Merge current structure with another structure."""

2601 # Copy self atoms, residues and chains

2602 self_atom_copies = [ atom.copy() for atom in self.atoms ]

2603 self_residue_copies = [ residue.copy() for residue in self.residues ]

2604 self_chain_copies = [ chain.copy() for chain in self.chains ]

2605 # Copy other atoms, residues and chains

2606 other_atom_copies = [ atom.copy() for atom in other.atoms ]

2607 other_residue_copies = [ residue.copy() for residue in other.residues ]

2608 other_chain_copies = [ chain.copy() for chain in other.chains ]

2609 # Adapt indices in other atoms, residues and chains

2610 atom_index_offset = len(self_atom_copies)

2611 residue_index_offset = len(self_residue_copies)

2612 chain_index_offset = len(self_chain_copies)

2613 for atom in other_atom_copies:

2614 atom._index += atom_index_offset

2615 atom._residue_index += residue_index_offset

2616 for residue in other_residue_copies:

2617 residue._index += residue_index_offset

2618 residue._atom_indices = [ i + atom_index_offset for i in residue._atom_indices ]

2619 residue._chain_index += chain_index_offset

2620 for chain in other_chain_copies:

2621 chain._index += chain_index_offset

2622 chain._residue_indices = [ i + residue_index_offset for i in chain._residue_indices ]

2623 # Merge self with other atoms, residues and chains and build the new structure

2624 merged_atoms = self_atom_copies + other_atom_copies

2625 merged_residues = self_residue_copies + other_residue_copies

2626 merged_chains = self_chain_copies + other_chain_copies

2627 return Structure(merged_atoms, merged_residues, merged_chains)

2628

2629 # Find rings in the structure and yield them as they are found

2630 # LIMITATION: Wrong rings may be found including smaller rings

2631 # LIMITATION: Use this function to see if a specific ring exists, but not to count rings

2632 def find_rings (self, selection : Optional[Selection] = None) -> Generator[ List[Atom], None, None ]:

2633 selected_atom_indices = selection.atom_indices if selection else None

2634 # We will use a logic which finds long paths of bonded heavy atoms

2635 # These paths have no 'branches' and thus, when there are 2 possible paths then both paths are explored independently

2636 # Save already searched atoms and the number of times it has been explored already

2637 # Note that atoms may be visited as many times as their number of heavy atom bonds -1

2638 already_searched_atoms = {}

2639 def find_rings_recursive (atom_path : List[Atom]) -> Generator[ List[Atom], None, None ]:

2640 # Get the current atom to continue the followup: the last atom

2641 current_atom = atom_path[-1] # Note that the list MUST always have at least 1 atom

2642 # Get bonded atoms to continue the path

2643 followup_atoms = current_atom.get_bonded_atoms()

2644 # Hydrogens are discarded since they have only 1 bond and thus they can only lead to a dead end

2645 followup_atoms = [ atom for atom in followup_atoms if atom.element != 'H' ]

2646 # In case there is a selection, check followup atoms not to be in the selection

2647 if selected_atom_indices:

2648 followup_atoms = [ atom for atom in followup_atoms if atom.index in selected_atom_indices ]

2649 # Check if this atom has been visited already too many times and, if so, stop here

2650 # Allowed search times is the posible number of rings where this atom can belong

2651 allowed_search_times = comb(len(followup_atoms), 2)

2652 already_search_times = already_searched_atoms.get(current_atom, 0)

2653 if already_search_times == allowed_search_times:

2654 return

2655 already_searched_atoms[current_atom] = already_search_times + 1

2656 # Remove the previous atom to avoid going back

2657 previous_atom = atom_path[-2] if len(atom_path) > 1 else None

2658 if previous_atom:

2659 followup_atoms.remove(previous_atom)

2660 # Iterate over the following bonded atoms

2661 for followup_atom in followup_atoms:

2662 # Check if the followup atom is in the path

2663 path_index = next(( index for index, atom in enumerate(atom_path) if atom == followup_atom ), None)

2664 # If so, we have found a ring

2665 if path_index != None:

2666 ring = atom_path[path_index:]

2667 yield ring

2668 continue

2669 # Otherwise, keep searching

2670 new_path = atom_path + [followup_atom]

2671 for ring in find_rings_recursive(atom_path=new_path):

2672 yield ring

2673 # Find a starting atom and run the recursive logic

2674 candidate_atom_indices = selected_atom_indices if selected_atom_indices else range(self.atom_count)

2675 for candidate_atom_index in candidate_atom_indices:

2676 candidate_atom = self.atoms[candidate_atom_index]

2677 # It must be heavy atom

2678 if candidate_atom.element == 'H':

2679 continue

2680 # It must not be a dead end already

2681 bonded_atoms = candidate_atom.get_bonded_atoms()

2682 bonded_atoms = [ atom for atom in bonded_atoms if atom.element != 'H' ]

2683 # In case there is a selection, check followup atoms not to be in the selection

2684 if selected_atom_indices:

2685 bonded_atoms = [ atom for atom in bonded_atoms if atom.index in selected_atom_indices ]

2686 # Check this atom is not a dead end already

2687 allowed_search_times = comb(len(bonded_atoms), 2)

2688 if allowed_search_times < 1:

2689 continue

2690 for ring in find_rings_recursive(atom_path=[candidate_atom]):

2691 yield ring

2692

2693 # Given an atom selection, get all bonds between these atoms and any other atom in the structure

2694 # Note that inner bonds between atoms in the selection are discarded

2695 def get_selection_outer_bonds (self, selection : Selection) -> List[int]:

2696 # Get bonds from all atoms in the selection

2697 bonds = set()

2698 for atom_index in selection.atom_indices:

2699 atom_bonds = set(self.bonds[atom_index])

2700 bonds = bonds.union(atom_bonds)

2701 # Discard selection atoms from the bonds list to discard inner bonds

2702 bonds -= set(selection.atom_indices)

2703 return list(bonds)

2704

2705 # Set the type of PTM according to the classification of the bonded residue

2706 ptm_options = {

2707 'protein': ValueError('A PTM residue must never be protein'),

2708 'dna': 'DNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama

2709 'rna': 'RNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama

2710 'carbohydrate': 'Glycosilation',

2711 'fatty': 'Lipidation',

2712 'steroid': 'Steroid linkage', # DANI: Esto es posible aunque muy raro y no se como se llama

2713 'ion': Warning('Ion is covalently bonded to protein'), # DANI: esto no es "correcto" pero si habitual

2714 'solvent': Warning('Solvent is covalently bonded to protein'), # DANI: probablemente sea un error

2715 'acetyl': 'Acetylation', # Typical N-capping terminals

2716 'amide': 'Amidation', # Typical C-capping terminals

2717 'other': Warning('Unknow type of PTM'), # Something not supported

2718 }

2719

2720 # Find Post Translational Modifications (PTM) in the structure

2721 def find_ptms (self) -> Generator[dict, None, None]:

2722 # Find bonds between protein and non-protein atoms

2723 # First get all protein atoms

2724 protein_selection = self.select_protein()

2725 if not protein_selection:

2726 return

2727 protein_atom_indices = set(protein_selection.atom_indices) # This is used later

2728 protein_outer_bonds = set(self.get_selection_outer_bonds(protein_selection))

2729 non_protein_selection = self.invert_selection(protein_selection)

2730 if not non_protein_selection:

2731 return

2732 non_protein_atom_indices = set(non_protein_selection.atom_indices)

2733 non_protein_atom_indices_bonded_to_protein = protein_outer_bonds.intersection(non_protein_atom_indices)

2734 # Get each residue bonded to the protein and based on its 'classification' set the name of the PTM

2735 for atom_index in non_protein_atom_indices_bonded_to_protein:

2736 atom = self.atoms[atom_index]

2737 residue = atom.residue

2738 residue_classification = residue.get_classification()

2739 ptm_classification = self.ptm_options[residue_classification]

2740 # If we found something impossible then raise the error

2741 if type(ptm_classification) == ValueError:

2742 print(f'Problematic residue: {residue}')

2743 raise ptm_classification

2744 # If we do not know what it is then do tag it as a PTM but print a warning

2745 if type(ptm_classification) == Warning:

2746 warn(f'{ptm_classification} -> {residue}')

2747 continue

2748 # At this point we have found a valid PTM

2749 # Find the protein residue linked to this PTM

2750 atom_bonds = self.bonds[atom_index]

2751 protein_bond = next((bond for bond in atom_bonds if bond in protein_atom_indices), None)

2752 if protein_bond == None:

2753 raise ValueError('There must be at least one protein bond to the current atom')

2754 protein_residue_index = self.atoms[protein_bond].residue_index

2755 # Set the PTM

2756 yield { 'name': ptm_classification, 'residue_index': protein_residue_index }

2757

2758 # Ask if the structure has at least one coarse grain atom/residue

2759 def has_cg (self) -> bool:

2760 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)

2761

2762### Related functions ###

2763

2764# Calculate the distance between two atoms

2765def calculate_distance (atom_1 : Atom, atom_2 : Atom) -> float:

2766 squared_distances_sum = 0

2767 for i in { 'x': 0, 'y': 1, 'z': 2 }.values():

2768 squared_distances_sum += (atom_1.coords[i] - atom_2.coords[i])**2

2769 return math.sqrt(squared_distances_sum)

2770

2771# Calculate the angle between 3 atoms

2772# https://stackoverflow.com/questions/35176451/python-code-to-calculate-angle-between-three-point-using-their-3d-coordinates

2773def calculate_angle (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom) -> float:

2774 # Get coordinates in numpy arrays, which allows to easily calculate the difference

2775 a = np.array(atom_1.coords)

2776 b = np.array(atom_2.coords)

2777 c = np.array(atom_3.coords)

2778 # Set the two vectors which make the angle

2779 ba = a - b

2780 bc = c - b

2781 # Calculate the angle between these 2 vectors

2782 cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))

2783 angle = np.arccos(cosine_angle)

2784 return np.degrees(angle)

2785

2786# Calculate the torsion between 4 atoms

2787# https://stackoverflow.com/questions/20305272/dihedral-torsion-angle-from-four-points-in-cartesian-coordinates-in-python

2788def calculate_torsion (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom, atom_4 : Atom) -> float:

2789 p0 = np.array(atom_1.coords)

2790 p1 = np.array(atom_2.coords)

2791 p2 = np.array(atom_3.coords)

2792 p3 = np.array(atom_4.coords)

2793 # Ge some vectors

2794 b0 = -1.0*(p1 - p0)

2795 b1 = p2 - p1

2796 b2 = p3 - p2

2797 # normalize b1 so that it does not influence magnitude of vector

2798 # rejections that come next

2799 b1 /= np.linalg.norm(b1)

2800 # vector rejections

2801 # v = projection of b0 onto plane perpendicular to b1

2802 # = b0 minus component that aligns with b1

2803 # w = projection of b2 onto plane perpendicular to b1

2804 # = b2 minus component that aligns with b1

2805 v = b0 - np.dot(b0, b1)*b1

2806 w = b2 - np.dot(b2, b1)*b1

2807 # angle between v and w in a plane is the torsion angle

2808 # v and w may not be normalized but that's fine since tan is y/x

2809 x = np.dot(v, w)

2810 y = np.dot(np.cross(b1, v), w)

2811 return float(np.degrees(np.arctan2(y, x)))

2812

2813### Auxiliar functions ###

2814

2815# Set a function to get the next letter from an input letter in alphabetic order

2816def get_next_letter (letter : str) -> str:

2817 if not letter:

2818 return 'A'

2819 if letter == 'z' or letter == 'Z':

2820 raise InputError("Limit of chain letters has been reached")

2821 next_letter = chr(ord(letter) + 1)

2822 return next_letter

2823

2824# Given a string with 1 or 2 characters, return a new string with the first letter cap and the second letter not cap (if any)

2825def first_cap_only (name : str) -> str:

2826 if len(name) == 1:

2827 return name.upper()

2828 first_character = name[0].upper()

2829 second_character = name[1].lower()

2830 return first_character + second_character

2831

2832# Convert numbers to lower text characters (chr 8020-8029)

2833lower_numbers = {

2834 '0': '₀',

2835 '1': '₁',

2836 '2': '₂',

2837 '3': '₃',

2838 '4': '₄',

2839 '5': '₅',

2840 '6': '₆',

2841 '7': '₇',

2842 '8': '₈',

2843 '9': '₉',

2844}

2845def get_lower_numbers (numbers_text : str) -> str:

2846 return ''.join([ lower_numbers[c] for c in numbers_text ])

2847

2848# Set a function to filter lines in PDB content for a specific model

2849def filter_model (pdb_content : str, model : int) -> str:

2850 # Make sure the PDB content has multiple models

2851 # If not, then teturn the whole PDB content ignoring the flag

2852 generic_model_header_regex = r'\nMODEL\s*[0-9]*\s*\n'

2853 if not re.search(generic_model_header_regex, pdb_content):

2854 print(f'PDB content has no models at all so it will be loaded as is (ignored model "{model}").')

2855 return pdb_content

2856 # If a model was passed then it means we must filter the PDB content

2857 filtered_pdb_content = ''

2858 # Search PDB lines until we find our model's header

2859 model_header_regex = rf'^MODEL\s*{model}'

2860 pdb_lines = iter(pdb_content.split('\n'))

2861 for line in pdb_lines:

2862 if not re.match(model_header_regex, line): continue

2863 # If we just found the header

2864 filtered_pdb_content = line

2865 break

2866 # If we did not find the header then stop here

2867 if not filtered_pdb_content: raise RuntimeError(f'Could not find model "{model}" header')

2868 # Add every line to the filtered content until we find the tail

2869 model_footer_regex = r'^ENDMDL'

2870 for line in pdb_lines:

2871 filtered_pdb_content += '\n' + line

2872 if re.match(model_footer_regex, line): return filtered_pdb_content

2873 # If we did not find the footer then stop here

2874 raise RuntimeError(f'Could not find model "{model}" footer')