Coverage for mddb_workflow/utils/structures.py: 84%

1# Main handler of the toolbelt

2import os

3import re

4import math

5import pytraj

6import MDAnalysis

7import numpy as np

8from bisect import bisect

10from mddb_workflow.utils.file import File

11from mddb_workflow.utils.selections import Selection

12from mddb_workflow.utils.vmd_spells import get_vmd_selection_atom_indices, get_covalent_bonds

13from mddb_workflow.utils.mdt_spells import sort_trajectory_atoms

14from mddb_workflow.utils.gmx_spells import make_index, read_ndx

15from mddb_workflow.utils.auxiliar import InputError, MISSING_BONDS

16from mddb_workflow.utils.auxiliar import is_imported, residue_name_to_letter, otherwise, warn

17from mddb_workflow.utils.constants import SUPPORTED_ION_ELEMENTS, SUPPORTED_ELEMENTS

18from mddb_workflow.utils.constants import STANDARD_COUNTER_CATION_ATOM_NAMES, STANDARD_COUNTER_ANION_ATOM_NAMES

19from mddb_workflow.utils.constants import STANDARD_SOLVENT_RESIDUE_NAMES, STANDARD_COUNTER_ION_ATOM_NAMES

20from mddb_workflow.utils.constants import STANDARD_DUMMY_ATOM_NAMES, DUMMY_ATOM_ELEMENT, CG_ATOM_ELEMENT

21from mddb_workflow.utils.constants import PROTEIN_RESIDUE_NAME_LETTERS, NUCLEIC_RESIDUE_NAME_LETTERS

22from mddb_workflow.utils.constants import DNA_RESIDUE_NAME_LETTERS, RNA_RESIDUE_NAME_LETTERS

23from mddb_workflow.utils.constants import FATTY_RESIDUE_NAMES, STEROID_RESIDUE_NAMES

24from mddb_workflow.utils.type_hints import *

27# Set all available chains according to pdb standards

28AVAILABLE_CAPS = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')

29AVAILABLE_LOWS = list('abcdefghijklmnopqrstuvwxyz')

30AVAILABLE_LETTERS = AVAILABLE_CAPS + AVAILABLE_LOWS

32# Set letters to be found in alphanumerical bases

33hexadecimal_letters = set(AVAILABLE_CAPS[:6] + AVAILABLE_LOWS[:6])

34alphanumerical_letters = set(AVAILABLE_CAPS[6:] + AVAILABLE_LOWS[6:])

36# Set the expected number of bonds for each atom according to its element

37coherent_bonds_with_hydrogen = {

38 'H': { 'min': 1, 'max': 1 },

39 'O': { 'min': 1, 'max': 2 },

40 'N': { 'min': 1, 'max': 4 },

41 'C': { 'min': 2, 'max': 4 },

42 'S': { 'min': 1, 'max': 6 },

43 'P': { 'min': 2, 'max': 6 },

44}

45# WARNING: Not deeply tested

46coherent_bonds_without_hydrogen = {

47 'O': { 'min': 0, 'max': 2 },

48 'N': { 'min': 1, 'max': 3 },

49 'C': { 'min': 1, 'max': 3 },

50 'S': { 'min': 1, 'max': 4 },

51 'P': { 'min': 2, 'max': 4 },

52}

54# Set the limit of residue numbers according to PDB format (4 characters, starts count at 1)

55# This means the last number is 9999 and it is equivalent to index 9998

56pdb_last_decimal_residue_index = 9998

58class Atom:

59 """An atom class."""

61 def __init__ (self,

62 name : Optional[str] = None,

63 element : Optional[str] = None,

64 coords : Optional[Coords] = None,

65 ):

66 self.name = name

67 self.element = element

68 self.coords = tuple(coords) if coords else None

69 # Set variables to store references to other related instances

70 # These variables will be set further by the structure

71 self._structure = None

72 self._index = None

73 self._residue_index = None

75 def __repr__ (self):

76 return '<Atom ' + self.name + '>'

78 def __eq__ (self, other):

79 if type(self) != type(other):

80 return False

81 return self._residue_index == other._residue_index and self.name == other.name

83 def __hash__ (self):

84 return hash((self.index))

86 def get_structure (self) -> Optional['Structure']:

87 """The parent structure (read only).

88 This value is set by the structure itself."""

89 return self._structure

90 structure: 'Structure' = property(get_structure, None, None, "The parent structure (read only)")

92 def get_index (self) -> Optional[int]:

93 """The residue index according to parent structure residues (read only).

94 This value is set by the structure itself."""

95 return self._index

96 index = property(get_index, None, None, "The residue index according to parent structure residues (read only)")

98 def get_residue_index (self) -> int:

99 """The atom residue index according to parent structure residues.

100 If residue index is set then make changes in all the structure to make this change coherent."""

101 return self._residue_index

102 def set_residue_index (self, new_residue_index : int):

103 # If the new residue index is the current residue index then do nothing

104 # WARNING: It is important to stop this here or it could delete a residue which is not to be deleted

105 if new_residue_index == self.residue_index:

106 return

107 # If there is not strucutre yet it means the residue is beeing set before the structure

108 # We just save the residue index and wait for the structure to be set

109 if not self.structure:

110 self._residue_index = new_residue_index

111 return

112 # Relational indices are updated through a top-down hierarchy

113 # Affected residues are the ones to update this atom internal residue index

114 new_residue = self.structure.residues[new_residue_index]

115 new_residue.add_atom(self)

116 residue_index = property(get_residue_index, set_residue_index, None,

117 "The atom residue index according to parent structure residues")

118

119 def get_residue (self) -> Optional['Residue']:

120 """The atom residue.

121 If residue is set then make changes in all the structure to make this change coherent."""

122 # If there is not strucutre yet it means the atom is beeing set before the structure

123 # In this case it is not possible to get related residues in the structure

124 # It also may happend that the residue is missing because the atom has been set rawly

125 if not self.structure or self.residue_index == None:

126 return None

127 # Get the residue in the structure according to the residue index

128 return self.structure.residues[self.residue_index]

129 def set_residue (self, new_residue : 'Residue'):

130 """Find the new residue index and set it as the atom residue index.

131 Note that the residue must be set in the structure already."""

132 new_residue_index = new_residue.index

133 if new_residue_index == None:

134 raise ValueError('Residue ' + str(new_residue) + ' is not set in the structure')

135 self.set_residue_index(new_residue_index)

136 residue = property(get_residue, set_residue, None, "The atom residue")

137

138 def get_chain_index (self) -> Optional[int]:

139 """Get the atom chain index according to parent structure chains."""

140 # The residue may be missing if the atom has been set rawly

141 if not self.residue:

142 return None

143 return self.residue.chain_index

144 chain_index = property(get_chain_index, None, None,

145 "The atom chain index according to parent structure chains (read only)")

146

147 def get_chain (self) -> Optional['Chain']:

148 """The atom chain (read only).

149 In order to change the chain it must be changed in the atom residue."""

150 # If there is not strucutre yet it means the atom is beeing set before the structure

151 # In this case it is not possible to get related residues in the structure

152 # It also may happend that the chain is missing because the atom has been set rawly

153 if not self.structure or self.chain_index == None:

154 return None

155 # Get the chain in the structure according to the chain index

156 return self.structure.chains[self.chain_index]

157 chain: 'Chain' = property(get_chain, None, None, "The atom chain (read only)")

158

159 def get_bonds (self, skip_ions : bool = False, skip_dummies : bool = False) -> Optional[ list[int] ]:

160 """Get indices of other atoms in the structure which are covalently bonded to this atom."""

161 if not self.structure:

162 raise ValueError('The atom has not a structure defined')

163 if self.index == None:

164 raise ValueError('The atom has not an index defined')

165 bonds = self.structure.bonds[self.index]

166 # If the skip ions argument is passed then remove atom indices of supported ion atoms

167 if skip_ions:

168 bonds = list(set(bonds) - self.structure.ion_atom_indices)

169 if skip_dummies:

170 bonds = list(set(bonds) - self.structure.dummy_atom_indices)

171 return bonds

172

173 # Atoms indices of atoms in the structure which are covalently bonded to this atom

174 bonds = property(get_bonds, None, None,

175 'Atoms indices of atoms in the structure which are covalently bonded to this atom')

176

177 def get_bonded_atoms (self) -> list['Atom']:

178 """Get bonded atoms."""

179 return [ self.structure.atoms[atom_index] for atom_index in self.bonds ]

180

181 def get_selection (self) -> 'Selection':

182 """Generate a selection for this atom."""

183 return Selection([self.index])

184

185 def copy (self) -> 'Atom':

186 """Make a copy of the current atom."""

187 atom_copy = Atom(self.name, self.element, self.coords)

188 atom_copy._structure = self._structure

189 atom_copy._index = self._index

190 atom_copy._residue_index = self._residue_index

191 return atom_copy

192

193 def is_fatty_candidate (self) -> bool:

194 """

195 Check if this atom meets specific criteria:

196 1. It is a carbon

197 2. It is connected only to other carbons and hydrogens

198 3. It is connected to 1 or 2 carbons

199 """

200 # Ignore non carbon atoms

201 if self.element != 'C':

202 return False

203 # Get bonded atom elements

204 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ]

205 # Check only carbon and hydrogen atoms to be bonded

206 if any((element != 'C' and element != 'H') for element in bonded_atoms_elements):

207 return False

208 # Check it is connected to 1 or 2 carbons

209 connected_carbons_count = bonded_atoms_elements.count('C')

210 if connected_carbons_count != 1 and connected_carbons_count != 2:

211 return False

212 return True

213

214 def is_carbohydrate_candidate (self) -> bool:

215 """

216 Check if this atom meets specific criteria:

217 1. It is a carbon

218 2. It is connected only to other carbons, hydrogens or oxygens

219 3. It is connected to 1 or 2 carbons

220 4. It is connected to 1 oxygen

221 """

222 # Ignore non carbon atoms

223 if self.element != 'C':

224 return False

225 # Get bonded atom elements

226 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ]

227 # Check only carbon and hydrogen atoms to be bonded

228 if any((element != 'C' and element != 'H' and element != 'O') for element in bonded_atoms_elements):

229 return False

230 # Check it is connected to 1 or 2 carbons

231 connected_carbons_count = bonded_atoms_elements.count('C')

232 if connected_carbons_count != 1 and connected_carbons_count != 2:

233 return False

234 # Check it is connected to 1 oxygen

235 connected_oxygens_count = bonded_atoms_elements.count('O')

236 if connected_oxygens_count != 1:

237 return False

238 return True

239

240 def is_ion (self) -> bool:

241 """Check if it is an ion by checking if it has no bonds with other atoms."""

242 return len(self.bonds) == 0

243

244 def guess_element (self) -> str:

245 """Guess an atom element from its name and number of bonds."""

246 # If the atom name is among the known dummy atoms then return a standard element for dummy atoms

247 if self.name.upper() in STANDARD_DUMMY_ATOM_NAMES:

248 return DUMMY_ATOM_ELEMENT

249 # If the element is already the "coarse grained" flag then return this very same flag

250 # WARNING: This has to be preset before guessing or the guess may fail

251 if self.is_cg():

252 return CG_ATOM_ELEMENT

253 # If the name is SOD and it is a lonely atom then it is clearly sodium, not sulfur

254 if self.name.upper() == 'SOD' and self.residue.atom_count == 1:

255 return 'Na'

256 # If the name is POT and it is a lonely atom then it is clearly potassium, not phosphor

257 if self.name.upper() == 'POT' and self.residue.atom_count == 1:

258 return 'K'

259 # Find a obvios element name in the atom name

260 element = self.get_name_suggested_element()

261 # CA may refer to calcium or alpha carbon, so the number of atoms in the residue is decisive

262 if element == 'Ca' and self.residue.atom_count >= 2:

263 return 'C'

264 # NA may refer to sodium or some ligand nitrogen, so the number of atoms in the residue is decisive

265 if element == 'Na' and self.residue.atom_count >= 2:

266 return 'N'

267 return element

268

269 def get_name_suggested_element (self) -> str:

270 """Guess an atom element from its name only."""

271 # Get the atom name and its characters length

272 name = self.name

273 length = len(name)

274 next_character = None

275 for i, character in enumerate(name):

276 # Get the next character, since element may be formed by 2 letters

277 if i < length - 1:

278 next_character = name[i+1]

279 # If next character is not a string then ignore it

280 if not next_character.isalpha():

281 next_character = None

282 # Try to get all possible matches between the characters and the supported atoms

283 # First letter is always caps

284 character = character.upper()

285 # First try to match both letters together

286 if next_character:

287 # Start with the second letter in caps

288 next_character = next_character.upper()

289 both = character + next_character

290 if both in SUPPORTED_ELEMENTS:

291 return both

292 # Continue with the second letter in lowers

293 next_character = next_character.lower()

294 both = character + next_character

295 if both in SUPPORTED_ELEMENTS:

296 return both

297 # Finally, try with the first character alone

298 if character in SUPPORTED_ELEMENTS:

299 return character

300 raise InputError(f"Cannot guess atom element from atom name '{name}'")

301

302 def get_label (self) -> str:

303 """Get a standard label."""

304 return f'{self.residue.label}.{self.name} (index {self.index})'

305 # The atom standard label (read only)

306 label = property(get_label, None, None)

307

308 # Ask if the current atom is in coarse grain

309 def is_cg (self) -> bool:

310 return self.element == CG_ATOM_ELEMENT

311

312class Residue:

313 """A residue class."""

314 def __init__ (self,

315 name : Optional[str] = None,

316 number : Optional[int] = None,

317 icode : Optional[str] = None,

318 ):

319 self.name = name

320 self.number = number

321 self.icode = icode

322 # Set variables to store references to other related instaces

323 # These variables will be set further by the structure

324 self._structure = None

325 self._index = None

326 self._atom_indices = []

327 self._chain_index = None

328 self._classification = None

329

330 def __repr__ (self):

331 return f'<Residue {self.label}>'

332

333 def __eq__ (self, other):

334 if type(self) != type(other):

335 return False

336 return (

337 self._chain_index == other._chain_index and

338 #self.name == other.name and

339 self.number == other.number and

340 self.icode == other.icode

341 )

342

343 def __hash__ (self):

344 # WARNING: This is susceptible to duplicated residues

345 return hash((self._chain_index, self.number, self.icode))

346 # WARNING: This is not susceptible to duplicated residues

347 #return hash(tuple(self._atom_indices))

348

349 def get_structure (self) -> Optional['Structure']:

350 """Get the parent structure (read only).

351 This value is set by the structure itself."""

352 return self._structure

353 structure: 'Structure' = property(get_structure, None, None,

354 "The parent structure (read only)")

355

356 def get_index (self) -> Optional[int]:

357 """Get the residue index according to parent structure residues (read only).

358 This value is set by the structure itself."""

359 return self._index

360 def set_index (self, index):

361 # Update residue atoms

362 for atom in self.atoms:

363 atom._residue_index = index

364 # Update residue chain

365 chain_residue_index = self.chain._residue_indices.index(self._index)

366 self.chain._residue_indices[chain_residue_index] = index

367 # Finally update self index

368 self._index = index

369 index = property(get_index, set_index, None,

370 "The residue index according to parent structure residues (read only)")

371

372 def get_atom_indices (self) -> list[int]:

373 """Get the atom indices according to parent structure atoms for atoms in this residue.

374 If atom indices are set then make changes in all the structure to make this change coherent."""

375 return self._atom_indices

376 def set_atom_indices (self, new_atom_indices : list[int]):

377 # If there is not strucutre yet it means the residue is beeing set before the structure

378 # We just save atom indices and wait for the structure to be set

379 if not self.structure:

380 self._atom_indices = new_atom_indices

381 return

382 # Update the current atoms

383 for atom in self.atoms:

384 atom._residue_index = None

385 # Update the new atoms

386 for index in new_atom_indices:

387 atom = self.structure.atoms[index]

388 atom._residue_index = self.index

389 # Now new indices are coherent and thus we can save them

390 self._atom_indices = new_atom_indices

391 atom_indices = property(get_atom_indices, set_atom_indices, None,

392 "The atom indices according to parent structure atoms for atoms in this residue")

393

394 def get_atoms (self) -> list['Atom']:

395 """Get the atoms in this residue.

396 If atoms are set then make changes in all the structure to make this change coherent."""

397 # If there is not strucutre yet it means the chain is beeing set before the structure

398 # In this case it is not possible to get related atoms in the structure

399 if not self.structure:

400 return []

401 # Get atoms in the structure according to atom indices

402 atoms = self.structure.atoms

403 return [ atoms[atom_index] for atom_index in self.atom_indices ]

404 def set_atoms (self, new_atoms : list['Atom']):

405 # Find indices for new atoms and set their indices as the new atom indices

406 # Note that atoms must be set in the structure already

407 new_atom_indices = []

408 for new_atom in new_atoms:

409 new_atom_index = new_atom.index

410 if new_atom_index == None:

411 raise ValueError('Atom ' + str(new_atom) + ' is not set in the structure')

412 new_atom_indices.append(new_atom_index)

413 self.set_atom_indices(new_atom_indices)

414 atoms: list['Atom'] = property(get_atoms, set_atoms, None, "The atoms in this residue")

415

416 # Get the number of atoms in the residue

417 def get_atom_count (self) -> int:

418 return len(self.atom_indices)

419 atom_count = property(get_atom_count, None, None, "The number of atoms in the residue (read only)")

420

421 def add_atom (self, new_atom : 'Atom'):

422 """Add an atom to the residue."""

423 # Remove the atom from its previous residue owner

424 if new_atom.residue_index:

425 new_atom.residue.remove_atom(new_atom)

426 # Insert the new atom index in the list of atom indices keeping the order

427 new_atom_index = new_atom.index

428 sorted_atom_index = bisect(self.atom_indices, new_atom_index)

429 self.atom_indices.insert(sorted_atom_index, new_atom_index)

430 # Update the atom internal index

431 new_atom._residue_index = self.index

432

433 def remove_atom (self, current_atom : 'Atom'):

434 """Remove an atom from the residue."""

435 # Remove the current atom index from the atom indices list

436 self.atom_indices.remove(current_atom.index) # This index MUST be in the list

437 # If we removed the last atom then this residue must be removed from its structure

438 if len(self.atom_indices) == 0 and self.structure:

439 self.structure.purge_residue(self)

440 # Update the atom internal index

441 current_atom._residue_index = None

442

443 # The residue chain index according to parent structure chains

444 # If chain index is set then make changes in all the structure to make this change coherent

445 def get_chain_index (self) -> int:

446 return self._chain_index

447 def set_chain_index (self, new_chain_index : int):

448 # If the new chain index is the current chain index do nothing

449 # WARNING: It is important to stop this here or it could delete a chain which is not to be deleted

450 if new_chain_index == self.chain_index:

451 return

452 # If there is not strucutre yet it means the chain is beeing set before the structure

453 # We just save the chain index and wait for the structure to be set

454 if not self.structure:

455 self._chain_index = new_chain_index

456 return

457 # Relational indices are updated through a top-down hierarchy

458 # Affected chains are the ones to update this residue internal chain index

459 # WARNING: It is critical to find the new chain before removing/adding residues

460 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged

461 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed

462 new_chain = self.structure.chains[new_chain_index]

463 if self.chain:

464 self.chain.remove_residue(self)

465 new_chain.add_residue(self)

466 chain_index = property(get_chain_index, set_chain_index, None,

467 "The residue chain index according to parent structure chains")

468

469 # The residue chain

470 # If chain is set then make changes in all the structure to make this change coherent

471 def get_chain (self) -> Optional['Chain']:

472 # If there is not strucutre yet then it means the residue is set before the structure

473 # In this case it is not possible to get related chain in the structure

474 if not self.structure or self.chain_index == None:

475 return None

476 # Get the chain in the structure according to the chain index

477 return self.structure.chains[self.chain_index]

478 def set_chain (self, new_chain : Union['Chain', str]):

479 # In case the chain is just a string we must find/create the corresponding chain

480 if type(new_chain) == str:

481 letter = new_chain

482 # Get the residue structure

483 structure = self.structure

484 if not structure:

485 raise ValueError(f'Cannot find the corresponding {new_chain} chain without the structure')

486 # Find if the letter belongs to an already existing chain

487 new_chain = structure.get_chain_by_name(letter)

488 # If the chain does not exist yet then create it

489 if not new_chain:

490 new_chain = Chain(name=letter)

491 structure.set_new_chain(new_chain)

492 # Find the new chain index and set it as the residue chain index

493 # Note that the chain must be set in the structure already

494 new_chain_index = new_chain.index

495 if new_chain_index == None:

496 raise ValueError(f'Chain {new_chain} is not set in the structure')

497 self.set_chain_index(new_chain_index)

498 chain: 'Chain' = property(get_chain, set_chain, None, "The residue chain")

499

500 def get_bonded_atom_indices (self) -> list[int]:

501 """Get atom indices from atoms bonded to this residue."""

502 # Get all bonds among residue atoms

503 all_bonds = []

504 for atom in self.atoms:

505 all_bonds += atom.bonds

506 # Remove self atom indices from the list

507 external_bonds = set(all_bonds) - set(self.atom_indices)

508 return list(external_bonds)

509

510 def get_bonded_atoms (self) -> list['Atom']:

511 """Get atoms bonded to this residue."""

512 return [ self.structure.atoms[atom_index] for atom_index in self.get_bonded_atom_indices() ]

513

514 def get_bonded_residue_indices (self) -> list[int]:

515 """Get residue indices from residues bonded to this residue."""

516 return list(set([ atom.residue_index for atom in self.get_bonded_atoms() ]))

517

518 def get_bonded_residues (self) -> list['Residue']:

519 """Get residues bonded to this residue."""

520 return [ self.structure.residues[residue_index] for residue_index in self.get_bonded_residue_indices() ]

521

522 def is_bonded_with_residue (self, other : 'Residue') -> bool:

523 """Given another residue, check if it is bonded with this residue."""

524 bonded_atom_indices = set(self.get_bonded_atom_indices())

525 if next((index for index in other.atom_indices if index in bonded_atom_indices), None) != None: return True

526 return False

527

528 def is_missing_any_bonds (self) -> bool:

529 return any(atom.bonds == MISSING_BONDS for atom in self.atoms)

530

531 def is_coherent (self) -> bool:

532 """Make sure atoms within the residue are all bonded."""

533 # If bonds are missing then just say everything is

534 if self.is_missing_any_bonds():

535 raise RuntimeError('Trying to check if residue with missing bonds is coherent')

536 residue_selection = self.get_selection()

537 residue_fragments = self.structure.find_fragments(residue_selection)

538 first_residue_fragment = next(residue_fragments)

539 return len(first_residue_fragment) == self.atom_count

540

541 def get_classification (self) -> str:

542 """

543 Get the residue biochemistry classification.

544

545 WARNING: Note that this logic will not work in a structure without hydrogens.

546

547 Available classifications:

548 - protein

549 - dna

550 - rna

551 - carbohydrate

552 - fatty

553 - steroid

554 - ion

555 - solvent

556 - acetyl

557 - amide

558 - other

559 """

560 # Return the internal value, if any

561 if self._classification:

562 return self._classification

563 # If this is a CG residue then we can not classify it

564 if self.is_cg(): return self.get_classification_by_name()

565 # If we dont have a value then we must classify the residue

566 # -------------------------------------------------------------------------------------------------------

567 # Ions are single atom residues

568 if self.atom_count == 1:

569 self._classification = 'ion'

570 return self._classification

571 # -------------------------------------------------------------------------------------------------------

572 # At this point we need atoms to have elements fixed

573 # WARNING: missing elements would result in silent failure to recognize some classifications

574 if self.structure._fixed_atom_elements == False:

575 self.structure.fix_atom_elements()

576 # Solvent is water molecules

577 # First rely on the residue name

578 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES:

579 self._classification = 'solvent'

580 return self._classification

581 # It may be water with a not known name

582 # Literally check if its a molecule with 3 atoms: 2 hydrogens and 1 oxygen

583 if self.atom_count == 3:

584 atom_elements = [ atom.element for atom in self.atoms ]

585 if atom_elements.count('H') == 2 and atom_elements.count('O') == 1:

586 self._classification = 'solvent'

587 return self._classification

588 # -------------------------------------------------------------------------------------------------------

589 # Protein definition according to vmd:

590 # a residue with atoms named C, N, CA, and O

591 # In our case we accept OC1 and OC2 or OT1 and OT2 instead of O for terminal residues

592 atom_names = set([ atom.name for atom in self.atoms ])

593 if ( all((name in atom_names) for name in ['C', 'N', 'CA']) and (

594 'O' in atom_names or { 'OC1', 'OC2' } <= atom_names or { 'OT1', 'OT2' } <= atom_names

595 )):

596 self._classification = 'protein'

597 return self._classification

598 # -------------------------------------------------------------------------------------------------------

599 # Nucleic acids definition according to vmd:

600 # a residue with atoms named P, O1P, O2P and either O3’, C3’, C4’, C5’, O5’ or O3*, C3*, C4*, C5*, O5*

601 # Apparently it has been fixed so now a residue does not need to be phosphorylated to be considered nucleic

602 # LORE: This included the condition "all((name in atom_names) for name in ['P', 'OP1', 'OP2'])"

603 if (( all((name in atom_names) for name in ["O3'", "C3'", "C4'", "C5'", "O5'"]) or

604 all((name in atom_names) for name in ["O3*", "C3*", "C4*", "C5*", "O5*"]) )):

605 # At this point we know it is nucleic

606 # We must tell the difference between DNA and RNA

607 if "O2'" in atom_names or "O2*" in atom_names:

608 self._classification = 'rna'

609 else:

610 self._classification = 'dna'

611 return self._classification

612 # -------------------------------------------------------------------------------------------------------

613 # To define carbohydrates search for rings made of 1 oxygen and 'n' carbons

614 # WARNING: This logic may fail for some very specific molecules such as furine

615 # LIMITATION: This logic only aims for cyclical carbohydrates. The linear form of carbohydrates is not yet consider

616 rings = self.find_rings(6)

617 for ring in rings:

618 ring_elements = [ atom.element for atom in ring ]

619 # Check the ring has 1 oxygen

620 oxygen_count = ring_elements.count('O')

621 if oxygen_count != 1:

622 continue

623 # Check the rest are only carbon

624 carbon_count = ring_elements.count('C')

625 if carbon_count != len(ring) - 1:

626 continue

627 self._classification = 'carbohydrate'

628 return self._classification

629 # -------------------------------------------------------------------------------------------------------

630 # To define steroids search for 3 6-carbon rings and 1 5-carbon ring (at least)

631 # WARNING: According to this logic some exotical non-steroid molecules may result in false positives

632 num_6_carbon_rings = 0

633 num_5_carbon_rings = 0

634 for ring in rings:

635 ring_elements = [ atom.element for atom in ring ]

636 if any(element != 'C' for element in ring_elements):

637 continue

638 ring_lenght = len(ring)

639 if ring_lenght == 5:

640 num_5_carbon_rings += 1

641 if ring_lenght == 6:

642 num_6_carbon_rings += 1

643 if num_6_carbon_rings >= 3 and num_5_carbon_rings >= 1:

644 self._classification = 'steroid'

645 return self._classification

646 # -------------------------------------------------------------------------------------------------------

647 # To define fatty acids search for a series of carbon atoms connected together (3 at least)

648 # These carbon atoms must be connected only to hydrogen in addition to 1-2 carbon

649 # These carbons must not be bonded in a cycle so check atoms not be repeated in the series

650 bonded_fatty_atoms = []

651 def get_bonded_fatty_atoms_recursive (current_atom : Atom, previous_atom : Optional[Atom] = None) -> bool:

652 # Iterate over the input atom bonded atoms

653 for bonded_atom in current_atom.get_bonded_atoms():

654 # Discard the previous atom to avoid an infinite loop

655 if bonded_atom == previous_atom:

656 continue

657 # If we find an atom which is already in the fatty atoms list then it means we are cycling

658 if bonded_atom in bonded_fatty_atoms:

659 return False

660 # If this is not a fatty atom then discard it

661 if not bonded_atom.is_fatty_candidate():

662 continue

663 # Add the current bonded fatty atom to the list

664 bonded_fatty_atoms.append(bonded_atom)

665 # Find more bonded fatty atoms and add them to list as well

666 if get_bonded_fatty_atoms_recursive(current_atom=bonded_atom, previous_atom=current_atom) == False:

667 return False

668 return True

669 # Now check all atoms and try to set the series until we find one which works

670 already_checked_atoms = []

671 for atom in self.atoms:

672 # If we already checked this atom trough the recursive logic then skip it

673 if atom in already_checked_atoms:

674 continue

675 # If the atom is not a fatty candidate then skip it

676 if not atom.is_fatty_candidate():

677 continue

678 # Now that we have found a suitable candidate atom we start the series

679 bonded_fatty_atoms = [atom]

680 if get_bonded_fatty_atoms_recursive(atom) and len(bonded_fatty_atoms) >= 3:

681 self._classification = 'fatty'

682 return self._classification

683 already_checked_atoms += bonded_fatty_atoms

684 # -------------------------------------------------------------------------------------------------------

685 # Check if it is an acetylation capping terminal

686 if self.name == 'ACE':

687 self._classification = 'acetyl'

688 return self._classification

689 # -------------------------------------------------------------------------------------------------------

690 # Check if it is an N-methyl amide capping terminal

691 if self.name == 'NME':

692 self._classification = 'amide'

693 return self._classification

694 # -------------------------------------------------------------------------------------------------------

695 self._classification = 'other'

696 return self._classification

697

698 def get_classification_by_name (self) -> str:

699 """

700 Set an alternative function to "try" to classify the residues according only to its name.

701 This is useful for corase grain residues whose atoms may not reflect the real atoms.

702 WARNING: This logic is very limited and will return "unknown" most of the times.

703 WARNING: This logic relies mostly in atom names, which may be not standard.

704 """

705 if self.name in PROTEIN_RESIDUE_NAME_LETTERS:

706 return 'protein'

707 if self.name in DNA_RESIDUE_NAME_LETTERS:

708 return 'dna'

709 if self.name in RNA_RESIDUE_NAME_LETTERS:

710 return 'rna'

711 if self.name in NUCLEIC_RESIDUE_NAME_LETTERS:

712 return 'nucleic'

713 if self.name in FATTY_RESIDUE_NAMES:

714 return 'fatty'

715 if self.name in STEROID_RESIDUE_NAMES:

716 return 'steroid'

717 if self.name in STANDARD_COUNTER_ION_ATOM_NAMES:

718 return 'ion'

719 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES:

720 return 'solvent'

721 # If we do not know what it is

722 return 'unknown'

723

724 # The residue biochemistry classification (read only)

725 classification = property(get_classification, None, None)

726

727 def get_selection (self) -> 'Selection':

728 """Generate a selection for this residue."""

729 return Selection(self.atom_indices)

730

731 def copy (self) -> 'Residue':

732 """Make a copy of the current residue."""

733 residue_copy = Residue(self.name, self.number, self.icode)

734 residue_copy._structure = self._structure

735 residue_copy._index = self._index

736 residue_copy._atom_indices = self._atom_indices

737 residue_copy._chain_index = self._chain_index

738 return residue_copy

739

740 def get_letter (self) -> str:

741 """Get the residue equivalent single letter code.

742 Note that this makes sense for aminoacids and nucleotides only.

743 Non recognized residue names return 'X'."""

744 return residue_name_to_letter(self.name)

745

746 def get_formula (self) -> str:

747 """Get the formula of the residue."""

748 elements = [ atom.element for atom in self.atoms ]

749 unique_elements = list(set(elements))

750 formula = ''

751 for unique_element in unique_elements:

752 count = elements.count(unique_element)

753 number = get_lower_numbers(str(count)) if count > 1 else ''

754 formula += unique_element + number

755 return formula

756

757 def find_rings (self, max_ring_size : int) -> list[ list[Atom] ]:

758 """Find rings in the residue."""

759 return self.structure.find_rings(max_ring_size, selection=self.get_selection())

760

761 def split (self,

762 first_residue_atom_indices : list[int],

763 second_residue_atom_indices : list[int],

764 first_residue_name : Optional[str] = None,

765 second_residue_name : Optional[str] = None,

766 first_residue_number : Optional[int] = None,

767 second_residue_number : Optional[int] = None,

768 first_residue_icode : Optional[str] = None,

769 second_residue_icode : Optional[str] = None,

770 ) -> tuple['Residue', 'Residue']:

771 """Split this residue in 2 residues and return them in a tuple.

772 Keep things coherent in the structure (renumerate all residues below this one).

773 Note that all residue atoms must be covered by the splits."""

774 # This function is expected to be called in a residue with an already set structure

775 if not self.structure:

776 raise InputError('The split function should be called when the residue has an already defined structure')

777 # Make sure all atoms in the residue are covered between both the first and the second residue

778 if set(self.atom_indices) != set(first_residue_atom_indices + second_residue_atom_indices):

779 print('Residue atoms: ' + ', '.join([ str(v) for v in set(self.atom_indices) ]))

780 print('Covered atoms: ' + ', '.join([ str(v) for v in set(first_residue_atom_indices + second_residue_atom_indices) ]))

781 raise InputError('All atom indices must be covered between both the first and the second residue')

782 # Reuse this first residue instead of creating a new one

783 if first_residue_name:

784 self.name = first_residue_name

785 if first_residue_number:

786 self.number = first_residue_number

787 if first_residue_icode:

788 self.icode = first_residue_icode

789 # Set the new second residue

790 _second_residue_name = second_residue_name if second_residue_name else self.name

791 _second_residue_number = second_residue_number if second_residue_number else self.number

792 _second_residue_icode = second_residue_icode if second_residue_icode else get_next_letter(self.icode)

793 second_residue = Residue(_second_residue_name, _second_residue_number, _second_residue_icode)

794 second_residue._structure = self.structure

795 new_residue_index = self.index + 1

796 second_residue._index = new_residue_index

797 # Insert the second residue in the structure residues list right after this residue

798 self.structure.residues.insert(new_residue_index, second_residue)

799 # Set the second residue index

800 # Update the index of all residues which have been shifted after the insertion

801 for residue_index in range(new_residue_index + 1, len(self.structure.residues)):

802 residue = self.structure.residues[residue_index]

803 residue.index = residue_index

804 # Now transfer atoms from residue 1 to residue 2 as it is specified

805 # Note that this will automatically update every atom

806 second_residue.atom_indices = second_residue_atom_indices

807 # Now add the new residue to the chain

808 self.chain.add_residue(second_residue)

809

810 def split_by_atom_names (self,

811 first_residue_atom_names : list[str],

812 second_residue_atom_names : list[str],

813 first_residue_name : Optional[str] = None,

814 second_residue_name : Optional[str] = None,

815 first_residue_number : Optional[int] = None,

816 second_residue_number : Optional[int] = None,

817 first_residue_icode : Optional[str] = None,

818 second_residue_icode : Optional[str] = None,

819 ) -> tuple['Residue', 'Residue']:

820 """Parse atom names to atom indices and then call the split function."""

821 # Check all atom names to exist in the residue

822 input_atom_names = set(first_residue_atom_names + second_residue_atom_names)

823 residue_atom_names = set([ atom.name for atom in self.atoms ])

824 if input_atom_names != residue_atom_names:

825 print(self)

826 print(self.atoms)

827 print('Input atom names: ' + ', '.join(input_atom_names))

828 print('Residue atom names: ' + ', '.join(residue_atom_names))

829 raise InputError('All residue atoms must be covered between both the first and the second residue atom names')

830 # Convert atom names to atom indices

831 first_residue_atom_indices = [ self.get_atom_by_name(name).index for name in first_residue_atom_names ]

832 second_residue_atom_indices = [ self.get_atom_by_name(name).index for name in second_residue_atom_names ]

833 # Call the actual split logic

834 return self.split(first_residue_atom_indices, second_residue_atom_indices, first_residue_name,

835 second_residue_name, first_residue_number, second_residue_number, first_residue_icode, second_residue_icode)

836

837 def get_atom_by_name (self, atom_name : str) -> 'Atom':

838 """Get a residue atom given its name."""

839 return next(( atom for atom in self.atoms if atom.name == atom_name ), None)

840

841 def get_label (self) -> str:

842 """Get a standard label."""

843 chainname = self.chain.name if self.chain.name.strip() else ''

844 return f'{chainname}{self.number}{self.icode}({self.name})'

845 label = property(get_label, None, None,

846 "The residue standard label (read only)")

847

848 def is_cg (self) -> bool:

849 """Ask if the current residue is in coarse grain.

850 Note that we assume there may be not hybrid aa/cg residues."""

851 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)

852

853class Chain:

854 """A chain of residues."""

855 def __init__ (self,

856 name : Optional[str] = None,

857 classification : Optional[str] = None,

858 ):

859 self.name = name

860 self._classification = classification

861 # Set variables to store references to other related instaces

862 # These variables will be set further by the structure

863 self._structure = None

864 self._index = None

865 self._residue_indices = []

866

867 def __repr__ (self):

868 return '<Chain ' + self.name + '>'

869

870 def __eq__ (self, other):

871 return self.name == other.name

872

873 def __hash__ (self):

874 return hash(self.name)

875

876 # The parent structure (read only)

877 # This value is set by the structure itself

878 def get_structure (self) -> Optional['Structure']:

879 return self._structure

880 structure: 'Structure' = property(get_structure, None, None, "The parent structure (read only)")

881

882 # The residue index according to parent structure residues (read only)

883 # This value is set by the structure itself

884 def get_index (self) -> Optional[int]:

885 return self._index

886 # When the index is set all residues are updated with the next chain index

887 def set_index (self, index : int):

888 for residue in self.residues:

889 residue._chain_index = index

890 self._index = index

891 index = property(get_index, set_index, None, "The residue index according to parent structure residues (read only)")

892

893 # The residue indices according to parent structure residues for residues in this chain

894 # If residue indices are set then make changes in all the structure to make this change coherent

895 def get_residue_indices (self) -> list[int]:

896 return self._residue_indices

897 def set_residue_indices (self, new_residue_indices : list[int]):

898 # If there is not strucutre yet it means the chain is beeing set before the structure

899 # We just save residue indices and wait for the structure to be set

900 if not self.structure:

901 self._residue_indices = new_residue_indices

902 return

903 # Update the current residues

904 for residue in self.residues:

905 residue._chain_index = None

906 # Update the new residues

907 for index in new_residue_indices:

908 residue = self.structure.residues[index]

909 residue._chain_index = self.index

910 # In case the new residue indices list is empty this chain must be removed from its structure

911 if len(new_residue_indices) == 0:

912 self.structure.purge_chain(self)

913 # Now new indices are coherent and thus we can save them

914 self._residue_indices = new_residue_indices

915 residue_indices = property(get_residue_indices, set_residue_indices, None, "The residue indices according to parent structure residues for residues in this residue")

916

917 def get_residues (self) -> list['Residue']:

918 """The residues in this chain.

919 If residues are set then make changes in all the structure to make this change coherent."""

920 # If there is not strucutre yet it means the chain is beeing set before the structure

921 # In this case it is not possible to get related residues in the structure

922 if not self.structure:

923 return []

924 # Get residues in the structure according to residue indices

925 residues = self.structure.residues

926 return [ residues[residue_index] for residue_index in self.residue_indices ]

927

928 def set_residues (self, new_residues : list['Residue']):

929 """Find indices for new residues and set their indices as the new residue indices.

930 Note that residues must be set in the structure already."""

931 new_residue_indices = []

932 for new_residue in new_residues:

933 new_residue_index = new_residue.index

934 if new_residue_index == None:

935 raise ValueError(f'Residue {new_residue} is not set in the structure')

936 new_residue_indices.append(new_residue_index)

937 self.set_residue_indices(new_residue_indices)

938 residues: list['Residue'] = property(get_residues, set_residues, None, "The residues in this chain")

939

940 def add_residue (self, residue : 'Residue'):

941 """Add a residue to the chain."""

942 # Insert the new residue index in the list of residue indices keeping the order

943 sorted_residue_index = bisect(self.residue_indices, residue.index)

944 self.residue_indices.insert(sorted_residue_index, residue.index)

945 # Update the residue internal chain index

946 residue._chain_index = self.index

947

948 def remove_residue (self, residue : 'Residue'):

949 """Remove a residue from the chain.

950 WARNING: Note that this function does not trigger the set_residue_indices."""

951 self.residue_indices.remove(residue.index) # This index MUST be in the list

952 # If we removed the last residue then this chain must be removed from its structure

953 if len(self.residue_indices) == 0 and self.structure:

954 self.structure.purge_chain(self)

955 # Update the residue internal chain index

956 residue._chain_index = None

957

958 def get_atom_indices (self) -> list[int]:

959 """Get atom indices for all atoms in the chain (read only).

960 In order to change atom indices they must be changed in their corresponding residues."""

961 return sum([ residue.atom_indices for residue in self.residues ], [])

962 atom_indices = property(get_atom_indices, None, None, "Atom indices for all atoms in the chain (read only)")

963

964 def get_atoms (self) -> list[int]:

965 """Get the atoms in the chain (read only).

966 In order to change atoms they must be changed in their corresponding residues."""

967 return sum([ residue.atoms for residue in self.residues ], [])

968 atoms: list['Atom'] = property(get_atoms, None, None, "Atoms in the chain (read only)")

969

970 def get_atom_count (self) -> int:

971 """Get the number of atoms in the chain (read only)."""

972 return len(self.atom_indices)

973 atom_count = property(get_atom_count, None, None, "Number of atoms in the chain (read only)")

974

975 def get_residue_count (self) -> int:

976 """Get the number of residues in the chain (read only)."""

977 return len(self._residue_indices)

978 residue_count = property(get_residue_count, None, None, "Number of residues in the chain (read only)")

979

980 def get_classification (self) -> str:

981 """Get the chain classification."""

982 if self._classification:

983 return self._classification

984 self._classification = self.structure.get_selection_classification(self.get_selection())

985 return self._classification

986

987 def set_classification (self, classification : str):

988 """Force the chain classification."""

989 self._classification = classification

990

991 classification = property(get_classification, set_classification, None,

992 "Classification of the chain (manual or automatic)")

993

994 def get_sequence (self) -> str:

995 """Get the residues sequence in one-letter code."""

996 return ''.join([ residue_name_to_letter(residue.name) for residue in self.residues ])

997

998 def get_selection (self) -> 'Selection':

999 """Generate a selection for this chain."""

1000 return Selection(self.atom_indices)

1001

1002 def copy (self) -> 'Chain':

1003 """Make a copy of the current chain."""

1004 chain_copy = Chain(self.name)

1005 chain_copy._structure = self._structure

1006 chain_copy._index = self._index

1007 chain_copy.residue_indices = self.residue_indices

1008 return chain_copy

1009

1010 def has_cg (self) -> bool:

1011 """Ask if the current chain has at least one coarse grain atom/residue."""

1012 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)

1013

1014 def is_missing_any_bonds (self) -> bool:

1015 return any(atom.bonds == MISSING_BONDS for atom in self.atoms)

1016

1017 def find_residue (self, number : int, icode : str = '', index = None) -> Optional['Residue']:

1018 """Find a residue by its number and insertion code."""

1019 # Iterate chain residues

1020 for residue in self.residues:

1021 if residue.number == number and residue.icode == icode and (index is None or residue.index == index):

1022 return residue

1023 return None

1024

1025class Structure:

1026 """A structure is a group of atoms organized in chains and residues."""

1027 def __init__ (self,

1028 atoms : list['Atom'] = [],

1029 residues : list['Residue'] = [],

1030 chains : list['Chain'] = [],

1031 residue_numeration_base : int = 10,

1032 ):

1033 self.atoms: list['Atom'] = []

1034 self.residues: list['Residue'] = []

1035 self.chains: list['Chain'] = []

1036 # Set references between instances

1037 for atom in atoms:

1038 self.set_new_atom(atom)

1039 for residue in residues:

1040 self.set_new_residue(residue)

1041 for chain in chains:

1042 self.set_new_chain(chain)

1043 # --- Set other internal variables ---

1044 # Set bonds between atoms

1045 self._bonds = None

1046 # Set fragments of bonded atoms

1047 self._fragments = None

1048 # --- Set other auxiliar variables ---

1049 # Trajectory atom sorter is a function used to sort coordinates in a trajectory file

1050 # This function is generated when sorting indices in the structure

1051 # Also the new atom order is saved

1052 self.trajectory_atom_sorter = None

1053 self.new_atom_order = None

1054 # Track when atom elements have been fixed

1055 self._fixed_atom_elements = False

1056 # Save indices of supported ion atoms

1057 self._ion_atom_indices = None

1058 self._dummy_atom_indices = None

1059 # Set the scale of the residue numbers

1060 # It may be decimal (10), hexadecimal(16) or alphanumeric(36)

1061 # Note that hybrid scales (1-9999 decimal and different further) are not explicitly supported

1062 # However, the scale is guessed on read and conserved on write, so the original numeration would be conserved

1063 # The default values is used only when the structure is not read from a PDB

1064 self.residue_numeration_base = residue_numeration_base

1065

1066 def __repr__ (self):

1067 return f'<Structure ({self.atom_count} atoms)>'

1068

1069 def get_bonds (self) -> list[ list[int] ]:

1070 """Get the bonds between atoms."""

1071 # Return the stored value, if exists

1072 if self._bonds:

1073 return self._bonds

1074 # If not, we must calculate the bonds using vmd

1075 self._bonds = self.get_covalent_bonds()

1076 return self._bonds

1077 def set_bonds (self, bonds : list[ list[int] ]):

1078 """Force specific bonds."""

1079 self._bonds = bonds

1080 # Reset fragments

1081 self._fragments = None

1082 bonds = property(get_bonds, set_bonds, None, "The structure bonds")

1083

1084 def get_fragments (self) -> list['Selection']:

1085 """Get the groups of atoms which are covalently bonded."""

1086 # Return the stored value, if exists

1087 if self._fragments != None:

1088 return self._fragments

1089 # Otherwise, find fragments in all structure atoms

1090 self._fragments = list(self.find_fragments())

1091 return self._fragments

1092 # Fragments of covalently bonded atoms (read only)

1093 fragments: list['Selection'] = property(get_fragments, None, None, "The structure fragments (read only)")

1094

1095 def find_fragments (self,

1096 selection : Optional['Selection'] = None,

1097 exclude_disulfide : bool = True,

1098 atom_bonds : Optional[list[list[int]]] = None,

1099 ) -> Generator['Selection', None, None]:

1100 """Find fragments in a selection of atoms. A fragment is a selection of

1101 covalently bonded atoms. All atoms are searched if no selection is provided.

1102

1103 WARNING: Note that fragments generated from a specific selection may not

1104 match the structure fragments. A selection including 2 separated regions of a

1105 structure fragment will yield 2 fragments.

1106

1107 For convenience, protein disulfide bonds are not considered in this logic by default.

1108 """

1109 # If there is no selection we consider all atoms

1110 if not selection: selection = self.select_all()

1111 # Get/Find covalent bonds between atoms in a new object avoid further corruption (deletion) of the original list

1112 safe_bonds = atom_bonds if atom_bonds else self.bonds

1113 atom_indexed_covalent_bonds = { atom_index: [ *safe_bonds[atom_index] ] for atom_index in selection.atom_indices }

1114 # Exclude disulfide bonds from

1115 if exclude_disulfide:

1116 # First search for protein disulfide bonds

1117 excluded = set()

1118 # Iterate atom bonds

1119 for atom_index, bonds in atom_indexed_covalent_bonds.items():

1120 # Get the actual atom

1121 atom = self.atoms[atom_index]

1122 # If it is not sulfur then continue

1123 if atom.element != 'S': continue

1124 # Iterate bonded atoms

1125 for bonded_atom_index in bonds:

1126 # Bonds with atoms out of the selection are not considered

1127 if bonded_atom_index not in atom_indexed_covalent_bonds: continue

1128 # Get the bonded atom

1129 bonded_atom = self.atoms[bonded_atom_index]

1130 if bonded_atom.element != 'S': continue

1131 # We found a disulfide bond here

1132 # Make sure this is protein

1133 if atom.residue.classification != 'protein': continue

1134 if bonded_atom.residue.classification != 'protein': continue

1135 # We found a protein disulfide bond, we must exclude it

1136 bond = tuple(sorted([atom_index, bonded_atom_index]))

1137 excluded.add(bond)

1138 # Now remove the corresponding bonds

1139 for bond in excluded:

1140 first_atom_index, second_atom_index = bond

1141 atom_indexed_covalent_bonds[first_atom_index].remove(second_atom_index)

1142 atom_indexed_covalent_bonds[second_atom_index].remove(first_atom_index)

1143 # Group the connected atoms in "fragments"

1144 while len(atom_indexed_covalent_bonds) > 0:

1145 start_atom_index, bonds = next(iter(atom_indexed_covalent_bonds.items()))

1146 del atom_indexed_covalent_bonds[start_atom_index]

1147 fragment_atom_indices = [ start_atom_index ]

1148 while len(bonds) > 0:

1149 # Get the next bond atom and remove it from the bonds list

1150 next_atom_index = bonds[0]

1151 bonds.remove(next_atom_index)

1152 next_bonds = atom_indexed_covalent_bonds.get(next_atom_index, None)

1153 # If this atom is out of the selection then skip it

1154 if next_bonds == None:

1155 continue

1156 next_new_bonds = [ bond for bond in next_bonds if bond not in fragment_atom_indices + bonds ]

1157 bonds += next_new_bonds

1158 fragment_atom_indices.append(next_atom_index)

1159 del atom_indexed_covalent_bonds[next_atom_index]

1160 yield Selection(fragment_atom_indices)

1161

1162 def find_whole_fragments (self, selection : 'Selection') -> Generator['Selection', None, None]:

1163 """Given a selection of atoms, find all whole structure fragments on them."""

1164 for fragment in self.fragments:

1165 if selection & fragment:

1166 yield fragment

1167

1168 def name_selection (self, selection : 'Selection') -> str:

1169 """Name an atom selection depending on the chains it contains. This is used for debug purpouses."""

1170 atoms = [ self.atoms[index] for index in selection.atom_indices ]

1171 # Count atoms per chain

1172 atom_count_per_chain = { chain: 0 for chain in self.chains }

1173 for atom in atoms:

1174 atom_count_per_chain[atom.chain] += 1

1175 # Set labels accoridng to the coverage of every chain

1176 chain_labels = []

1177 for chain, atom_count in atom_count_per_chain.items():

1178 if atom_count == 0: continue

1179 coverage = atom_count / chain.atom_count

1180 label = f'chain {chain.name}'

1181 if coverage < 1:

1182 percent = round(coverage * 1000) / 10

1183 label += f' ({percent}%)'

1184 chain_labels.append(label)

1185 return ', '.join(chain_labels)

1186

1187 def set_new_atom (self, atom : 'Atom'):

1188 """Set a new atom in the structure."""

1189 atom._structure = self

1190 new_atom_index = self.atom_count

1191 self.atoms.append(atom)

1192 atom._index = new_atom_index

1193

1194 def set_new_residue (self, residue : 'Residue'):

1195 """

1196 Set a new residue in the structure.

1197 WARNING: Atoms must be set already before setting residues.

1198 """

1199 residue._structure = self

1200 new_residue_index = len(self.residues)

1201 self.residues.append(residue)

1202 residue._index = new_residue_index

1203 # In case the residue has atom indices, set relational indices on each atom

1204 for atom_index in residue.atom_indices:

1205 atom = self.atoms[atom_index]

1206 atom._residue_index = new_residue_index

1207

1208 def purge_residue (self, residue : 'Residue'):

1209 """

1210 Purge residue from the structure and its chain.

1211 This can be done only when the residue has no atoms left in the structure.

1212 Renumerate all residue indices which have been offsetted as a result of the purge.

1213 """

1214 # Check the residue can be purged

1215 if residue not in self.residues:

1216 raise ValueError(f'{residue} is not in the structure already')

1217 if len(residue.atom_indices) > 0:

1218 raise ValueError(f'{residue} is still having atoms and thus it cannot be purged')

1219 # Remove the purged residue from its chain

1220 if residue.chain: residue.chain.remove_residue(residue)

1221 # Get the current index of the residue to be purged

1222 purged_index = residue.index

1223 # Residues and their atoms below this index are not to be modified

1224 # Residues and their atoms over this index must be renumerated

1225 for affected_residue in self.residues[purged_index+1:]:

1226 # Chaging the index automatically changes all residues atoms '_residue_index' values

1227 # Chaging the index automatically changes its corresponding index in residue chain '_residue_indices'

1228 affected_residue.index -= 1

1229 # Finally, remove the current residue from the list of residues in the structure

1230 del self.residues[purged_index]

1231

1232 def set_new_chain (self, chain : 'Chain'):

1233 """Set a new chain in the structure.

1234 WARNING: Residues and atoms must be set already before setting chains."""

1235 chain._structure = self

1236 new_chain_index = len(self.chains)

1237 self.chains.append(chain)

1238 chain._index = new_chain_index

1239 # In case the chain has residue indices, set relational indices on each residue

1240 for residue_index in chain.residue_indices:

1241 residue = self.residues[residue_index]

1242 residue._chain_index = new_chain_index

1243

1244 def purge_chain (self, chain : 'Chain'):

1245 """Purge chain from the structure.

1246 This can be done only when the chain has no residues left in the structure.

1247 Renumerate all chain indices which have been offsetted as a result of the purge."""

1248 # Check the chain can be purged

1249 if chain not in self.chains:

1250 raise ValueError(f'Chain {chain.name} is not in the structure already')

1251 if len(chain.residue_indices) > 0:

1252 raise ValueError(f'Chain {chain.name} is still having residues and thus it cannot be purged')

1253 # Get the current index of the chain to be purged

1254 purged_index = chain.index

1255 # Chains and their residues below this index are not to be modified

1256 # Chains and their residues over this index must be renumerated

1257 for affected_chain in self.chains[purged_index+1:]:

1258 # Chaging the index automatically changes all chain residues '_chain_index' values

1259 affected_chain.index -= 1

1260 # Finally, remove the current chain from the list of chains in the structure

1261 del self.chains[purged_index]

1262

1263 @classmethod

1264 def from_pdb (cls, pdb_content : str, model : Optional[int] = None, flexible_numeration : bool = True):

1265 """Set the structure from a pdb file.

1266 You may filter the PDB content for a specific model.

1267 Some weird numeration systems are not supported and, when encountered, they are ignored.

1268 In these cases we set our own numeration system.

1269 Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead."""

1270 # Filter the PDB content in case a model was passed

1271 filtered_pdb_content = filter_model(pdb_content, model) if model else pdb_content

1272 # Split the PDB content in lines

1273 pdb_lines = filtered_pdb_content.split('\n')

1274 # Before we start, we must guess the numeration system

1275 # To do so mine all residue numbers

1276 all_residue_number_characters = set()

1277 for line in pdb_lines:

1278 # Parse atoms only

1279 start = line[0:6]

1280 is_atom = start == 'ATOM ' or start == 'HETATM'

1281 if not is_atom: continue

1282 # Mine all residue numbers

1283 residue_number = line[22:26]

1284 for character in residue_number:

1285 all_residue_number_characters.add(character)

1286 # Remove white spaces

1287 all_residue_number_characters.discard(' ')

1288 # If we find a non-numerical and non-alphabetical character then we assume it has a weird numeration system

1289 # Since we can not support every scenario, in this cases we set the numeration totally ignoring the one in the PDB

1290 weird_character = next((character for character in all_residue_number_characters if not (character.isalnum() or character == '-')), None)

1291 if weird_character:

1292 if flexible_numeration == False: raise InputError(f'Not supported numeration system including "{weird_character}" characters')

1293 warn(f'Weird residue numeration including "{weird_character}" characters found in the PDB file. Ignoring it.')

1294 residue_numeration_base = None

1295 # Search among all resiude numbers any letters (non-numerical characters)

1296 elif next((letter for letter in alphanumerical_letters if letter in all_residue_number_characters), None):

1297 residue_numeration_base = 36

1298 elif next((letter for letter in hexadecimal_letters if letter in all_residue_number_characters), None):

1299 residue_numeration_base = 16

1300 else:

1301 residue_numeration_base = 10

1302 # Read the pdb content line by line and set the parsed atoms, residues and chains

1303 parsed_atoms = []

1304 parsed_residues = []

1305 parsed_chains = []

1306 # Save chains and residues also in dictionaries only to find them faster

1307 name_chains = {}

1308 label_residues = {}

1309 # Keep track of the last issued atom and residue indices

1310 atom_index = -1

1311 residue_index = -1

1312 for line in pdb_lines:

1313 # Parse atoms only

1314 start = line[0:6]

1315 is_atom = start == 'ATOM ' or start == 'HETATM'

1316 if not is_atom:

1317 continue

1318 # Mine all atom data

1319 atom_name = line[11:16].strip()

1320 residue_name = line[17:21].strip()

1321 chain_name = line[21:22]

1322 residue_number = line[22:26]

1323 icode = line[26:27]

1324 if icode == ' ':

1325 icode = ''

1326 x_coord = float(line[30:38])

1327 y_coord = float(line[38:46])

1328 z_coord = float(line[46:54])

1329 element = line[76:78].strip()

1330 # Set the parsed atom

1331 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord))

1332 # Add the parsed atom to the list and update the current atom index

1333 parsed_atoms.append(parsed_atom)

1334 # Get the parsed chain

1335 # DANI: If we always check for an already existing chain there will never be repeated chains

1336 # DANI: However we may create chains with unconnected atoms silently (they were separated in the PDB)

1337 # DANI: For this reason we must only consider the last processed chain

1338 parsed_chain = name_chains.get(chain_name, None)

1339 if parsed_chain and parsed_chain != parsed_chains[-1]:

1340 parsed_chain = None

1341 # If the parsed chain was not yet instantiated then do it now

1342 if not parsed_chain:

1343 parsed_chain = Chain(name=chain_name)

1344 parsed_chains.append(parsed_chain)

1345 name_chains[chain_name] = parsed_chain

1346 # Get the parsed residue

1347 residue_label = (chain_name, residue_number, icode)

1348 # DANI: If we always check for an already existing residue there will never be repeated residues

1349 # DANI: However we may create residues with unconnected atoms silently (they were separated in the PDB)

1350 # DANI: For this reason we must only consider the last processed residue

1351 parsed_residue = label_residues.get(residue_label, None)

1352 if parsed_residue and parsed_residue != parsed_residues[-1]:

1353 parsed_residue = None

1354 # If the parsed residue was not yet instantiated then do it now

1355 if not parsed_residue:

1356 # Parse the residue number if it is to be parsed

1357 if residue_numeration_base:

1358 parsed_residue_number = int(residue_number, residue_numeration_base)

1359 # If we decided to ignore the numeration system then we just issue a new residue number

1360 # Use the last residue number from the current chain as reference

1361 else:

1362 chain_last_residue_index = parsed_chain.residue_indices[-1] if len(parsed_chain.residue_indices) > 0 else None

1363 chain_last_residue_number = parsed_residues[chain_last_residue_index].number if chain_last_residue_index != None else 0

1364 parsed_residue_number = chain_last_residue_number + 1

1365 # Instantiate the new parsed residue

1366 parsed_residue = Residue(name=residue_name, number=parsed_residue_number, icode=icode)

1367 parsed_residues.append(parsed_residue)

1368 label_residues[residue_label] = parsed_residue

1369 # Add current residue to the parsed chain

1370 residue_index += 1

1371 parsed_chain.residue_indices.append(residue_index)

1372 # Add current atom to the parsed residue

1373 atom_index += 1

1374 parsed_residue.atom_indices.append(atom_index)

1375 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains, residue_numeration_base=residue_numeration_base)

1376

1377 @classmethod

1378 def from_pdb_file (cls, pdb_filepath : str, model : Optional[int] = None, flexible_numeration : bool = True):

1379 """Set the structure from a pdb file.

1380 You may filter the input PDB file for a specific model.

1381 Some weird numeration systems are not supported and, when encountered, they are ignored.

1382 In these cases we set our own numeration system.

1383 Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead."""

1384 pdb_file = File(pdb_filepath)

1385 if not pdb_file.exists:

1386 raise InputError(f'File "{pdb_filepath}" not found')

1387 if not pdb_file.format == 'pdb':

1388 raise InputError(f'"{pdb_filepath}" is not a path for a pdb file')

1389 # Read the pdb file

1390 pdb_content = None

1391 with open(pdb_filepath, 'r') as file:

1392 pdb_content = file.read()

1393 return cls.from_pdb(pdb_content, model, flexible_numeration)

1394

1395 # https://biopandas.github.io/biopandas/tutorials/Working_with_mmCIF_Structures_in_DataFrames/

1396 @classmethod

1397 def from_mmcif (cls, mmcif_content : str, model : Optional[int] = None, author_notation : bool = False):

1398 """Set the structure from mmcif.

1399 You may filter the content for a specific model.

1400 You may ask for the author notation instead of the standarized notation for legacy reasons.

1401 This may have an effect in atom names, residue names, residue numbers and chain names.

1402 Read the pdb content line by line and set the parsed atoms, residues and chains.

1403 """

1404 parsed_atoms = []

1405 parsed_residues = []

1406 parsed_chains = []

1407 # Save chains and residues also in dictionaries only to find them faster

1408 name_chains = {}

1409 label_residues = {}

1410 # Keep track of the last issued atom and residue indices

1411 atom_index = -1

1412 residue_index = -1

1413 # Iterate the content line by line

1414 lines = iter(mmcif_content.split('\n'))

1415 # Now mine atoms data

1416 atom_headers = { 'ATOM', 'HETATM', 'ANISOU' }

1417 for line in lines:

1418 # Values are separated by spaces

1419 values = line.split()

1420 # If this is not atom line then we are done

1421 if len(values) == 0 or values[0] not in atom_headers: continue

1422 # Get the atom model number if a model number was passed

1423 # Note that then model is a number greater than 0

1424 if model:

1425 model_number = int(values[20])

1426 # If the model number odes not match then skip it

1427 if model != model_number: continue

1428 # Mine atom data

1429 # Next value is just the atom number, we do not need it

1430 # atom_number = values[1]

1431 # Mine the atom element

1432 element = values[2]

1433 # Mine the atom name

1434 atom_name = values[3].replace('"','')

1435 # Next value is a place holder to indicate alternate conformation, according to the docs

1436 # I don't know what this is but we do not need it

1437 # idk = values[4]

1438 # Mine the residue name

1439 residue_name = values[5]

1440 # Mine the chain name

1441 chain_name = values[6]

1442 # Next value is the chain number, we do not need it

1443 # chain_number = values[7]

1444 # Residue number is '.' for chains with only one residue

1445 residue_number = 1 if values[8] == '.' else int(values[8])

1446 icode = '' if values[9] == '?' else values[9]

1447 x_coord = float(values[10])

1448 y_coord = float(values[11])

1449 z_coord = float(values[12])

1450 # Next value is the occupancy, we do not need it

1451 # occupancy = float(values[13])

1452 # Next value is the isotropic displacement, we do not need it

1453 # isotropic = float(values[14])

1454 # Next value is the charge, we do not need it

1455 # charge = None if values[15] == '?' else float(values[15])

1456 # The rest of values are alternative author values

1457 if author_notation:

1458 residue_number = int(values[16])

1459 residue_name = values[17]

1460 chain_name = values[18]

1461 atom_name = values[19].replace('"','')

1462 # Set the parsed atom

1463 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord))

1464 # Add the parsed atom to the list and update the current atom index

1465 parsed_atoms.append(parsed_atom)

1466 # Get the parsed chain

1467 parsed_chain = name_chains.get(chain_name, None)

1468 # If the parsed chain was not yet instantiated then do it now

1469 if not parsed_chain:

1470 parsed_chain = Chain(name=chain_name)

1471 parsed_chains.append(parsed_chain)

1472 name_chains[chain_name] = parsed_chain

1473 # Get the parsed residue

1474 residue_label = (chain_name, residue_number, icode)

1475 parsed_residue = label_residues.get(residue_label, None)

1476 # If the parsed residue was not yet instantiated then do it now

1477 if not parsed_residue:

1478 # Instantiate the new parsed residue

1479 parsed_residue = Residue(name=residue_name, number=residue_number, icode=icode)

1480 parsed_residues.append(parsed_residue)

1481 label_residues[residue_label] = parsed_residue

1482 # Add current residue to the parsed chain

1483 residue_index += 1

1484 parsed_chain.residue_indices.append(residue_index)

1485 # Add current atom to the parsed residue

1486 atom_index += 1

1487 parsed_residue.atom_indices.append(atom_index)

1488 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)

1489

1490 @classmethod

1491 def from_mmcif_file (cls, mmcif_filepath : str, model : Optional[int] = None, author_notation : bool = False):

1492 """Set the structure from a mmcif file."""

1493 mmcif_file = File(mmcif_filepath)

1494 if not mmcif_file.exists:

1495 raise InputError(f'File "{mmcif_filepath}" not found')

1496 if not mmcif_file.format == 'cif':

1497 raise InputError(f'"{mmcif_filepath}" is not a path for a mmcif file')

1498 # Read the mmcif file

1499 mmcif_content = None

1500 with open(mmcif_filepath, 'r') as file:

1501 mmcif_content = file.read()

1502 return cls.from_mmcif(mmcif_content, model, author_notation)

1503

1504 @classmethod

1505 def from_mdanalysis (cls, mdanalysis_universe):

1506 """Set the structure from an MD analysis object."""

1507 # Set the final list of atoms to be included in the structure

1508 parsed_atoms = []

1509 parsed_residues = []

1510 parsed_chains = []

1511 # Setup atoms

1512 for atom in mdanalysis_universe.atoms:

1513 name = atom.name

1514 # WARNING: MDAnalysis cannot handle reading an element of an atom with no element

1515 element = atom.element if hasattr(atom, 'element') else '?'

1516 #coords = atom.position # DANI: Esto da error si no hay coordenadas

1517 parsed_atom = Atom(name=name, element=element)

1518 parsed_atoms.append(parsed_atom)

1519 # Setup residues

1520 for residue in mdanalysis_universe.residues:

1521 name = residue.resname

1522 number = residue.resnum

1523 icode = residue.icode if hasattr(residue, 'icode') else None # DANI: No se ha provado

1524 parsed_residue = Residue(name=name, number=number, icode=icode)

1525 atom_indices = [ atom.index for atom in residue.atoms ]

1526 parsed_residue.atom_indices = atom_indices

1527 parsed_residues.append(parsed_residue)

1528 # Setup chains

1529 for segment in mdanalysis_universe.segments:

1530 name = segment.segid

1531 parsed_chain = Chain(name=name)

1532 residue_indices = [ residue.resindex for residue in segment.residues ]

1533 parsed_chain.residue_indices = residue_indices

1534 parsed_chains.append(parsed_chain)

1535 # Setup the structure

1536 structure = cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)

1537 # Add bonds and charges which are also available in a topology

1538 atom_count = len(mdanalysis_universe.atoms)

1539 atom_bonds = [ [] for i in range(atom_count) ]

1540 for bond in mdanalysis_universe.bonds:

1541 a,b = bond.indices

1542 # Make sure atom indices are regular integers so they are JSON serializables

1543 atom_bonds[a].append(int(b))

1544 atom_bonds[b].append(int(a))

1545 structure.bonds = atom_bonds

1546 structure.charges = list(mdanalysis_universe._topology.charges.values)

1547 return structure

1548

1549 @classmethod

1550 def from_prmtop_file (cls, prmtop_filepath : str):

1551 """Set the structure from a prmtop file."""

1552 # Make sure the input file exists and has the right format

1553 prmtop_file = File(prmtop_filepath)

1554 if not prmtop_file.exists:

1555 raise InputError(f'File "{prmtop_filepath}" not found')

1556 if not prmtop_file.format == 'prmtop':

1557 raise InputError(f'"{prmtop_filepath}" is not a name for a prmtop file')

1558 # In we do not have mdanalysis in our environment then we cannot proceed

1559 if not is_imported('MDAnalysis'):

1560 raise InputError('Missing dependency error: MDAnalysis')

1561 # Parse the topology using MDanalysis

1562 parser = MDAnalysis.topology.TOPParser.TOPParser(prmtop_filepath)

1563 mdanalysis_topology = parser.parse()

1564 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology)

1565 return cls.from_mdanalysis(mdanalysis_universe)

1566

1567 @classmethod

1568 def from_tpr_file (cls, tpr_filepath : str):

1569 """Set the structure from a tpr file."""

1570 # Make sure the input file exists and has the right format

1571 tpr_file = File(tpr_filepath)

1572 if not tpr_file.exists:

1573 raise InputError(f'File "{tpr_filepath}" not found')

1574 if not tpr_file.format == 'tpr':

1575 raise InputError(f'"{tpr_filepath}" is not a name for a tpr file')

1576 # In we do not have mdanalysis in our environment then we cannot proceed

1577 if not is_imported('MDAnalysis'):

1578 raise InputError('Missing dependency error: MDAnalysis')

1579 # Parse the topology using MDanalysis

1580 parser = MDAnalysis.topology.TPRParser.TPRParser(tpr_filepath)

1581 mdanalysis_topology = parser.parse()

1582 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology)

1583 return cls.from_mdanalysis(mdanalysis_universe)

1584

1585 @classmethod

1586 def from_file (cls, mysterious_filepath : str):

1587 """Set the structure from a file if the file format is supported."""

1588 mysterious_file = File(mysterious_filepath)

1589 if mysterious_file.format == 'pdb':

1590 return cls.from_pdb_file(mysterious_file.path)

1591 if mysterious_file.format == 'cif':

1592 return cls.from_mmcif_file(mysterious_file.path)

1593 if mysterious_file.format == 'prmtop':

1594 return cls.from_prmtop_file(mysterious_file.path)

1595 if mysterious_file.format == 'tpr':

1596 return cls.from_tpr_file(mysterious_file.path)

1597 raise InputError(f'Not supported format ({mysterious_file.format}) to setup a Structure')

1598

1599 def get_atom_count (self) -> int:

1600 """Get the number of atoms in the structure."""

1601 return len(self.atoms)

1602 atom_count = property(get_atom_count, None, None, "The number of atoms in the structure (read only)")

1603

1604 def get_residue_count (self) -> int:

1605 """Get the number of residues in the structure (read only)."""

1606 return len(self.residues)

1607 residue_count = property(get_residue_count, None, None, "Number of residues in the structure (read only)")

1608

1609 def get_chain_count (self) -> int:

1610 """Get the number of chains in the structure."""

1611 return len(self.chains)

1612 chain_count = property(get_chain_count, None, None, "Number of chains in the structure (read only)")

1613

1614 def fix_atom_elements (self, trust : bool = True) -> bool:

1615 """Fix atom elements by guessing them when missing.

1616 Set all elements with the first letter upper and the second (if any) lower.

1617 Also check if atom elements are coherent with atom names.

1618

1619 Args:

1620 trust (bool): If 'trust' is set as False then we impose elements according to what we can guess from the atom name.

1621

1622 Returns:

1623 bool:

1624 Return True if any element was modified or False if not.

1625 """

1626 modified = False

1627 added = False

1628 # Save the wrong guesses for a final report

1629 # This way we do not crowd the terminal with logs when a lot of atoms are affected

1630 reports = {}

1631 for atom in self.atoms:

1632 # Make sure elements have the first letter cap and the second letter not cap

1633 if atom.element:

1634 new_element = first_cap_only(atom.element)

1635 if atom.element != new_element:

1636 atom.element = new_element

1637 modified = True

1638 # Check the element to match what we would guess from the atom name

1639 # In case it does not just warn the user

1640 guess = atom.guess_element()

1641 if atom.element != guess:

1642 report = (atom.name, atom.element, guess)

1643 report_count = reports.get(report, 0)

1644 reports[report] = report_count + 1

1645 if not trust:

1646 atom.element = guess

1647 modified = True

1648 # If elements are missing then guess them from atom names

1649 else:

1650 atom.element = atom.guess_element()

1651 added = True

1652 # Warn the user about anormalies

1653 for report, count in reports.items():

1654 atom_name, atom_element, guess = report

1655 warn(f"Suspicious element for atom {atom_name}: {atom_element} -> shoudn't it be {guess}? ({count} occurrences)")

1656 # Warn the user that some elements were modified

1657 if modified: warn('Atom elements have been modified')

1658 if added: warn('Atom elements were missing and have been added')

1659 # Set atom elements as fixed in order to avoid repeating this process

1660 self._fixed_atom_elements = True

1661 return modified or added

1662

1663 def set_new_coordinates (self, new_coordinates : list[Coords]):

1664 """Set new coordinates."""

1665 # Make sure the list of coordinates is as long as the number of atoms

1666 if len(new_coordinates) != self.atom_count:

1667 raise ValueError(f'The number of coordinates ({len(new_coordinates)}) does not match the number of atoms ({self.atom_count})')

1668 # Overwrite current coordinates with the new coordinates

1669 for i, atom in enumerate(self.atoms):

1670 atom.coords = tuple(new_coordinates[i])

1671

1672 def get_ion_atom_indices (self) -> set:

1673 """Get all supported ion atom indices together in a set."""

1674 # If we already did this then return the stored value

1675 if self._ion_atom_indices != None:

1676 return self._ion_atom_indices

1677 # Find ion atom indices

1678 indices = set()

1679 for atom in self.atoms:

1680 if atom.element in SUPPORTED_ION_ELEMENTS:

1681 indices.add(atom.index)

1682 self._ion_atom_indices = indices

1683 return self._ion_atom_indices

1684 ion_atom_indices = property(get_ion_atom_indices, None, None, "Atom indices for what we consider supported ions")

1685

1686 def get_dummy_atom_indices (self) -> set:

1687 """Get all dummy atom indices together in a set."""

1688 # If we already did this then return the stored value

1689 if self._dummy_atom_indices != None:

1690 return self._dummy_atom_indices

1691 # Find ion atom indices

1692 indices = set()

1693 for atom in self.atoms:

1694 if atom.element == DUMMY_ATOM_ELEMENT:

1695 indices.add(atom.index)

1696 self._dummy_atom_indices = indices

1697 return self._dummy_atom_indices

1698 dummy_atom_indices = property(get_dummy_atom_indices, None, None, "Atom indices for what we consider dummy atoms")

1699

1700 def generate_pdb (self):

1701 """Generate a pdb file content with the current structure."""

1702 content = 'REMARK workflow generated pdb file\n'

1703 for a, atom in enumerate(self.atoms):

1704 residue = atom.residue

1705 index = str((a+1) % 100000).rjust(5)

1706 name = ' ' + atom.name.ljust(3) if len(atom.name) < 4 else atom.name

1707 residue_name = residue.name.ljust(4) if residue else 'XXX'.ljust(4)

1708 chain = atom.chain

1709 chain_name = chain.name if chain.name and len(chain.name) == 1 else 'X'

1710 residue_number = str(residue.number).rjust(4) if residue else '0'.rjust(4)

1711 # If residue number is longer than 4 characters then we must parse to hexadecimal

1712 if len(residue_number) > 4:

1713 residue_number = hex(residue.number)[2:].rjust(4)

1714 icode = residue.icode if residue.icode and len(residue.icode) else ' '

1715 # Make sure we have atom coordinates

1716 if atom.coords == None:

1717 raise InputError('Trying to write a PDB file from a structure with atoms without coordinates')

1718 x_coord, y_coord, z_coord = [ "{:.3f}".format(coord).rjust(8) for coord in atom.coords ]

1719 occupancy = '1.00' # Just a placeholder

1720 temp_factor = '0.00' # Just a placeholder

1721 element = atom.element.rjust(2)

1722 atom_line = ('ATOM ' + index + ' ' + name + ' ' + residue_name

1723 + chain_name + residue_number + icode + ' ' + x_coord + y_coord + z_coord

1724 + ' ' + occupancy + ' ' + temp_factor + ' ' + element).ljust(80) + '\n'

1725 content += atom_line

1726 return content

1727

1728 def generate_pdb_file (self, pdb_filepath : str):

1729 """Generate a pdb file with current structure."""

1730 pdb_content = self.generate_pdb()

1731 with open(pdb_filepath, "w") as file:

1732 file.write(pdb_content)

1733

1734 def get_pytraj_topology (self):

1735 """Get the structure equivalent pytraj topology."""

1736 # In we do not have pytraj in our environment then we cannot proceed

1737 if not is_imported('pytraj'):

1738 raise InputError('Missing dependency error: pytraj')

1739 # Generate a pdb file from the current structure to feed pytraj

1740 pdb_filepath = '.structure.pdb'

1741 self.generate_pdb_file(pdb_filepath)

1742 pytraj_topology = pytraj.load_topology(filename = pdb_filepath)

1743 os.remove(pdb_filepath)

1744 return pytraj_topology

1745

1746 SUPPORTED_SELECTION_SYNTAXES = { 'vmd', 'pytraj', 'gmx' }

1747 def select (self, selection_string : str, syntax : str = 'vmd') -> Optional['Selection']:

1748 """Select atoms from the structure thus generating an atom indices list.

1749 Different tools may be used to make the selection:

1750 - vmd (default)

1751 - pytraj"""

1752 if syntax == 'vmd':

1753 # Generate a pdb for vmd to read it

1754 auxiliar_pdb_filepath = '.structure.pdb'

1755 self.generate_pdb_file(auxiliar_pdb_filepath)

1756 # Use vmd to find atom indices

1757 atom_indices = get_vmd_selection_atom_indices(auxiliar_pdb_filepath, selection_string)

1758 os.remove(auxiliar_pdb_filepath)

1759 if len(atom_indices) == 0:

1760 return Selection()

1761 return Selection(atom_indices)

1762 if syntax == 'pytraj':

1763 # In we do not have pytraj in our environment then we cannot proceed

1764 if not is_imported('pytraj'):

1765 raise InputError('Missing dependency error: pytraj')

1766 pytraj_topology = self.get_pytraj_topology()

1767 pytraj_selection = pytraj_topology[selection_string]

1768 atom_indices = [ atom.index for atom in pytraj_selection.atoms ]

1769 if len(atom_indices) == 0:

1770 return Selection()

1771 return Selection(atom_indices)

1772 if syntax == 'gmx':

1773 # Generate a pdb strucutre to feed gmx

1774 auxiliar_pdb_filepath = '.structure.pdb'

1775 self.generate_pdb_file(auxiliar_pdb_filepath)

1776 auxiliar_pdb_file = File(auxiliar_pdb_filepath)

1777 # Create the index file with the current atom selection

1778 index_filepath = f'.structure.ndx'

1779 index_file = File(index_filepath)

1780 selection_name, selection_exists = make_index(auxiliar_pdb_file, index_file, selection_string)

1781 if not selection_exists:

1782 available_selections = ', '.join(list(index_groups.keys()))

1783 raise InputError(f'Something was wrong with the selection. Available gromacs selections: {available_selections}')

1784 # Read the index file to capture the selection of atoms

1785 index_groups = read_ndx(index_file)

1786 indices = index_groups.get(selection_name, None)

1787 if indices == None: raise RuntimeError('Atom group must exist at this point')

1788 auxiliar_pdb_file.remove()

1789 index_file.remove()

1790 # Return the parsed selection

1791 return Selection(indices)

1792

1793 options = ', '.join(self.SUPPORTED_SELECTION_SYNTAXES)

1794 raise InputError(f'Syntax "{syntax}" is not supported. Choose one of the following: {options}')

1795

1796 def select_atom_indices (self, atom_indices : list[int]) -> 'Selection':

1797 """Set a function to make selections using atom indices."""

1798 # Check atom indices to be in the structure

1799 atom_count = self.atom_count

1800 for atom_index in atom_indices:

1801 if atom_index >= atom_count:

1802 raise InputError(f'Atom index {atom_index} is out of range ({atom_count})')

1803 return Selection(atom_indices)

1804

1805 def select_residue_indices (self, residue_indices : list[int]) -> 'Selection':

1806 """Set a function to make selections using residue indices."""

1807 # WARNING: The following line gets stucked sometimes, idk why

1808 #atom_indices = sum([ self.residues[index].atom_indices for index in residue_indices ], [])

1809 atom_indices = []

1810 for i, index in enumerate(residue_indices):

1811 atom_indices += self.residues[index].atom_indices

1812 return Selection(atom_indices)

1813

1814 def select_all (self) -> 'Selection':

1815 """Get a selection with all atoms."""

1816 return Selection(list(range(self.atom_count)))

1817

1818 def select_by_classification (self, classification : str) -> 'Selection':

1819 """Select atoms according to the classification of its residue."""

1820 atom_indices = []

1821 for residue in self.residues:

1822 if residue.classification == classification:

1823 atom_indices += residue.atom_indices

1824 return Selection(atom_indices)

1825

1826 def select_water (self) -> 'Selection':

1827 """Select water atoms.

1828 WARNING: This logic is a bit guessy and it may fail for non-standard residue named structures

1829 """

1830 return self.select_by_classification('solvent')

1831

1832 def select_ions (self) -> 'Selection':

1833 """Select ions."""

1834 return self.select_by_classification('ion')

1835

1836 def select_counter_ions (self, charge : Optional[str] = None) -> 'Selection':

1837 """Select counter ion atoms.

1838 WARNING: This logic is a bit guessy and it may fail for non-standard atom named structures

1839 """

1840 counter_ion_indices = []

1841 # Set the accepted names accoridng to the charge

1842 if charge == None:

1843 accepted_names = STANDARD_COUNTER_ION_ATOM_NAMES

1844 elif charge == '+':

1845 accepted_names = STANDARD_COUNTER_CATION_ATOM_NAMES

1846 elif charge == '-':

1847 accepted_names = STANDARD_COUNTER_ANION_ATOM_NAMES

1848 else:

1849 raise ValueError('Not supported charge')

1850 # Iterate atoms

1851 for atom in self.atoms:

1852 # If the residue has not one single atom then it is not an ion

1853 if len(atom.residue.atoms) != 1: continue

1854 # Get a simplified version of the atom name

1855 # Set all letters upper and remove non-letter characters (e.g. '+' and '-' symbols)

1856 simple_atom_name = ''.join(filter(str.isalpha, atom.name.upper()))

1857 if simple_atom_name in accepted_names:

1858 counter_ion_indices.append(atom.index)

1859 return Selection(counter_ion_indices)

1860

1861 def select_water_and_counter_ions (self) -> 'Selection':

1862 """Select both water and counter ions."""

1863 return self.select_water() + self.select_counter_ions()

1864

1865 def select_heavy_atoms (self) -> 'Selection':

1866 """Select heavy atoms."""

1867 atom_indices = []

1868 for atom in self.atoms:

1869 # If the atom is not an hydrogen then add it to the list

1870 if atom.element != 'H':

1871 atom_indices.append(atom.index)

1872 return Selection(atom_indices)

1873

1874 def select_protein (self) -> 'Selection':

1875 """Select protein atoms.

1876 WARNING: Note that there is a small difference between VMD protein and our protein.

1877 This function is improved to consider terminal residues as proteins.

1878 VMD considers protein any residue including N, C, CA and O while terminals may have OC1 and OC2 instead of O.

1879 """

1880 return self.select_by_classification('protein')

1881

1882 def select_nucleic (self) -> 'Selection':

1883 """Select nucleic atoms."""

1884 return self.select_by_classification('dna') + self.select_by_classification('rna')

1885

1886 def select_lipids (self) -> 'Selection':

1887 """Select lipids."""

1888 return self.select_by_classification('fatty') + self.select_by_classification('steroid')

1889

1890 def select_carbohydrates (self) -> 'Selection':

1891 """Select carbohydrates."""

1892 return self.select_by_classification('carbohydrate')

1893

1894 def select_pbc_guess (self) -> 'Selection':

1895 """Return a selection of the typical PBC atoms: solvent, counter ions and lipids.

1896 WARNING: This is just a guess."""

1897 return self.select_water() + self.select_counter_ions() + self.select_lipids()

1898

1899 def select_cg (self) -> 'Selection':

1900 """Select coarse grain atoms."""

1901 return Selection([ atom.index for atom in self.atoms if atom.element == CG_ATOM_ELEMENT ])

1902

1903 def select_cartoon (self, include_terminals : bool = False) -> 'Selection':

1904 """Select cartoon representable regions for VMD.

1905

1906 Rules are:

1907 1. Residues must be protein (i.e. must contain C, CA, N and O atoms) or nucleic (P, OP1, OP2, O3', C3', C4', C5', O5')

1908 2. There must be at least 3 covalently bonded residues

1909

1910 It does not matter their chain, numeration or even index order as long as they are bonded.

1911 * Note that we can represent cartoon while we display one residue alone, but it must be connected anyway.

1912 Also, we have the option to include terminals in the cartoon selection although they are not representable.

1913 This is helpful for the screenshot: terminals are better hidden than represented as ligands."""

1914 # Set fragments which are candidate to be cartoon representable

1915 fragments = []

1916 # Get protein fragments according to VMD

1917 protein_selection = self.select_protein() - self.select_cg() if include_terminals else self.select('protein', syntax='vmd')

1918 if protein_selection:

1919 fragments += list(self.find_fragments(protein_selection))

1920 # Get nucleic fragments according to VMD

1921 nucleic_selection = self.select_nucleic() - self.select_cg()

1922 if nucleic_selection:

1923 fragments += list(self.find_fragments(nucleic_selection))

1924 # Set the final selection including all valid fragments

1925 cartoon_selection = Selection()

1926 # Iterate over every fragment

1927 for fragment in fragments:

1928 # Make sure the fragment has at least 3 residues before adding it to the cartoon selection

1929 fragment_residue_indices = self.get_selection_residue_indices(fragment)

1930 if len(fragment_residue_indices) >= 3:

1931 cartoon_selection += fragment

1932 return cartoon_selection

1933

1934 def invert_selection (self, selection : 'Selection') -> 'Selection':

1935 """Invert a selection."""

1936 atom_indices = list(range(self.atom_count))

1937 for atom_index in selection.atom_indices:

1938 atom_indices[atom_index] = None

1939 return Selection([ atom_index for atom_index in atom_indices if atom_index != None ])

1940

1941 def get_selection_residue_indices (self, selection : 'Selection') -> list[int]:

1942 """Given a selection, get a list of residue indices for residues implicated.

1943 Note that if a single atom from the residue is in the selection then the residue index is returned."""

1944 return list(set([ self.atoms[atom_index].residue_index for atom_index in selection.atom_indices ]))

1945

1946 def get_selection_residues (self, selection : 'Selection') -> list['Residue']:

1947 """Given a selection, get a list of residues implicated.

1948 Note that if a single atom from the residue is in the selection then the residue is returned."""

1949 residue_indices = self.get_selection_residue_indices(selection)

1950 return [ self.residues[index] for index in residue_indices ]

1951

1952 def get_selection_chain_indices (self, selection : 'Selection') -> list[int]:

1953 """Given a selection, get a list of chain indices for chains implicated.

1954 Note that if a single atom from the chain is in the selection then the chain index is returned."""

1955 return list(set([ self.atoms[atom_index].chain_index for atom_index in selection.atom_indices ]))

1956

1957 def get_selection_chains (self, selection : 'Selection') -> list['Chain']:

1958 """Given a selection, get a list of chains implicated.

1959 Note that if a single atom from the chain is in the selection then the chain is returned."""

1960 chain_indices = self.get_selection_chain_indices(selection)

1961 return [ self.chains[index] for index in chain_indices ]

1962

1963 def get_selection_classification (self, selection : 'Selection') -> str:

1964 """Get type of the chain."""

1965 # Get selection residues

1966 selection_residue_indices = self.get_selection_residue_indices(selection)

1967

1968 # Inicializar contadores para cada tipo de residuo

1969 residue_counts = {}

1970

1971 # Count the residues of each type

1972 for residue_index in selection_residue_indices:

1973 residue = self.residues[residue_index]

1974 res_class = residue.classification

1975 if res_class in residue_counts:

1976 residue_counts[res_class] += 1

1977 else:

1978 # If the classification is not recognized, count it as "other"

1979 residue_counts[res_class] = 1

1980

1981 # Count the total number of residues in the selection

1982 total_residues = sum(residue_counts.values())

1983 if total_residues == 0: raise ValueError('Should have residues at this point')

1984

1985 # Calculate the proportion of each type of residue

1986 proportions = { k: v / total_residues for k, v in residue_counts.items() }

1987

1988 # If one type of residue dominates, return it

1989 primary_type = max(proportions, key=proportions.get)

1990 # We establish a threshold of 80% to consider a chain as a single type

1991 if proportions[primary_type] >= 0.8:

1992 return primary_type

1993

1994 # Special cases

1995 relevant_threshold = 0.3

1996 if proportions.get("dna", 0) > relevant_threshold and proportions.get("rna", 0) > relevant_threshold:

1997 return "nucleic"

1998 if proportions.get("carbohydrate", 0) > relevant_threshold and proportions.get("protein", 0) > relevant_threshold:

1999 return "glycoprotein"

2000 if proportions.get("fatty", 0) > relevant_threshold and proportions.get("steroid", 0) > relevant_threshold:

2001 return "lipid"

2002

2003 # Any other combinations of different main proportions

2004 main_proportions = { k: v for k, v in proportions.items() if v > relevant_threshold }

2005 # Nothing is above the threshold

2006 if len(main_proportions) == 0:

2007 return "mix"

2008 # There is only one thing above threshold

2009 elif len(main_proportions) == 1:

2010 return f"{primary_type}/mix"

2011 # There are two things above threshold

2012 elif len(main_proportions) == 2:

2013 other_type = next(key for key in main_proportions.keys() if key != primary_type)

2014 return f"{primary_type}/{other_type}"

2015 # There are three things above the threshold (more is not possible)

2016 else:

2017 return "mix"

2018

2019 def filter (self, selection : Union['Selection', str], selection_syntax : str = 'vmd') -> 'Structure':

2020 """Create a new structure from the current using a selection to filter atoms."""

2021 if not selection: raise RuntimeError('No selection was passed')

2022 # In case the selection is not an actual Selection, but a string, parse the string into a Selection

2023 if type(selection) == str:

2024 selection = self.select(selection, selection_syntax)

2025 new_atoms = []

2026 new_residues = []

2027 new_chains = []

2028 # Get the selected atoms

2029 # Generate dictionaries with new indexes as keys and previous indexes as values for atoms, residues and chains

2030 # This is done with this structure for the residues and chains further to find the new indices fast

2031 new_atom_indices = {}

2032 # Collect also original indices to related atom residues and chains

2033 original_atom_residue_indices = []

2034 for new_index, original_index in enumerate(selection.atom_indices):

2035 # Make a copy of the selected atom in order to not modify the original one

2036 original_atom = self.atoms[original_index]

2037 new_atom = Atom(

2038 name=original_atom.name,

2039 element=original_atom.element,

2040 coords=original_atom.coords,

2041 )

2042 new_atoms.append(new_atom)

2043 # Save old and new indices in order to reconfigure all indices later

2044 new_atom_indices[original_index] = new_index

2045 original_atom_residue_indices.append(original_atom.residue_index)

2046 # Find the selected residues

2047 selected_residue_indices = list(set(original_atom_residue_indices))

2048 # Repeat the original/new indices backup we did before

2049 new_residue_indices = {}

2050 original_residue_atom_indices = []

2051 original_residue_chain_indices = []

2052 for new_index, original_index in enumerate(selected_residue_indices):

2053 # Make a copy of the selected residue in order to not modify the original one

2054 original_residue = self.residues[original_index]

2055 new_residue = Residue(

2056 name=original_residue.name,

2057 number=original_residue.number,

2058 icode=original_residue.icode,

2059 )

2060 new_residues.append(new_residue)

2061 # Save old and new indices in order to reconfigure all indices later

2062 new_residue_indices[original_index] = new_index

2063 original_residue_atom_indices.append(original_residue.atom_indices)

2064 original_residue_chain_indices.append(original_residue.chain_index)

2065 # Find the selected chains

2066 selected_chain_indices = list(set(original_residue_chain_indices))

2067 # Repeat the original/new indices backup we did before

2068 new_chain_indices = {}

2069 original_chain_residue_indices = []

2070 for new_index, original_index in enumerate(selected_chain_indices):

2071 # Make a copy of the selected chain in order to not modify the original one

2072 original_chain = self.chains[original_index]

2073 new_chain = Chain(

2074 name=original_chain.name,

2075 )

2076 new_chains.append(new_chain)

2077 # Save old and new indices in order to reconfigure all indices later

2078 new_chain_indices[original_index] = new_index

2079 original_chain_residue_indices.append(original_chain.residue_indices)

2080 # Finally, reset indices in all instances

2081 for a, atom in enumerate(new_atoms):

2082 atom.residue_index = new_residue_indices[ original_atom_residue_indices[a] ]

2083 for r, residue in enumerate(new_residues):

2084 atom_indices = []

2085 for original_index in original_residue_atom_indices[r]:

2086 new_index = new_atom_indices.get(original_index, None)

2087 if new_index != None:

2088 atom_indices.append(new_index)

2089 residue.atom_indices = atom_indices

2090 residue.chain_index = new_chain_indices[ original_residue_chain_indices[r] ]

2091 for c, chain in enumerate(new_chains):

2092 residue_indices = []

2093 for original_index in original_chain_residue_indices[c]:

2094 new_index = new_residue_indices.get(original_index, None)

2095 if new_index != None:

2096 residue_indices.append(new_index)

2097 chain.residue_indices = residue_indices

2098 return Structure(atoms=new_atoms, residues=new_residues, chains=new_chains)

2099

2100 def chainer (self,

2101 selection : Optional['Selection'] = None,

2102 letter : Optional[str] = None,

2103 whole_fragments : bool = False):

2104 """Set chains on demand.

2105 If no selection is passed then the whole structure will be affected.

2106 If no chain is passed then a "chain by fragment" logic will be applied."""

2107 # If there is no selection we consider all atoms

2108 if selection == None:

2109 selection = self.select_all()

2110 # If the selection is empty then there is nothing to do here

2111 if len(selection) == 0: return

2112 # If a letter is specified then the logic is way simpler

2113 if letter and not whole_fragments:

2114 self.set_selection_chain_name(selection, letter)

2115 return

2116 # If a letter is not specified we run the "fragments" logic

2117 fragment_getter = self.find_whole_fragments if whole_fragments else self.find_fragments

2118 for fragment in fragment_getter(selection):

2119 chain_name = letter if letter else self.get_available_chain_name()

2120 self.set_selection_chain_name(fragment, chain_name)

2121

2122 def auto_chainer (self, verbose : bool = False):

2123 """Smart function to set chains automatically.

2124 Original chains will be overwritten."""

2125 if verbose: print('Running auto-chainer')

2126 # Set all chains to X

2127 self.chainer(letter='X')

2128 # Set solvent and ions as a unique chain

2129 ion_selection = self.select_ions()

2130 if verbose: print(f' Ion atoms: {len(ion_selection)}')

2131 solvent_selection = self.select_water()

2132 if verbose: print(f' Solvent atoms: {len(solvent_selection)}')

2133 ion_and_indices_selection = ion_selection + solvent_selection

2134 self.chainer(selection=ion_and_indices_selection, letter='S')

2135 # Set fatty acids and steroids as a unique chain

2136 # RUBEN: con whole_fragments algunos carbohidratos se sobreescriben como glucolípidos

2137 # DANI: Se podrían descartarían residuos que no pertenezcan a la membrana por proximidad

2138 membrane_selection = self.select_lipids()

2139 if verbose: print(f' Membrane atoms: {len(membrane_selection)}')

2140 self.chainer(selection=membrane_selection, letter='M', whole_fragments=True)

2141 # Set carbohydrates as a unique chain as well, just in case we have glycans

2142 # Note that in case glycan atoms are mixed with protein atoms glycan chains will be overwritten

2143 # However this is not a problem. It is indeed the best solution if we don't want ro resort atoms

2144 carbohydrate_selection = self.select_carbohydrates()

2145 if verbose: print(f' Carbohydrate atoms: {len(carbohydrate_selection)}')

2146 self.chainer(selection=carbohydrate_selection, letter='H')

2147 # Add a chain per fragment for both proteins and nucleic acids

2148 protein_selection = self.select_protein()

2149 if verbose: print(f' Protein atoms: {len(protein_selection)}')

2150 nucleic_selection = self.select_nucleic()

2151 if verbose: print(f' Nucleic acid atoms: {len(nucleic_selection)}')

2152 protein_and_nucleic_selection = protein_selection + nucleic_selection

2153 self.chainer(selection=protein_and_nucleic_selection)

2154 # At this point we should have covered most of the molecules in the structure

2155 # However, in case there are more molecules, we have already set them all as a single chain ('X')

2156 # Here we do not apply the chain per fragment logic since it may be dangerous

2157 # Note that we may have a lot of independent residues (and thus a los of small fragments)

2158 # This would make us run out of letters in the alphabet and thus there would be no more chains

2159 # As the last step, fix repeated chains

2160 # RUBEN: sacar esta parte ya que se hace luego en el structure_corrector?

2161 self.check_repeated_chains(fix_chains=True)

2162

2163 def raw_protein_chainer (self):

2164 """This is an alternative system to find protein chains (anything else is chained as 'X').

2165 This system does not depend on VMD.

2166 It totally overrides previous chains since it is expected to be used only when chains are missing."""

2167 current_chain = self.get_available_chain_name()

2168 previous_alpha_carbon = None

2169 for residue in self.residues:

2170 alpha_carbon = next((atom for atom in residue.atoms if atom.name == 'CA'), None)

2171 if not alpha_carbon:

2172 residue.set_chain('X')

2173 continue

2174 # Connected aminoacids have their alpha carbons at a distance of around 3.8 Ångstroms

2175 residues_are_connected = previous_alpha_carbon and calculate_distance(previous_alpha_carbon, alpha_carbon) < 4

2176 if not residues_are_connected:

2177 current_chain = self.get_available_chain_name()

2178 residue.set_chain(current_chain)

2179 previous_alpha_carbon = alpha_carbon

2180

2181 def set_selection_chain_name (self, selection : 'Selection', letter : str):

2182 """

2183 Given an atom selection, set the chain for all these atoms.

2184 Note that the chain is changed in every whole residue, no

2185 matter if only one atom was selected.

2186 """

2187 # Find if the letter belongs to an already existing chain

2188 chain = self.get_chain_by_name(letter)

2189 if not chain:

2190 chain = Chain(name=letter)

2191 self.set_new_chain(chain)

2192 # Get the selection residue indices

2193 selection_residue_indices = self.get_selection_residue_indices(selection)

2194 # Set the chain index of every residue to the new chain

2195 # WARNING:

2196 # This may have to be done atom by atom but we forgot why so we change

2197 # it so see what was the problem. Doing it atom by atom is not efficient

2198 # and was causing a bug where residues with same number could change their name

2199 # to that of the first residue found with that number.

2200 for residue_index in selection_residue_indices:

2201 # WARNING: Chain index has to be read every iteration since it may change. Do not save it

2202 residue = self.residues[residue_index]

2203 residue.set_chain_index(chain.index)

2204

2205 def check_available_chains(self):

2206 """Check if there are more chains than available letters."""

2207 n_chains = len(self.chains)

2208 n_available_letters = len(AVAILABLE_LETTERS)

2209 if n_chains <= n_available_letters: return

2210 raise InputError(f'There are {n_chains} chains in the structure so far. If this is not expected then there may be a problem.\n' +

2211 f' If this is expected then unfortunatelly there is a limit of {n_available_letters} available chain letters.\n' +

2212 ' Please manually set the chains from scratch or merge together some chains to reduce the number.\n' +

2213 ' Customize a PDB file using the command "mwf chainer". Then use the customized PDB as input structure (-stru).')

2214

2215 def get_available_chain_name (self) -> str:

2216 """Get an available chain name.

2217 Find alphabetically the first letter which is not yet used as a chain name.

2218 If all letters in the alphabet are used already then raise an error."""

2219 self.check_available_chains()

2220 current_chain_names = [ chain.name for chain in self.chains ]

2221 next_available_chain_name = next((name for name in AVAILABLE_LETTERS if name not in current_chain_names), None)

2222 return next_available_chain_name

2223

2224 def get_next_available_chain_name (self, anterior : str) -> str:

2225 """

2226 Get the next available chain name.

2227

2228 Args:

2229 anterior (str): The last chain name used, which is expected to be a single letter

2230

2231 Raises:

2232 ValueError: If the anterior is not a letter or if there are more chains than available

2233 """

2234 self.check_available_chains()

2235 current_chain_names = set([ chain.name for chain in self.chains ])

2236 # If anterior is cap then try caps first, if anterior is lower then try lowers first

2237 if anterior.isupper():

2238 first_group, second_group = AVAILABLE_CAPS, AVAILABLE_LOWS

2239 elif anterior.islower():

2240 first_group, second_group = AVAILABLE_LOWS, AVAILABLE_CAPS

2241 else: raise ValueError(f'Is "{anterior}" even a letter?')

2242 # Reorder letters in the first group, so the anterior is the last letter

2243 anterior_position = first_group.index(anterior)

2244 next_position = anterior_position + 1

2245 reordered_group = first_group[next_position:] + first_group[0:next_position]

2246 next_letter = next((letter for letter in reordered_group if letter not in current_chain_names), None)

2247 if next_letter: return next_letter

2248 # If there is not available letters is the first group then return the first available in the second

2249 next_letter = next((letter for letter in second_group if letter not in current_chain_names), None)

2250 return next_letter

2251

2252 def get_chain_by_name (self, name : str) -> Optional['Chain']:

2253 """Get a chain by its name."""

2254 return next((c for c in self.chains if c.name == name), None)

2255

2256 def find_residue (self, chain_name : str, number : int, icode : str = '' ) -> Optional['Residue']:

2257 """Find a residue by its chain, number and insertion code."""

2258 # Get the corresponding chain

2259 target_chain = self.get_chain_by_name(chain_name)

2260 if not target_chain: return None

2261 # Find the residue in the target chain

2262 target_chain.find_residue(number, icode)

2263

2264 def display_summary (self):

2265 """Get a summary of the structure."""

2266 print(f'Atoms: {self.atom_count}')

2267 print(f'Residues: {len(self.residues)}')

2268 print(f'Chains: {len(self.chains)}')

2269 for chain in self.chains:

2270 print(f'Chain {chain.name} ({len(chain.residue_indices)} residues)')

2271 print(' -> ' + chain.get_sequence())

2272

2273 def check_repeated_chains (self, fix_chains : bool = False, display_summary : bool = False) -> bool:

2274 """

2275 There may be chains which are equal in the structure (i.e. same chain name).

2276 This means we have a duplicated/splitted chain.

2277 Repeated chains are usual and they are usually supported but with some problems.

2278 Also, repeated chains usually come with repeated residues, which means more problems (see explanation below).

2279

2280 In the context of this structure class we may have 2 different problems with a different solution each:

2281 1. There is more than one chain with the same letter (repeated chain) -> rename the duplicated chains

2282 2. There is a chain with atom indices which are not consecutive (splitted chain) -> create new chains

2283

2284 Rename repeated chains or create new chains if the fix_chains argument is True.

2285

2286 WARNING: These fixes are possible only if there are less chains than the number of letters in the alphabet.

2287 Although there is no limitation in this code for chain names, setting long chain names is not compatible with pdb format.

2288

2289 Check splitted chains (a chains with non consecutive residues) and try to fix them if requested.

2290 Check repeated chains (two chains with the same name) and return True if there were any repeats.

2291 """

2292 # Order chains according to their names

2293 # Save also those chains which have a previous duplicate

2294 name_chains = {}

2295 repeated_chains = []

2296 for chain in self.chains:

2297 chain_name = chain.name

2298 current_name_chains = name_chains.get(chain_name, None)

2299 if not current_name_chains:

2300 name_chains[chain_name] = [chain]

2301 else:

2302 name_chains[chain_name].append(chain)

2303 repeated_chains.append(chain)

2304 # Display the summary of repeated chains if requested

2305 if display_summary:

2306 if len(repeated_chains) > 0:

2307 warn('There are repeated chains:')

2308 for chain_name, chains in name_chains.items():

2309 chains_count = len(chains)

2310 if chains_count > 1:

2311 print(f'- Chain {chain_name} has {chains_count} repeats')

2312 # Rename repeated chains if requested

2313 if len(repeated_chains) > 0 and fix_chains:

2314 self.check_available_chains()

2315 current_letters = list(name_chains.keys())

2316 for repeated_chain in repeated_chains:

2317 last_chain_letter = repeated_chain.name

2318 while last_chain_letter in current_letters:

2319 last_chain_letter = get_next_letter(last_chain_letter)

2320 repeated_chain.name = last_chain_letter

2321 current_letters.append(last_chain_letter)

2322 # Check if there are splitted chains

2323 # RUBEN: sacar esta parte ya que esta self.check_splitted_chains

2324 for chain in self.chains:

2325 residue_indices = sorted(chain.residue_indices)

2326 # Check if residue indices are consecutive

2327 if residue_indices[-1] - residue_indices[0] + 1 == len(residue_indices):

2328 continue

2329 warn(f'Splitted chain {chain.name}')

2330 # If indices are not consecutive then we must find ranges of consecutive residues and create new chains for them

2331 previous_residue_index = residue_indices[0]

2332 consecutive_residues = [previous_residue_index]

2333 overall_consecutive_residues = []

2334 for residue_index in residue_indices[1:]:

2335 # If next residue is consecutive

2336 if residue_index == previous_residue_index + 1:

2337 consecutive_residues.append(residue_index)

2338 # If next residue is NOT consecutive

2339 else:

2340 overall_consecutive_residues.append(consecutive_residues)

2341 consecutive_residues = [residue_index]

2342 # Update the previous index

2343 previous_residue_index = residue_index

2344 # Add the last split

2345 overall_consecutive_residues.append(consecutive_residues)

2346 # Now create new chains and reasign residues

2347 # Skip the first set of consecutive residues since they will stay in the original chain

2348 for residues_indices in overall_consecutive_residues[1:]:

2349 chain_name = self.get_available_chain_name()

2350 residues_selection = self.select_residue_indices(residues_indices)

2351 self.set_selection_chain_name(residues_selection, chain_name)

2352

2353 # Fix repeated chains if requested

2354 return len(repeated_chains) > 0

2355

2356 def check_splitted_chains (self, fix_chains : bool = False, display_summary : bool = False) -> bool:

2357 """Check if non-consecutive atoms belong to the same chain.

2358 If so, separate pieces of non-consecuite atoms in different chains.

2359 Note that the new chains will be duplicated, so you will need to run check_repeated_chains after.

2360

2361 Args:

2362 fix_chains (bool): If True then the splitted chains will be fixed.

2363 display_summary (bool): If True then a summary of the splitted chains will be displayed.

2364

2365 Returns:

2366 bool:

2367 True if we encountered splitted chains and false otherwise.

2368 """

2369 splitted_fragments: list[tuple[str, list[int]]] = []

2370 # Keep track of already checked chains

2371 checked_chains = set()

2372 last_chain = None

2373 last_fragment_start = None

2374 for atom_index, atom in enumerate(self.atoms):

2375 # Get the chain name

2376 chain_name = atom.chain.name

2377 # Skip the atom if it belong to the previous chain

2378 if chain_name == last_chain: continue

2379 # If we were in a splitted fragment then end it here

2380 if last_fragment_start != None:

2381 new_fragment_indices = list(range(last_fragment_start, atom_index))

2382 new_fragment = (last_chain, new_fragment_indices)

2383 splitted_fragments.append(new_fragment)

2384 last_fragment_start = None

2385 # Check if the chain was already found

2386 if chain_name in checked_chains:

2387 # Start a new fragment

2388 last_fragment_start = atom_index

2389 # Update last chain

2390 last_chain = chain_name

2391 # Add the new chain to the set of already checked chains

2392 checked_chains.add(chain_name)

2393

2394 # Make a summary of the splitted chains if requested

2395 if display_summary and len(splitted_fragments) > 0:

2396 warn(f'Found {len(splitted_fragments)} splitted fragments')

2397 affected_chains = sorted(list(set([ fragment[0] for fragment in splitted_fragments ])))

2398 print(f' We are having splits in chains {", ".join(affected_chains)}')

2399

2400 # Fix chains if requested

2401 if fix_chains:

2402 for chain_name, fragment_atom_indices in splitted_fragments:

2403 # Create a new chain

2404 new_chain = Chain(name=chain_name)

2405 self.set_new_chain(new_chain)

2406 # Ge the selection residue indices for the fragment

2407 fragment_selection = self.select_atom_indices(fragment_atom_indices)

2408 fragment_residue_indices = self.get_selection_residue_indices(fragment_selection)

2409 # Move atoms in the fragment to the new chain

2410 for residue_index in fragment_residue_indices:

2411 residue = self.residues[residue_index]

2412 residue.set_chain_index(new_chain.index)

2413 return len(splitted_fragments) > 0

2414

2415 def sort_residues (self):

2416 """Coherently sort residues according to the indices of the atoms they hold."""

2417 # Set a function to sort atoms and residues by index

2418 def by_first_atom_index (residue):

2419 return min(residue.atom_indices)

2420 # Sort residues according to their first atom index

2421 sorted_residues = sorted(self.residues, key = by_first_atom_index)

2422 # Iterate sorted residues letting them know their new index

2423 for r, residue in enumerate(sorted_residues):

2424 residue.index = r

2425 # Finally update the structure's residues list

2426 self.residues = sorted_residues

2427

2428 def check_merged_residues (self, fix_residues : bool = False, display_summary : bool = False) -> bool:

2429 """

2430 There may be residues which contain unconnected (unbonded) atoms. They are not allowed.

2431 They may come from a wrong parsing and be indeed duplicated residues.

2432

2433 Search for merged residues.

2434 Create new residues for every group of connected atoms if the fix_residues argument is True.

2435 Note that the new residues will be repeated, so you will need to run check_repeated_residues after.

2436 Return True if there were any merged residues.

2437 """

2438 # Get the list of merged residues we encounter

2439 merged_residues = []

2440 # Iterate residues

2441 for residue in self.residues:

2442 residue_selection = residue.get_selection()

2443 residue_fragments = list(self.find_fragments(residue_selection))

2444 if len(residue_fragments) <= 1: continue

2445 # If current residue has more than 1 fragment then it is a merged residue

2446 merged_residues.append(residue)

2447 if not fix_residues: continue

2448 # If the residue is to be fixed then let the first fragment as the current residue

2449 # Then create a new residue for every other fragment

2450 for extra_fragment in residue_fragments[1:]:

2451 # Set a new residue identical to the current one

2452 new_residue = Residue(residue.name, residue.number, residue.icode)

2453 self.set_new_residue(new_residue)

2454 # Add it to the same chain

2455 residue.chain.add_residue(new_residue)

2456 # Add atoms to it

2457 for atom_index in extra_fragment.atom_indices:

2458 atom = self.atoms[atom_index]

2459 new_residue.add_atom(atom)

2460 # Now that we have new resiudes, sort all residues to keep them coherent

2461 self.sort_residues()

2462 # Count how many merged residues we encountered

2463 merged_residues_count = len(merged_residues)

2464 # Log some details if the summary is requested

2465 if display_summary and merged_residues_count > 0:

2466 print(f'Found {merged_residues_count} merged residues')

2467 print(f' e.g. {merged_residues[0]}')

2468 # Return if we found merged residues

2469 return merged_residues_count > 0

2470

2471 def check_repeated_residues (self, fix_residues : bool = False, display_summary : bool = False) -> bool:

2472 """

2473 There may be residues which are equal in the structure (i.e. same chain, number and icode).

2474 In case 2 residues in the structure are equal we must check distance between their atoms.

2475 If atoms are far it means they are different residues with the same notation (duplicated residues).

2476 If atoms are close it means they are indeed the same residue (splitted residue).

2477

2478 Splitted residues are found in some pdbs and they are supported by some tools.

2479 These tools consider all atoms with the same 'record' as the same residue.

2480 However, there are other tools which would consider the splitted residue as two different residues.

2481 This causes inconsistency along different tools besides a long list of problems.

2482 The only possible is fix is changing the order of atoms in the topology.

2483 Note that this is a breaking change for associated trajectories, which must change the order of coordinates.

2484 However here we provide tools to fix associates trajectories as well.

2485

2486 Duplicated residues are usual and they are usually supported but with some problems.

2487 For example, pytraj analysis outputs use to sort results by residues and each residue is tagged.

2488 If there are duplicated residues with the same tag it may be not possible to know which result belongs to each residue.

2489 Another example are NGL selections once in the web client.

2490 If you select residue ':A and 1' and there are multiple residues 1 in chain A all of them will be displayed.

2491

2492 Check residues to search for duplicated and splitted residues.

2493 Renumerate repeated residues if the fix_residues argument is True.

2494 Return True if there were any repeats.

2495 """

2496 # Track if residues have to be changed or not

2497 modified = False

2498 # Group all residues in the structure according to their chain, number and icode

2499 grouped_residues = {}

2500 # Check repeated residues which are one after the other

2501 # Note that these residues MUST have a different name

2502 # Otherwise they would have not been considered different residues

2503 # For these rare cases we use icodes to solve the problem

2504 non_icoded_residues = []

2505 last_residue = None

2506 for residue in self.residues:

2507 # Check residue to be equal than the previous residue

2508 if residue == last_residue:

2509 non_icoded_residues.append(residue)

2510 last_residue = residue

2511 continue

2512 last_residue = residue

2513 # Add residue to the groups of repeated residues

2514 current_residue_repeats = grouped_residues.get(residue, None)

2515 if not current_residue_repeats:

2516 grouped_residues[residue] = [ residue ]

2517 else:

2518 current_residue_repeats.append(residue)

2519 # In case we have non icoded residues

2520 if len(non_icoded_residues) > 0:

2521 if display_summary:

2522 print(f'There are non-icoded residues ({len(non_icoded_residues)})')

2523 # Set new icodes for non icoded residues

2524 if fix_residues:

2525 print(' Non icoded residues will recieve an icode')

2526 for residue in non_icoded_residues:

2527 repeated_residues_group = grouped_residues[residue]

2528 current_icodes = [ residue.icode for residue in repeated_residues_group if residue.icode ]

2529 next_icode = next((cap for cap in AVAILABLE_CAPS if cap not in current_icodes), None)

2530 if not next_icode:

2531 raise ValueError('There are no more icodes available')

2532 # print('Setting icode ' + next_icode + ' to residue ' + str(residue))

2533 residue.icode = next_icode

2534 modified = True

2535 # Grouped residues with more than 1 result are considered as repeated

2536 repeated_residues = [ residues for residues in grouped_residues.values() if len(residues) > 1 ]

2537 if len(repeated_residues) == 0:

2538 return modified

2539 # In case we have repeated residues...

2540 if display_summary:

2541 warn(f'There are {len(repeated_residues)} different groups of repeated residues')

2542 print(f' e.g. {repeated_residues[0][0]}')

2543 if len(repeated_residues) == 9999 or len(repeated_residues) == 10000:

2544 print(' Probably you have more residues than the PDB numeration limit (1-9999)')

2545 # Now for each repeated residue, find out which are splitted and which are duplicated

2546 covalent_bonds = self.bonds

2547 overall_splitted_residues = []

2548 overall_duplicated_residues = []

2549 for residues in repeated_residues:

2550 # Iterate over repeated residues and check if residues are covalently bonded

2551 # If any pair of residues are bonded add them both to the splitted residues list

2552 # At the end, all non-splitted residues will be considered duplicated residues

2553 # WARNING: Using a set here is not possible since repeated residues have the same hash

2554 # WARNING: Also comparing residues themselves is not advisable, so we use indices at this point

2555 splitted_residue_indices = set()

2556 for residue, other_residues in otherwise(residues):

2557 if residue.index in splitted_residue_indices:

2558 continue

2559 # Get atom indices for all atoms connected to the current residue

2560 residue_bonds = sum([ covalent_bonds[index] for index in residue.atom_indices ], [])

2561 for other_residue in other_residues:

2562 # Get all atom indices for each other residue and collate with the current residue bonds

2563 if any( index in residue_bonds for index in other_residue.atom_indices ):

2564 splitted_residue_indices.add(residue.index)

2565 splitted_residue_indices.add(other_residue.index)

2566 # Finally obtain the splitted residues from its indices

2567 splitted_residues = [ self.residues[index] for index in splitted_residue_indices ]

2568 # Repeated residues which are not splitted are thus duplicated

2569 duplicated_residues = [ residue for residue in residues if residue.index not in splitted_residue_indices ]

2570 # Update the overall lists

2571 if len(splitted_residues) > 0:

2572 overall_splitted_residues.append(splitted_residues)

2573 if len(duplicated_residues) > 0:

2574 overall_duplicated_residues.append(duplicated_residues)

2575 # In case we have splitted residues

2576 if len(overall_splitted_residues) > 0:

2577 if display_summary:

2578 print(f' There are splitted residues ({len(overall_splitted_residues)})')

2579 # Fix splitted residues

2580 if fix_residues:

2581 print(' Splitted residues will be merged')

2582 # Set a function to sort atoms and residues by index

2583 by_index = lambda v: v._index

2584 # Merge splitted residues

2585 # WARNING: Merging residues without sorting atoms is possible, but this would be lost after exporting to pdb

2586 for splitted_residues in overall_splitted_residues:

2587 # The first residue (i.e. the residue with the lower index) will be the one which remains

2588 # It will absorb other residue atom indices

2589 splitted_residues.sort(key=by_index) # Residues are not sorted by default, this is mandatory

2590 first_residue = splitted_residues[0]

2591 other_residues = splitted_residues[1:]

2592 for residue in other_residues:

2593 for atom in residue.atoms:

2594 atom.residue = first_residue

2595 print(' Atoms will be sorted to be together by residues')

2596 print(' NEVER FORGET: This will break any associated trajectory if coordinates are not sorted as well')

2597 # Sort atoms to group residue atoms together

2598 # Note that each atom index must be updated

2599 new_atom_indices = sum([ residue.atom_indices for residue in self.residues ], [])

2600 for i, new_atom_index in enumerate(new_atom_indices):

2601 atom = self.atoms[new_atom_index]

2602 atom._index = i

2603 self.atoms.sort(key=by_index)

2604 # Also residue 'atom_indices' must be updated

2605 for residue in self.residues:

2606 residue._atom_indices = [ new_atom_indices.index(atom_index) for atom_index in residue._atom_indices ]

2607 # Bonds must be reset since atom indices have changes

2608 self._bonds = None

2609 # Prepare the trajectory atom sorter which must be returned

2610 # Include atom indices already so the user has to provide only the structure and trajectory filepaths

2611 def trajectory_atom_sorter (

2612 input_structure_file : 'File',

2613 input_trajectory_file : 'File',

2614 output_trajectory_file : 'File'

2615 ):

2616 sort_trajectory_atoms(

2617 input_structure_file,

2618 input_trajectory_file,

2619 output_trajectory_file,

2620 new_atom_indices

2621 )

2622 self.trajectory_atom_sorter = trajectory_atom_sorter

2623 self.new_atom_order = new_atom_indices

2624 modified = True

2625 # In case we have duplicated residues

2626 duplicated_residues_count = len(overall_duplicated_residues)

2627 if duplicated_residues_count > 0:

2628 if display_summary:

2629 warn(f'There are {duplicated_residues_count} different groups of duplicated residues')

2630 print(f' e.g. {overall_duplicated_residues[0][0]}')

2631 # Renumerate duplicated residues if requested

2632 if fix_residues:

2633 # First of all, from each group of repeated residues, discard the first residue

2634 # The first residue will remain as it is and the rest will be renumerated

2635 # Join all residues to be renumerated in a single list

2636 residue_to_renumerate = []

2637 for residues in overall_duplicated_residues:

2638 residue_to_renumerate += residues[1:]

2639 # Now group residues per chain since the renumeration is done by chains

2640 chain_residues = {}

2641 for residue in residue_to_renumerate:

2642 chain = residue.chain

2643 current_chain_residues = chain_residues.get(chain, None)

2644 if current_chain_residues: current_chain_residues.append(residue)

2645 else: chain_residues[chain] = [ residue ]

2646 # Iterate residue chain groups

2647 for chain, residues in chain_residues.items():

2648 # Find the last residue number in the current chain

2649 maximum_chain_number = max([ residue.number for residue in chain.residues ])

2650 # Sort residues by index

2651 residues.sort(key=lambda x: x.index, reverse=True)

2652 for residue in residues:

2653 # Set the number of the residue as the next available number

2654 residue.number = maximum_chain_number + 1

2655 # Update the maximum number

2656 maximum_chain_number = residue.number

2657 modified = True

2658 return modified

2659

2660 # DANI: No recuerdo los problemas que daba tener átomos repetidos

2661 def check_repeated_atoms (self, fix_atoms : bool = False, display_summary : bool = False) -> bool:

2662 """Check atoms to search for repeated atoms.

2663 Atoms with identical chain, residue and name are considered repeated atoms.

2664

2665 Args:

2666 fix_atoms (bool): If True, rename repeated atoms.

2667 display_summary (bool): If True, display a summary of repeated atoms.

2668

2669 Returns:

2670 bool: True if there were any repeated atoms, False otherwise.

2671 """

2672 # Set two trackers for display

2673 repeated_atoms_count = 0

2674 example = None

2675 for residue in self.residues:

2676 # Iterate over the residue atoms counting how many repeated names we have

2677 current_names = []

2678 for atom in residue.atoms:

2679 # We check if the atom name already exists. If not, go to the next atom

2680 name = atom.name

2681 if name not in current_names:

2682 current_names.append(name)

2683 continue

2684 # When atom is repeated

2685 repeated_atoms_count += 1

2686 # If the fix was not requested we stop here

2687 if not fix_atoms:

2688 continue

2689 # We set the name of the atom as the element letter + the count of this element

2690 # If element is missing take the first character of the atom name

2691 initial = atom.element

2692 if not initial or initial == ' ':

2693 initial = name[0]

2694 number = 1

2695 new_name = initial + str(number)

2696 while new_name in current_names:

2697 number += 1

2698 if number > 999:

2699 raise ValueError('There are more than 999 atoms with the same name in the residue')

2700 new_name = initial + str(number)

2701 # Save an example for the logs if there is none yet

2702 if not example:

2703 example = f'{atom.name} renamed as {new_name} in residue {residue}'

2704 atom.name = new_name

2705 current_names.append(new_name)

2706 # Display the summary of repeated atoms if requested

2707 if display_summary:

2708 if repeated_atoms_count > 0:

2709 warn(f'There are repeated atoms ({repeated_atoms_count})')

2710 print(f' e.g. {example}')

2711 return repeated_atoms_count > 0

2712

2713 def is_missing_any_bonds (self) -> bool:

2714 return any(bond == MISSING_BONDS for bond in self.bonds)

2715

2716 def check_incoherent_bonds (self) -> bool:

2717 """ Check bonds to be incoherent i.e. check atoms not to have more or less

2718 bonds than expected according to their element.

2719 Return True if any incoherent bond is found. """

2720 # Find out if there are hydrogens in the structure

2721 # It may happen that we encounter an structure without hydrogens

2722 has_hydrogen = next(( True for atom in self.atoms if atom.element == 'H' ), False)

2723 coherent_bonds = coherent_bonds_with_hydrogen if has_hydrogen else coherent_bonds_without_hydrogen

2724 # Check coherent bonds atom by atom

2725 for atom in self.atoms:

2726 # Do not check this atom bonds if this may be an ion

2727 # Most authors "force" dummy bonds in these atoms to make them stable

2728 if atom.element in SUPPORTED_ION_ELEMENTS: continue

2729 # Ignore dummy atoms as well

2730 if atom.element == DUMMY_ATOM_ELEMENT: continue

2731 # Ignore coarse grain atoms as well

2732 if atom.element == CG_ATOM_ELEMENT: continue

2733 # Get actual number of bonds in the current atom both with and without ion bonds

2734 # LORE: This was a problem since some ions are force-bonded but bonds are actually not real

2735 # LORE: When an ion is forced we may end up with hyrogens with 2 bonds or carbons with 5 bonds

2736 # LORE: When an ions is really bonded we can not discard it or we may end up with orphan carbons (e.g. ligands)

2737 min_nbonds = len(atom.get_bonds(skip_ions = True, skip_dummies = True))

2738 max_nbonds = len(atom.get_bonds(skip_ions = False, skip_dummies = True))

2739 # Get the accepted range of number of bonds for the current atom according to its element

2740 element = atom.element

2741 element_coherent_bonds = coherent_bonds.get(element, None)

2742 # If there are no specficiations for the current atom element then skip it

2743 if not element_coherent_bonds:

2744 continue

2745 # Get the range of accepted number of bonds

2746 min_allowed_bonds = element_coherent_bonds['min']

2747 max_allowed_bonds = element_coherent_bonds['max']

2748 # Check the actual number of bonds is insdie the accepted range

2749 if max_nbonds < min_allowed_bonds or min_nbonds > max_allowed_bonds:

2750 if min_nbonds == max_nbonds:

2751 print(f' An atom with element {element} has {min_nbonds} bonds')

2752 else:

2753 print(f' An atom with element {element} has between {min_nbonds} bonds (withou ions) and {max_nbonds} bonds (with ions)')

2754 if min_allowed_bonds == max_allowed_bonds:

2755 plural_sufix = '' if min_allowed_bonds == 1 else 's'

2756 print(f' It should have {min_allowed_bonds} bond{plural_sufix}')

2757 else:

2758 print(f' It should have between {min_allowed_bonds} and {max_allowed_bonds} bonds')

2759 print(f' -> Atom {atom.label}')

2760 bond_label = ', '.join([ self.atoms[atom].label for atom in atom.get_bonds(skip_ions = False) ])

2761 print(f' -> Bonds {bond_label}')

2762 return True

2763 return False

2764

2765 def get_covalent_bonds (self, selection : Optional['Selection'] = None) -> list[ list[int] ]:

2766 """Get all atomic covalent (strong) bonds.

2767 Bonds are defined as a list of atom indices for each atom in the structure.

2768 Rely on VMD logic to do so."""

2769 # It is important to fix elements before trying to fix bonds, since elements have an impact on bonds

2770 # VMD logic to find bonds relies in the atom element to set the covalent bond distance cutoff

2771 self.fix_atom_elements()

2772 # Generate a pdb strucutre to feed vmd

2773 auxiliar_pdb_filepath = '.structure.pdb'

2774 self.generate_pdb_file(auxiliar_pdb_filepath)

2775 # Get covalent bonds between both residue atoms

2776 covalent_bonds = get_covalent_bonds(auxiliar_pdb_filepath, selection)

2777 # For every atom in CG, replace its bonds with a class which will raise and error when read

2778 # Thus we make sure using these wrong bonds anywhere further will result in failure

2779 for atom_index in self.select_cg().atom_indices:

2780 covalent_bonds[atom_index] = MISSING_BONDS

2781 # Remove the auxiliar pdb file

2782 os.remove(auxiliar_pdb_filepath)

2783 return covalent_bonds

2784

2785 def copy_bonds (self) -> list[list[int]]:

2786 """Make a copy of the bonds list."""

2787 new_bonds = []

2788 for atom_bonds in self.bonds:

2789 # Missing bonds coming from CG atoms are forwarded

2790 if atom_bonds == MISSING_BONDS:

2791 new_bonds.append(MISSING_BONDS)

2792 continue

2793 # Copy also the inner lists to avoid further mutation of the original structure

2794 new_bonds.append([ atom_index for atom_index in atom_bonds ])

2795 return new_bonds

2796

2797 def copy (self) -> 'Structure':

2798 """Make a copy of the current structure."""

2799 atom_copies = [ atom.copy() for atom in self.atoms ]

2800 residue_copies = [ residue.copy() for residue in self.residues ]

2801 chain_copies = [ chain.copy() for chain in self.chains ]

2802 structure_copy = Structure(atom_copies, residue_copies, chain_copies)

2803 structure_copy.bonds = self.copy_bonds()

2804 return structure_copy

2805

2806 # DANI: No lo he testeado en profundidad

2807 def merge (self, other : 'Structure') -> 'Structure':

2808 """Merge current structure with another structure."""

2809 # Copy self atoms, residues and chains

2810 self_atom_copies = [ atom.copy() for atom in self.atoms ]

2811 self_residue_copies = [ residue.copy() for residue in self.residues ]

2812 self_chain_copies = [ chain.copy() for chain in self.chains ]

2813 # Copy other atoms, residues and chains

2814 other_atom_copies = [ atom.copy() for atom in other.atoms ]

2815 other_residue_copies = [ residue.copy() for residue in other.residues ]

2816 other_chain_copies = [ chain.copy() for chain in other.chains ]

2817 # Adapt indices in other atoms, residues and chains

2818 atom_index_offset = len(self_atom_copies)

2819 residue_index_offset = len(self_residue_copies)

2820 chain_index_offset = len(self_chain_copies)

2821 for atom in other_atom_copies:

2822 atom._index += atom_index_offset

2823 atom._residue_index += residue_index_offset

2824 for residue in other_residue_copies:

2825 residue._index += residue_index_offset

2826 residue._atom_indices = [ i + atom_index_offset for i in residue._atom_indices ]

2827 residue._chain_index += chain_index_offset

2828 for chain in other_chain_copies:

2829 chain._index += chain_index_offset

2830 chain._residue_indices = [ i + residue_index_offset for i in chain._residue_indices ]

2831 # Merge self with other atoms, residues and chains and build the new structure

2832 merged_atoms = self_atom_copies + other_atom_copies

2833 merged_residues = self_residue_copies + other_residue_copies

2834 merged_chains = self_chain_copies + other_chain_copies

2835 return Structure(merged_atoms, merged_residues, merged_chains)

2836

2837 def find_rings (self, max_ring_size : int, selection : Optional[Selection] = None) -> list[ list[Atom] ]:

2838 """Find rings with a maximum specific size or less in the structure and yield them as they are found."""

2839 found_rings = []

2840 selected_atom_indices = selection.atom_indices if selection else None

2841 # Find rings recursively

2842 def find_rings_recursive (atom_path : list[Atom]) -> Generator[ tuple[Atom], None, None ]:

2843 # Get the current atom to continue the followup: the last atom

2844 current_atom = atom_path[-1] # Note that the list MUST always have at least 1 atom

2845 # Get bonded atoms to continue the path

2846 followup_atoms = current_atom.get_bonded_atoms()

2847 # Hydrogens are discarded since they have only 1 bond and thus they can only lead to a dead end

2848 followup_atoms = [ atom for atom in followup_atoms if atom.element != 'H' ]

2849 # In case there is a selection, check followup atoms not to be in the selection

2850 if selected_atom_indices:

2851 followup_atoms = [ atom for atom in followup_atoms if atom.index in selected_atom_indices ]

2852 # Remove the previous atom to avoid going back

2853 previous_atom = atom_path[-2] if len(atom_path) > 1 else None

2854 if previous_atom:

2855 # HARDCODE: This is a problem which should be studied and fixed

2856 # DANI: I temporatily bypass the problem to make it on time to a meeting

2857 if previous_atom not in followup_atoms: return

2858 followup_atoms.remove(previous_atom)

2859 # Iterate over the following bonded atoms

2860 for followup_atom in followup_atoms:

2861 # Check if the followup atom is in the path

2862 path_index = next(( index for index, atom in enumerate(atom_path) if atom == followup_atom ), None)

2863 # If so, we have found a ring

2864 if path_index != None:

2865 ring = atom_path[path_index:]

2866 ring.sort(key=lambda x: x.index)

2867 yield tuple(ring)

2868 continue

2869 # If the ring size is reached, we can not continue

2870 if len(atom_path) == max_ring_size: continue

2871 # Otherwise, keep searching

2872 new_path = atom_path + [followup_atom]

2873 for ring in find_rings_recursive(atom_path=new_path):

2874 yield ring

2875 # Find a starting atom and run the recursive logic

2876 candidate_atom_indices = selected_atom_indices if selected_atom_indices else range(self.atom_count)

2877 for candidate_atom_index in candidate_atom_indices:

2878 candidate_atom = self.atoms[candidate_atom_index]

2879 # It must be heavy atom

2880 if len(candidate_atom.bonds) < 2: continue

2881 # It must not be a dead end already

2882 bonded_atoms = candidate_atom.get_bonded_atoms()

2883 bonded_atoms = [ atom for atom in bonded_atoms if len(atom.bonds) >= 2 ]

2884 # In case there is a selection, check followup atoms not to be in the selection

2885 if selected_atom_indices:

2886 bonded_atoms = [ atom for atom in bonded_atoms if atom.index in selected_atom_indices ]

2887 for ring in find_rings_recursive(atom_path=[candidate_atom]):

2888 found_rings.append(ring)

2889 # Get unique rings

2890 unique_rings = set(found_rings)

2891 return list(unique_rings)

2892

2893 def get_selection_outer_bonds (self, selection : Selection) -> list[int]:

2894 """Given an atom selection, get all bonds between these atoms and any other atom in the structure.

2895 Note that inner bonds between atoms in the selection are discarded."""

2896 # Get bonds from all atoms in the selection

2897 bonds = set()

2898 for atom_index in selection.atom_indices:

2899 atom_bonds = set(self.bonds[atom_index])

2900 bonds = bonds.union(atom_bonds)

2901 # Discard selection atoms from the bonds list to discard inner bonds

2902 bonds -= set(selection.atom_indices)

2903 return list(bonds)

2904

2905 # Set the type of PTM according to the classification of the bonded residue

2906 ptm_options = {

2907 'protein': ValueError('A PTM residue must never be protein'),

2908 'dna': 'DNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama

2909 'rna': 'RNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama

2910 'carbohydrate': 'Glycosilation',

2911 'fatty': 'Lipidation',

2912 'steroid': 'Steroid linkage', # DANI: Esto es posible aunque muy raro y no se como se llama

2913 'ion': Warning('Ion is covalently bonded to protein'), # DANI: esto no es "correcto" pero si habitual

2914 'solvent': Warning('Solvent is covalently bonded to protein'), # DANI: probablemente sea un error

2915 'acetyl': 'Acetylation', # Typical N-capping terminals

2916 'amide': 'Amidation', # Typical C-capping terminals

2917 'other': Warning('Unknow type of PTM'), # Something not supported

2918 }

2919

2920 def find_ptms (self) -> Generator[dict, None, None]:

2921 """Find Post Translational Modifications (PTM) in the structure."""

2922 # Find bonds between protein and non-protein atoms

2923 # First get all protein atoms

2924 protein_selection = self.select_protein()

2925 if not protein_selection:

2926 return

2927 protein_atom_indices = set(protein_selection.atom_indices) # This is used later

2928 protein_outer_bonds = set(self.get_selection_outer_bonds(protein_selection))

2929 non_protein_selection = self.invert_selection(protein_selection)

2930 if not non_protein_selection:

2931 return

2932 non_protein_atom_indices = set(non_protein_selection.atom_indices)

2933 non_protein_atom_indices_bonded_to_protein = protein_outer_bonds.intersection(non_protein_atom_indices)

2934 # Get each residue bonded to the protein and based on its 'classification' set the name of the PTM

2935 for atom_index in non_protein_atom_indices_bonded_to_protein:

2936 atom = self.atoms[atom_index]

2937 residue = atom.residue

2938 residue_classification = residue.get_classification()

2939 ptm_classification = self.ptm_options[residue_classification]

2940 # If we found something impossible then raise the error

2941 if type(ptm_classification) == ValueError:

2942 print(f'Problematic residue: {residue}')

2943 raise ptm_classification

2944 # If we do not know what it is then do tag it as a PTM but print a warning

2945 if type(ptm_classification) == Warning:

2946 warn(f'{ptm_classification} -> {residue}')

2947 continue

2948 # At this point we have found a valid PTM

2949 # Find the protein residue linked to this PTM

2950 atom_bonds = self.bonds[atom_index]

2951 protein_bond = next((bond for bond in atom_bonds if bond in protein_atom_indices), None)

2952 if protein_bond == None:

2953 raise ValueError('There must be at least one protein bond to the current atom')

2954 protein_residue_index = self.atoms[protein_bond].residue_index

2955 # Set the PTM

2956 yield { 'name': ptm_classification, 'residue_index': protein_residue_index }

2957

2958 def has_cg (self) -> bool:

2959 """Ask if the structure has at least one coarse grain atom/residue."""

2960 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)

2961

2962

2963# =========================

2964# Related functions

2965# =========================

2966

2967def calculate_distance (atom_1 : Atom, atom_2 : Atom) -> float:

2968 """Calculate the distance between two atoms."""

2969 squared_distances_sum = 0

2970 for i in { 'x': 0, 'y': 1, 'z': 2 }.values():

2971 squared_distances_sum += (atom_1.coords[i] - atom_2.coords[i])**2

2972 return math.sqrt(squared_distances_sum)

2973

2974def calculate_angle (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom) -> float:

2975 """Calculate the angle between 3 atoms."""

2976 # From: https://stackoverflow.com/questions/35176451/python-code-to-calculate-angle-between-three-point-using-their-3d-coordinates

2977 # Get coordinates in numpy arrays, which allows to easily calculate the difference

2978 a = np.array(atom_1.coords)

2979 b = np.array(atom_2.coords)

2980 c = np.array(atom_3.coords)

2981 # Set the two vectors which make the angle

2982 ba = a - b

2983 bc = c - b

2984 # Calculate the angle between these 2 vectors

2985 cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))

2986 angle = np.arccos(cosine_angle)

2987 return np.degrees(angle)

2988

2989def calculate_torsion (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom, atom_4 : Atom) -> float:

2990 """Calculate the torsion between 4 atoms."""

2991 # From: https://stackoverflow.com/questions/20305272/dihedral-torsion-angle-from-four-points-in-cartesian-coordinates-in-python

2992 p0 = np.array(atom_1.coords)

2993 p1 = np.array(atom_2.coords)

2994 p2 = np.array(atom_3.coords)

2995 p3 = np.array(atom_4.coords)

2996 # Get some vectors

2997 b0 = -1.0*(p1 - p0)

2998 b1 = p2 - p1

2999 b2 = p3 - p2

3000 # normalize b1 so that it does not influence magnitude of vector

3001 # rejections that come next

3002 b1 /= np.linalg.norm(b1)

3003 # vector rejections

3004 # v = projection of b0 onto plane perpendicular to b1

3005 # = b0 minus component that aligns with b1

3006 # w = projection of b2 onto plane perpendicular to b1

3007 # = b2 minus component that aligns with b1

3008 v = b0 - np.dot(b0, b1)*b1

3009 w = b2 - np.dot(b2, b1)*b1

3010 # angle between v and w in a plane is the torsion angle

3011 # v and w may not be normalized but that's fine since tan is y/x

3012 x = np.dot(v, w)

3013 y = np.dot(np.cross(b1, v), w)

3014 return float(np.degrees(np.arctan2(y, x)))

3015

3016# =========================

3017# Auxiliar functions

3018# =========================

3019

3020def get_next_letter (letter : str) -> str:

3021 """Set a function to get the next letter from an input letter in alphabetic order."""

3022 if not letter:

3023 return 'A'

3024 if letter == 'z' or letter == 'Z':

3025 raise InputError("Limit of chain letters has been reached")

3026 next_letter = chr(ord(letter) + 1)

3027 return next_letter

3028

3029def first_cap_only (name : str) -> str:

3030 """Given a string with 1 or 2 characters, return a new string with

3031 the first letter cap and the second letter not cap (if any)"""

3032 if len(name) == 1:

3033 return name.upper()

3034 first_character = name[0].upper()

3035 second_character = name[1].lower()

3036 return first_character + second_character

3037

3038lower_numbers = {

3039 '0': '₀',

3040 '1': '₁',

3041 '2': '₂',

3042 '3': '₃',

3043 '4': '₄',

3044 '5': '₅',

3045 '6': '₆',

3046 '7': '₇',

3047 '8': '₈',

3048 '9': '₉',

3049}

3050def get_lower_numbers (numbers_text : str) -> str:

3051 """Convert numbers to lower text characters (chr 8020-8029)."""

3052 return ''.join([ lower_numbers[c] for c in numbers_text ])

3053

3054def filter_model (pdb_content : str, model : int) -> str:

3055 """Set a function to filter lines in PDB content for a specific model."""

3056 # Make sure the PDB content has multiple models

3057 # If not, then return the whole PDB content ignoring the flag

3058 generic_model_header_regex = r'\nMODEL\s*[0-9]*\s*\n'

3059 if not re.search(generic_model_header_regex, pdb_content):

3060 print(f'PDB content has no models at all so it will be loaded as is (ignored model "{model}").')

3061 return pdb_content

3062 # If a model was passed then it means we must filter the PDB content

3063 filtered_pdb_content = ''

3064 # Search PDB lines until we find our model's header

3065 model_header_regex = rf'^MODEL\s*{model}'

3066 pdb_lines = iter(pdb_content.split('\n'))

3067 for line in pdb_lines:

3068 if not re.match(model_header_regex, line): continue

3069 # If we just found the header

3070 filtered_pdb_content = line

3071 break

3072 # If we did not find the header then stop here

3073 if not filtered_pdb_content: raise RuntimeError(f'Could not find model "{model}" header')

3074 # Add every line to the filtered content until we find the tail

3075 model_footer_regex = r'^ENDMDL'

3076 for line in pdb_lines:

3077 filtered_pdb_content += '\n' + line

3078 if re.match(model_footer_regex, line): return filtered_pdb_content

3079 # If we did not find the footer then stop here

3080 raise RuntimeError(f'Could not find model "{model}" footer')