Coverage for mddb_workflow / utils / structures.py: 84%
1746 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
1# Main handler of the toolbelt
2import os
3import re
4import math
5import pytraj
6import MDAnalysis
7import numpy as np
8import networkx as nx
9from bisect import bisect
11from mddb_workflow.utils.file import File
12from mddb_workflow.utils.selections import Selection
13from mddb_workflow.utils.vmd_spells import get_vmd_selection_atom_indices, get_covalent_bonds
14from mddb_workflow.utils.mdt_spells import sort_trajectory_atoms
15from mddb_workflow.utils.gmx_spells import make_index, read_ndx
16from mddb_workflow.utils.auxiliar import InputError, MISSING_BONDS
17from mddb_workflow.utils.auxiliar import is_imported, residue_name_to_letter, otherwise, warn
18from mddb_workflow.utils.constants import SUPPORTED_ION_ELEMENTS, SUPPORTED_ELEMENTS
19from mddb_workflow.utils.constants import STANDARD_COUNTER_CATION_ATOM_NAMES, STANDARD_COUNTER_ANION_ATOM_NAMES
20from mddb_workflow.utils.constants import STANDARD_SOLVENT_RESIDUE_NAMES, STANDARD_COUNTER_ION_ATOM_NAMES
21from mddb_workflow.utils.constants import STANDARD_DUMMY_ATOM_NAMES, DUMMY_ATOM_ELEMENT, CG_ATOM_ELEMENT
22from mddb_workflow.utils.constants import PROTEIN_RESIDUE_NAME_LETTERS, NUCLEIC_RESIDUE_NAME_LETTERS
23from mddb_workflow.utils.constants import DNA_RESIDUE_NAME_LETTERS, RNA_RESIDUE_NAME_LETTERS
24from mddb_workflow.utils.constants import FATTY_RESIDUE_NAMES, STEROID_RESIDUE_NAMES
25from mddb_workflow.utils.type_hints import *
28# Set all available chains according to pdb standards
29AVAILABLE_CAPS = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
30AVAILABLE_LOWS = list('abcdefghijklmnopqrstuvwxyz')
31AVAILABLE_LETTERS = AVAILABLE_CAPS + AVAILABLE_LOWS
33# Set letters to be found in alphanumerical bases
34hexadecimal_letters = set(AVAILABLE_CAPS[:6] + AVAILABLE_LOWS[:6])
35alphanumerical_letters = set(AVAILABLE_CAPS[6:] + AVAILABLE_LOWS[6:])
37# Set the expected number of bonds for each atom according to its element
38coherent_bonds_with_hydrogen = {
39 'H': { 'min': 1, 'max': 1 },
40 'O': { 'min': 1, 'max': 2 },
41 'N': { 'min': 1, 'max': 4 },
42 'C': { 'min': 2, 'max': 4 },
43 'S': { 'min': 1, 'max': 6 },
44 'P': { 'min': 2, 'max': 6 },
45}
46# WARNING: Not deeply tested
47coherent_bonds_without_hydrogen = {
48 'O': { 'min': 0, 'max': 2 },
49 'N': { 'min': 1, 'max': 3 },
50 'C': { 'min': 1, 'max': 3 },
51 'S': { 'min': 1, 'max': 4 },
52 'P': { 'min': 2, 'max': 4 },
53}
55# Set the limit of residue numbers according to PDB format (4 characters, starts count at 1)
56# This means the last number is 9999 and it is equivalent to index 9998
57pdb_last_decimal_residue_index = 9998
59class Atom:
60 """An atom class."""
62 def __init__ (self,
63 name : Optional[str] = None,
64 element : Optional[str] = None,
65 coords : Optional[Coords] = None,
66 ):
67 self.name = name
68 self.element = element
69 self.coords = tuple(coords) if coords else None
70 # Set variables to store references to other related instances
71 # These variables will be set further by the structure
72 self._structure = None
73 self._index = None
74 self._residue_index = None
76 def __repr__ (self):
77 return '<Atom ' + self.name + '>'
79 def __eq__ (self, other):
80 if type(self) != type(other):
81 return False
82 return self._residue_index == other._residue_index and self.name == other.name
84 def __hash__ (self):
85 return hash((self.index))
87 def get_structure (self) -> Optional['Structure']:
88 """The parent structure (read only).
89 This value is set by the structure itself."""
90 return self._structure
91 structure: 'Structure' = property(get_structure, None, None, "The parent structure (read only)")
93 def get_index (self) -> Optional[int]:
94 """The residue index according to parent structure residues (read only).
95 This value is set by the structure itself."""
96 return self._index
97 index = property(get_index, None, None, "The residue index according to parent structure residues (read only)")
99 def get_residue_index (self) -> int:
100 """The atom residue index according to parent structure residues.
101 If residue index is set then make changes in all the structure to make this change coherent."""
102 return self._residue_index
103 def set_residue_index (self, new_residue_index : int):
104 # If the new residue index is the current residue index then do nothing
105 # WARNING: It is important to stop this here or it could delete a residue which is not to be deleted
106 if new_residue_index == self.residue_index:
107 return
108 # If there is not strucutre yet it means the residue is beeing set before the structure
109 # We just save the residue index and wait for the structure to be set
110 if not self.structure:
111 self._residue_index = new_residue_index
112 return
113 # Relational indices are updated through a top-down hierarchy
114 # Affected residues are the ones to update this atom internal residue index
115 new_residue = self.structure.residues[new_residue_index]
116 new_residue.add_atom(self)
117 residue_index = property(get_residue_index, set_residue_index, None,
118 "The atom residue index according to parent structure residues")
120 def get_residue (self) -> Optional['Residue']:
121 """The atom residue.
122 If residue is set then make changes in all the structure to make this change coherent."""
123 # If there is not strucutre yet it means the atom is beeing set before the structure
124 # In this case it is not possible to get related residues in the structure
125 # It also may happend that the residue is missing because the atom has been set rawly
126 if not self.structure or self.residue_index == None:
127 return None
128 # Get the residue in the structure according to the residue index
129 return self.structure.residues[self.residue_index]
130 def set_residue (self, new_residue : 'Residue'):
131 """Find the new residue index and set it as the atom residue index.
132 Note that the residue must be set in the structure already."""
133 new_residue_index = new_residue.index
134 if new_residue_index == None:
135 raise ValueError('Residue ' + str(new_residue) + ' is not set in the structure')
136 self.set_residue_index(new_residue_index)
137 residue = property(get_residue, set_residue, None, "The atom residue")
139 def get_chain_index (self) -> Optional[int]:
140 """Get the atom chain index according to parent structure chains."""
141 # The residue may be missing if the atom has been set rawly
142 if not self.residue:
143 return None
144 return self.residue.chain_index
145 chain_index = property(get_chain_index, None, None,
146 "The atom chain index according to parent structure chains (read only)")
148 def get_chain (self) -> Optional['Chain']:
149 """The atom chain (read only).
150 In order to change the chain it must be changed in the atom residue."""
151 # If there is not strucutre yet it means the atom is beeing set before the structure
152 # In this case it is not possible to get related residues in the structure
153 # It also may happend that the chain is missing because the atom has been set rawly
154 if not self.structure or self.chain_index == None:
155 return None
156 # Get the chain in the structure according to the chain index
157 return self.structure.chains[self.chain_index]
158 chain: 'Chain' = property(get_chain, None, None, "The atom chain (read only)")
160 def get_bonds (self, skip_ions : bool = False, skip_dummies : bool = False) -> Optional[ list[int] ]:
161 """Get indices of other atoms in the structure which are covalently bonded to this atom."""
162 if not self.structure:
163 raise ValueError('The atom has not a structure defined')
164 if self.index == None:
165 raise ValueError('The atom has not an index defined')
166 bonds = self.structure.bonds[self.index]
167 # If the skip ions argument is passed then remove atom indices of supported ion atoms
168 if skip_ions:
169 bonds = list(set(bonds) - self.structure.ion_atom_indices)
170 if skip_dummies:
171 bonds = list(set(bonds) - self.structure.dummy_atom_indices)
172 return bonds
174 # Atoms indices of atoms in the structure which are covalently bonded to this atom
175 bonds = property(get_bonds, None, None,
176 'Atoms indices of atoms in the structure which are covalently bonded to this atom')
178 def get_bonded_atoms (self) -> list['Atom']:
179 """Get bonded atoms."""
180 return [ self.structure.atoms[atom_index] for atom_index in self.bonds ]
182 def get_selection (self) -> 'Selection':
183 """Generate a selection for this atom."""
184 return Selection([self.index])
186 def copy (self) -> 'Atom':
187 """Make a copy of the current atom."""
188 atom_copy = Atom(self.name, self.element, self.coords)
189 atom_copy._structure = self._structure
190 atom_copy._index = self._index
191 atom_copy._residue_index = self._residue_index
192 return atom_copy
194 def is_fatty_candidate (self) -> bool:
195 """
196 Check if this atom meets specific criteria:
197 1. It is a carbon
198 2. It is connected only to other carbons and hydrogens
199 3. It is connected to 1 or 2 carbons
200 """
201 # Ignore non carbon atoms
202 if self.element != 'C':
203 return False
204 # Get bonded atom elements
205 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ]
206 # Check only carbon and hydrogen atoms to be bonded
207 if any((element != 'C' and element != 'H') for element in bonded_atoms_elements):
208 return False
209 # Check it is connected to 1 or 2 carbons
210 connected_carbons_count = bonded_atoms_elements.count('C')
211 if connected_carbons_count != 1 and connected_carbons_count != 2:
212 return False
213 return True
215 def is_carbohydrate_candidate (self) -> bool:
216 """
217 Check if this atom meets specific criteria:
218 1. It is a carbon
219 2. It is connected only to other carbons, hydrogens or oxygens
220 3. It is connected to 1 or 2 carbons
221 4. It is connected to 1 oxygen
222 """
223 # Ignore non carbon atoms
224 if self.element != 'C':
225 return False
226 # Get bonded atom elements
227 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ]
228 # Check only carbon and hydrogen atoms to be bonded
229 if any((element != 'C' and element != 'H' and element != 'O') for element in bonded_atoms_elements):
230 return False
231 # Check it is connected to 1 or 2 carbons
232 connected_carbons_count = bonded_atoms_elements.count('C')
233 if connected_carbons_count != 1 and connected_carbons_count != 2:
234 return False
235 # Check it is connected to 1 oxygen
236 connected_oxygens_count = bonded_atoms_elements.count('O')
237 if connected_oxygens_count != 1:
238 return False
239 return True
241 def is_ion (self) -> bool:
242 """Check if it is an ion by checking if it has no bonds with other atoms."""
243 return len(self.bonds) == 0
245 def guess_element (self) -> str:
246 """Guess an atom element from its name and number of bonds."""
247 # If the atom name is among the known dummy atoms then return a standard element for dummy atoms
248 if self.name.upper() in STANDARD_DUMMY_ATOM_NAMES:
249 return DUMMY_ATOM_ELEMENT
250 # If the element is already the "coarse grained" flag then return this very same flag
251 # WARNING: This has to be preset before guessing or the guess may fail
252 if self.is_cg():
253 return CG_ATOM_ELEMENT
254 # If the name is SOD and it is a lonely atom then it is clearly sodium, not sulfur
255 if self.name.upper() == 'SOD' and self.residue.atom_count == 1:
256 return 'Na'
257 # If the name is POT and it is a lonely atom then it is clearly potassium, not phosphor
258 if self.name.upper() == 'POT' and self.residue.atom_count == 1:
259 return 'K'
260 # Find a obvios element name in the atom name
261 element = self.get_name_suggested_element()
262 # CA may refer to calcium or alpha carbon, so the number of atoms in the residue is decisive
263 if element == 'Ca' and self.residue.atom_count >= 2:
264 return 'C'
265 # NA may refer to sodium or some ligand nitrogen, so the number of atoms in the residue is decisive
266 if element == 'Na' and self.residue.atom_count >= 2:
267 return 'N'
268 return element
270 def get_name_suggested_element (self) -> str:
271 """Guess an atom element from its name only."""
272 # Get the atom name and its characters length
273 name = self.name
274 length = len(name)
275 next_character = None
276 for i, character in enumerate(name):
277 # Get the next character, since element may be formed by 2 letters
278 if i < length - 1:
279 next_character = name[i+1]
280 # If next character is not a string then ignore it
281 if not next_character.isalpha():
282 next_character = None
283 # Try to get all possible matches between the characters and the supported atoms
284 # First letter is always caps
285 character = character.upper()
286 # First try to match both letters together
287 if next_character:
288 # Start with the second letter in caps
289 next_character = next_character.upper()
290 both = character + next_character
291 if both in SUPPORTED_ELEMENTS:
292 return both
293 # Continue with the second letter in lowers
294 next_character = next_character.lower()
295 both = character + next_character
296 if both in SUPPORTED_ELEMENTS:
297 return both
298 # Finally, try with the first character alone
299 if character in SUPPORTED_ELEMENTS:
300 return character
301 raise InputError(f"Cannot guess atom element from atom name '{name}'")
303 def get_label (self) -> str:
304 """Get a standard label."""
305 return f'{self.residue.label}.{self.name} (index {self.index})'
306 # The atom standard label (read only)
307 label = property(get_label, None, None)
309 # Ask if the current atom is in coarse grain
310 def is_cg (self) -> bool:
311 return self.element == CG_ATOM_ELEMENT
313class Residue:
314 """A residue class."""
315 def __init__ (self,
316 name : Optional[str] = None,
317 number : Optional[int] = None,
318 icode : Optional[str] = None,
319 ):
320 self.name = name
321 self.number = number
322 self.icode = icode
323 # Set variables to store references to other related instaces
324 # These variables will be set further by the structure
325 self._structure = None
326 self._index = None
327 self._atom_indices = []
328 self._chain_index = None
329 self._classification = None
331 def __repr__ (self):
332 return f'<Residue {self.label}>'
334 def __eq__ (self, other):
335 if type(self) != type(other):
336 return False
337 return (
338 self._chain_index == other._chain_index and
339 #self.name == other.name and
340 self.number == other.number and
341 self.icode == other.icode
342 )
344 def __hash__ (self):
345 # WARNING: This is susceptible to duplicated residues
346 return hash((self._chain_index, self.number, self.icode))
347 # WARNING: This is not susceptible to duplicated residues
348 #return hash(tuple(self._atom_indices))
350 def get_structure (self) -> Optional['Structure']:
351 """Get the parent structure (read only).
352 This value is set by the structure itself."""
353 return self._structure
354 structure: 'Structure' = property(get_structure, None, None,
355 "The parent structure (read only)")
357 def get_index (self) -> Optional[int]:
358 """Get the residue index according to parent structure residues (read only).
359 This value is set by the structure itself."""
360 return self._index
361 def set_index (self, index):
362 # Update residue atoms
363 for atom in self.atoms:
364 atom._residue_index = index
365 # Update residue chain
366 chain_residue_index = self.chain._residue_indices.index(self._index)
367 self.chain._residue_indices[chain_residue_index] = index
368 # Finally update self index
369 self._index = index
370 index = property(get_index, set_index, None,
371 "The residue index according to parent structure residues (read only)")
373 def get_atom_indices (self) -> list[int]:
374 """Get the atom indices according to parent structure atoms for atoms in this residue.
375 If atom indices are set then make changes in all the structure to make this change coherent."""
376 return self._atom_indices
377 def set_atom_indices (self, new_atom_indices : list[int]):
378 # If there is not strucutre yet it means the residue is beeing set before the structure
379 # We just save atom indices and wait for the structure to be set
380 if not self.structure:
381 self._atom_indices = new_atom_indices
382 return
383 # Update the current atoms
384 for atom in self.atoms:
385 atom._residue_index = None
386 # Update the new atoms
387 for index in new_atom_indices:
388 atom = self.structure.atoms[index]
389 atom._residue_index = self.index
390 # Now new indices are coherent and thus we can save them
391 self._atom_indices = new_atom_indices
392 atom_indices = property(get_atom_indices, set_atom_indices, None,
393 "The atom indices according to parent structure atoms for atoms in this residue")
395 def get_atoms (self) -> list['Atom']:
396 """Get the atoms in this residue.
397 If atoms are set then make changes in all the structure to make this change coherent."""
398 # If there is not strucutre yet it means the chain is beeing set before the structure
399 # In this case it is not possible to get related atoms in the structure
400 if not self.structure:
401 return []
402 # Get atoms in the structure according to atom indices
403 atoms = self.structure.atoms
404 return [ atoms[atom_index] for atom_index in self.atom_indices ]
405 def set_atoms (self, new_atoms : list['Atom']):
406 # Find indices for new atoms and set their indices as the new atom indices
407 # Note that atoms must be set in the structure already
408 new_atom_indices = []
409 for new_atom in new_atoms:
410 new_atom_index = new_atom.index
411 if new_atom_index == None:
412 raise ValueError('Atom ' + str(new_atom) + ' is not set in the structure')
413 new_atom_indices.append(new_atom_index)
414 self.set_atom_indices(new_atom_indices)
415 atoms: list['Atom'] = property(get_atoms, set_atoms, None, "The atoms in this residue")
417 # Get the number of atoms in the residue
418 def get_atom_count (self) -> int:
419 return len(self.atom_indices)
420 atom_count = property(get_atom_count, None, None, "The number of atoms in the residue (read only)")
422 def add_atom (self, new_atom : 'Atom'):
423 """Add an atom to the residue."""
424 # Remove the atom from its previous residue owner
425 if new_atom.residue_index:
426 new_atom.residue.remove_atom(new_atom)
427 # Insert the new atom index in the list of atom indices keeping the order
428 new_atom_index = new_atom.index
429 sorted_atom_index = bisect(self.atom_indices, new_atom_index)
430 self.atom_indices.insert(sorted_atom_index, new_atom_index)
431 # Update the atom internal index
432 new_atom._residue_index = self.index
434 def remove_atom (self, current_atom : 'Atom'):
435 """Remove an atom from the residue."""
436 # Remove the current atom index from the atom indices list
437 self.atom_indices.remove(current_atom.index) # This index MUST be in the list
438 # If we removed the last atom then this residue must be removed from its structure
439 if len(self.atom_indices) == 0 and self.structure:
440 self.structure.purge_residue(self)
441 # Update the atom internal index
442 current_atom._residue_index = None
444 # The residue chain index according to parent structure chains
445 # If chain index is set then make changes in all the structure to make this change coherent
446 def get_chain_index (self) -> int:
447 return self._chain_index
448 def set_chain_index (self, new_chain_index : int):
449 # If the new chain index is the current chain index do nothing
450 # WARNING: It is important to stop this here or it could delete a chain which is not to be deleted
451 if new_chain_index == self.chain_index:
452 return
453 # If there is not strucutre yet it means the chain is beeing set before the structure
454 # We just save the chain index and wait for the structure to be set
455 if not self.structure:
456 self._chain_index = new_chain_index
457 return
458 # Relational indices are updated through a top-down hierarchy
459 # Affected chains are the ones to update this residue internal chain index
460 # WARNING: It is critical to find the new chain before removing/adding residues
461 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged
462 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed
463 new_chain = self.structure.chains[new_chain_index]
464 if self.chain:
465 self.chain.remove_residue(self)
466 new_chain.add_residue(self)
467 chain_index = property(get_chain_index, set_chain_index, None,
468 "The residue chain index according to parent structure chains")
470 # The residue chain
471 # If chain is set then make changes in all the structure to make this change coherent
472 def get_chain (self) -> Optional['Chain']:
473 # If there is not strucutre yet then it means the residue is set before the structure
474 # In this case it is not possible to get related chain in the structure
475 if not self.structure or self.chain_index == None:
476 return None
477 # Get the chain in the structure according to the chain index
478 return self.structure.chains[self.chain_index]
479 def set_chain (self, new_chain : Union['Chain', str]):
480 # In case the chain is just a string we must find/create the corresponding chain
481 if type(new_chain) == str:
482 letter = new_chain
483 # Get the residue structure
484 structure = self.structure
485 if not structure:
486 raise ValueError(f'Cannot find the corresponding {new_chain} chain without the structure')
487 # Find if the letter belongs to an already existing chain
488 new_chain = structure.get_chain_by_name(letter)
489 # If the chain does not exist yet then create it
490 if not new_chain:
491 new_chain = Chain(name=letter)
492 structure.set_new_chain(new_chain)
493 # Find the new chain index and set it as the residue chain index
494 # Note that the chain must be set in the structure already
495 new_chain_index = new_chain.index
496 if new_chain_index == None:
497 raise ValueError(f'Chain {new_chain} is not set in the structure')
498 self.set_chain_index(new_chain_index)
499 chain: 'Chain' = property(get_chain, set_chain, None, "The residue chain")
501 def get_bonded_atom_indices (self) -> list[int]:
502 """Get atom indices from atoms bonded to this residue."""
503 # Get all bonds among residue atoms
504 all_bonds = []
505 for atom in self.atoms:
506 all_bonds += atom.bonds
507 # Remove self atom indices from the list
508 external_bonds = set(all_bonds) - set(self.atom_indices)
509 return list(external_bonds)
511 def get_bonded_atoms (self) -> list['Atom']:
512 """Get atoms bonded to this residue."""
513 return [ self.structure.atoms[atom_index] for atom_index in self.get_bonded_atom_indices() ]
515 def get_bonded_residue_indices (self) -> list[int]:
516 """Get residue indices from residues bonded to this residue."""
517 return list(set([ atom.residue_index for atom in self.get_bonded_atoms() ]))
519 def get_bonded_residues (self) -> list['Residue']:
520 """Get residues bonded to this residue."""
521 return [ self.structure.residues[residue_index] for residue_index in self.get_bonded_residue_indices() ]
523 def is_bonded_with_residue (self, other : 'Residue') -> bool:
524 """Given another residue, check if it is bonded with this residue."""
525 bonded_atom_indices = set(self.get_bonded_atom_indices())
526 if next((index for index in other.atom_indices if index in bonded_atom_indices), None) != None: return True
527 return False
529 def is_missing_any_bonds (self) -> bool:
530 return any(atom.bonds == MISSING_BONDS for atom in self.atoms)
532 def is_coherent (self) -> bool:
533 """Make sure atoms within the residue are all bonded."""
534 # If bonds are missing then just say everything is
535 if self.is_missing_any_bonds():
536 raise RuntimeError('Trying to check if residue with missing bonds is coherent')
537 residue_selection = self.get_selection()
538 residue_fragments = self.structure.find_fragments(residue_selection)
539 first_residue_fragment = next(residue_fragments)
540 return len(first_residue_fragment) == self.atom_count
542 def get_classification (self) -> str:
543 """
544 Get the residue biochemistry classification.
546 WARNING: Note that this logic will not work in a structure without hydrogens.
548 Available classifications:
549 - protein
550 - dna
551 - rna
552 - carbohydrate
553 - fatty
554 - steroid
555 - ion
556 - solvent
557 - acetyl
558 - amide
559 - other
560 """
561 # Return the internal value, if any
562 if self._classification:
563 return self._classification
564 # If this is a CG residue then we can not classify it
565 if self.is_cg(): return self.get_classification_by_name()
566 # If this residue is missing bonds then we can not classify it
567 # Some parts of the logic like the find_rings logic rely on bonds
568 if self.is_missing_any_bonds(): return self.get_classification_by_name()
569 # If we dont have a value then we must classify the residue
570 # -------------------------------------------------------------------------------------------------------
571 # Ions are single atom residues
572 if self.atom_count == 1:
573 self._classification = 'ion'
574 return self._classification
575 # -------------------------------------------------------------------------------------------------------
576 # At this point we need atoms to have elements fixed
577 # WARNING: missing elements would result in silent failure to recognize some classifications
578 if self.structure._fixed_atom_elements == False:
579 self.structure.fix_atom_elements()
580 # Solvent is water molecules
581 # First rely on the residue name
582 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES:
583 self._classification = 'solvent'
584 return self._classification
585 # It may be water with a not known name
586 # Literally check if its a molecule with 3 atoms: 2 hydrogens and 1 oxygen
587 if self.atom_count == 3:
588 atom_elements = [ atom.element for atom in self.atoms ]
589 if atom_elements.count('H') == 2 and atom_elements.count('O') == 1:
590 self._classification = 'solvent'
591 return self._classification
592 # -------------------------------------------------------------------------------------------------------
593 # Protein definition according to vmd:
594 # a residue with atoms named C, N, CA, and O
595 # In our case we accept OC1 and OC2 or OT1 and OT2 instead of O for terminal residues
596 atom_names = set([ atom.name for atom in self.atoms ])
597 if ( all((name in atom_names) for name in ['C', 'N', 'CA']) and (
598 'O' in atom_names or { 'OC1', 'OC2' } <= atom_names or { 'OT1', 'OT2' } <= atom_names
599 )):
600 self._classification = 'protein'
601 return self._classification
602 # -------------------------------------------------------------------------------------------------------
603 # Nucleic acids definition according to vmd:
604 # a residue with atoms named P, O1P, O2P and either O3’, C3’, C4’, C5’, O5’ or O3*, C3*, C4*, C5*, O5*
605 # Apparently it has been fixed so now a residue does not need to be phosphorylated to be considered nucleic
606 # LORE: This included the condition "all((name in atom_names) for name in ['P', 'OP1', 'OP2'])"
607 if (( all((name in atom_names) for name in ["O3'", "C3'", "C4'", "C5'", "O5'"]) or
608 all((name in atom_names) for name in ["O3*", "C3*", "C4*", "C5*", "O5*"]) )):
609 # At this point we know it is nucleic
610 # We must tell the difference between DNA and RNA
611 if "O2'" in atom_names or "O2*" in atom_names:
612 self._classification = 'rna'
613 else:
614 self._classification = 'dna'
615 return self._classification
616 # -------------------------------------------------------------------------------------------------------
617 # To define carbohydrates search for rings made of 1 oxygen and 'n' carbons
618 # WARNING: This logic may fail for some very specific molecules such as furine
619 # LIMITATION: This logic only aims for cyclical carbohydrates. The linear form of carbohydrates is not yet consider
620 rings = self.find_rings(6)
621 for ring in rings:
622 ring_elements = [ atom.element for atom in ring ]
623 # Check the ring has 1 oxygen
624 oxygen_count = ring_elements.count('O')
625 if oxygen_count != 1:
626 continue
627 # Check the rest are only carbon
628 carbon_count = ring_elements.count('C')
629 if carbon_count != len(ring) - 1:
630 continue
631 self._classification = 'carbohydrate'
632 return self._classification
633 # -------------------------------------------------------------------------------------------------------
634 # To define steroids search for 3 6-carbon rings and 1 5-carbon ring (at least)
635 # WARNING: According to this logic some exotical non-steroid molecules may result in false positives
636 num_6_carbon_rings = 0
637 num_5_carbon_rings = 0
638 for ring in rings:
639 ring_elements = [ atom.element for atom in ring ]
640 if any(element != 'C' for element in ring_elements):
641 continue
642 ring_lenght = len(ring)
643 if ring_lenght == 5:
644 num_5_carbon_rings += 1
645 if ring_lenght == 6:
646 num_6_carbon_rings += 1
647 if num_6_carbon_rings >= 3 and num_5_carbon_rings >= 1:
648 self._classification = 'steroid'
649 return self._classification
650 # -------------------------------------------------------------------------------------------------------
651 # To define fatty acids search for a series of carbon atoms connected together (3 at least)
652 # These carbon atoms must be connected only to hydrogen in addition to 1-2 carbon
653 # These carbons must not be bonded in a cycle so check atoms not be repeated in the series
654 bonded_fatty_atoms = []
655 def get_bonded_fatty_atoms_recursive (current_atom : Atom, previous_atom : Optional[Atom] = None) -> bool:
656 # Iterate over the input atom bonded atoms
657 for bonded_atom in current_atom.get_bonded_atoms():
658 # Discard the previous atom to avoid an infinite loop
659 if bonded_atom == previous_atom:
660 continue
661 # If we find an atom which is already in the fatty atoms list then it means we are cycling
662 if bonded_atom in bonded_fatty_atoms:
663 return False
664 # If this is not a fatty atom then discard it
665 if not bonded_atom.is_fatty_candidate():
666 continue
667 # Add the current bonded fatty atom to the list
668 bonded_fatty_atoms.append(bonded_atom)
669 # Find more bonded fatty atoms and add them to list as well
670 if get_bonded_fatty_atoms_recursive(current_atom=bonded_atom, previous_atom=current_atom) == False:
671 return False
672 return True
673 # Now check all atoms and try to set the series until we find one which works
674 already_checked_atoms = []
675 for atom in self.atoms:
676 # If we already checked this atom trough the recursive logic then skip it
677 if atom in already_checked_atoms:
678 continue
679 # If the atom is not a fatty candidate then skip it
680 if not atom.is_fatty_candidate():
681 continue
682 # Now that we have found a suitable candidate atom we start the series
683 bonded_fatty_atoms = [atom]
684 if get_bonded_fatty_atoms_recursive(atom) and len(bonded_fatty_atoms) >= 3:
685 self._classification = 'fatty'
686 return self._classification
687 already_checked_atoms += bonded_fatty_atoms
688 # -------------------------------------------------------------------------------------------------------
689 # Check if it is an acetylation capping terminal
690 if self.name == 'ACE':
691 self._classification = 'acetyl'
692 return self._classification
693 # -------------------------------------------------------------------------------------------------------
694 # Check if it is an N-methyl amide capping terminal
695 if self.name == 'NME':
696 self._classification = 'amide'
697 return self._classification
698 # -------------------------------------------------------------------------------------------------------
699 self._classification = 'other'
700 return self._classification
702 def get_classification_by_name (self) -> str:
703 """
704 Set an alternative function to "try" to classify the residues according only to its name.
705 This is useful for corase grain residues whose atoms may not reflect the real atoms.
706 WARNING: This logic is very limited and will return "unknown" most of the times.
707 WARNING: This logic relies mostly in atom names, which may be not standard.
708 """
709 if self.name in PROTEIN_RESIDUE_NAME_LETTERS:
710 return 'protein'
711 if self.name in DNA_RESIDUE_NAME_LETTERS:
712 return 'dna'
713 if self.name in RNA_RESIDUE_NAME_LETTERS:
714 return 'rna'
715 if self.name in NUCLEIC_RESIDUE_NAME_LETTERS:
716 return 'nucleic'
717 if self.name in FATTY_RESIDUE_NAMES:
718 return 'fatty'
719 if self.name in STEROID_RESIDUE_NAMES:
720 return 'steroid'
721 if self.name in STANDARD_COUNTER_ION_ATOM_NAMES:
722 return 'ion'
723 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES:
724 return 'solvent'
725 # If we do not know what it is
726 return 'unknown'
728 # The residue biochemistry classification (read only)
729 classification = property(get_classification, None, None)
731 def get_selection (self) -> 'Selection':
732 """Generate a selection for this residue."""
733 return Selection(self.atom_indices)
735 def copy (self) -> 'Residue':
736 """Make a copy of the current residue."""
737 residue_copy = Residue(self.name, self.number, self.icode)
738 residue_copy._structure = self._structure
739 residue_copy._index = self._index
740 residue_copy._atom_indices = self._atom_indices
741 residue_copy._chain_index = self._chain_index
742 return residue_copy
744 def get_letter (self) -> str:
745 """Get the residue equivalent single letter code.
746 Note that this makes sense for aminoacids and nucleotides only.
747 Non recognized residue names return 'X'."""
748 return residue_name_to_letter(self.name)
750 def get_formula (self) -> str:
751 """Get the formula of the residue."""
752 elements = [ atom.element for atom in self.atoms ]
753 unique_elements = list(set(elements))
754 formula = ''
755 for unique_element in unique_elements:
756 count = elements.count(unique_element)
757 number = get_lower_numbers(str(count)) if count > 1 else ''
758 formula += unique_element + number
759 return formula
761 def find_rings (self, max_ring_size : int) -> list[ list[Atom] ]:
762 """Find rings in the residue."""
763 # Create graph and add edges in one pass
764 G = nx.Graph()
765 G.add_edges_from(
766 (atom.index, bonded_atom)
767 for atom in self.atoms
768 for bonded_atom in atom.bonds
769 )
770 # Get cycles and convert to atoms in one comprehension
771 cycles = list(nx.simple_cycles(G, max_ring_size))
772 # Create a mapping from structure atom indices to residue-local atom indices
773 residue_atom_indices = {atom.index: i for i, atom in enumerate(self.atoms)}
774 # Return the rings as lists of Atom objects
775 return [
776 [self.atoms[residue_atom_indices[i]] for i in cycle]
777 for cycle in cycles
778 ]
780 def split (self,
781 first_residue_atom_indices : list[int],
782 second_residue_atom_indices : list[int],
783 first_residue_name : Optional[str] = None,
784 second_residue_name : Optional[str] = None,
785 first_residue_number : Optional[int] = None,
786 second_residue_number : Optional[int] = None,
787 first_residue_icode : Optional[str] = None,
788 second_residue_icode : Optional[str] = None,
789 ) -> tuple['Residue', 'Residue']:
790 """Split this residue in 2 residues and return them in a tuple.
791 Keep things coherent in the structure (renumerate all residues below this one).
792 Note that all residue atoms must be covered by the splits."""
793 # This function is expected to be called in a residue with an already set structure
794 if not self.structure:
795 raise InputError('The split function should be called when the residue has an already defined structure')
796 # Make sure all atoms in the residue are covered between both the first and the second residue
797 if set(self.atom_indices) != set(first_residue_atom_indices + second_residue_atom_indices):
798 print('Residue atoms: ' + ', '.join([ str(v) for v in set(self.atom_indices) ]))
799 print('Covered atoms: ' + ', '.join([ str(v) for v in set(first_residue_atom_indices + second_residue_atom_indices) ]))
800 raise InputError('All atom indices must be covered between both the first and the second residue')
801 # Reuse this first residue instead of creating a new one
802 if first_residue_name:
803 self.name = first_residue_name
804 if first_residue_number:
805 self.number = first_residue_number
806 if first_residue_icode:
807 self.icode = first_residue_icode
808 # Set the new second residue
809 _second_residue_name = second_residue_name if second_residue_name else self.name
810 _second_residue_number = second_residue_number if second_residue_number else self.number
811 _second_residue_icode = second_residue_icode if second_residue_icode else get_next_letter(self.icode)
812 second_residue = Residue(_second_residue_name, _second_residue_number, _second_residue_icode)
813 second_residue._structure = self.structure
814 new_residue_index = self.index + 1
815 second_residue._index = new_residue_index
816 # Insert the second residue in the structure residues list right after this residue
817 self.structure.residues.insert(new_residue_index, second_residue)
818 # Set the second residue index
819 # Update the index of all residues which have been shifted after the insertion
820 for residue_index in range(new_residue_index + 1, len(self.structure.residues)):
821 residue = self.structure.residues[residue_index]
822 residue.index = residue_index
823 # Now transfer atoms from residue 1 to residue 2 as it is specified
824 # Note that this will automatically update every atom
825 second_residue.atom_indices = second_residue_atom_indices
826 # Now add the new residue to the chain
827 self.chain.add_residue(second_residue)
829 def split_by_atom_names (self,
830 first_residue_atom_names : list[str],
831 second_residue_atom_names : list[str],
832 first_residue_name : Optional[str] = None,
833 second_residue_name : Optional[str] = None,
834 first_residue_number : Optional[int] = None,
835 second_residue_number : Optional[int] = None,
836 first_residue_icode : Optional[str] = None,
837 second_residue_icode : Optional[str] = None,
838 ) -> tuple['Residue', 'Residue']:
839 """Parse atom names to atom indices and then call the split function."""
840 # Check all atom names to exist in the residue
841 input_atom_names = set(first_residue_atom_names + second_residue_atom_names)
842 residue_atom_names = set([ atom.name for atom in self.atoms ])
843 if input_atom_names != residue_atom_names:
844 print(self)
845 print(self.atoms)
846 print('Input atom names: ' + ', '.join(input_atom_names))
847 print('Residue atom names: ' + ', '.join(residue_atom_names))
848 raise InputError('All residue atoms must be covered between both the first and the second residue atom names')
849 # Convert atom names to atom indices
850 first_residue_atom_indices = [ self.get_atom_by_name(name).index for name in first_residue_atom_names ]
851 second_residue_atom_indices = [ self.get_atom_by_name(name).index for name in second_residue_atom_names ]
852 # Call the actual split logic
853 return self.split(first_residue_atom_indices, second_residue_atom_indices, first_residue_name,
854 second_residue_name, first_residue_number, second_residue_number, first_residue_icode, second_residue_icode)
856 def get_atom_by_name (self, atom_name : str) -> 'Atom':
857 """Get a residue atom given its name."""
858 return next(( atom for atom in self.atoms if atom.name == atom_name ), None)
860 def get_label (self) -> str:
861 """Get a standard label."""
862 chainname = self.chain.name if self.chain.name.strip() else ''
863 return f'{chainname}{self.number}{self.icode}({self.name})'
864 label = property(get_label, None, None,
865 "The residue standard label (read only)")
867 def is_cg (self) -> bool:
868 """Ask if the current residue is in coarse grain.
869 Note that we assume there may be not hybrid aa/cg residues."""
870 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)
872class Chain:
873 """A chain of residues."""
874 def __init__ (self,
875 name : Optional[str] = None,
876 classification : Optional[str] = None,
877 ):
878 self.name = name
879 self._classification = classification
880 # Set variables to store references to other related instaces
881 # These variables will be set further by the structure
882 self._structure = None
883 self._index = None
884 self._residue_indices = []
886 def __repr__ (self):
887 return '<Chain ' + self.name + '>'
889 def __eq__ (self, other):
890 return self.name == other.name
892 def __hash__ (self):
893 return hash(self.name)
895 # The parent structure (read only)
896 # This value is set by the structure itself
897 def get_structure (self) -> Optional['Structure']:
898 return self._structure
899 structure: 'Structure' = property(get_structure, None, None, "The parent structure (read only)")
901 # The residue index according to parent structure residues (read only)
902 # This value is set by the structure itself
903 def get_index (self) -> Optional[int]:
904 return self._index
905 # When the index is set all residues are updated with the next chain index
906 def set_index (self, index : int):
907 for residue in self.residues:
908 residue._chain_index = index
909 self._index = index
910 index = property(get_index, set_index, None, "The residue index according to parent structure residues (read only)")
912 # The residue indices according to parent structure residues for residues in this chain
913 # If residue indices are set then make changes in all the structure to make this change coherent
914 def get_residue_indices (self) -> list[int]:
915 return self._residue_indices
916 def set_residue_indices (self, new_residue_indices : list[int]):
917 # If there is not strucutre yet it means the chain is beeing set before the structure
918 # We just save residue indices and wait for the structure to be set
919 if not self.structure:
920 self._residue_indices = new_residue_indices
921 return
922 # Update the current residues
923 for residue in self.residues:
924 residue._chain_index = None
925 # Update the new residues
926 for index in new_residue_indices:
927 residue = self.structure.residues[index]
928 residue._chain_index = self.index
929 # In case the new residue indices list is empty this chain must be removed from its structure
930 if len(new_residue_indices) == 0:
931 self.structure.purge_chain(self)
932 # Now new indices are coherent and thus we can save them
933 self._residue_indices = new_residue_indices
934 residue_indices = property(get_residue_indices, set_residue_indices, None, "The residue indices according to parent structure residues for residues in this residue")
936 def get_residues (self) -> list['Residue']:
937 """The residues in this chain.
938 If residues are set then make changes in all the structure to make this change coherent."""
939 # If there is not strucutre yet it means the chain is beeing set before the structure
940 # In this case it is not possible to get related residues in the structure
941 if not self.structure:
942 return []
943 # Get residues in the structure according to residue indices
944 residues = self.structure.residues
945 return [ residues[residue_index] for residue_index in self.residue_indices ]
947 def set_residues (self, new_residues : list['Residue']):
948 """Find indices for new residues and set their indices as the new residue indices.
949 Note that residues must be set in the structure already."""
950 new_residue_indices = []
951 for new_residue in new_residues:
952 new_residue_index = new_residue.index
953 if new_residue_index == None:
954 raise ValueError(f'Residue {new_residue} is not set in the structure')
955 new_residue_indices.append(new_residue_index)
956 self.set_residue_indices(new_residue_indices)
957 # Set the new residues chain index to this chain index
958 for residue in new_residues:
959 residue.chain_index = self.index
960 residues: list['Residue'] = property(get_residues, set_residues, None, "The residues in this chain")
962 def add_residue (self, residue : 'Residue'):
963 """Add a residue to the chain."""
964 # Insert the new residue index in the list of residue indices keeping the order
965 sorted_residue_index = bisect(self.residue_indices, residue.index)
966 self.residue_indices.insert(sorted_residue_index, residue.index)
967 # Update the residue internal chain index
968 residue._chain_index = self.index
970 def remove_residue (self, residue : 'Residue'):
971 """Remove a residue from the chain.
972 WARNING: Note that this function does not trigger the set_residue_indices."""
973 self.residue_indices.remove(residue.index) # This index MUST be in the list
974 # If we removed the last residue then this chain must be removed from its structure
975 if len(self.residue_indices) == 0 and self.structure:
976 self.structure.purge_chain(self)
977 # Update the residue internal chain index
978 residue._chain_index = None
980 def get_atom_indices (self) -> list[int]:
981 """Get atom indices for all atoms in the chain (read only).
982 In order to change atom indices they must be changed in their corresponding residues."""
983 return sum([ residue.atom_indices for residue in self.residues ], [])
984 atom_indices = property(get_atom_indices, None, None, "Atom indices for all atoms in the chain (read only)")
986 def get_atoms (self) -> list[int]:
987 """Get the atoms in the chain (read only).
988 In order to change atoms they must be changed in their corresponding residues."""
989 return sum([ residue.atoms for residue in self.residues ], [])
990 atoms: list['Atom'] = property(get_atoms, None, None, "Atoms in the chain (read only)")
992 def get_atom_count (self) -> int:
993 """Get the number of atoms in the chain (read only)."""
994 return len(self.atom_indices)
995 atom_count = property(get_atom_count, None, None, "Number of atoms in the chain (read only)")
997 def get_residue_count (self) -> int:
998 """Get the number of residues in the chain (read only)."""
999 return len(self._residue_indices)
1000 residue_count = property(get_residue_count, None, None, "Number of residues in the chain (read only)")
1002 def get_classification (self) -> str:
1003 """Get the chain classification."""
1004 if self._classification:
1005 return self._classification
1006 self._classification = self.structure.get_selection_classification(self.get_selection())
1007 return self._classification
1009 def set_classification (self, classification : str):
1010 """Force the chain classification."""
1011 self._classification = classification
1013 classification = property(get_classification, set_classification, None,
1014 "Classification of the chain (manual or automatic)")
1016 def get_sequence (self) -> str:
1017 """Get the residues sequence in one-letter code."""
1018 return ''.join([ residue_name_to_letter(residue.name) for residue in self.residues ])
1020 def get_selection (self) -> 'Selection':
1021 """Generate a selection for this chain."""
1022 return Selection(self.atom_indices)
1024 def copy (self) -> 'Chain':
1025 """Make a copy of the current chain."""
1026 chain_copy = Chain(self.name)
1027 chain_copy._structure = self._structure
1028 chain_copy._index = self._index
1029 chain_copy.residue_indices = self.residue_indices
1030 return chain_copy
1032 def has_cg (self) -> bool:
1033 """Ask if the current chain has at least one coarse grain atom/residue."""
1034 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)
1036 def is_missing_any_bonds (self) -> bool:
1037 return any(atom.bonds == MISSING_BONDS for atom in self.atoms)
1039 def find_residue (self, number : int, icode : str = '', index = None) -> Optional['Residue']:
1040 """Find a residue by its number and insertion code."""
1041 # Iterate chain residues
1042 for residue in self.residues:
1043 if residue.number == number and residue.icode == icode and (index is None or residue.index == index):
1044 return residue
1045 return None
1047class Structure:
1048 """A structure is a group of atoms organized in chains and residues."""
1049 def __init__ (self,
1050 atoms : list['Atom'] = [],
1051 residues : list['Residue'] = [],
1052 chains : list['Chain'] = [],
1053 residue_numeration_base : int = 10,
1054 ):
1055 self.atoms: list['Atom'] = []
1056 self.residues: list['Residue'] = []
1057 self.chains: list['Chain'] = []
1058 # Set references between instances
1059 for atom in atoms:
1060 self.set_new_atom(atom)
1061 for residue in residues:
1062 self.set_new_residue(residue)
1063 for chain in chains:
1064 self.set_new_chain(chain)
1065 # --- Set other internal variables ---
1066 # Set bonds between atoms
1067 self._bonds = None
1068 # Set fragments of bonded atoms
1069 self._fragments = None
1070 # --- Set other auxiliar variables ---
1071 # Trajectory atom sorter is a function used to sort coordinates in a trajectory file
1072 # This function is generated when sorting indices in the structure
1073 # Also the new atom order is saved
1074 self.trajectory_atom_sorter = None
1075 self.new_atom_order = None
1076 # Track when atom elements have been fixed
1077 self._fixed_atom_elements = False
1078 # Save indices of supported ion atoms
1079 self._ion_atom_indices = None
1080 self._dummy_atom_indices = None
1081 # Set the scale of the residue numbers
1082 # It may be decimal (10), hexadecimal(16) or alphanumeric(36)
1083 # Note that hybrid scales (1-9999 decimal and different further) are not explicitly supported
1084 # However, the scale is guessed on read and conserved on write, so the original numeration would be conserved
1085 # The default values is used only when the structure is not read from a PDB
1086 self.residue_numeration_base = residue_numeration_base
1088 def __repr__ (self):
1089 return f'<Structure ({self.atom_count} atoms)>'
1091 def get_bonds (self) -> list[ list[int] ]:
1092 """Get the bonds between atoms."""
1093 # Return the stored value, if exists
1094 if self._bonds:
1095 return self._bonds
1096 # If not, we must calculate the bonds using vmd
1097 self._bonds = self.get_covalent_bonds()
1098 return self._bonds
1099 def set_bonds (self, bonds : list[ list[int] ]):
1100 """Force specific bonds."""
1101 self._bonds = bonds
1102 # Reset fragments
1103 self._fragments = None
1104 bonds = property(get_bonds, set_bonds, None, "The structure bonds")
1106 def get_fragments (self) -> list['Selection']:
1107 """Get the groups of atoms which are covalently bonded."""
1108 # Return the stored value, if exists
1109 if self._fragments != None:
1110 return self._fragments
1111 # Otherwise, find fragments in all structure atoms
1112 self._fragments = list(self.find_fragments())
1113 return self._fragments
1114 # Fragments of covalently bonded atoms (read only)
1115 fragments: list['Selection'] = property(get_fragments, None, None, "The structure fragments (read only)")
1117 def find_fragments (self,
1118 selection : Optional['Selection'] = None,
1119 coherent : bool = True,
1120 atom_bonds : Optional[list[list[int]]] = None,
1121 ) -> Generator['Selection', None, None]:
1122 """Find fragments in a selection of atoms. A fragment is a selection of
1123 covalently bonded atoms. All atoms are searched if no selection is provided.
1125 WARNING: Note that fragments generated from a specific selection may not
1126 match the structure fragments. A selection including 2 separated regions of a
1127 structure fragment will yield 2 fragments.
1129 For convenience, bonds between non-consecutive residues are excluded from this logic.
1130 This is useful to ignore disulfide bonds.
1131 May also help to properly find chains in CG simulations where chains may be bonded.
1132 """
1133 # If there is no selection we consider all atoms
1134 if not selection: selection = self.select_all()
1135 # Get/Find covalent bonds between atoms in a new object avoid further corruption (deletion) of the original list
1136 safe_bonds = atom_bonds if atom_bonds else self.bonds
1137 atom_indexed_covalent_bonds = { atom_index: [ *safe_bonds[atom_index] ] for atom_index in selection.atom_indices }
1138 # Ignore bonds from non-consecutive residues
1139 if coherent:
1140 for atom_index, bonded_atom_indices in atom_indexed_covalent_bonds.items():
1141 atom = self.atoms[atom_index]
1142 residue_index = atom.residue_index
1143 consecutive_residue_index = residue_index + 1
1144 self_bonded_atoms = []
1145 consecutive_bonded_atoms = []
1146 for bonded_atom_index in bonded_atom_indices:
1147 bonded_atom = self.atoms[bonded_atom_index]
1148 bonded_residue_index = bonded_atom.residue_index
1149 if bonded_residue_index == residue_index:
1150 self_bonded_atoms.append(bonded_atom_index)
1151 continue
1152 if bonded_residue_index == consecutive_residue_index:
1153 consecutive_bonded_atoms.append(bonded_atom_index)
1154 continue
1155 new_bonded_atoms = self_bonded_atoms + consecutive_bonded_atoms
1156 atom_indexed_covalent_bonds[atom_index] = new_bonded_atoms
1157 # Group the connected atoms in "fragments"
1158 while len(atom_indexed_covalent_bonds) > 0:
1159 start_atom_index, bonds = next(iter(atom_indexed_covalent_bonds.items()))
1160 del atom_indexed_covalent_bonds[start_atom_index]
1161 fragment_atom_indices = [ start_atom_index ]
1162 while len(bonds) > 0:
1163 # Get the next bond atom and remove it from the bonds list
1164 next_atom_index = bonds[0]
1165 bonds.remove(next_atom_index)
1166 next_bonds = atom_indexed_covalent_bonds.get(next_atom_index, None)
1167 # If this atom is out of the selection then skip it
1168 if next_bonds == None: continue
1169 next_new_bonds = [ bond for bond in next_bonds if bond not in fragment_atom_indices + bonds ]
1170 bonds += next_new_bonds
1171 fragment_atom_indices.append(next_atom_index)
1172 del atom_indexed_covalent_bonds[next_atom_index]
1173 yield Selection(fragment_atom_indices)
1175 def find_whole_fragments (self, selection : 'Selection') -> Generator['Selection', None, None]:
1176 """Given a selection of atoms, find all whole structure fragments on them."""
1177 for fragment in self.fragments:
1178 if selection & fragment:
1179 yield fragment
1181 def name_selection (self, selection : 'Selection') -> str:
1182 """Name an atom selection depending on the chains it contains. This is used for debug purpouses."""
1183 atoms = [ self.atoms[index] for index in selection.atom_indices ]
1184 # Count atoms per chain
1185 atom_count_per_chain = { chain: 0 for chain in self.chains }
1186 for atom in atoms:
1187 atom_count_per_chain[atom.chain] += 1
1188 # Set labels accoridng to the coverage of every chain
1189 chain_labels = []
1190 for chain, atom_count in atom_count_per_chain.items():
1191 if atom_count == 0: continue
1192 coverage = atom_count / chain.atom_count
1193 label = f'chain {chain.name}'
1194 if coverage < 1:
1195 percent = round(coverage * 1000) / 10
1196 label += f' ({percent}%)'
1197 chain_labels.append(label)
1198 return ', '.join(chain_labels)
1200 def set_new_atom (self, atom : 'Atom'):
1201 """Set a new atom in the structure."""
1202 atom._structure = self
1203 new_atom_index = self.atom_count
1204 self.atoms.append(atom)
1205 atom._index = new_atom_index
1207 def set_new_residue (self, residue : 'Residue'):
1208 """
1209 Set a new residue in the structure.
1210 WARNING: Atoms must be set already before setting residues.
1211 """
1212 residue._structure = self
1213 new_residue_index = len(self.residues)
1214 self.residues.append(residue)
1215 residue._index = new_residue_index
1216 # In case the residue has atom indices, set relational indices on each atom
1217 for atom_index in residue.atom_indices:
1218 atom = self.atoms[atom_index]
1219 atom._residue_index = new_residue_index
1221 def purge_residue (self, residue : 'Residue'):
1222 """
1223 Purge residue from the structure and its chain.
1224 This can be done only when the residue has no atoms left in the structure.
1225 Renumerate all residue indices which have been offsetted as a result of the purge.
1226 """
1227 # Check the residue can be purged
1228 if residue not in self.residues:
1229 raise ValueError(f'{residue} is not in the structure already')
1230 if len(residue.atom_indices) > 0:
1231 raise ValueError(f'{residue} is still having atoms and thus it cannot be purged')
1232 # Remove the purged residue from its chain
1233 if residue.chain: residue.chain.remove_residue(residue)
1234 # Get the current index of the residue to be purged
1235 purged_index = residue.index
1236 # Residues and their atoms below this index are not to be modified
1237 # Residues and their atoms over this index must be renumerated
1238 for affected_residue in self.residues[purged_index+1:]:
1239 # Chaging the index automatically changes all residues atoms '_residue_index' values
1240 # Chaging the index automatically changes its corresponding index in residue chain '_residue_indices'
1241 affected_residue.index -= 1
1242 # Finally, remove the current residue from the list of residues in the structure
1243 del self.residues[purged_index]
1245 def set_new_chain (self, chain : 'Chain'):
1246 """Set a new chain in the structure.
1247 WARNING: Residues and atoms must be set already before setting chains."""
1248 chain._structure = self
1249 new_chain_index = len(self.chains)
1250 self.chains.append(chain)
1251 chain._index = new_chain_index
1252 # In case the chain has residue indices, set relational indices on each residue
1253 for residue_index in chain.residue_indices:
1254 residue = self.residues[residue_index]
1255 residue._chain_index = new_chain_index
1257 def purge_chain (self, chain : 'Chain'):
1258 """Purge chain from the structure.
1259 This can be done only when the chain has no residues left in the structure.
1260 Renumerate all chain indices which have been offsetted as a result of the purge."""
1261 # Check the chain can be purged
1262 if chain not in self.chains:
1263 raise ValueError(f'Chain {chain.name} is not in the structure already')
1264 if len(chain.residue_indices) > 0:
1265 raise ValueError(f'Chain {chain.name} is still having residues and thus it cannot be purged')
1266 # Get the current index of the chain to be purged
1267 purged_index = chain.index
1268 # Chains and their residues below this index are not to be modified
1269 # Chains and their residues over this index must be renumerated
1270 for affected_chain in self.chains[purged_index+1:]:
1271 # Chaging the index automatically changes all chain residues '_chain_index' values
1272 affected_chain.index -= 1
1273 # Finally, remove the current chain from the list of chains in the structure
1274 del self.chains[purged_index]
1276 @classmethod
1277 def from_pdb (cls, pdb_content : str, model : Optional[int] = None, flexible_numeration : bool = True):
1278 """Set the structure from a pdb file.
1279 You may filter the PDB content for a specific model.
1280 Some weird numeration systems are not supported and, when encountered, they are ignored.
1281 In these cases we set our own numeration system.
1282 Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead."""
1283 # Filter the PDB content in case a model was passed
1284 filtered_pdb_content = filter_model(pdb_content, model) if model else pdb_content
1285 # Split the PDB content in lines
1286 pdb_lines = filtered_pdb_content.split('\n')
1287 # Before we start, we must guess the numeration system
1288 # To do so mine all residue numbers
1289 all_residue_number_characters = set()
1290 for line in pdb_lines:
1291 # Parse atoms only
1292 start = line[0:6]
1293 is_atom = start == 'ATOM ' or start == 'HETATM'
1294 if not is_atom: continue
1295 # Mine all residue numbers
1296 residue_number = line[22:26]
1297 for character in residue_number:
1298 all_residue_number_characters.add(character)
1299 # Remove white spaces
1300 all_residue_number_characters.discard(' ')
1301 # If we find a non-numerical and non-alphabetical character then we assume it has a weird numeration system
1302 # Since we can not support every scenario, in this cases we set the numeration totally ignoring the one in the PDB
1303 weird_character = next((character for character in all_residue_number_characters if not (character.isalnum() or character == '-')), None)
1304 if weird_character:
1305 if flexible_numeration == False: raise InputError(f'Not supported numeration system including "{weird_character}" characters')
1306 warn(f'Weird residue numeration including "{weird_character}" characters found in the PDB file. Ignoring it.')
1307 residue_numeration_base = None
1308 # Search among all resiude numbers any letters (non-numerical characters)
1309 elif next((letter for letter in alphanumerical_letters if letter in all_residue_number_characters), None):
1310 residue_numeration_base = 36
1311 elif next((letter for letter in hexadecimal_letters if letter in all_residue_number_characters), None):
1312 residue_numeration_base = 16
1313 else:
1314 residue_numeration_base = 10
1315 # Read the pdb content line by line and set the parsed atoms, residues and chains
1316 parsed_atoms = []
1317 parsed_residues = []
1318 parsed_chains = []
1319 # Save chains and residues also in dictionaries only to find them faster
1320 name_chains = {}
1321 label_residues = {}
1322 # Keep track of the last issued atom and residue indices
1323 atom_index = -1
1324 residue_index = -1
1325 for line in pdb_lines:
1326 # Parse atoms only
1327 start = line[0:6]
1328 is_atom = start == 'ATOM ' or start == 'HETATM'
1329 if not is_atom:
1330 continue
1331 # Mine all atom data
1332 atom_name = line[11:16].strip()
1333 residue_name = line[17:21].strip()
1334 chain_name = line[21:22]
1335 residue_number = line[22:26]
1336 icode = line[26:27]
1337 if icode == ' ':
1338 icode = ''
1339 x_coord = float(line[30:38])
1340 y_coord = float(line[38:46])
1341 z_coord = float(line[46:54])
1342 element = line[76:78].strip()
1343 # Set the parsed atom
1344 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord))
1345 # Add the parsed atom to the list and update the current atom index
1346 parsed_atoms.append(parsed_atom)
1347 # Get the parsed chain
1348 # DANI: If we always check for an already existing chain there will never be repeated chains
1349 # DANI: However we may create chains with unconnected atoms silently (they were separated in the PDB)
1350 # DANI: For this reason we must only consider the last processed chain
1351 parsed_chain = name_chains.get(chain_name, None)
1352 if parsed_chain and parsed_chain != parsed_chains[-1]:
1353 parsed_chain = None
1354 # If the parsed chain was not yet instantiated then do it now
1355 if not parsed_chain:
1356 parsed_chain = Chain(name=chain_name)
1357 parsed_chains.append(parsed_chain)
1358 name_chains[chain_name] = parsed_chain
1359 # Get the parsed residue
1360 residue_label = (chain_name, residue_number, icode)
1361 # DANI: If we always check for an already existing residue there will never be repeated residues
1362 # DANI: However we may create residues with unconnected atoms silently (they were separated in the PDB)
1363 # DANI: For this reason we must only consider the last processed residue
1364 parsed_residue = label_residues.get(residue_label, None)
1365 if parsed_residue and parsed_residue != parsed_residues[-1]:
1366 parsed_residue = None
1367 # If the parsed residue was not yet instantiated then do it now
1368 if not parsed_residue:
1369 # Parse the residue number if it is to be parsed
1370 if residue_numeration_base:
1371 parsed_residue_number = int(residue_number, residue_numeration_base)
1372 # If we decided to ignore the numeration system then we just issue a new residue number
1373 # Use the last residue number from the current chain as reference
1374 else:
1375 chain_last_residue_index = parsed_chain.residue_indices[-1] if len(parsed_chain.residue_indices) > 0 else None
1376 chain_last_residue_number = parsed_residues[chain_last_residue_index].number if chain_last_residue_index != None else 0
1377 parsed_residue_number = chain_last_residue_number + 1
1378 # Instantiate the new parsed residue
1379 parsed_residue = Residue(name=residue_name, number=parsed_residue_number, icode=icode)
1380 parsed_residues.append(parsed_residue)
1381 label_residues[residue_label] = parsed_residue
1382 # Add current residue to the parsed chain
1383 residue_index += 1
1384 parsed_chain.residue_indices.append(residue_index)
1385 # Add current atom to the parsed residue
1386 atom_index += 1
1387 parsed_residue.atom_indices.append(atom_index)
1388 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains, residue_numeration_base=residue_numeration_base)
1390 @classmethod
1391 def from_pdb_file (cls, pdb_filepath : str, model : Optional[int] = None, flexible_numeration : bool = True):
1392 """Set the structure from a pdb file.
1393 You may filter the input PDB file for a specific model.
1394 Some weird numeration systems are not supported and, when encountered, they are ignored.
1395 In these cases we set our own numeration system.
1396 Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead."""
1397 pdb_file = File(pdb_filepath)
1398 if not pdb_file.exists:
1399 raise InputError(f'File "{pdb_filepath}" not found')
1400 if not pdb_file.format == 'pdb':
1401 raise InputError(f'"{pdb_filepath}" is not a path for a pdb file')
1402 # Read the pdb file
1403 pdb_content = None
1404 with open(pdb_filepath, 'r') as file:
1405 pdb_content = file.read()
1406 return cls.from_pdb(pdb_content, model, flexible_numeration)
1408 # https://biopandas.github.io/biopandas/tutorials/Working_with_mmCIF_Structures_in_DataFrames/
1409 @classmethod
1410 def from_mmcif (cls, mmcif_content : str, model : Optional[int] = None, author_notation : bool = False):
1411 """Set the structure from mmcif.
1412 You may filter the content for a specific model.
1413 You may ask for the author notation instead of the standarized notation for legacy reasons.
1414 This may have an effect in atom names, residue names, residue numbers and chain names.
1415 Read the pdb content line by line and set the parsed atoms, residues and chains.
1416 """
1417 parsed_atoms = []
1418 parsed_residues = []
1419 parsed_chains = []
1420 # Save chains and residues also in dictionaries only to find them faster
1421 name_chains = {}
1422 label_residues = {}
1423 # Keep track of the last issued atom and residue indices
1424 atom_index = -1
1425 residue_index = -1
1426 # Iterate the content line by line
1427 lines = iter(mmcif_content.split('\n'))
1428 # Now mine atoms data
1429 atom_headers = { 'ATOM', 'HETATM', 'ANISOU' }
1430 for line in lines:
1431 # Values are separated by spaces
1432 values = line.split()
1433 # If this is not atom line then we are done
1434 if len(values) == 0 or values[0] not in atom_headers: continue
1435 # Get the atom model number if a model number was passed
1436 # Note that then model is a number greater than 0
1437 if model:
1438 model_number = int(values[20])
1439 # If the model number odes not match then skip it
1440 if model != model_number: continue
1441 # Mine atom data
1442 # Next value is just the atom number, we do not need it
1443 # atom_number = values[1]
1444 # Mine the atom element
1445 element = values[2]
1446 # Mine the atom name
1447 atom_name = values[3].replace('"','')
1448 # Next value is a place holder to indicate alternate conformation, according to the docs
1449 # I don't know what this is but we do not need it
1450 # idk = values[4]
1451 # Mine the residue name
1452 residue_name = values[5]
1453 # Mine the chain name
1454 chain_name = values[6]
1455 # Next value is the chain number, we do not need it
1456 # chain_number = values[7]
1457 # Residue number is '.' for chains with only one residue
1458 residue_number = 1 if values[8] == '.' else int(values[8])
1459 icode = '' if values[9] == '?' else values[9]
1460 x_coord = float(values[10])
1461 y_coord = float(values[11])
1462 z_coord = float(values[12])
1463 # Next value is the occupancy, we do not need it
1464 # occupancy = float(values[13])
1465 # Next value is the isotropic displacement, we do not need it
1466 # isotropic = float(values[14])
1467 # Next value is the charge, we do not need it
1468 # charge = None if values[15] == '?' else float(values[15])
1469 # The rest of values are alternative author values
1470 if author_notation:
1471 residue_number = int(values[16])
1472 residue_name = values[17]
1473 chain_name = values[18]
1474 atom_name = values[19].replace('"','')
1475 # Set the parsed atom
1476 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord))
1477 # Add the parsed atom to the list and update the current atom index
1478 parsed_atoms.append(parsed_atom)
1479 # Get the parsed chain
1480 parsed_chain = name_chains.get(chain_name, None)
1481 # If the parsed chain was not yet instantiated then do it now
1482 if not parsed_chain:
1483 parsed_chain = Chain(name=chain_name)
1484 parsed_chains.append(parsed_chain)
1485 name_chains[chain_name] = parsed_chain
1486 # Get the parsed residue
1487 residue_label = (chain_name, residue_number, icode)
1488 parsed_residue = label_residues.get(residue_label, None)
1489 # If the parsed residue was not yet instantiated then do it now
1490 if not parsed_residue:
1491 # Instantiate the new parsed residue
1492 parsed_residue = Residue(name=residue_name, number=residue_number, icode=icode)
1493 parsed_residues.append(parsed_residue)
1494 label_residues[residue_label] = parsed_residue
1495 # Add current residue to the parsed chain
1496 residue_index += 1
1497 parsed_chain.residue_indices.append(residue_index)
1498 # Add current atom to the parsed residue
1499 atom_index += 1
1500 parsed_residue.atom_indices.append(atom_index)
1501 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)
1503 @classmethod
1504 def from_mmcif_file (cls, mmcif_filepath : str, model : Optional[int] = None, author_notation : bool = False):
1505 """Set the structure from a mmcif file."""
1506 mmcif_file = File(mmcif_filepath)
1507 if not mmcif_file.exists:
1508 raise InputError(f'File "{mmcif_filepath}" not found')
1509 if not mmcif_file.format == 'cif':
1510 raise InputError(f'"{mmcif_filepath}" is not a path for a mmcif file')
1511 # Read the mmcif file
1512 mmcif_content = None
1513 with open(mmcif_filepath, 'r') as file:
1514 mmcif_content = file.read()
1515 return cls.from_mmcif(mmcif_content, model, author_notation)
1517 @classmethod
1518 def from_mdanalysis (cls, mdanalysis_universe):
1519 """Set the structure from an MD analysis object."""
1520 # Set the final list of atoms to be included in the structure
1521 parsed_atoms = []
1522 parsed_residues = []
1523 parsed_chains = []
1524 # Setup atoms
1525 for atom in mdanalysis_universe.atoms:
1526 name = atom.name
1527 # WARNING: MDAnalysis cannot handle reading an element of an atom with no element
1528 element = atom.element if hasattr(atom, 'element') else '?'
1529 #coords = atom.position # DANI: Esto da error si no hay coordenadas
1530 parsed_atom = Atom(name=name, element=element)
1531 parsed_atoms.append(parsed_atom)
1532 # Setup residues
1533 for residue in mdanalysis_universe.residues:
1534 name = residue.resname
1535 number = residue.resnum
1536 icode = residue.icode if hasattr(residue, 'icode') else None # DANI: No se ha provado
1537 parsed_residue = Residue(name=name, number=number, icode=icode)
1538 atom_indices = [ atom.index for atom in residue.atoms ]
1539 parsed_residue.atom_indices = atom_indices
1540 parsed_residues.append(parsed_residue)
1541 # Setup chains
1542 for segment in mdanalysis_universe.segments:
1543 name = segment.segid
1544 parsed_chain = Chain(name=name)
1545 residue_indices = [ residue.resindex for residue in segment.residues ]
1546 parsed_chain.residue_indices = residue_indices
1547 parsed_chains.append(parsed_chain)
1548 # Setup the structure
1549 structure = cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)
1550 # Add bonds and charges which are also available in a topology
1551 atom_count = len(mdanalysis_universe.atoms)
1552 atom_bonds = [ [] for i in range(atom_count) ]
1553 for bond in mdanalysis_universe.bonds:
1554 a,b = bond.indices
1555 # Make sure atom indices are regular integers so they are JSON serializables
1556 atom_bonds[a].append(int(b))
1557 atom_bonds[b].append(int(a))
1558 structure.bonds = atom_bonds
1559 structure.charges = list(mdanalysis_universe._topology.charges.values)
1560 return structure
1562 @classmethod
1563 def from_prmtop_file (cls, prmtop_filepath : str):
1564 """Set the structure from a prmtop file."""
1565 # Make sure the input file exists and has the right format
1566 prmtop_file = File(prmtop_filepath)
1567 if not prmtop_file.exists:
1568 raise InputError(f'File "{prmtop_filepath}" not found')
1569 if not prmtop_file.format == 'prmtop':
1570 raise InputError(f'"{prmtop_filepath}" is not a name for a prmtop file')
1571 # In we do not have mdanalysis in our environment then we cannot proceed
1572 if not is_imported('MDAnalysis'):
1573 raise InputError('Missing dependency error: MDAnalysis')
1574 # Parse the topology using MDanalysis
1575 parser = MDAnalysis.topology.TOPParser.TOPParser(prmtop_filepath)
1576 mdanalysis_topology = parser.parse()
1577 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology)
1578 return cls.from_mdanalysis(mdanalysis_universe)
1580 @classmethod
1581 def from_tpr_file (cls, tpr_filepath : str):
1582 """Set the structure from a tpr file."""
1583 # Make sure the input file exists and has the right format
1584 tpr_file = File(tpr_filepath)
1585 if not tpr_file.exists:
1586 raise InputError(f'File "{tpr_filepath}" not found')
1587 if not tpr_file.format == 'tpr':
1588 raise InputError(f'"{tpr_filepath}" is not a name for a tpr file')
1589 # In we do not have mdanalysis in our environment then we cannot proceed
1590 if not is_imported('MDAnalysis'):
1591 raise InputError('Missing dependency error: MDAnalysis')
1592 # Parse the topology using MDanalysis
1593 parser = MDAnalysis.topology.TPRParser.TPRParser(tpr_filepath)
1594 mdanalysis_topology = parser.parse()
1595 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology)
1596 return cls.from_mdanalysis(mdanalysis_universe)
1598 @classmethod
1599 def from_file (cls, mysterious_filepath : str):
1600 """Set the structure from a file if the file format is supported."""
1601 mysterious_file = File(mysterious_filepath)
1602 if mysterious_file.format == 'pdb':
1603 return cls.from_pdb_file(mysterious_file.path)
1604 if mysterious_file.format == 'cif':
1605 return cls.from_mmcif_file(mysterious_file.path)
1606 if mysterious_file.format == 'prmtop':
1607 return cls.from_prmtop_file(mysterious_file.path)
1608 if mysterious_file.format == 'tpr':
1609 return cls.from_tpr_file(mysterious_file.path)
1610 raise InputError(f'Not supported format ({mysterious_file.format}) to setup a Structure')
1612 def get_atom_count (self) -> int:
1613 """Get the number of atoms in the structure."""
1614 return len(self.atoms)
1615 atom_count = property(get_atom_count, None, None, "The number of atoms in the structure (read only)")
1617 def get_residue_count (self) -> int:
1618 """Get the number of residues in the structure (read only)."""
1619 return len(self.residues)
1620 residue_count = property(get_residue_count, None, None, "Number of residues in the structure (read only)")
1622 def get_chain_count (self) -> int:
1623 """Get the number of chains in the structure."""
1624 return len(self.chains)
1625 chain_count = property(get_chain_count, None, None, "Number of chains in the structure (read only)")
1627 def fix_atom_elements (self, trust : bool = True) -> bool:
1628 """Fix atom elements by guessing them when missing.
1629 Set all elements with the first letter upper and the second (if any) lower.
1630 Also check if atom elements are coherent with atom names.
1632 Args:
1633 trust (bool): If 'trust' is set as False then we impose elements according to what we can guess from the atom name.
1635 Returns:
1636 bool:
1637 Return True if any element was modified or False if not.
1638 """
1639 modified = False
1640 added = False
1641 # Save the wrong guesses for a final report
1642 # This way we do not crowd the terminal with logs when a lot of atoms are affected
1643 reports = {}
1644 for atom in self.atoms:
1645 # Make sure elements have the first letter cap and the second letter not cap
1646 if atom.element:
1647 new_element = first_cap_only(atom.element)
1648 if atom.element != new_element:
1649 atom.element = new_element
1650 modified = True
1651 # Check the element to match what we would guess from the atom name
1652 # In case it does not just warn the user
1653 guess = atom.guess_element()
1654 if atom.element != guess:
1655 report = (atom.name, atom.element, guess)
1656 report_count = reports.get(report, 0)
1657 reports[report] = report_count + 1
1658 if not trust:
1659 atom.element = guess
1660 modified = True
1661 # If elements are missing then guess them from atom names
1662 else:
1663 atom.element = atom.guess_element()
1664 added = True
1665 # Warn the user about anormalies
1666 for report, count in reports.items():
1667 atom_name, atom_element, guess = report
1668 warn(f"Suspicious element for atom {atom_name}: {atom_element} -> shoudn't it be {guess}? ({count} occurrences)")
1669 # Warn the user that some elements were modified
1670 if modified: warn('Atom elements have been modified')
1671 if added: warn('Atom elements were missing and have been added')
1672 # Set atom elements as fixed in order to avoid repeating this process
1673 self._fixed_atom_elements = True
1674 return modified or added
1676 def set_new_coordinates (self, new_coordinates : list[Coords]):
1677 """Set new coordinates."""
1678 # Make sure the list of coordinates is as long as the number of atoms
1679 if len(new_coordinates) != self.atom_count:
1680 raise ValueError(f'The number of coordinates ({len(new_coordinates)}) does not match the number of atoms ({self.atom_count})')
1681 # Overwrite current coordinates with the new coordinates
1682 for i, atom in enumerate(self.atoms):
1683 atom.coords = tuple(new_coordinates[i])
1685 def get_ion_atom_indices (self) -> set:
1686 """Get all supported ion atom indices together in a set."""
1687 # If we already did this then return the stored value
1688 if self._ion_atom_indices != None:
1689 return self._ion_atom_indices
1690 # Find ion atom indices
1691 indices = set()
1692 for atom in self.atoms:
1693 if atom.element in SUPPORTED_ION_ELEMENTS:
1694 indices.add(atom.index)
1695 self._ion_atom_indices = indices
1696 return self._ion_atom_indices
1697 ion_atom_indices = property(get_ion_atom_indices, None, None, "Atom indices for what we consider supported ions")
1699 def get_dummy_atom_indices (self) -> set:
1700 """Get all dummy atom indices together in a set."""
1701 # If we already did this then return the stored value
1702 if self._dummy_atom_indices != None:
1703 return self._dummy_atom_indices
1704 # Find ion atom indices
1705 indices = set()
1706 for atom in self.atoms:
1707 if atom.element == DUMMY_ATOM_ELEMENT:
1708 indices.add(atom.index)
1709 self._dummy_atom_indices = indices
1710 return self._dummy_atom_indices
1711 dummy_atom_indices = property(get_dummy_atom_indices, None, None, "Atom indices for what we consider dummy atoms")
1713 def generate_pdb (self):
1714 """Generate a pdb file content with the current structure."""
1715 content = 'REMARK workflow generated pdb file\n'
1716 for a, atom in enumerate(self.atoms):
1717 residue = atom.residue
1718 index = str((a+1) % 100000).rjust(5)
1719 name = ' ' + atom.name.ljust(3) if len(atom.name) < 4 else atom.name
1720 residue_name = residue.name.ljust(4) if residue else 'XXX'.ljust(4)
1721 chain = atom.chain
1722 chain_name = chain.name if chain.name and len(chain.name) == 1 else 'X'
1723 residue_number = str(residue.number).rjust(4) if residue else '0'.rjust(4)
1724 # If residue number is longer than 4 characters then we must parse to hexadecimal
1725 if len(residue_number) > 4:
1726 residue_number = hex(residue.number)[2:].rjust(4)
1727 icode = residue.icode if residue.icode and len(residue.icode) else ' '
1728 # Make sure we have atom coordinates
1729 if atom.coords == None:
1730 raise InputError('Trying to write a PDB file from a structure with atoms without coordinates')
1731 x_coord, y_coord, z_coord = [ "{:.3f}".format(coord).rjust(8) for coord in atom.coords ]
1732 occupancy = '1.00' # Just a placeholder
1733 temp_factor = '0.00' # Just a placeholder
1734 element = atom.element.rjust(2)
1735 atom_line = ('ATOM ' + index + ' ' + name + ' ' + residue_name
1736 + chain_name + residue_number + icode + ' ' + x_coord + y_coord + z_coord
1737 + ' ' + occupancy + ' ' + temp_factor + ' ' + element).ljust(80) + '\n'
1738 content += atom_line
1739 return content
1741 def generate_pdb_file (self, pdb_filepath : str):
1742 """Generate a pdb file with current structure."""
1743 pdb_content = self.generate_pdb()
1744 with open(pdb_filepath, "w") as file:
1745 file.write(pdb_content)
1747 def get_pytraj_topology (self):
1748 """Get the structure equivalent pytraj topology."""
1749 # In we do not have pytraj in our environment then we cannot proceed
1750 if not is_imported('pytraj'):
1751 raise InputError('Missing dependency error: pytraj')
1752 # Generate a pdb file from the current structure to feed pytraj
1753 pdb_filepath = '.structure.pdb'
1754 self.generate_pdb_file(pdb_filepath)
1755 pytraj_topology = pytraj.load_topology(filename = pdb_filepath)
1756 os.remove(pdb_filepath)
1757 return pytraj_topology
1759 SUPPORTED_SELECTION_SYNTAXES = { 'vmd', 'pytraj', 'gmx' }
1760 def select (self, selection_string : str, syntax : str = 'vmd') -> Optional['Selection']:
1761 """Select atoms from the structure thus generating an atom indices list.
1762 Different tools may be used to make the selection:
1763 - vmd (default)
1764 - pytraj"""
1765 if syntax == 'vmd':
1766 # Generate a pdb for vmd to read it
1767 auxiliar_pdb_filepath = '.structure.pdb'
1768 self.generate_pdb_file(auxiliar_pdb_filepath)
1769 # Use vmd to find atom indices
1770 atom_indices = get_vmd_selection_atom_indices(auxiliar_pdb_filepath, selection_string)
1771 os.remove(auxiliar_pdb_filepath)
1772 if len(atom_indices) == 0:
1773 return Selection()
1774 return Selection(atom_indices)
1775 if syntax == 'pytraj':
1776 # In we do not have pytraj in our environment then we cannot proceed
1777 if not is_imported('pytraj'):
1778 raise InputError('Missing dependency error: pytraj')
1779 pytraj_topology = self.get_pytraj_topology()
1780 pytraj_selection = pytraj_topology[selection_string]
1781 atom_indices = [ atom.index for atom in pytraj_selection.atoms ]
1782 if len(atom_indices) == 0:
1783 return Selection()
1784 return Selection(atom_indices)
1785 if syntax == 'gmx':
1786 # Generate a pdb strucutre to feed gmx
1787 auxiliar_pdb_filepath = '.structure.pdb'
1788 self.generate_pdb_file(auxiliar_pdb_filepath)
1789 auxiliar_pdb_file = File(auxiliar_pdb_filepath)
1790 # Create the index file with the current atom selection
1791 index_filepath = f'.structure.ndx'
1792 index_file = File(index_filepath)
1793 selection_name, selection_exists = make_index(auxiliar_pdb_file, index_file, selection_string)
1794 if not selection_exists:
1795 available_selections = ', '.join(list(index_groups.keys()))
1796 raise InputError(f'Something was wrong with the selection. Available gromacs selections: {available_selections}')
1797 # Read the index file to capture the selection of atoms
1798 index_groups = read_ndx(index_file)
1799 indices = index_groups.get(selection_name, None)
1800 if indices == None: raise RuntimeError('Atom group must exist at this point')
1801 auxiliar_pdb_file.remove()
1802 index_file.remove()
1803 # Return the parsed selection
1804 return Selection(indices)
1806 options = ', '.join(self.SUPPORTED_SELECTION_SYNTAXES)
1807 raise InputError(f'Syntax "{syntax}" is not supported. Choose one of the following: {options}')
1809 def select_atom_indices (self, atom_indices : list[int]) -> 'Selection':
1810 """Set a function to make selections using atom indices."""
1811 # Check atom indices to be in the structure
1812 atom_count = self.atom_count
1813 for atom_index in atom_indices:
1814 if atom_index >= atom_count:
1815 raise InputError(f'Atom index {atom_index} is out of range ({atom_count})')
1816 return Selection(atom_indices)
1818 def select_residue_indices (self, residue_indices : list[int]) -> 'Selection':
1819 """Set a function to make selections using residue indices."""
1820 # WARNING: The following line gets stucked sometimes, idk why
1821 #atom_indices = sum([ self.residues[index].atom_indices for index in residue_indices ], [])
1822 atom_indices = []
1823 for i, index in enumerate(residue_indices):
1824 atom_indices += self.residues[index].atom_indices
1825 return Selection(atom_indices)
1827 def select_all (self) -> 'Selection':
1828 """Get a selection with all atoms."""
1829 return Selection(list(range(self.atom_count)))
1831 def select_by_classification (self, classification : str) -> 'Selection':
1832 """Select atoms according to the classification of its residue."""
1833 atom_indices = []
1834 for residue in self.residues:
1835 if residue.classification == classification:
1836 atom_indices += residue.atom_indices
1837 return Selection(atom_indices)
1839 def select_water (self) -> 'Selection':
1840 """Select water atoms.
1841 WARNING: This logic is a bit guessy and it may fail for non-standard residue named structures
1842 """
1843 return self.select_by_classification('solvent')
1845 def select_ions (self) -> 'Selection':
1846 """Select ions."""
1847 return self.select_by_classification('ion')
1849 def select_counter_ions (self, charge : Optional[str] = None) -> 'Selection':
1850 """Select counter ion atoms.
1851 WARNING: This logic is a bit guessy and it may fail for non-standard atom named structures
1852 """
1853 counter_ion_indices = []
1854 # Set the accepted names accoridng to the charge
1855 if charge == None:
1856 accepted_names = STANDARD_COUNTER_ION_ATOM_NAMES
1857 elif charge == '+':
1858 accepted_names = STANDARD_COUNTER_CATION_ATOM_NAMES
1859 elif charge == '-':
1860 accepted_names = STANDARD_COUNTER_ANION_ATOM_NAMES
1861 else:
1862 raise ValueError('Not supported charge')
1863 # Iterate atoms
1864 for atom in self.atoms:
1865 # If the residue has not one single atom then it is not an ion
1866 if len(atom.residue.atoms) != 1: continue
1867 # Get a simplified version of the atom name
1868 # Set all letters upper and remove non-letter characters (e.g. '+' and '-' symbols)
1869 simple_atom_name = ''.join(filter(str.isalpha, atom.name.upper()))
1870 if simple_atom_name in accepted_names:
1871 counter_ion_indices.append(atom.index)
1872 return Selection(counter_ion_indices)
1874 def select_water_and_counter_ions (self) -> 'Selection':
1875 """Select both water and counter ions."""
1876 return self.select_water() + self.select_counter_ions()
1878 def select_heavy_atoms (self) -> 'Selection':
1879 """Select heavy atoms."""
1880 atom_indices = []
1881 for atom in self.atoms:
1882 # If the atom is not an hydrogen then add it to the list
1883 if atom.element != 'H':
1884 atom_indices.append(atom.index)
1885 return Selection(atom_indices)
1887 def select_protein (self) -> 'Selection':
1888 """Select protein atoms.
1889 WARNING: Note that there is a small difference between VMD protein and our protein.
1890 This function is improved to consider terminal residues as proteins.
1891 VMD considers protein any residue including N, C, CA and O while terminals may have OC1 and OC2 instead of O.
1892 """
1893 return self.select_by_classification('protein')
1895 def select_nucleic (self) -> 'Selection':
1896 """Select nucleic atoms."""
1897 return self.select_by_classification('dna') + self.select_by_classification('rna')
1899 def select_lipids (self) -> 'Selection':
1900 """Select lipids."""
1901 return self.select_by_classification('fatty') + self.select_by_classification('steroid')
1903 def select_carbohydrates (self) -> 'Selection':
1904 """Select carbohydrates."""
1905 return self.select_by_classification('carbohydrate')
1907 def select_pbc_guess (self) -> 'Selection':
1908 """Return a selection of the typical PBC atoms: solvent, counter ions and lipids.
1909 WARNING: This is just a guess."""
1910 return self.select_water() + self.select_counter_ions() + self.select_lipids()
1912 def select_cg (self) -> 'Selection':
1913 """Select coarse grain atoms."""
1914 return Selection([ atom.index for atom in self.atoms if atom.element == CG_ATOM_ELEMENT ])
1916 def select_missing_bonds (self) -> 'Selection':
1917 return Selection([ index for index, bonds in enumerate(self.bonds) if bonds == MISSING_BONDS ])
1919 def select_cartoon (self, include_terminals : bool = False) -> 'Selection':
1920 """Select cartoon representable regions for VMD.
1922 Rules are:
1923 1. Residues must be protein (i.e. must contain C, CA, N and O atoms) or nucleic (P, OP1, OP2, O3', C3', C4', C5', O5')
1924 2. There must be at least 3 covalently bonded residues
1926 It does not matter their chain, numeration or even index order as long as they are bonded.
1927 * Note that we can represent cartoon while we display one residue alone, but it must be connected anyway.
1928 Also, we have the option to include terminals in the cartoon selection although they are not representable.
1929 This is helpful for the screenshot: terminals are better hidden than represented as ligands."""
1930 # Set fragments which are candidate to be cartoon representable
1931 fragments = []
1932 # Get protein fragments according to VMD
1933 protein_selection = self.select_protein() - self.select_cg() if include_terminals else self.select('protein', syntax='vmd')
1934 if protein_selection:
1935 fragments += list(self.find_fragments(protein_selection))
1936 # Get nucleic fragments according to VMD
1937 nucleic_selection = self.select_nucleic() - self.select_cg()
1938 if nucleic_selection:
1939 fragments += list(self.find_fragments(nucleic_selection))
1940 # Set the final selection including all valid fragments
1941 cartoon_selection = Selection()
1942 # Iterate over every fragment
1943 for fragment in fragments:
1944 # Make sure the fragment has at least 3 residues before adding it to the cartoon selection
1945 fragment_residue_indices = self.get_selection_residue_indices(fragment)
1946 if len(fragment_residue_indices) >= 3:
1947 cartoon_selection += fragment
1948 return cartoon_selection
1950 def invert_selection (self, selection : 'Selection') -> 'Selection':
1951 """Invert a selection."""
1952 atom_indices = list(range(self.atom_count))
1953 for atom_index in selection.atom_indices:
1954 atom_indices[atom_index] = None
1955 return Selection([ atom_index for atom_index in atom_indices if atom_index != None ])
1957 def get_selection_residue_indices (self, selection : 'Selection') -> list[int]:
1958 """Given a selection, get a list of residue indices for residues implicated.
1959 Note that if a single atom from the residue is in the selection then the residue index is returned."""
1960 return list(set([ self.atoms[atom_index].residue_index for atom_index in selection.atom_indices ]))
1962 def get_selection_residues (self, selection : 'Selection') -> list['Residue']:
1963 """Given a selection, get a list of residues implicated.
1964 Note that if a single atom from the residue is in the selection then the residue is returned."""
1965 residue_indices = self.get_selection_residue_indices(selection)
1966 return [ self.residues[index] for index in residue_indices ]
1968 def get_selection_chain_indices (self, selection : 'Selection') -> list[int]:
1969 """Given a selection, get a list of chain indices for chains implicated.
1970 Note that if a single atom from the chain is in the selection then the chain index is returned."""
1971 return list(set([ self.atoms[atom_index].chain_index for atom_index in selection.atom_indices ]))
1973 def get_selection_chains (self, selection : 'Selection') -> list['Chain']:
1974 """Given a selection, get a list of chains implicated.
1975 Note that if a single atom from the chain is in the selection then the chain is returned."""
1976 chain_indices = self.get_selection_chain_indices(selection)
1977 return [ self.chains[index] for index in chain_indices ]
1979 def get_selection_classification (self, selection : 'Selection') -> str:
1980 """Get type of the chain."""
1981 # Get selection residues
1982 selection_residue_indices = self.get_selection_residue_indices(selection)
1984 # Inicializar contadores para cada tipo de residuo
1985 residue_counts = {}
1987 # Count the residues of each type
1988 for residue_index in selection_residue_indices:
1989 residue = self.residues[residue_index]
1990 res_class = residue.classification
1991 if res_class in residue_counts:
1992 residue_counts[res_class] += 1
1993 else:
1994 # If the classification is not recognized, count it as "other"
1995 residue_counts[res_class] = 1
1997 # Count the total number of residues in the selection
1998 total_residues = sum(residue_counts.values())
1999 if total_residues == 0: raise ValueError('Should have residues at this point')
2001 # Calculate the proportion of each type of residue
2002 proportions = { k: v / total_residues for k, v in residue_counts.items() }
2004 # If one type of residue dominates, return it
2005 primary_type = max(proportions, key=proportions.get)
2006 # We establish a threshold of 80% to consider a chain as a single type
2007 if proportions[primary_type] >= 0.8:
2008 return primary_type
2010 # Special cases
2011 relevant_threshold = 0.3
2012 if proportions.get("dna", 0) > relevant_threshold and proportions.get("rna", 0) > relevant_threshold:
2013 return "nucleic"
2014 if proportions.get("carbohydrate", 0) > relevant_threshold and proportions.get("protein", 0) > relevant_threshold:
2015 return "glycoprotein"
2016 if proportions.get("fatty", 0) > relevant_threshold and proportions.get("steroid", 0) > relevant_threshold:
2017 return "lipid"
2019 # Any other combinations of different main proportions
2020 main_proportions = { k: v for k, v in proportions.items() if v > relevant_threshold }
2021 # Nothing is above the threshold
2022 if len(main_proportions) == 0:
2023 return "mix"
2024 # There is only one thing above threshold
2025 elif len(main_proportions) == 1:
2026 return f"{primary_type}/mix"
2027 # There are two things above threshold
2028 elif len(main_proportions) == 2:
2029 other_type = next(key for key in main_proportions.keys() if key != primary_type)
2030 return f"{primary_type}/{other_type}"
2031 # There are three things above the threshold (more is not possible)
2032 else:
2033 return "mix"
2035 def filter (self, selection : Union['Selection', str], selection_syntax : str = 'vmd') -> 'Structure':
2036 """Create a new structure from the current using a selection to filter atoms."""
2037 if not selection: raise RuntimeError('No selection was passed')
2038 # In case the selection is not an actual Selection, but a string, parse the string into a Selection
2039 if type(selection) == str:
2040 selection = self.select(selection, selection_syntax)
2041 new_atoms = []
2042 new_residues = []
2043 new_chains = []
2044 # Get the selected atoms
2045 # Generate dictionaries with new indexes as keys and previous indexes as values for atoms, residues and chains
2046 # This is done with this structure for the residues and chains further to find the new indices fast
2047 new_atom_indices = {}
2048 # Collect also original indices to related atom residues and chains
2049 original_atom_residue_indices = []
2050 for new_index, original_index in enumerate(selection.atom_indices):
2051 # Make a copy of the selected atom in order to not modify the original one
2052 original_atom = self.atoms[original_index]
2053 new_atom = Atom(
2054 name=original_atom.name,
2055 element=original_atom.element,
2056 coords=original_atom.coords,
2057 )
2058 new_atoms.append(new_atom)
2059 # Save old and new indices in order to reconfigure all indices later
2060 new_atom_indices[original_index] = new_index
2061 original_atom_residue_indices.append(original_atom.residue_index)
2062 # Find the selected residues
2063 selected_residue_indices = list(set(original_atom_residue_indices))
2064 # Repeat the original/new indices backup we did before
2065 new_residue_indices = {}
2066 original_residue_atom_indices = []
2067 original_residue_chain_indices = []
2068 for new_index, original_index in enumerate(selected_residue_indices):
2069 # Make a copy of the selected residue in order to not modify the original one
2070 original_residue = self.residues[original_index]
2071 new_residue = Residue(
2072 name=original_residue.name,
2073 number=original_residue.number,
2074 icode=original_residue.icode,
2075 )
2076 new_residues.append(new_residue)
2077 # Save old and new indices in order to reconfigure all indices later
2078 new_residue_indices[original_index] = new_index
2079 original_residue_atom_indices.append(original_residue.atom_indices)
2080 original_residue_chain_indices.append(original_residue.chain_index)
2081 # Find the selected chains
2082 selected_chain_indices = list(set(original_residue_chain_indices))
2083 # Repeat the original/new indices backup we did before
2084 new_chain_indices = {}
2085 original_chain_residue_indices = []
2086 for new_index, original_index in enumerate(selected_chain_indices):
2087 # Make a copy of the selected chain in order to not modify the original one
2088 original_chain = self.chains[original_index]
2089 new_chain = Chain(
2090 name=original_chain.name,
2091 )
2092 new_chains.append(new_chain)
2093 # Save old and new indices in order to reconfigure all indices later
2094 new_chain_indices[original_index] = new_index
2095 original_chain_residue_indices.append(original_chain.residue_indices)
2096 # Finally, reset indices in all instances
2097 for a, atom in enumerate(new_atoms):
2098 atom.residue_index = new_residue_indices[ original_atom_residue_indices[a] ]
2099 for r, residue in enumerate(new_residues):
2100 atom_indices = []
2101 for original_index in original_residue_atom_indices[r]:
2102 new_index = new_atom_indices.get(original_index, None)
2103 if new_index != None:
2104 atom_indices.append(new_index)
2105 residue.atom_indices = atom_indices
2106 residue.chain_index = new_chain_indices[ original_residue_chain_indices[r] ]
2107 for c, chain in enumerate(new_chains):
2108 residue_indices = []
2109 for original_index in original_chain_residue_indices[c]:
2110 new_index = new_residue_indices.get(original_index, None)
2111 if new_index != None:
2112 residue_indices.append(new_index)
2113 chain.residue_indices = residue_indices
2114 return Structure(atoms=new_atoms, residues=new_residues, chains=new_chains)
2116 def chainer (self,
2117 selection : Optional['Selection'] = None,
2118 letter : Optional[str] = None,
2119 whole_fragments : bool = False):
2120 """Set chains on demand.
2121 If no selection is passed then the whole structure will be affected.
2122 If no chain is passed then a "chain by fragment" logic will be applied."""
2123 # If there is no selection we consider all atoms
2124 if selection == None:
2125 selection = self.select_all()
2126 # If the selection is empty then there is nothing to do here
2127 if len(selection) == 0: return
2128 # If a letter is specified then the logic is way simpler
2129 if letter and not whole_fragments:
2130 self.set_selection_chain_name(selection, letter)
2131 return
2132 # If a letter is not specified we run the "fragments" logic
2133 fragment_getter = self.find_whole_fragments if whole_fragments else self.find_fragments
2134 for fragment in fragment_getter(selection):
2135 chain_name = letter if letter else self.get_available_chain_name()
2136 self.set_selection_chain_name(fragment, chain_name)
2138 def auto_chainer (self, verbose : bool = False):
2139 """Smart function to set chains automatically.
2140 Original chains will be overwritten."""
2141 if verbose: print('Running auto-chainer')
2142 # Set all chains to X
2143 self.chainer(letter='X')
2144 # Set solvent and ions as a unique chain
2145 ion_selection = self.select_ions()
2146 if verbose: print(f' Ion atoms: {len(ion_selection)}')
2147 solvent_selection = self.select_water()
2148 if verbose: print(f' Solvent atoms: {len(solvent_selection)}')
2149 ion_and_indices_selection = ion_selection + solvent_selection
2150 self.chainer(selection=ion_and_indices_selection, letter='S')
2151 # Set fatty acids and steroids as a unique chain
2152 # RUBEN: con whole_fragments algunos carbohidratos se sobreescriben como glucolípidos
2153 # DANI: Se podrían descartarían residuos que no pertenezcan a la membrana por proximidad
2154 membrane_selection = self.select_lipids()
2155 if verbose: print(f' Membrane atoms: {len(membrane_selection)}')
2156 self.chainer(selection=membrane_selection, letter='M', whole_fragments=True)
2157 # Set carbohydrates as a unique chain as well, just in case we have glycans
2158 # Note that in case glycan atoms are mixed with protein atoms glycan chains will be overwritten
2159 # However this is not a problem. It is indeed the best solution if we don't want ro resort atoms
2160 carbohydrate_selection = self.select_carbohydrates()
2161 if verbose: print(f' Carbohydrate atoms: {len(carbohydrate_selection)}')
2162 self.chainer(selection=carbohydrate_selection, letter='H')
2163 # Add a chain per fragment for both proteins and nucleic acids
2164 protein_selection = self.select_protein()
2165 if verbose: print(f' Protein atoms: {len(protein_selection)}')
2166 nucleic_selection = self.select_nucleic()
2167 if verbose: print(f' Nucleic acid atoms: {len(nucleic_selection)}')
2168 protein_and_nucleic_selection = protein_selection + nucleic_selection
2169 self.chainer(selection=protein_and_nucleic_selection)
2170 # At this point we should have covered most of the molecules in the structure
2171 # However, in case there are more molecules, we have already set them all as a single chain ('X')
2172 # Here we do not apply the chain per fragment logic since it may be dangerous
2173 # Note that we may have a lot of independent residues (and thus a los of small fragments)
2174 # This would make us run out of letters in the alphabet and thus there would be no more chains
2175 # As the last step, fix repeated chains
2176 # RUBEN: sacar esta parte ya que se hace luego en el structure_corrector?
2177 self.check_repeated_chains(fix_chains=True)
2179 def raw_protein_chainer (self):
2180 """This is an alternative system to find protein chains (anything else is chained as 'X').
2181 This system does not depend on VMD.
2182 It totally overrides previous chains since it is expected to be used only when chains are missing."""
2183 current_chain = self.get_available_chain_name()
2184 previous_alpha_carbon = None
2185 for residue in self.residues:
2186 alpha_carbon = next((atom for atom in residue.atoms if atom.name == 'CA'), None)
2187 if not alpha_carbon:
2188 residue.set_chain('X')
2189 continue
2190 # Connected aminoacids have their alpha carbons at a distance of around 3.8 Ångstroms
2191 residues_are_connected = previous_alpha_carbon and calculate_distance(previous_alpha_carbon, alpha_carbon) < 4
2192 if not residues_are_connected:
2193 current_chain = self.get_available_chain_name()
2194 residue.set_chain(current_chain)
2195 previous_alpha_carbon = alpha_carbon
2197 def set_selection_chain_name (self, selection : 'Selection', letter : str):
2198 """
2199 Given an atom selection, set the chain for all these atoms.
2200 Note that the chain is changed in every whole residue, no
2201 matter if only one atom was selected.
2202 """
2203 # Find if the letter belongs to an already existing chain
2204 chain = self.get_chain_by_name(letter)
2205 if not chain:
2206 chain = Chain(name=letter)
2207 self.set_new_chain(chain)
2208 # Get the selection residue indices
2209 selection_residue_indices = self.get_selection_residue_indices(selection)
2210 # Set the chain index of every residue to the new chain
2211 # WARNING:
2212 # This may have to be done atom by atom but we forgot why so we change
2213 # it so see what was the problem. Doing it atom by atom is not efficient
2214 # and was causing a bug where residues with same number could change their name
2215 # to that of the first residue found with that number.
2216 for residue_index in selection_residue_indices:
2217 # WARNING: Chain index has to be read every iteration since it may change. Do not save it
2218 residue = self.residues[residue_index]
2219 residue.set_chain_index(chain.index)
2221 def check_available_chains(self):
2222 """Check if there are more chains than available letters."""
2223 n_chains = len(self.chains)
2224 n_available_letters = len(AVAILABLE_LETTERS)
2225 if n_chains <= n_available_letters: return
2226 raise InputError(f'There are {n_chains} chains in the structure so far. If this is not expected then there may be a problem.\n' +
2227 f' If this is expected then unfortunatelly there is a limit of {n_available_letters} available chain letters.\n' +
2228 ' Please manually set the chains from scratch or merge together some chains to reduce the number.\n' +
2229 ' Customize a PDB file using the command "mwf chainer". Then use the customized PDB as input structure (-stru).')
2231 def get_available_chain_name (self) -> str:
2232 """Get an available chain name.
2233 Find alphabetically the first letter which is not yet used as a chain name.
2234 If all letters in the alphabet are used already then raise an error."""
2235 self.check_available_chains()
2236 current_chain_names = [ chain.name for chain in self.chains ]
2237 next_available_chain_name = next((name for name in AVAILABLE_LETTERS if name not in current_chain_names), None)
2238 return next_available_chain_name
2240 def get_next_available_chain_name (self, anterior : str) -> str:
2241 """
2242 Get the next available chain name.
2244 Args:
2245 anterior (str): The last chain name used, which is expected to be a single letter
2247 Raises:
2248 ValueError: If the anterior is not a letter or if there are more chains than available
2249 """
2250 self.check_available_chains()
2251 current_chain_names = set([ chain.name for chain in self.chains ])
2252 # If anterior is cap then try caps first, if anterior is lower then try lowers first
2253 if anterior.isupper():
2254 first_group, second_group = AVAILABLE_CAPS, AVAILABLE_LOWS
2255 elif anterior.islower():
2256 first_group, second_group = AVAILABLE_LOWS, AVAILABLE_CAPS
2257 else: raise ValueError(f'Is "{anterior}" even a letter?')
2258 # Reorder letters in the first group, so the anterior is the last letter
2259 anterior_position = first_group.index(anterior)
2260 next_position = anterior_position + 1
2261 reordered_group = first_group[next_position:] + first_group[0:next_position]
2262 next_letter = next((letter for letter in reordered_group if letter not in current_chain_names), None)
2263 if next_letter: return next_letter
2264 # If there is not available letters is the first group then return the first available in the second
2265 next_letter = next((letter for letter in second_group if letter not in current_chain_names), None)
2266 return next_letter
2268 def get_chain_by_name (self, name : str) -> Optional['Chain']:
2269 """Get a chain by its name."""
2270 return next((c for c in self.chains if c.name == name), None)
2272 def find_residue (self, chain_name : str, number : int, icode : str = '' ) -> Optional['Residue']:
2273 """Find a residue by its chain, number and insertion code."""
2274 # Get the corresponding chain
2275 target_chain = self.get_chain_by_name(chain_name)
2276 if not target_chain: return None
2277 # Find the residue in the target chain
2278 target_chain.find_residue(number, icode)
2280 def display_summary (self):
2281 """Get a summary of the structure."""
2282 print(f'Atoms: {self.atom_count}')
2283 print(f'Residues: {len(self.residues)}')
2284 print(f'Chains: {len(self.chains)}')
2285 for chain in self.chains:
2286 print(f'Chain {chain.name} ({len(chain.residue_indices)} residues)')
2287 print(' -> ' + chain.get_sequence())
2289 def check_repeated_chains (self, fix_chains : bool = False, display_summary : bool = False) -> bool:
2290 """
2291 There may be chains which are equal in the structure (i.e. same chain name).
2292 This means we have a duplicated/splitted chain.
2293 Repeated chains are usual and they are usually supported but with some problems.
2294 Also, repeated chains usually come with repeated residues, which means more problems (see explanation below).
2296 In the context of this structure class we may have 2 different problems with a different solution each:
2297 1. There is more than one chain with the same letter (repeated chain) -> rename the duplicated chains
2298 2. There is a chain with atom indices which are not consecutive (splitted chain) -> create new chains
2300 Rename repeated chains or create new chains if the fix_chains argument is True.
2302 WARNING: These fixes are possible only if there are less chains than the number of letters in the alphabet.
2303 Although there is no limitation in this code for chain names, setting long chain names is not compatible with pdb format.
2305 Check splitted chains (a chains with non consecutive residues) and try to fix them if requested.
2306 Check repeated chains (two chains with the same name) and return True if there were any repeats.
2307 """
2308 # Order chains according to their names
2309 # Save also those chains which have a previous duplicate
2310 name_chains = {}
2311 repeated_chains = []
2312 for chain in self.chains:
2313 chain_name = chain.name
2314 current_name_chains = name_chains.get(chain_name, None)
2315 if not current_name_chains:
2316 name_chains[chain_name] = [chain]
2317 else:
2318 name_chains[chain_name].append(chain)
2319 repeated_chains.append(chain)
2320 # Display the summary of repeated chains if requested
2321 if display_summary:
2322 if len(repeated_chains) > 0:
2323 warn('There are repeated chains:')
2324 for chain_name, chains in name_chains.items():
2325 chains_count = len(chains)
2326 if chains_count > 1:
2327 print(f'- Chain {chain_name} has {chains_count} repeats')
2328 # Rename repeated chains if requested
2329 if len(repeated_chains) > 0 and fix_chains:
2330 self.check_available_chains()
2331 current_letters = list(name_chains.keys())
2332 for repeated_chain in repeated_chains:
2333 last_chain_letter = repeated_chain.name
2334 while last_chain_letter in current_letters:
2335 last_chain_letter = get_next_letter(last_chain_letter)
2336 repeated_chain.name = last_chain_letter
2337 current_letters.append(last_chain_letter)
2338 # Check if there are splitted chains
2339 # RUBEN: sacar esta parte ya que esta self.check_splitted_chains
2340 for chain in self.chains:
2341 residue_indices = sorted(chain.residue_indices)
2342 # Check if residue indices are consecutive
2343 if residue_indices[-1] - residue_indices[0] + 1 == len(residue_indices):
2344 continue
2345 warn(f'Splitted chain {chain.name}')
2346 # If indices are not consecutive then we must find ranges of consecutive residues and create new chains for them
2347 previous_residue_index = residue_indices[0]
2348 consecutive_residues = [previous_residue_index]
2349 overall_consecutive_residues = []
2350 for residue_index in residue_indices[1:]:
2351 # If next residue is consecutive
2352 if residue_index == previous_residue_index + 1:
2353 consecutive_residues.append(residue_index)
2354 # If next residue is NOT consecutive
2355 else:
2356 overall_consecutive_residues.append(consecutive_residues)
2357 consecutive_residues = [residue_index]
2358 # Update the previous index
2359 previous_residue_index = residue_index
2360 # Add the last split
2361 overall_consecutive_residues.append(consecutive_residues)
2362 # Now create new chains and reasign residues
2363 # Skip the first set of consecutive residues since they will stay in the original chain
2364 for residues_indices in overall_consecutive_residues[1:]:
2365 chain_name = self.get_available_chain_name()
2366 residues_selection = self.select_residue_indices(residues_indices)
2367 self.set_selection_chain_name(residues_selection, chain_name)
2369 # Fix repeated chains if requested
2370 return len(repeated_chains) > 0
2372 def check_splitted_chains (self, fix_chains : bool = False, display_summary : bool = False) -> bool:
2373 """Check if non-consecutive atoms belong to the same chain.
2374 If so, separate pieces of non-consecuite atoms in different chains.
2375 Note that the new chains will be duplicated, so you will need to run check_repeated_chains after.
2377 Args:
2378 fix_chains (bool): If True then the splitted chains will be fixed.
2379 display_summary (bool): If True then a summary of the splitted chains will be displayed.
2381 Returns:
2382 bool:
2383 True if we encountered splitted chains and false otherwise.
2384 """
2385 splitted_fragments: list[tuple[str, list[int]]] = []
2386 # Keep track of already checked chains
2387 checked_chains = set()
2388 last_chain = None
2389 last_fragment_start = None
2390 for atom_index, atom in enumerate(self.atoms):
2391 # Get the chain name
2392 chain_name = atom.chain.name
2393 # Skip the atom if it belong to the previous chain
2394 if chain_name == last_chain: continue
2395 # If we were in a splitted fragment then end it here
2396 if last_fragment_start != None:
2397 new_fragment_indices = list(range(last_fragment_start, atom_index))
2398 new_fragment = (last_chain, new_fragment_indices)
2399 splitted_fragments.append(new_fragment)
2400 last_fragment_start = None
2401 # Check if the chain was already found
2402 if chain_name in checked_chains:
2403 # Start a new fragment
2404 last_fragment_start = atom_index
2405 # Update last chain
2406 last_chain = chain_name
2407 # Add the new chain to the set of already checked chains
2408 checked_chains.add(chain_name)
2410 # Make a summary of the splitted chains if requested
2411 if display_summary and len(splitted_fragments) > 0:
2412 warn(f'Found {len(splitted_fragments)} splitted fragments')
2413 affected_chains = sorted(list(set([ fragment[0] for fragment in splitted_fragments ])))
2414 print(f' We are having splits in chains {", ".join(affected_chains)}')
2416 # Fix chains if requested
2417 if fix_chains:
2418 for chain_name, fragment_atom_indices in splitted_fragments:
2419 # Create a new chain
2420 new_chain = Chain(name=chain_name)
2421 self.set_new_chain(new_chain)
2422 # Ge the selection residue indices for the fragment
2423 fragment_selection = self.select_atom_indices(fragment_atom_indices)
2424 fragment_residue_indices = self.get_selection_residue_indices(fragment_selection)
2425 # Move atoms in the fragment to the new chain
2426 for residue_index in fragment_residue_indices:
2427 residue = self.residues[residue_index]
2428 residue.set_chain_index(new_chain.index)
2429 return len(splitted_fragments) > 0
2431 def sort_residues (self):
2432 """Coherently sort residues according to the indices of the atoms they hold."""
2433 # Set a function to sort atoms and residues by index
2434 def by_first_atom_index (residue):
2435 return min(residue.atom_indices)
2436 # Sort residues according to their first atom index
2437 sorted_residues = sorted(self.residues, key = by_first_atom_index)
2438 # Iterate sorted residues letting them know their new index
2439 for r, residue in enumerate(sorted_residues):
2440 residue.index = r
2441 # Finally update the structure's residues list
2442 self.residues = sorted_residues
2444 def check_merged_residues (self, fix_residues : bool = False, display_summary : bool = False) -> bool:
2445 """
2446 There may be residues which contain unconnected (unbonded) atoms. They are not allowed.
2447 They may come from a wrong parsing and be indeed duplicated residues.
2449 Search for merged residues.
2450 Create new residues for every group of connected atoms if the fix_residues argument is True.
2451 Note that the new residues will be repeated, so you will need to run check_repeated_residues after.
2452 Return True if there were any merged residues.
2453 """
2454 # Get the list of merged residues we encounter
2455 merged_residues = []
2456 # Iterate residues
2457 for residue in self.residues:
2458 residue_selection = residue.get_selection()
2459 residue_fragments = list(self.find_fragments(residue_selection))
2460 if len(residue_fragments) <= 1: continue
2461 # If current residue has more than 1 fragment then it is a merged residue
2462 merged_residues.append(residue)
2463 if not fix_residues: continue
2464 # If the residue is to be fixed then let the first fragment as the current residue
2465 # Then create a new residue for every other fragment
2466 for extra_fragment in residue_fragments[1:]:
2467 # Set a new residue identical to the current one
2468 new_residue = Residue(residue.name, residue.number, residue.icode)
2469 self.set_new_residue(new_residue)
2470 # Add it to the same chain
2471 residue.chain.add_residue(new_residue)
2472 # Add atoms to it
2473 for atom_index in extra_fragment.atom_indices:
2474 atom = self.atoms[atom_index]
2475 new_residue.add_atom(atom)
2476 # Now that we have new resiudes, sort all residues to keep them coherent
2477 self.sort_residues()
2478 # Count how many merged residues we encountered
2479 merged_residues_count = len(merged_residues)
2480 # Log some details if the summary is requested
2481 if display_summary and merged_residues_count > 0:
2482 print(f'Found {merged_residues_count} merged residues')
2483 print(f' e.g. {merged_residues[0]}')
2484 # Return if we found merged residues
2485 return merged_residues_count > 0
2487 def check_repeated_residues (self, fix_residues : bool = False, display_summary : bool = False) -> bool:
2488 """
2489 There may be residues which are equal in the structure (i.e. same chain, number and icode).
2490 In case 2 residues in the structure are equal we must check distance between their atoms.
2491 If atoms are far it means they are different residues with the same notation (duplicated residues).
2492 If atoms are close it means they are indeed the same residue (splitted residue).
2494 Splitted residues are found in some pdbs and they are supported by some tools.
2495 These tools consider all atoms with the same 'record' as the same residue.
2496 However, there are other tools which would consider the splitted residue as two different residues.
2497 This causes inconsistency along different tools besides a long list of problems.
2498 The only possible is fix is changing the order of atoms in the topology.
2499 Note that this is a breaking change for associated trajectories, which must change the order of coordinates.
2500 However here we provide tools to fix associates trajectories as well.
2502 Duplicated residues are usual and they are usually supported but with some problems.
2503 For example, pytraj analysis outputs use to sort results by residues and each residue is tagged.
2504 If there are duplicated residues with the same tag it may be not possible to know which result belongs to each residue.
2505 Another example are NGL selections once in the web client.
2506 If you select residue ':A and 1' and there are multiple residues 1 in chain A all of them will be displayed.
2508 Check residues to search for duplicated and splitted residues.
2509 Renumerate repeated residues if the fix_residues argument is True.
2510 Return True if there were any repeats.
2511 """
2512 # Track if residues have to be changed or not
2513 modified = False
2514 # Group all residues in the structure according to their chain, number and icode
2515 grouped_residues = {}
2516 # Check repeated residues which are one after the other
2517 # Note that these residues MUST have a different name
2518 # Otherwise they would have not been considered different residues
2519 # For these rare cases we use icodes to solve the problem
2520 non_icoded_residues = []
2521 last_residue = None
2522 for residue in self.residues:
2523 # Check residue to be equal than the previous residue
2524 if residue == last_residue:
2525 non_icoded_residues.append(residue)
2526 last_residue = residue
2527 continue
2528 last_residue = residue
2529 # Add residue to the groups of repeated residues
2530 current_residue_repeats = grouped_residues.get(residue, None)
2531 if not current_residue_repeats:
2532 grouped_residues[residue] = [ residue ]
2533 else:
2534 current_residue_repeats.append(residue)
2535 # In case we have non icoded residues
2536 if len(non_icoded_residues) > 0:
2537 if display_summary:
2538 print(f'There are non-icoded residues ({len(non_icoded_residues)})')
2539 # Set new icodes for non icoded residues
2540 if fix_residues:
2541 print(' Non icoded residues will recieve an icode')
2542 for residue in non_icoded_residues:
2543 repeated_residues_group = grouped_residues[residue]
2544 current_icodes = [ residue.icode for residue in repeated_residues_group if residue.icode ]
2545 next_icode = next((cap for cap in AVAILABLE_CAPS if cap not in current_icodes), None)
2546 if not next_icode:
2547 raise ValueError('There are no more icodes available')
2548 # print('Setting icode ' + next_icode + ' to residue ' + str(residue))
2549 residue.icode = next_icode
2550 modified = True
2551 # Grouped residues with more than 1 result are considered as repeated
2552 repeated_residues = [ residues for residues in grouped_residues.values() if len(residues) > 1 ]
2553 if len(repeated_residues) == 0:
2554 return modified
2555 # In case we have repeated residues...
2556 if display_summary:
2557 warn(f'There are {len(repeated_residues)} different groups of repeated residues')
2558 print(f' e.g. {repeated_residues[0][0]}')
2559 if len(repeated_residues) == 9999 or len(repeated_residues) == 10000:
2560 print(' Probably you have more residues than the PDB numeration limit (1-9999)')
2561 # Now for each repeated residue, find out which are splitted and which are duplicated
2562 covalent_bonds = self.bonds
2563 overall_splitted_residues = []
2564 overall_duplicated_residues = []
2565 for residues in repeated_residues:
2566 # Iterate over repeated residues and check if residues are covalently bonded
2567 # If any pair of residues are bonded add them both to the splitted residues list
2568 # At the end, all non-splitted residues will be considered duplicated residues
2569 # WARNING: Using a set here is not possible since repeated residues have the same hash
2570 # WARNING: Also comparing residues themselves is not advisable, so we use indices at this point
2571 splitted_residue_indices = set()
2572 for residue, other_residues in otherwise(residues):
2573 if residue.index in splitted_residue_indices:
2574 continue
2575 # Get atom indices for all atoms connected to the current residue
2576 residue_bonds = sum([ covalent_bonds[index] for index in residue.atom_indices ], [])
2577 for other_residue in other_residues:
2578 # Get all atom indices for each other residue and collate with the current residue bonds
2579 if any( index in residue_bonds for index in other_residue.atom_indices ):
2580 splitted_residue_indices.add(residue.index)
2581 splitted_residue_indices.add(other_residue.index)
2582 # Finally obtain the splitted residues from its indices
2583 splitted_residues = [ self.residues[index] for index in splitted_residue_indices ]
2584 # Repeated residues which are not splitted are thus duplicated
2585 duplicated_residues = [ residue for residue in residues if residue.index not in splitted_residue_indices ]
2586 # Update the overall lists
2587 if len(splitted_residues) > 0:
2588 overall_splitted_residues.append(splitted_residues)
2589 if len(duplicated_residues) > 0:
2590 overall_duplicated_residues.append(duplicated_residues)
2591 # In case we have splitted residues
2592 if len(overall_splitted_residues) > 0:
2593 if display_summary:
2594 print(f' There are splitted residues ({len(overall_splitted_residues)})')
2595 # Fix splitted residues
2596 if fix_residues:
2597 print(' Splitted residues will be merged')
2598 # Set a function to sort atoms and residues by index
2599 by_index = lambda v: v._index
2600 # Merge splitted residues
2601 # WARNING: Merging residues without sorting atoms is possible, but this would be lost after exporting to pdb
2602 for splitted_residues in overall_splitted_residues:
2603 # The first residue (i.e. the residue with the lower index) will be the one which remains
2604 # It will absorb other residue atom indices
2605 splitted_residues.sort(key=by_index) # Residues are not sorted by default, this is mandatory
2606 first_residue = splitted_residues[0]
2607 other_residues = splitted_residues[1:]
2608 for residue in other_residues:
2609 for atom in residue.atoms:
2610 atom.residue = first_residue
2611 print(' Atoms will be sorted to be together by residues')
2612 print(' NEVER FORGET: This will break any associated trajectory if coordinates are not sorted as well')
2613 # Sort atoms to group residue atoms together
2614 # Note that each atom index must be updated
2615 new_atom_indices = sum([ residue.atom_indices for residue in self.residues ], [])
2616 for i, new_atom_index in enumerate(new_atom_indices):
2617 atom = self.atoms[new_atom_index]
2618 atom._index = i
2619 self.atoms.sort(key=by_index)
2620 # Also residue 'atom_indices' must be updated
2621 for residue in self.residues:
2622 residue._atom_indices = [ new_atom_indices.index(atom_index) for atom_index in residue._atom_indices ]
2623 # Bonds must be reset since atom indices have changes
2624 self._bonds = None
2625 # Prepare the trajectory atom sorter which must be returned
2626 # Include atom indices already so the user has to provide only the structure and trajectory filepaths
2627 def trajectory_atom_sorter (
2628 input_structure_file : 'File',
2629 input_trajectory_file : 'File',
2630 output_trajectory_file : 'File'
2631 ):
2632 sort_trajectory_atoms(
2633 input_structure_file,
2634 input_trajectory_file,
2635 output_trajectory_file,
2636 new_atom_indices
2637 )
2638 self.trajectory_atom_sorter = trajectory_atom_sorter
2639 self.new_atom_order = new_atom_indices
2640 modified = True
2641 # In case we have duplicated residues
2642 duplicated_residues_count = len(overall_duplicated_residues)
2643 if duplicated_residues_count > 0:
2644 if display_summary:
2645 warn(f'There are {duplicated_residues_count} different groups of duplicated residues')
2646 print(f' e.g. {overall_duplicated_residues[0][0]}')
2647 # Renumerate duplicated residues if requested
2648 if fix_residues:
2649 # First of all, from each group of repeated residues, discard the first residue
2650 # The first residue will remain as it is and the rest will be renumerated
2651 # Join all residues to be renumerated in a single list
2652 residue_to_renumerate = []
2653 for residues in overall_duplicated_residues:
2654 residue_to_renumerate += residues[1:]
2655 # Now group residues per chain since the renumeration is done by chains
2656 chain_residues = {}
2657 for residue in residue_to_renumerate:
2658 chain = residue.chain
2659 current_chain_residues = chain_residues.get(chain, None)
2660 if current_chain_residues: current_chain_residues.append(residue)
2661 else: chain_residues[chain] = [ residue ]
2662 # Iterate residue chain groups
2663 for chain, residues in chain_residues.items():
2664 # Find the last residue number in the current chain
2665 maximum_chain_number = max([ residue.number for residue in chain.residues ])
2666 # Sort residues by index
2667 residues.sort(key=lambda x: x.index, reverse=True)
2668 for residue in residues:
2669 # Set the number of the residue as the next available number
2670 residue.number = maximum_chain_number + 1
2671 # Update the maximum number
2672 maximum_chain_number = residue.number
2673 modified = True
2674 return modified
2676 # DANI: No recuerdo los problemas que daba tener átomos repetidos
2677 def check_repeated_atoms (self, fix_atoms : bool = False, display_summary : bool = False) -> bool:
2678 """Check atoms to search for repeated atoms.
2679 Atoms with identical chain, residue and name are considered repeated atoms.
2681 Args:
2682 fix_atoms (bool): If True, rename repeated atoms.
2683 display_summary (bool): If True, display a summary of repeated atoms.
2685 Returns:
2686 bool: True if there were any repeated atoms, False otherwise.
2687 """
2688 # Set two trackers for display
2689 repeated_atoms_count = 0
2690 example = None
2691 for residue in self.residues:
2692 # Iterate over the residue atoms counting how many repeated names we have
2693 current_names = []
2694 for atom in residue.atoms:
2695 # We check if the atom name already exists. If not, go to the next atom
2696 name = atom.name
2697 if name not in current_names:
2698 current_names.append(name)
2699 continue
2700 # When atom is repeated
2701 repeated_atoms_count += 1
2702 # If the fix was not requested we stop here
2703 if not fix_atoms:
2704 continue
2705 # We set the name of the atom as the element letter + the count of this element
2706 # If element is missing take the first character of the atom name
2707 initial = atom.element
2708 if not initial or initial == ' ':
2709 initial = name[0]
2710 number = 1
2711 new_name = initial + str(number)
2712 while new_name in current_names:
2713 number += 1
2714 if number > 999:
2715 raise ValueError('There are more than 999 atoms with the same name in the residue')
2716 new_name = initial + str(number)
2717 # Save an example for the logs if there is none yet
2718 if not example:
2719 example = f'{atom.name} renamed as {new_name} in residue {residue}'
2720 atom.name = new_name
2721 current_names.append(new_name)
2722 # Display the summary of repeated atoms if requested
2723 if display_summary:
2724 if repeated_atoms_count > 0:
2725 warn(f'There are repeated atoms ({repeated_atoms_count})')
2726 print(f' e.g. {example}')
2727 return repeated_atoms_count > 0
2729 def is_missing_any_bonds (self) -> bool:
2730 return any(bond == MISSING_BONDS for bond in self.bonds)
2732 def check_incoherent_bonds (self) -> bool:
2733 """ Check bonds to be incoherent i.e. check atoms not to have more or less
2734 bonds than expected according to their element.
2735 Return True if any incoherent bond is found. """
2736 # Find out if there are hydrogens in the structure
2737 # It may happen that we encounter an structure without hydrogens
2738 has_hydrogen = next(( True for atom in self.atoms if atom.element == 'H' ), False)
2739 coherent_bonds = coherent_bonds_with_hydrogen if has_hydrogen else coherent_bonds_without_hydrogen
2740 # Check coherent bonds atom by atom
2741 for atom in self.atoms:
2742 # Do not check this atom bonds if this may be an ion
2743 # Most authors "force" dummy bonds in these atoms to make them stable
2744 if atom.element in SUPPORTED_ION_ELEMENTS: continue
2745 # Ignore dummy atoms as well
2746 if atom.element == DUMMY_ATOM_ELEMENT: continue
2747 # Ignore coarse grain atoms as well
2748 if atom.element == CG_ATOM_ELEMENT: continue
2749 # Get actual number of bonds in the current atom both with and without ion bonds
2750 # LORE: This was a problem since some ions are force-bonded but bonds are actually not real
2751 # LORE: When an ion is forced we may end up with hyrogens with 2 bonds or carbons with 5 bonds
2752 # LORE: When an ions is really bonded we can not discard it or we may end up with orphan carbons (e.g. ligands)
2753 min_nbonds = len(atom.get_bonds(skip_ions = True, skip_dummies = True))
2754 max_nbonds = len(atom.get_bonds(skip_ions = False, skip_dummies = True))
2755 # Get the accepted range of number of bonds for the current atom according to its element
2756 element = atom.element
2757 element_coherent_bonds = coherent_bonds.get(element, None)
2758 # If there are no specficiations for the current atom element then skip it
2759 if not element_coherent_bonds:
2760 continue
2761 # Get the range of accepted number of bonds
2762 min_allowed_bonds = element_coherent_bonds['min']
2763 max_allowed_bonds = element_coherent_bonds['max']
2764 # Check the actual number of bonds is insdie the accepted range
2765 if max_nbonds < min_allowed_bonds or min_nbonds > max_allowed_bonds:
2766 if min_nbonds == max_nbonds:
2767 print(f' An atom with element {element} has {min_nbonds} bonds')
2768 else:
2769 print(f' An atom with element {element} has between {min_nbonds} bonds (withou ions) and {max_nbonds} bonds (with ions)')
2770 if min_allowed_bonds == max_allowed_bonds:
2771 plural_sufix = '' if min_allowed_bonds == 1 else 's'
2772 print(f' It should have {min_allowed_bonds} bond{plural_sufix}')
2773 else:
2774 print(f' It should have between {min_allowed_bonds} and {max_allowed_bonds} bonds')
2775 print(f' -> Atom {atom.label}')
2776 bond_label = ', '.join([ self.atoms[atom].label for atom in atom.get_bonds(skip_ions = False) ])
2777 print(f' -> Bonds {bond_label}')
2778 return True
2779 return False
2781 def get_covalent_bonds (self, selection : Optional['Selection'] = None) -> list[ list[int] ]:
2782 """Get all atomic covalent (strong) bonds.
2783 Bonds are defined as a list of atom indices for each atom in the structure.
2784 Rely on VMD logic to do so."""
2785 # It is important to fix elements before trying to fix bonds, since elements have an impact on bonds
2786 # VMD logic to find bonds relies in the atom element to set the covalent bond distance cutoff
2787 self.fix_atom_elements()
2788 # Generate a pdb strucutre to feed vmd
2789 auxiliar_pdb_filepath = '.structure.pdb'
2790 self.generate_pdb_file(auxiliar_pdb_filepath)
2791 # Get covalent bonds between both residue atoms
2792 covalent_bonds = get_covalent_bonds(auxiliar_pdb_filepath, selection)
2793 # For every atom in CG, replace its bonds with a class which will raise and error when read
2794 # Thus we make sure using these wrong bonds anywhere further will result in failure
2795 for atom_index in self.select_cg().atom_indices:
2796 covalent_bonds[atom_index] = MISSING_BONDS
2797 # Remove the auxiliar pdb file
2798 os.remove(auxiliar_pdb_filepath)
2799 return covalent_bonds
2801 def copy_bonds (self) -> list[list[int]]:
2802 """Make a copy of the bonds list."""
2803 new_bonds = []
2804 for atom_bonds in self.bonds:
2805 # Missing bonds coming from CG atoms are forwarded
2806 if atom_bonds == MISSING_BONDS:
2807 new_bonds.append(MISSING_BONDS)
2808 continue
2809 # Copy also the inner lists to avoid further mutation of the original structure
2810 new_bonds.append([ atom_index for atom_index in atom_bonds ])
2811 return new_bonds
2813 def copy (self) -> 'Structure':
2814 """Make a copy of the current structure."""
2815 atom_copies = [ atom.copy() for atom in self.atoms ]
2816 residue_copies = [ residue.copy() for residue in self.residues ]
2817 chain_copies = [ chain.copy() for chain in self.chains ]
2818 structure_copy = Structure(atom_copies, residue_copies, chain_copies)
2819 structure_copy.bonds = self.copy_bonds()
2820 return structure_copy
2822 # DANI: No lo he testeado en profundidad
2823 def merge (self, other : 'Structure') -> 'Structure':
2824 """Merge current structure with another structure."""
2825 # Copy self atoms, residues and chains
2826 self_atom_copies = [ atom.copy() for atom in self.atoms ]
2827 self_residue_copies = [ residue.copy() for residue in self.residues ]
2828 self_chain_copies = [ chain.copy() for chain in self.chains ]
2829 # Copy other atoms, residues and chains
2830 other_atom_copies = [ atom.copy() for atom in other.atoms ]
2831 other_residue_copies = [ residue.copy() for residue in other.residues ]
2832 other_chain_copies = [ chain.copy() for chain in other.chains ]
2833 # Adapt indices in other atoms, residues and chains
2834 atom_index_offset = len(self_atom_copies)
2835 residue_index_offset = len(self_residue_copies)
2836 chain_index_offset = len(self_chain_copies)
2837 for atom in other_atom_copies:
2838 atom._index += atom_index_offset
2839 atom._residue_index += residue_index_offset
2840 for residue in other_residue_copies:
2841 residue._index += residue_index_offset
2842 residue._atom_indices = [ i + atom_index_offset for i in residue._atom_indices ]
2843 residue._chain_index += chain_index_offset
2844 for chain in other_chain_copies:
2845 chain._index += chain_index_offset
2846 chain._residue_indices = [ i + residue_index_offset for i in chain._residue_indices ]
2847 # Merge self with other atoms, residues and chains and build the new structure
2848 merged_atoms = self_atom_copies + other_atom_copies
2849 merged_residues = self_residue_copies + other_residue_copies
2850 merged_chains = self_chain_copies + other_chain_copies
2851 return Structure(merged_atoms, merged_residues, merged_chains)
2853 def find_rings (self, max_ring_size : int, selection : Optional[Selection] = None) -> list[ list[Atom] ]:
2854 """Find rings with a maximum specific size or less in the structure and yield them as they are found."""
2855 # Make sure the selection does not include regions without bonds
2856 if selection & self.select_missing_bonds():
2857 raise RuntimeError('The find rings logic can not be used when we are missing bonds')
2860 def get_selection_outer_bonds (self, selection : Selection) -> list[int]:
2861 """Given an atom selection, get all bonds between these atoms and any other atom in the structure.
2862 Note that inner bonds between atoms in the selection are discarded."""
2863 # Get bonds from all atoms in the selection
2864 bonds = set()
2865 for atom_index in selection.atom_indices:
2866 atom_bonds = set(self.bonds[atom_index])
2867 bonds = bonds.union(atom_bonds)
2868 # Discard selection atoms from the bonds list to discard inner bonds
2869 bonds -= set(selection.atom_indices)
2870 return list(bonds)
2872 # Set the type of PTM according to the classification of the bonded residue
2873 ptm_options = {
2874 'protein': ValueError('A PTM residue must never be protein'),
2875 'dna': 'DNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama
2876 'rna': 'RNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama
2877 'carbohydrate': 'Glycosilation',
2878 'fatty': 'Lipidation',
2879 'steroid': 'Steroid linkage', # DANI: Esto es posible aunque muy raro y no se como se llama
2880 'ion': Warning('Ion is covalently bonded to protein'), # DANI: esto no es "correcto" pero si habitual
2881 'solvent': Warning('Solvent is covalently bonded to protein'), # DANI: probablemente sea un error
2882 'acetyl': 'Acetylation', # Typical N-capping terminals
2883 'amide': 'Amidation', # Typical C-capping terminals
2884 'other': Warning('Unknow type of PTM'), # Something not supported
2885 }
2887 def find_ptms (self) -> Generator[dict, None, None]:
2888 """Find Post Translational Modifications (PTM) in the structure."""
2889 # Find bonds between protein and non-protein atoms
2890 # First get all protein atoms with bonds
2891 protein_selection = self.select_protein() - self.select_missing_bonds()
2892 if not protein_selection: return
2893 protein_atom_indices = set(protein_selection.atom_indices) # This is used later
2894 protein_outer_bonds = set(self.get_selection_outer_bonds(protein_selection))
2895 non_protein_selection = self.invert_selection(protein_selection)
2896 if not non_protein_selection: return
2897 non_protein_atom_indices = set(non_protein_selection.atom_indices)
2898 non_protein_atom_indices_bonded_to_protein = protein_outer_bonds.intersection(non_protein_atom_indices)
2899 # Get each residue bonded to the protein and based on its 'classification' set the name of the PTM
2900 for atom_index in non_protein_atom_indices_bonded_to_protein:
2901 atom = self.atoms[atom_index]
2902 residue = atom.residue
2903 residue_classification = residue.get_classification()
2904 ptm_classification = self.ptm_options[residue_classification]
2905 # If we found something impossible then raise the error
2906 if type(ptm_classification) == ValueError:
2907 print(f'Problematic residue: {residue}')
2908 raise ptm_classification
2909 # If we do not know what it is then do tag it as a PTM but print a warning
2910 if type(ptm_classification) == Warning:
2911 warn(f'{ptm_classification} -> {residue}')
2912 continue
2913 # At this point we have found a valid PTM
2914 # Find the protein residue linked to this PTM
2915 atom_bonds = self.bonds[atom_index]
2916 protein_bond = next((bond for bond in atom_bonds if bond in protein_atom_indices), None)
2917 if protein_bond == None:
2918 raise ValueError('There must be at least one protein bond to the current atom')
2919 protein_residue_index = self.atoms[protein_bond].residue_index
2920 # Set the PTM
2921 yield { 'name': ptm_classification, 'residue_index': protein_residue_index }
2923 def has_cg (self) -> bool:
2924 """Ask if the structure has at least one coarse grain atom/residue."""
2925 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)
2928# =========================
2929# Related functions
2930# =========================
2932def calculate_distance (atom_1 : Atom, atom_2 : Atom) -> float:
2933 """Calculate the distance between two atoms."""
2934 squared_distances_sum = 0
2935 for i in { 'x': 0, 'y': 1, 'z': 2 }.values():
2936 squared_distances_sum += (atom_1.coords[i] - atom_2.coords[i])**2
2937 return math.sqrt(squared_distances_sum)
2939def calculate_angle (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom) -> float:
2940 """Calculate the angle between 3 atoms."""
2941 # From: https://stackoverflow.com/questions/35176451/python-code-to-calculate-angle-between-three-point-using-their-3d-coordinates
2942 # Get coordinates in numpy arrays, which allows to easily calculate the difference
2943 a = np.array(atom_1.coords)
2944 b = np.array(atom_2.coords)
2945 c = np.array(atom_3.coords)
2946 # Set the two vectors which make the angle
2947 ba = a - b
2948 bc = c - b
2949 # Calculate the angle between these 2 vectors
2950 cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
2951 angle = np.arccos(cosine_angle)
2952 return np.degrees(angle)
2954def calculate_torsion (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom, atom_4 : Atom) -> float:
2955 """Calculate the torsion between 4 atoms."""
2956 # From: https://stackoverflow.com/questions/20305272/dihedral-torsion-angle-from-four-points-in-cartesian-coordinates-in-python
2957 p0 = np.array(atom_1.coords)
2958 p1 = np.array(atom_2.coords)
2959 p2 = np.array(atom_3.coords)
2960 p3 = np.array(atom_4.coords)
2961 # Get some vectors
2962 b0 = -1.0*(p1 - p0)
2963 b1 = p2 - p1
2964 b2 = p3 - p2
2965 # normalize b1 so that it does not influence magnitude of vector
2966 # rejections that come next
2967 b1 /= np.linalg.norm(b1)
2968 # vector rejections
2969 # v = projection of b0 onto plane perpendicular to b1
2970 # = b0 minus component that aligns with b1
2971 # w = projection of b2 onto plane perpendicular to b1
2972 # = b2 minus component that aligns with b1
2973 v = b0 - np.dot(b0, b1)*b1
2974 w = b2 - np.dot(b2, b1)*b1
2975 # angle between v and w in a plane is the torsion angle
2976 # v and w may not be normalized but that's fine since tan is y/x
2977 x = np.dot(v, w)
2978 y = np.dot(np.cross(b1, v), w)
2979 return float(np.degrees(np.arctan2(y, x)))
2981# =========================
2982# Auxiliar functions
2983# =========================
2985def get_next_letter (letter : str) -> str:
2986 """Set a function to get the next letter from an input letter in alphabetic order."""
2987 if not letter:
2988 return 'A'
2989 if letter == 'z' or letter == 'Z':
2990 raise InputError("Limit of chain letters has been reached")
2991 next_letter = chr(ord(letter) + 1)
2992 return next_letter
2994def first_cap_only (name : str) -> str:
2995 """Given a string with 1 or 2 characters, return a new string with
2996 the first letter cap and the second letter not cap (if any)"""
2997 if len(name) == 1:
2998 return name.upper()
2999 first_character = name[0].upper()
3000 second_character = name[1].lower()
3001 return first_character + second_character
3003lower_numbers = {
3004 '0': '₀',
3005 '1': '₁',
3006 '2': '₂',
3007 '3': '₃',
3008 '4': '₄',
3009 '5': '₅',
3010 '6': '₆',
3011 '7': '₇',
3012 '8': '₈',
3013 '9': '₉',
3014}
3015def get_lower_numbers (numbers_text : str) -> str:
3016 """Convert numbers to lower text characters (chr 8020-8029)."""
3017 return ''.join([ lower_numbers[c] for c in numbers_text ])
3019def filter_model (pdb_content : str, model : int) -> str:
3020 """Set a function to filter lines in PDB content for a specific model."""
3021 # Make sure the PDB content has multiple models
3022 # If not, then return the whole PDB content ignoring the flag
3023 generic_model_header_regex = r'\nMODEL\s*[0-9]*\s*\n'
3024 if not re.search(generic_model_header_regex, pdb_content):
3025 print(f'PDB content has no models at all so it will be loaded as is (ignored model "{model}").')
3026 return pdb_content
3027 # If a model was passed then it means we must filter the PDB content
3028 filtered_pdb_content = ''
3029 # Search PDB lines until we find our model's header
3030 model_header_regex = rf'^MODEL\s*{model}'
3031 pdb_lines = iter(pdb_content.split('\n'))
3032 for line in pdb_lines:
3033 if not re.match(model_header_regex, line): continue
3034 # If we just found the header
3035 filtered_pdb_content = line
3036 break
3037 # If we did not find the header then stop here
3038 if not filtered_pdb_content: raise RuntimeError(f'Could not find model "{model}" header')
3039 # Add every line to the filtered content until we find the tail
3040 model_footer_regex = r'^ENDMDL'
3041 for line in pdb_lines:
3042 filtered_pdb_content += '\n' + line
3043 if re.match(model_footer_regex, line): return filtered_pdb_content
3044 # If we did not find the footer then stop here
3045 raise RuntimeError(f'Could not find model "{model}" footer')