Coverage for model_workflow/utils/structures.py: 87%
1674 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
1# Main handler of the toolbelt
2import os
3import re
4import math
5import pytraj
6import MDAnalysis
7import numpy as np
8from scipy.special import comb # DANI: Substituye al math.comb porque fué añadido en python 3.8 y nosotros seguimos en 3.7
9from bisect import bisect
11from model_workflow.utils.file import File
12from model_workflow.utils.selections import Selection
13from model_workflow.utils.vmd_spells import get_vmd_selection_atom_indices, get_covalent_bonds
14from model_workflow.utils.mdt_spells import sort_trajectory_atoms
15from model_workflow.utils.auxiliar import InputError, MISSING_BONDS
16from model_workflow.utils.auxiliar import is_imported, residue_name_to_letter, otherwise, warn
17from model_workflow.utils.constants import SUPPORTED_ION_ELEMENTS, SUPPORTED_ELEMENTS
18from model_workflow.utils.constants import STANDARD_COUNTER_CATION_ATOM_NAMES, STANDARD_COUNTER_ANION_ATOM_NAMES
19from model_workflow.utils.constants import STANDARD_SOLVENT_RESIDUE_NAMES, STANDARD_COUNTER_ION_ATOM_NAMES
20from model_workflow.utils.constants import STANDARD_DUMMY_ATOM_NAMES, DUMMY_ATOM_ELEMENT, CG_ATOM_ELEMENT
21from model_workflow.utils.constants import PROTEIN_RESIDUE_NAME_LETTERS, NUCLEIC_RESIDUE_NAME_LETTERS
22from model_workflow.utils.constants import DNA_RESIDUE_NAME_LETTERS, RNA_RESIDUE_NAME_LETTERS
23from model_workflow.utils.constants import FATTY_RESIDUE_NAMES, STEROID_RESIDUE_NAMES
24from model_workflow.utils.type_hints import *
25Coords = Tuple[float, float, float]
28# Set all available chains according to pdb standards
29AVAILABLE_CAPS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
30 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
31AVAILABLE_LOWS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
32 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
33AVAILABLE_LETTERS = AVAILABLE_CAPS + AVAILABLE_LOWS
35# Set letters to be found in alphanumerical bases
36hexadecimal_letters = set(AVAILABLE_CAPS[0:6] + AVAILABLE_LOWS[0:6])
37alphanumerical_letters = set(AVAILABLE_CAPS[6:] + AVAILABLE_LOWS[6:])
39# Set the expected number of bonds for each atom according to its element
40coherent_bonds_with_hydrogen = {
41 'H': { 'min': 1, 'max': 1 },
42 'O': { 'min': 1, 'max': 2 },
43 'N': { 'min': 1, 'max': 4 },
44 'C': { 'min': 2, 'max': 4 },
45 'S': { 'min': 1, 'max': 6 },
46 'P': { 'min': 2, 'max': 6 },
47}
48# WARNING: Not deeply tested
49coherent_bonds_without_hydrogen = {
50 'O': { 'min': 0, 'max': 2 },
51 'N': { 'min': 1, 'max': 3 },
52 'C': { 'min': 1, 'max': 3 },
53 'S': { 'min': 1, 'max': 4 },
54 'P': { 'min': 2, 'max': 4 },
55}
57# Set the limit of residue numbers according to PDB format (4 characters, starts count at 1)
58# This means the last number is 9999 and it is equivalent to index 9998
59pdb_last_decimal_residue_index = 9998
61class Atom:
62 """An atom class."""
64 def __init__ (self,
65 name : Optional[str] = None,
66 element : Optional[str] = None,
67 coords : Optional[Coords] = None,
68 ):
69 self.name = name
70 self.element = element
71 self.coords = tuple(coords) if coords else None
72 # Set variables to store references to other related instances
73 # These variables will be set further by the structure
74 self._structure = None
75 self._index = None
76 self._residue_index = None
78 def __repr__ (self):
79 return '<Atom ' + self.name + '>'
81 def __eq__ (self, other):
82 if type(self) != type(other):
83 return False
84 return self._residue_index == other._residue_index and self.name == other.name
86 def __hash__ (self):
87 return hash((self.index))
89 # The parent structure (read only)
90 # This value is set by the structure itself
91 def get_structure (self) -> Optional['Structure']:
92 return self._structure
93 structure = property(get_structure, None, None, "The parent structure (read only)")
95 # The residue index according to parent structure residues (read only)
96 # This value is set by the structure itself
97 def get_index (self) -> Optional[int]:
98 return self._index
99 index = property(get_index, None, None, "The residue index according to parent structure residues (read only)")
101 # The atom residue index according to parent structure residues
102 # If residue index is set then make changes in all the structure to make this change coherent
103 def get_residue_index (self) -> int:
104 return self._residue_index
105 def set_residue_index (self, new_residue_index : int):
106 # If the new residue index is the current residue index then do nothing
107 # WARNING: It is important to stop this here or it could delete a residue which is not to be deleted
108 if new_residue_index == self.residue_index:
109 return
110 # If there is not strucutre yet it means the residue is beeing set before the structure
111 # We just save the residue index and wait for the structure to be set
112 if not self.structure:
113 self._residue_index = new_residue_index
114 return
115 # Relational indices are updated through a top-down hierarchy
116 # Affected residues are the ones to update this atom internal residue index
117 current_residue = self.residue
118 current_residue.remove_atom(self)
119 new_residue = self.structure.residues[new_residue_index]
120 new_residue.add_atom(self)
121 residue_index = property(get_residue_index, set_residue_index, None, "The atom residue index according to parent structure residues")
123 # The atom residue
124 # If residue is set then make changes in all the structure to make this change coherent
125 def get_residue (self) -> Optional['Residue']:
126 # If there is not strucutre yet it means the atom is beeing set before the structure
127 # In this case it is not possible to get related residues in the structure
128 # It also may happend that the residue is missing because the atom has been set rawly
129 if not self.structure or self.residue_index == None:
130 return None
131 # Get the residue in the structure according to the residue index
132 return self.structure.residues[self.residue_index]
133 def set_residue (self, new_residue : 'Residue'):
134 # Find the new residue index and set it as the atom residue index
135 # Note that the residue must be set in the structure already
136 new_residue_index = new_residue.index
137 if new_residue_index == None:
138 raise ValueError('Residue ' + str(new_residue) + ' is not set in the structure')
139 self.set_residue_index(new_residue_index)
140 residue = property(get_residue, set_residue, None, "The atom residue")
142 def get_chain_index (self) -> Optional[int]:
143 """Get the atom chain index according to parent structure chains."""
144 # The residue may be missing if the atom has been set rawly
145 if not self.residue:
146 return None
147 return self.residue.chain_index
149 def set_chain_index (self, new_chain_index : int):
150 """Set a new chain index.
151 WARNING: You may create a new residue in this process."""
152 # If the new chain index is the current chain index do nothing
153 # WARNING: It is important to stop this here or it could delete a chain which is not to be deleted
154 if new_chain_index == self.chain_index:
155 return
156 # If there is not strucutre yet it means the chain is beeing set before the structure
157 # We just save the chain index and wait for the structure to be set
158 if not self.structure:
159 self._chain_index = new_chain_index
160 return
161 # Relational indices are updated through a top-down hierarchy
162 # Affected chains are the ones to update this residue internal chain index
163 # WARNING: It is critical to find the new chain before removing/adding residues
164 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged
165 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed
166 new_chain = self.structure.chains[new_chain_index]
167 # set the new residue where this atom will be appended
168 new_residue = new_chain.find_residue(self.residue.number, self.residue.icode)
169 # If it does not exist yet then create it
170 # It will be a copy of the current residue (name, number and icode) but in the new chain
171 if not new_residue:
172 new_residue = Residue(self.residue.name, self.residue.number, self.residue.icode)
173 self.structure.set_new_residue(new_residue)
174 new_chain.add_residue(new_residue)
175 # Now remove this atom from its current residue and move it to the new chain residue
176 self.residue.remove_atom(self)
177 new_residue.add_atom(self)
178 chain_index = property(get_chain_index, set_chain_index, None, "The atom chain index according to parent structure chains (read only)")
180 # The atom chain (read only)
181 # In order to change the chain it must be changed in the atom residue
182 def get_chain (self) -> Optional['Chain']:
183 # If there is not strucutre yet it means the atom is beeing set before the structure
184 # In this case it is not possible to get related residues in the structure
185 # It also may happend that the chain is missing because the atom has been set rawly
186 if not self.structure or self.chain_index == None:
187 return None
188 # Get the chain in the structure according to the chain index
189 return self.structure.chains[self.chain_index]
190 chain = property(get_chain, None, None, "The atom chain (read only)")
192 def get_bonds (self, skip_ions : bool = False, skip_dummies : bool = False) -> Optional[ List[int] ]:
193 """Get indices of other atoms in the structure which are covalently bonded to this atom."""
194 if not self.structure:
195 raise ValueError('The atom has not a structure defined')
196 if self.index == None:
197 raise ValueError('The atom has not an index defined')
198 bonds = self.structure.bonds[self.index]
199 # If the skip ions argument is passed then remove atom indices of supported ion atoms
200 if skip_ions:
201 bonds = list(set(bonds) - self.structure.ion_atom_indices)
202 if skip_dummies:
203 bonds = list(set(bonds) - self.structure.dummy_atom_indices)
204 return bonds
206 # Atoms indices of atoms in the structure which are covalently bonded to this atom
207 bonds = property(get_bonds, None, None, 'Atoms indices of atoms in the structure which are covalently bonded to this atom')
209 def get_bonded_atoms (self) -> List['Atom']:
210 """Get bonded atoms."""
211 return [ self.structure.atoms[atom_index] for atom_index in self.bonds ]
213 def get_selection (self) -> 'Selection':
214 """Generate a selection for this atom."""
215 return Selection([self.index])
217 def copy (self) -> 'Atom':
218 """Make a copy of the current atom."""
219 atom_copy = Atom(self.name, self.element, self.coords)
220 atom_copy._structure = self._structure
221 atom_copy._index = self._index
222 atom_copy._residue_index = self._residue_index
223 return atom_copy
225 def is_fatty_candidate (self) -> bool:
226 """
227 Check if this atom meets specific criteria:
228 1. It is a carbon
229 2. It is connected only to other carbons and hydrogens
230 3. It is connected to 1 or 2 carbons
231 """
232 # Ignore non carbon atoms
233 if self.element != 'C':
234 return False
235 # Get bonded atom elements
236 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ]
237 # Check only carbon and hydrogen atoms to be bonded
238 if any((element != 'C' and element != 'H') for element in bonded_atoms_elements):
239 return False
240 # Check it is connected to 1 or 2 carbons
241 connected_carbons_count = bonded_atoms_elements.count('C')
242 if connected_carbons_count != 1 and connected_carbons_count != 2:
243 return False
244 return True
246 def is_carbohydrate_candidate (self) -> bool:
247 """
248 Check if this atom meets specific criteria:
249 1. It is a carbon
250 2. It is connected only to other carbons, hydrogens or oxygens
251 3. It is connected to 1 or 2 carbons
252 4. It is connected to 1 oxygen
253 """
254 # Ignore non carbon atoms
255 if self.element != 'C':
256 return False
257 # Get bonded atom elements
258 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ]
259 # Check only carbon and hydrogen atoms to be bonded
260 if any((element != 'C' and element != 'H' and element != 'O') for element in bonded_atoms_elements):
261 return False
262 # Check it is connected to 1 or 2 carbons
263 connected_carbons_count = bonded_atoms_elements.count('C')
264 if connected_carbons_count != 1 and connected_carbons_count != 2:
265 return False
266 # Check it is connected to 1 oxygen
267 connected_oxygens_count = bonded_atoms_elements.count('O')
268 if connected_oxygens_count != 1:
269 return False
270 return True
272 def is_ion (self) -> bool:
273 """Check if it is an ion by checking if it has no bonds with other atoms."""
274 return len(self.bonds) == 0
276 def guess_element (self) -> str:
277 """Guess an atom element from its name and number of bonds."""
278 # If the atom name is among the known dummy atoms then return a standard element for dummy atoms
279 if self.name.upper() in STANDARD_DUMMY_ATOM_NAMES:
280 return DUMMY_ATOM_ELEMENT
281 # If the element is already the "coarse grained" flag then return this very same flag
282 # WARNING: This has to be preset before guessing or the guess may fail
283 if self.is_cg():
284 return CG_ATOM_ELEMENT
285 # If the name is SOD and it is a lonely atom then it is clearly sodium, not sulfur
286 if self.name.upper() == 'SOD' and self.residue.atom_count == 1:
287 return 'Na'
288 # Find a obvios element name in the atom name
289 element = self.get_name_suggested_element()
290 # CA may refer to calcium or alpha carbon, so the number of atoms in the residue is decisive
291 if element == 'Ca' and self.residue.atom_count >= 2:
292 return 'C'
293 # NA may refer to sodium or some ligand nitrogen, so the number of atoms in the residue is decisive
294 if element == 'Na' and self.residue.atom_count >= 2:
295 return 'N'
296 return element
298 def get_name_suggested_element (self) -> str:
299 """Guess an atom element from its name only."""
300 # Get the atom name and its characters length
301 name = self.name
302 length = len(name)
303 next_character = None
304 for i, character in enumerate(name):
305 # Get the next character, since element may be formed by 2 letters
306 if i < length - 1:
307 next_character = name[i+1]
308 # If next character is not a string then ignore it
309 if not next_character.isalpha():
310 next_character = None
311 # Try to get all possible matches between the characters and the supported atoms
312 # First letter is always caps
313 character = character.upper()
314 # First try to match both letters together
315 if next_character:
316 # Start with the second letter in caps
317 next_character = next_character.upper()
318 both = character + next_character
319 if both in SUPPORTED_ELEMENTS:
320 return both
321 # Continue with the second letter in lowers
322 next_character = next_character.lower()
323 both = character + next_character
324 if both in SUPPORTED_ELEMENTS:
325 return both
326 # Finally, try with the first character alone
327 if character in SUPPORTED_ELEMENTS:
328 return character
329 raise InputError(f"Cannot guess atom element from atom name '{name}'")
331 # Get a standard label
332 def get_label (self) -> str:
333 return f'{self.residue.label}.{self.name} (index {self.index})'
334 # The atom standard label (read only)
335 label = property(get_label, None, None)
337 # Ask if the current atom is in coarse grain
338 def is_cg (self) -> bool:
339 return self.element == CG_ATOM_ELEMENT
341# A residue
342class Residue:
343 def __init__ (self,
344 name : Optional[str] = None,
345 number : Optional[int] = None,
346 icode : Optional[str] = None,
347 ):
348 self.name = name
349 self.number = number
350 self.icode = icode
351 # Set variables to store references to other related instaces
352 # These variables will be set further by the structure
353 self._structure = None
354 self._index = None
355 self._atom_indices = []
356 self._chain_index = None
357 self._classification = None
359 def __repr__ (self):
360 return '<Residue ' + self.name + str(self.number) + (self.icode if self.icode else '') + '>'
362 def __eq__ (self, other):
363 if type(self) != type(other):
364 return False
365 return (
366 self._chain_index == other._chain_index and
367 #self.name == other.name and
368 self.number == other.number and
369 self.icode == other.icode
370 )
372 def __hash__ (self):
373 # WARNING: This is susceptible to duplicated residues
374 return hash((self._chain_index, self.number, self.icode))
375 # WARNING: This is not susceptible to duplicated residues
376 #return hash(tuple(self._atom_indices))
378 # The parent structure (read only)
379 # This value is set by the structure itself
380 def get_structure (self) -> Optional['Structure']:
381 return self._structure
382 structure = property(get_structure, None, None, "The parent structure (read only)")
384 # The residue index according to parent structure residues (read only)
385 # This value is set by the structure itself
386 def get_index (self) -> Optional[int]:
387 return self._index
388 def set_index (self, index):
389 # Update residue atoms
390 for atom in self.atoms:
391 atom._residue_index = index
392 # Update residue chain
393 chain_residue_index = self.chain._residue_indices.index(self._index)
394 self.chain._residue_indices[chain_residue_index] = index
395 # Finally update self index
396 self._index = index
397 index = property(get_index, set_index, None, "The residue index according to parent structure residues (read only)")
399 # The atom indices according to parent structure atoms for atoms in this residue
400 # If atom indices are set then make changes in all the structure to make this change coherent
401 def get_atom_indices (self) -> List[int]:
402 return self._atom_indices
403 def set_atom_indices (self, new_atom_indices : List[int]):
404 # If there is not strucutre yet it means the residue is beeing set before the structure
405 # We just save atom indices and wait for the structure to be set
406 if not self.structure:
407 self._atom_indices = new_atom_indices
408 return
409 # Update the current atoms
410 for atom in self.atoms:
411 atom._residue_index = None
412 # Update the new atoms
413 for index in new_atom_indices:
414 atom = self.structure.atoms[index]
415 atom._residue_index = self.index
416 # Now new indices are coherent and thus we can save them
417 self._atom_indices = new_atom_indices
418 atom_indices = property(get_atom_indices, set_atom_indices, None, "The atom indices according to parent structure atoms for atoms in this residue")
420 # The atoms in this residue
421 # If atoms are set then make changes in all the structure to make this change coherent
422 def get_atoms (self) -> List['Atom']:
423 # If there is not strucutre yet it means the chain is beeing set before the structure
424 # In this case it is not possible to get related atoms in the structure
425 if not self.structure:
426 return []
427 # Get atoms in the structure according to atom indices
428 atoms = self.structure.atoms
429 return [ atoms[atom_index] for atom_index in self.atom_indices ]
430 def set_atoms (self, new_atoms : List['Atom']):
431 # Find indices for new atoms and set their indices as the new atom indices
432 # Note that atoms must be set in the structure already
433 new_atom_indices = []
434 for new_atom in new_atoms:
435 new_atom_index = new_atom.index
436 if new_atom_index == None:
437 raise ValueError('Atom ' + str(new_atom) + ' is not set in the structure')
438 new_atom_indices.append(new_atom_index)
439 self.set_atom_indices(new_atom_indices)
440 atoms = property(get_atoms, set_atoms, None, "The atoms in this residue")
442 # Get the number of atoms in the residue
443 def get_atom_count (self) -> int:
444 return len(self.atom_indices)
445 atom_count = property(get_atom_count, None, None, "The number of atoms in the residue (read only)")
447 # Add an atom to the residue
448 def add_atom (self, new_atom : 'Atom'):
449 # Insert the new atom index in the list of atom indices keeping the order
450 new_atom_index = new_atom.index
451 sorted_atom_index = bisect(self.atom_indices, new_atom_index)
452 self.atom_indices.insert(sorted_atom_index, new_atom_index)
453 # Update the atom internal index
454 new_atom._residue_index = self.index
456 # Remove an atom from the residue
457 def remove_atom (self, current_atom : 'Atom'):
458 # Remove the current atom index from the atom indices list
459 self.atom_indices.remove(current_atom.index) # This index MUST be in the list
460 # If we removed the last atom then this residue must be removed from its structure
461 if len(self.atom_indices) == 0 and self.structure:
462 self.structure.purge_residue(self)
463 # Update the atom internal index
464 current_atom._residue_index = None
466 # The residue chain index according to parent structure chains
467 # If chain index is set then make changes in all the structure to make this change coherent
468 def get_chain_index (self) -> int:
469 return self._chain_index
470 def set_chain_index (self, new_chain_index : int):
471 # If the new chain index is the current chain index do nothing
472 # WARNING: It is important to stop this here or it could delete a chain which is not to be deleted
473 if new_chain_index == self.chain_index:
474 return
475 # If there is not strucutre yet it means the chain is beeing set before the structure
476 # We just save the chain index and wait for the structure to be set
477 if not self.structure:
478 self._chain_index = new_chain_index
479 return
480 # Relational indices are updated through a top-down hierarchy
481 # Affected chains are the ones to update this residue internal chain index
482 # WARNING: It is critical to find the new chain before removing/adding residues
483 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged
484 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed
485 new_chain = self.structure.chains[new_chain_index]
486 if self.chain:
487 self.chain.remove_residue(self)
488 new_chain.add_residue(self)
489 chain_index = property(get_chain_index, set_chain_index, None, "The residue chain index according to parent structure chains")
491 # The residue chain
492 # If chain is set then make changes in all the structure to make this change coherent
493 def get_chain (self) -> Optional['Chain']:
494 # If there is not strucutre yet then it means the residue is set before the structure
495 # In this case it is not possible to get related chain in the structure
496 if not self.structure or self.chain_index == None:
497 return None
498 # Get the chain in the structure according to the chain index
499 return self.structure.chains[self.chain_index]
500 def set_chain (self, new_chain : Union['Chain', str]):
501 # In case the chain is just a string we must find/create the corresponding chain
502 if type(new_chain) == str:
503 letter = new_chain
504 # Get the residue structure
505 structure = self.structure
506 if not structure:
507 raise ValueError(f'Cannot find the corresponding {new_chain} chain without the structure')
508 # Find if the letter belongs to an already existing chain
509 new_chain = structure.get_chain_by_name(letter)
510 # If the chain does not exist yet then create it
511 if not new_chain:
512 new_chain = Chain(name=letter)
513 structure.set_new_chain(new_chain)
514 # Find the new chain index and set it as the residue chain index
515 # Note that the chain must be set in the structure already
516 new_chain_index = new_chain.index
517 if new_chain_index == None:
518 raise ValueError(f'Chain {new_chain} is not set in the structure')
519 self.set_chain_index(new_chain_index)
520 chain = property(get_chain, set_chain, None, "The residue chain")
522 # Get atom indices from atoms bonded to this residue
523 def get_bonded_atom_indices (self) -> List[int]:
524 # Get all bonds among residue atoms
525 all_bonds = []
526 for atom in self.atoms:
527 all_bonds += atom.bonds
528 # Remove self atom indices from the list
529 external_bonds = set(all_bonds) - set(self.atom_indices)
530 return list(external_bonds)
532 def get_bonded_atoms (self) -> List['Atom']:
533 """Get atoms bonded to this residue."""
534 return [ self.structure.atoms[atom_index] for atom_index in self.get_bonded_atom_indices() ]
536 def get_bonded_residue_indices (self) -> List[int]:
537 """Get residue indices from residues bonded to this residue."""
538 return list(set([ atom.residue_index for atom in self.get_bonded_atoms() ]))
540 def get_bonded_residues (self) -> List['Residue']:
541 """Get residues bonded to this residue."""
542 return [ self.structure.residues[residue_index] for residue_index in self.get_bonded_residue_indices() ]
544 def is_bonded_with_residue (self, other : 'Residue') -> bool:
545 """Given another residue, check if it is bonded with this residue."""
546 bonded_atom_indices = set(self.get_bonded_atom_indices())
547 if next((index for index in other.atom_indices if index in bonded_atom_indices), None) != None: return True
548 return False
550 def get_classification (self) -> str:
551 """
552 Get the residue biochemistry classification.
554 WARNING: Note that this logic will not work in a structure without hydrogens.
556 Available classifications:
557 - protein
558 - dna
559 - rna
560 - carbohydrate
561 - fatty
562 - steroid
563 - ion
564 - solvent
565 - acetyl
566 - amide
567 - other
568 """
569 # Return the internal value, if any
570 if self._classification:
571 return self._classification
572 # If this is a CG residue then we can not classify it
573 if self.is_cg(): return self.get_classification_by_name()
574 # If we dont have a value then we must classify the residue
575 # -------------------------------------------------------------------------------------------------------
576 # Ions are single atom residues
577 if self.atom_count == 1:
578 self._classification = 'ion'
579 return self._classification
580 # -------------------------------------------------------------------------------------------------------
581 # At this point we need atoms to have elements fixed
582 # WARNING: missing elements would result in silent failure to recognize some classifications
583 if self.structure._fixed_atom_elements == False:
584 self.structure.fix_atom_elements()
585 # Solvent is water molecules
586 # First rely on the residue name
587 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES:
588 self._classification = 'solvent'
589 return self._classification
590 # It may be water with a not known name
591 # Literally check if its a molecule with 3 atoms: 2 hydrogens and 1 oxygen
592 if self.atom_count == 3:
593 atom_elements = [ atom.element for atom in self.atoms ]
594 if atom_elements.count('H') == 2 and atom_elements.count('O') == 1:
595 self._classification = 'solvent'
596 return self._classification
597 # -------------------------------------------------------------------------------------------------------
598 # Protein definition according to vmd:
599 # a residue with atoms named C, N, CA, and O
600 # In our case we accept OC1 and OC2 or OT1 and OT2 instead of O for terminal residues
601 atom_names = set([ atom.name for atom in self.atoms ])
602 if ( all((name in atom_names) for name in ['C', 'N', 'CA']) and (
603 'O' in atom_names or { 'OC1', 'OC2' } <= atom_names or { 'OT1', 'OT2' } <= atom_names
604 )):
605 self._classification = 'protein'
606 return self._classification
607 # -------------------------------------------------------------------------------------------------------
608 # Nucleic acids definition according to vmd:
609 # a residue with atoms named P, O1P, O2P and either O3’, C3’, C4’, C5’, O5’ or O3*, C3*, C4*, C5*, O5*
610 # Apparently it has been fixed so now a residue does not need to be phosphorylated to be considered nucleic
611 # LORE: This included the condition "all((name in atom_names) for name in ['P', 'OP1', 'OP2'])"
612 if (( all((name in atom_names) for name in ["O3'", "C3'", "C4'", "C5'", "O5'"]) or
613 all((name in atom_names) for name in ["O3*", "C3*", "C4*", "C5*", "O5*"]) )):
614 # At this point we know it is nucleic
615 # We must tell the difference between DNA and RNA
616 if "O2'" in atom_names or "O2*" in atom_names:
617 self._classification = 'rna'
618 else:
619 self._classification = 'dna'
620 return self._classification
621 # -------------------------------------------------------------------------------------------------------
622 # To define carbohydrates search for rings made of 1 oxygen and 'n' carbons
623 # WARNING: This logic may fail for some very specific molecules such as furine
624 # LIMITATION: This logic only aims for cyclical carbohydrates. The linear form of carbohydrates is not yet consider
625 rings = list(self.find_rings())
626 for ring in rings:
627 ring_elements = [ atom.element for atom in ring ]
628 # Check the ring has 1 oxygen
629 oxygen_count = ring_elements.count('O')
630 if oxygen_count != 1:
631 continue
632 # Check the rest are only carbon
633 carbon_count = ring_elements.count('C')
634 if carbon_count != len(ring) - 1:
635 continue
636 self._classification = 'carbohydrate'
637 return self._classification
638 # -------------------------------------------------------------------------------------------------------
639 # To define steroids search for 3 6-carbon rings and 1 5-carbon ring (at least)
640 # WARNING: According to this logic some exotical non-steroid molecules may result in false positives
641 num_6_carbon_rings = 0
642 num_5_carbon_rings = 0
643 for ring in rings:
644 ring_elements = [ atom.element for atom in ring ]
645 if any(element != 'C' for element in ring_elements):
646 continue
647 ring_lenght = len(ring)
648 if ring_lenght == 5:
649 num_5_carbon_rings += 1
650 if ring_lenght == 6:
651 num_6_carbon_rings += 1
652 if num_6_carbon_rings >= 3 and num_5_carbon_rings >= 1:
653 self._classification = 'steroid'
654 return self._classification
655 # -------------------------------------------------------------------------------------------------------
656 # To define fatty acids search for a series of carbon atoms connected together (3 at least)
657 # These carbon atoms must be connected only to hydrogen in addition to 1-2 carbon
658 # These carbons must not be bonded in a cycle so check atoms not be repeated in the series
659 bonded_fatty_atoms = []
660 def get_bonded_fatty_atoms_recursive (current_atom : Atom, previous_atom : Optional[Atom] = None) -> bool:
661 # Iterate over the input atom bonded atoms
662 for bonded_atom in current_atom.get_bonded_atoms():
663 # Discard the previous atom to avoid an infinite loop
664 if bonded_atom == previous_atom:
665 continue
666 # If we find an atom which is already in the fatty atoms list then it means we are cycling
667 if bonded_atom in bonded_fatty_atoms:
668 return False
669 # If this is not a fatty atom then discard it
670 if not bonded_atom.is_fatty_candidate():
671 continue
672 # Add the current bonded fatty atom to the list
673 bonded_fatty_atoms.append(bonded_atom)
674 # Find more bonded fatty atoms and add them to list as well
675 if get_bonded_fatty_atoms_recursive(current_atom=bonded_atom, previous_atom=current_atom) == False:
676 return False
677 return True
678 # Now check all atoms and try to set the series until we find one which works
679 already_checked_atoms = []
680 for atom in self.atoms:
681 # If we already checked this atom trough the recursive logic then skip it
682 if atom in already_checked_atoms:
683 continue
684 # If the atom is not a fatty candidate then skip it
685 if not atom.is_fatty_candidate():
686 continue
687 # Now that we have found a suitable candidate atom we start the series
688 bonded_fatty_atoms = [atom]
689 if get_bonded_fatty_atoms_recursive(atom) and len(bonded_fatty_atoms) >= 3:
690 self._classification = 'fatty'
691 return self._classification
692 already_checked_atoms += bonded_fatty_atoms
693 # -------------------------------------------------------------------------------------------------------
694 # Check if it is an acetylation capping terminal
695 if self.name == 'ACE':
696 self._classification = 'acetyl'
697 return self._classification
698 # -------------------------------------------------------------------------------------------------------
699 # Check if it is an N-methyl amide capping terminal
700 if self.name == 'NME':
701 self._classification = 'amide'
702 return self._classification
703 # -------------------------------------------------------------------------------------------------------
704 self._classification = 'other'
705 return self._classification
707 # Set an alternative function to "try" to classify the residues according only to its name
708 # This is useful for corase grain residues whose atoms may not reflect the real atoms
709 # WARNING: This logic is very limited and will return "unknown" most of the times
710 # WARNING: This logic relies mostly in atom names, which may be not standard
711 def get_classification_by_name (self) -> str:
712 if self.name in PROTEIN_RESIDUE_NAME_LETTERS:
713 return 'protein'
714 if self.name in DNA_RESIDUE_NAME_LETTERS:
715 return 'dna'
716 if self.name in RNA_RESIDUE_NAME_LETTERS:
717 return 'rna'
718 if self.name in NUCLEIC_RESIDUE_NAME_LETTERS:
719 return 'nucleic'
720 if self.name in FATTY_RESIDUE_NAMES:
721 return 'fatty'
722 if self.name in STEROID_RESIDUE_NAMES:
723 return 'steroid'
724 if self.name in STANDARD_COUNTER_ION_ATOM_NAMES:
725 return 'ion'
726 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES:
727 return 'solvent'
728 # If we do not know what it is
729 return 'unknown'
731 # The residue biochemistry classification (read only)
732 classification = property(get_classification, None, None)
734 # Generate a selection for this residue
735 def get_selection (self) -> 'Selection':
736 return Selection(self.atom_indices)
738 # Make a copy of the current residue
739 def copy (self) -> 'Residue':
740 residue_copy = Residue(self.name, self.number, self.icode)
741 residue_copy._structure = self._structure
742 residue_copy._index = self._index
743 residue_copy._atom_indices = self._atom_indices
744 residue_copy._chain_index = self._chain_index
745 return residue_copy
747 # Get the residue equivalent single letter code
748 # Note that this makes sense for aminoacids and nucleotides only
749 # Non recognized residue names return 'X'
750 def get_letter (self) -> str:
751 return residue_name_to_letter(self.name)
753 # Get the formula of the residue
754 def get_formula (self) -> str:
755 elements = [ atom.element for atom in self.atoms ]
756 unique_elements = list(set(elements))
757 formula = ''
758 for unique_element in unique_elements:
759 count = elements.count(unique_element)
760 number = get_lower_numbers(str(count)) if count > 1 else ''
761 formula += unique_element + number
762 return formula
764 # Find rings in the residue
765 def find_rings (self) -> Generator[ List[Atom], None, None ]:
766 return self.structure.find_rings(selection=self.get_selection())
769 def split (self,
770 first_residue_atom_indices : List[int],
771 second_residue_atom_indices : List[int],
772 first_residue_name : Optional[str] = None,
773 second_residue_name : Optional[str] = None,
774 first_residue_number : Optional[int] = None,
775 second_residue_number : Optional[int] = None,
776 first_residue_icode : Optional[str] = None,
777 second_residue_icode : Optional[str] = None,
778 ) -> Tuple['Residue', 'Residue']:
779 """
780 Split this residue in 2 residues and return them in a tuple.
781 Keep things coherent in the structure (renumerate all residues below this one).
782 Note that all residue atoms must be covered by the splits.
783 """
784 # This function is expected to be called in a residue with an already set structure
785 if not self.structure:
786 raise InputError('The split function should be called when the residue has an already defined structure')
787 # Make sure all atoms in the residue are covered between both the first and the second residue
788 if set(self.atom_indices) != set(first_residue_atom_indices + second_residue_atom_indices):
789 print('Residue atoms: ' + ', '.join([ str(v) for v in set(self.atom_indices) ]))
790 print('Covered atoms: ' + ', '.join([ str(v) for v in set(first_residue_atom_indices + second_residue_atom_indices) ]))
791 raise InputError('All atom indices must be covered between both the first and the second residue')
792 # Reuse this first residue instead of creating a new one
793 if first_residue_name:
794 self.name = first_residue_name
795 if first_residue_number:
796 self.number = first_residue_number
797 if first_residue_icode:
798 self.icode = first_residue_icode
799 # Set the new second residue
800 _second_residue_name = second_residue_name if second_residue_name else self.name
801 _second_residue_number = second_residue_number if second_residue_number else self.number
802 _second_residue_icode = second_residue_icode if second_residue_icode else get_next_letter(self.icode)
803 second_residue = Residue(_second_residue_name, _second_residue_number, _second_residue_icode)
804 second_residue._structure = self.structure
805 new_residue_index = self.index + 1
806 second_residue._index = new_residue_index
807 # Insert the second residue in the structure residues list right after this residue
808 self.structure.residues.insert(new_residue_index, second_residue)
809 # Set the second residue index
810 # Update the index of all residues which have been shifted after the insertion
811 for residue_index in range(new_residue_index + 1, len(self.structure.residues)):
812 residue = self.structure.residues[residue_index]
813 residue.index = residue_index
814 # Now transfer atoms from residue 1 to residue 2 as it is specified
815 # Note that this will automatically update every atom
816 second_residue.atom_indices = second_residue_atom_indices
817 # Now add the new residue to the chain
818 self.chain.add_residue(second_residue)
820 # Parse atom names to atom indices and then call the split function
821 def split_by_atom_names (self,
822 first_residue_atom_names : List[str],
823 second_residue_atom_names : List[str],
824 first_residue_name : Optional[str] = None,
825 second_residue_name : Optional[str] = None,
826 first_residue_number : Optional[int] = None,
827 second_residue_number : Optional[int] = None,
828 first_residue_icode : Optional[str] = None,
829 second_residue_icode : Optional[str] = None,
830 ) -> Tuple['Residue', 'Residue']:
831 # Check all atom names to exist in the residue
832 input_atom_names = set(first_residue_atom_names + second_residue_atom_names)
833 residue_atom_names = set([ atom.name for atom in self.atoms ])
834 if input_atom_names != residue_atom_names:
835 print(self)
836 print(self.atoms)
837 print('Input atom names: ' + ', '.join(input_atom_names))
838 print('Residue atom names: ' + ', '.join(residue_atom_names))
839 raise InputError('All residue atoms must be covered between both the first and the second residue atom names')
840 # Convert atom names to atom indices
841 first_residue_atom_indices = [ self.get_atom_by_name(name).index for name in first_residue_atom_names ]
842 second_residue_atom_indices = [ self.get_atom_by_name(name).index for name in second_residue_atom_names ]
843 # Call the actual split logic
844 return self.split(first_residue_atom_indices, second_residue_atom_indices, first_residue_name,
845 second_residue_name, first_residue_number, second_residue_number, first_residue_icode, second_residue_icode)
847 # Get a residue atom given its name
848 def get_atom_by_name (self, atom_name : str) -> 'Atom':
849 return next(( atom for atom in self.atoms if atom.name == atom_name ), None)
851 # Get a standard label
852 def get_label (self) -> str:
853 chainname = self.chain.name if self.chain.name.strip() else ''
854 return f'{chainname}{self.number}{self.icode}({self.name})'
855 # The residue standard label (read only)
856 label = property(get_label, None, None)
858 # Ask if the current residue is in coarse grain
859 # Note that we asome there may be not hybrid aa/cg residues
860 def is_cg (self) -> bool:
861 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)
863# A chain
864class Chain:
865 def __init__ (self,
866 name : Optional[str] = None,
867 classification : Optional[str] = None,
868 ):
869 self.name = name
870 self._classification = classification
871 # Set variables to store references to other related instaces
872 # These variables will be set further by the structure
873 self._structure = None
874 self._index = None
875 self._residue_indices = []
877 def __repr__ (self):
878 return '<Chain ' + self.name + '>'
880 def __eq__ (self, other):
881 return self.name == other.name
883 def __hash__ (self):
884 return hash(self.name)
886 # The parent structure (read only)
887 # This value is set by the structure itself
888 def get_structure (self) -> Optional['Structure']:
889 return self._structure
890 structure = property(get_structure, None, None, "The parent structure (read only)")
892 # The residue index according to parent structure residues (read only)
893 # This value is set by the structure itself
894 def get_index (self) -> Optional[int]:
895 return self._index
896 # When the index is set all residues are updated with the next chain index
897 def set_index (self, index : int):
898 for residue in self.residues:
899 residue._chain_index = index
900 self._index = index
901 index = property(get_index, set_index, None, "The residue index according to parent structure residues (read only)")
903 # The residue indices according to parent structure residues for residues in this chain
904 # If residue indices are set then make changes in all the structure to make this change coherent
905 def get_residue_indices (self) -> List[int]:
906 return self._residue_indices
907 def set_residue_indices (self, new_residue_indices : List[int]):
908 # If there is not strucutre yet it means the chain is beeing set before the structure
909 # We just save residue indices and wait for the structure to be set
910 if not self.structure:
911 self._residue_indices = new_residue_indices
912 return
913 # Update the current residues
914 for residue in self.residues:
915 residue._chain_index = None
916 # Update the new residues
917 for index in new_residue_indices:
918 residue = self.structure.residues[index]
919 residue._chain_index = self.index
920 # In case the new residue indices list is empty this chain must be removed from its structure
921 if len(new_residue_indices) == 0:
922 self.structure.purge_chain(self)
923 # Now new indices are coherent and thus we can save them
924 self._residue_indices = new_residue_indices
925 residue_indices = property(get_residue_indices, set_residue_indices, None, "The residue indices according to parent structure residues for residues in this residue")
927 # The residues in this chain
928 # If residues are set then make changes in all the structure to make this change coherent
929 def get_residues (self) -> List['Residue']:
930 # If there is not strucutre yet it means the chain is beeing set before the structure
931 # In this case it is not possible to get related residues in the structure
932 if not self.structure:
933 return []
934 # Get residues in the structure according to residue indices
935 residues = self.structure.residues
936 return [ residues[residue_index] for residue_index in self.residue_indices ]
938 def set_residues (self, new_residues : List['Residue']):
939 """Find indices for new residues and set their indices as the new residue indices.
940 Note that residues must be set in the structure already."""
941 new_residue_indices = []
942 for new_residue in new_residues:
943 new_residue_index = new_residue.index
944 if new_residue_index == None:
945 raise ValueError(f'Residue {new_residue} is not set in the structure')
946 new_residue_indices.append(new_residue_index)
947 self.set_residue_indices(new_residue_indices)
948 residues = property(get_residues, set_residues, None, "The residues in this chain")
950 # Add a residue to the chain
951 def add_residue (self, residue : 'Residue'):
952 # Insert the new residue index in the list of residue indices keeping the order
953 sorted_residue_index = bisect(self.residue_indices, residue.index)
954 self.residue_indices.insert(sorted_residue_index, residue.index)
955 # Update the residue internal chain index
956 residue._chain_index = self.index
958 # Remove a residue from the chain
959 # WARNING: Note that this function does not trigger the set_residue_indices
960 def remove_residue (self, residue : 'Residue'):
961 self.residue_indices.remove(residue.index) # This index MUST be in the list
962 # If we removed the last residue then this chain must be removed from its structure
963 if len(self.residue_indices) == 0 and self.structure:
964 self.structure.purge_chain(self)
965 # Update the residue internal chain index
966 residue._chain_index = None
968 # Atom indices for all atoms in the chain (read only)
969 # In order to change atom indices they must be changed in their corresponding residues
970 def get_atom_indices (self) -> List[int]:
971 return sum([ residue.atom_indices for residue in self.residues ], [])
972 atom_indices = property(get_atom_indices, None, None, "Atom indices for all atoms in the chain (read only)")
974 # Atoms in the chain (read only)
975 # In order to change atoms they must be changed in their corresponding residues
976 def get_atoms (self) -> List[int]:
977 return sum([ residue.atoms for residue in self.residues ], [])
978 atoms = property(get_atoms, None, None, "Atoms in the chain (read only)")
980 # Get the number of atoms in the chain (read only)
981 def get_atom_count (self) -> int:
982 return len(self.atom_indices)
983 atom_count = property(get_atom_count, None, None, "Number of atoms in the chain (read only)")
985 # Get the number of residues in the chain (read only)
986 def get_residue_count (self) -> int:
987 return len(self._residue_indices)
988 residue_count = property(get_residue_count, None, None, "Number of residues in the chain (read only)")
990 # Get the chain classification
991 def get_classification (self) -> str:
992 if self._classification:
993 return self._classification
994 self._classification = self.structure.get_selection_classification(self.get_selection())
995 return self._classification
997 # Force the chain classification
998 def set_classification (self, classification : str):
999 self._classification = classification
1001 # Chain classification
1002 classification = property(get_classification, set_classification, None, "Classification of the chain (manual or automatic)")
1004 # Get the residues sequence in one-letter code
1005 def get_sequence (self) -> str:
1006 return ''.join([ residue_name_to_letter(residue.name) for residue in self.residues ])
1008 # Generate a selection for this chain
1009 def get_selection (self) -> 'Selection':
1010 return Selection(self.atom_indices)
1012 # Make a copy of the current chain
1013 def copy (self) -> 'Chain':
1014 chain_copy = Chain(self.name)
1015 chain_copy._structure = self._structure
1016 chain_copy._index = self._index
1017 chain_copy.residue_indices = self.residue_indices
1018 return chain_copy
1020 # Ask if the current chain has at least one coarse grain atom/residue
1021 def has_cg (self) -> bool:
1022 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)
1024 # Find a residue by its number and insertion code
1025 def find_residue (self, number : int, icode : str = '') -> Optional['Residue']:
1026 # Iterate chain residues
1027 for residue in self.residues:
1028 if residue.number == number and residue.icode == icode:
1029 return residue
1030 return None
1032# A structure is a group of atoms organized in chains and residues
1033class Structure:
1034 def __init__ (self,
1035 atoms : List['Atom'] = [],
1036 residues : List['Residue'] = [],
1037 chains : List['Chain'] = [],
1038 residue_numeration_base : int = 10,
1039 ):
1040 self.atoms = []
1041 self.residues = []
1042 self.chains = []
1043 # Set references between instances
1044 for atom in atoms:
1045 self.set_new_atom(atom)
1046 for residue in residues:
1047 self.set_new_residue(residue)
1048 for chain in chains:
1049 self.set_new_chain(chain)
1050 # --- Set other internal variables ---
1051 # Set bonds between atoms
1052 self._bonds = None
1053 # Set fragments of bonded atoms
1054 self._fragments = None
1055 # --- Set other auxiliar variables ---
1056 # Trajectory atom sorter is a function used to sort coordinates in a trajectory file
1057 # This function is generated when sorting indices in the structure
1058 # Also the new atom order is saved
1059 self.trajectory_atom_sorter = None
1060 self.new_atom_order = None
1061 # Track when atom elements have been fixed
1062 self._fixed_atom_elements = False
1063 # Save indices of supported ion atoms
1064 self._ion_atom_indices = None
1065 self._dummy_atom_indices = None
1066 # Set the scale of the residue numbers
1067 # It may be decimal (10), hexadecimal(16) or alphanumeric(36)
1068 # Note that hybrid scales (1-9999 decimal and different further) are not explicitly supported
1069 # However, the scale is guessed on read and conserved on write, so the original numeration would be conserved
1070 # The default values is used only when the structure is not read from a PDB
1071 self.residue_numeration_base = residue_numeration_base
1073 def __repr__ (self):
1074 return f'<Structure ({self.atom_count} atoms)>'
1076 # The bonds between atoms
1077 def get_bonds (self) -> List[ List[int] ]:
1078 # Return the stored value, if exists
1079 if self._bonds:
1080 return self._bonds
1081 # If not, we must calculate the bonds using vmd
1082 self._bonds = self.get_covalent_bonds()
1083 return self._bonds
1084 # Force specific bonds
1085 def set_bonds (self, bonds : List[ List[int] ]):
1086 self._bonds = bonds
1087 # Reset fragments
1088 self._fragments = None
1089 bonds = property(get_bonds, set_bonds, None, "The structure bonds")
1091 # Get the groups of atoms which are covalently bonded
1092 def get_fragments (self) -> List['Selection']:
1093 # Return the stored value, if exists
1094 if self._fragments != None:
1095 return self._fragments
1096 # Otherwise, find fragments in all structure atoms
1097 self._fragments = list(self.find_fragments())
1098 return self._fragments
1099 # Fragments of covalently bonded atoms (read only)
1100 fragments = property(get_fragments, None, None, "The structure fragments (read only)")
1102 # Find fragments* in a selection of atoms
1103 # * A fragment is a selection of colvalently bonded atoms
1104 # All atoms are searched if no selection is provided
1105 # WARNING: Note that fragments generated from a specific selection may not match the structure fragments
1106 # A selection including 2 separated regions of a structure fragment will yield 2 fragments
1107 def find_fragments (self, selection : Optional['Selection'] = None) -> Generator['Selection', None, None]:
1108 # If there is no selection we consider all atoms
1109 if not selection:
1110 selection = self.select_all()
1111 # Get/Find covalent bonds between atoms in a new object avoid further corruption (deletion) of the original list
1112 atom_indexed_covalent_bonds = { atom_index: [ *self.bonds[atom_index] ] for atom_index in selection.atom_indices }
1113 # Group the connected atoms in "fragments"
1114 while len(atom_indexed_covalent_bonds) > 0:
1115 start_atom_index, bonds = next(iter(atom_indexed_covalent_bonds.items()))
1116 del atom_indexed_covalent_bonds[start_atom_index]
1117 fragment_atom_indices = [ start_atom_index ]
1118 while len(bonds) > 0:
1119 # Get the next bond atom and remove it from the bonds list
1120 next_atom_index = bonds[0]
1121 bonds.remove(next_atom_index)
1122 next_bonds = atom_indexed_covalent_bonds.get(next_atom_index, None)
1123 # If this atom is out of the selection then skip it
1124 if next_bonds == None:
1125 continue
1126 next_new_bonds = [ bond for bond in next_bonds if bond not in fragment_atom_indices + bonds ]
1127 bonds += next_new_bonds
1128 fragment_atom_indices.append(next_atom_index)
1129 del atom_indexed_covalent_bonds[next_atom_index]
1130 yield Selection(fragment_atom_indices)
1132 # Given a selection of atoms, find all whole structure fragments on them
1133 def find_whole_fragments (self, selection : 'Selection') -> Generator['Selection', None, None]:
1134 for fragment in self.fragments:
1135 if selection & fragment:
1136 yield fragment
1138 def name_selection (self, selection : 'Selection') -> str:
1139 """Name an atom selection depending on the chains it contains. This is used for debug purpouses."""
1140 atoms = [ self.atoms[index] for index in selection.atom_indices ]
1141 # Count atoms per chain
1142 atom_count_per_chain = { chain: 0 for chain in self.chains }
1143 for atom in atoms:
1144 atom_count_per_chain[atom.chain] += 1
1145 # Set labels accoridng to the coverage of every chain
1146 chain_labels = []
1147 for chain, atom_count in atom_count_per_chain.items():
1148 if atom_count == 0: continue
1149 coverage = atom_count / chain.atom_count
1150 label = f'chain {chain.name}'
1151 if coverage < 1:
1152 percent = round(coverage * 1000) / 10
1153 label += f' ({percent}%)'
1154 chain_labels.append(label)
1155 return ', '.join(chain_labels)
1157 # Set a new atom in the structure
1158 def set_new_atom (self, atom : 'Atom'):
1159 atom._structure = self
1160 new_atom_index = self.atom_count
1161 self.atoms.append(atom)
1162 atom._index = new_atom_index
1164 # Set a new residue in the structure
1165 # WARNING: Atoms must be set already before setting residues
1166 def set_new_residue (self, residue : 'Residue'):
1167 residue._structure = self
1168 new_residue_index = len(self.residues)
1169 self.residues.append(residue)
1170 residue._index = new_residue_index
1171 # In case the residue has atom indices, set relational indices on each atom
1172 for atom_index in residue.atom_indices:
1173 atom = self.atoms[atom_index]
1174 atom._residue_index = new_residue_index
1176 # Purge residue from the structure and its chain
1177 # This can be done only when the residue has no atoms left in the structure
1178 # Renumerate all residue indices which have been offsetted as a result of the purge
1179 def purge_residue (self, residue : 'Residue'):
1180 # Check the residue can be purged
1181 if residue not in self.residues:
1182 raise ValueError(f'{residue} is not in the structure already')
1183 if len(residue.atom_indices) > 0:
1184 raise ValueError(f'{residue} is still having atoms and thus it cannot be purged')
1185 # Remove the purged residue from its chain
1186 if residue.chain: residue.chain.remove_residue(residue)
1187 # Get the current index of the residue to be purged
1188 purged_index = residue.index
1189 # Residues and their atoms below this index are not to be modified
1190 # Residues and their atoms over this index must be renumerated
1191 for affected_residue in self.residues[purged_index+1:]:
1192 # Chaging the index automatically changes all residues atoms '_residue_index' values
1193 # Chaging the index automatically changes its corresponding index in residue chain '_residue_indices'
1194 affected_residue.index -= 1
1195 # Finally, remove the current residue from the list of residues in the structure
1196 del self.residues[purged_index]
1198 # Set a new chain in the structure
1199 # WARNING: Residues and atoms must be set already before setting chains
1200 def set_new_chain (self, chain : 'Chain'):
1201 chain._structure = self
1202 new_chain_index = len(self.chains)
1203 self.chains.append(chain)
1204 chain._index = new_chain_index
1205 # In case the chain has residue indices, set relational indices on each residue
1206 for residue_index in chain.residue_indices:
1207 residue = self.residues[residue_index]
1208 residue._chain_index = new_chain_index
1210 # Purge chain from the structure
1211 # This can be done only when the chain has no residues left in the structure
1212 # Renumerate all chain indices which have been offsetted as a result of the purge
1213 def purge_chain (self, chain : 'Chain'):
1214 # Check the chain can be purged
1215 if chain not in self.chains:
1216 raise ValueError(f'Chain {chain.name} is not in the structure already')
1217 if len(chain.residue_indices) > 0:
1218 raise ValueError(f'Chain {chain.name} is still having residues and thus it cannot be purged')
1219 # Get the current index of the chain to be purged
1220 purged_index = chain.index
1221 # Chains and their residues below this index are not to be modified
1222 # Chains and their residues over this index must be renumerated
1223 for affected_chain in self.chains[purged_index+1:]:
1224 # Chaging the index automatically changes all chain residues '_chain_index' values
1225 affected_chain.index -= 1
1226 # Finally, remove the current chain from the list of chains in the structure
1227 del self.chains[purged_index]
1229 # Set the structure from a pdb file
1230 # You may filter the PDB content for a specific model
1231 # Some weird numeration systems are not supported and, when encountered, they are ignored
1232 # In these cases we set our own numeration system
1233 # Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead
1234 @classmethod
1235 def from_pdb (cls, pdb_content : str, model : Optional[int] = None, flexible_numeration : bool = True):
1236 # Filter the PDB content in case a model was passed
1237 filtered_pdb_content = filter_model(pdb_content, model) if model else pdb_content
1238 # Split the PDB content in lines
1239 pdb_lines = filtered_pdb_content.split('\n')
1240 # Before we start, we must guess the numeration system
1241 # To do so mine all residue numbers
1242 all_residue_number_characters = set()
1243 for line in pdb_lines:
1244 # Parse atoms only
1245 start = line[0:6]
1246 is_atom = start == 'ATOM ' or start == 'HETATM'
1247 if not is_atom: continue
1248 # Mine all resiude numbers
1249 residue_number = line[22:26]
1250 for character in residue_number:
1251 all_residue_number_characters.add(character)
1252 # Remove white spaces
1253 all_residue_number_characters.discard(' ')
1254 # If we find a non-numerical and non-alphabetical character then we assume it has a weird numeration system
1255 # Since we can not support every scenario, in this cases we set the numeration totally ignoring the one in the PDB
1256 weird_character = next((character for character in all_residue_number_characters if not (character.isalnum() or character == '-')), None)
1257 if weird_character:
1258 if flexible_numeration == False: raise InputError(f'Not supported numeration system including "{weird_character}" characters')
1259 warn(f'Weird residue numeration including "{weird_character}" characters found in the PDB file. Ignoring it.')
1260 residue_numeration_base = None
1261 # Search among all resiude numbers any letters (non-numerical characters)
1262 elif next((letter for letter in alphanumerical_letters if letter in all_residue_number_characters), None):
1263 residue_numeration_base = 36
1264 elif next((letter for letter in hexadecimal_letters if letter in all_residue_number_characters), None):
1265 residue_numeration_base = 16
1266 else:
1267 residue_numeration_base = 10
1268 # Read the pdb content line by line and set the parsed atoms, residues and chains
1269 parsed_atoms = []
1270 parsed_residues = []
1271 parsed_chains = []
1272 # Save chains and residues also in dictionaries only to find them faster
1273 name_chains = {}
1274 label_residues = {}
1275 # Keep track of the last issued atom and residue indices
1276 atom_index = -1
1277 residue_index = -1
1278 for line in pdb_lines:
1279 # Parse atoms only
1280 start = line[0:6]
1281 is_atom = start == 'ATOM ' or start == 'HETATM'
1282 if not is_atom:
1283 continue
1284 # Mine all atom data
1285 atom_name = line[11:16].strip()
1286 residue_name = line[17:21].strip()
1287 chain_name = line[21:22]
1288 residue_number = line[22:26]
1289 icode = line[26:27]
1290 if icode == ' ':
1291 icode = ''
1292 x_coord = float(line[30:38])
1293 y_coord = float(line[38:46])
1294 z_coord = float(line[46:54])
1295 element = line[76:78].strip()
1296 # Set the parsed atom
1297 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord))
1298 # Add the parsed atom to the list and update the current atom index
1299 parsed_atoms.append(parsed_atom)
1300 # Get the parsed chain
1301 # DANI: If we always check for an already existing chain there will never be repeated chains
1302 # DANI: However we may create chains with unconnected atoms silently (they were separated in the PDB)
1303 # DANI: For this reason we must only consider the last processed chain
1304 parsed_chain = name_chains.get(chain_name, None)
1305 if parsed_chain and parsed_chain != parsed_chains[-1]:
1306 parsed_chain = None
1307 # If the parsed chain was not yet instantiated then do it now
1308 if not parsed_chain:
1309 parsed_chain = Chain(name=chain_name)
1310 parsed_chains.append(parsed_chain)
1311 name_chains[chain_name] = parsed_chain
1312 # Get the parsed residue
1313 residue_label = (chain_name, residue_number, icode)
1314 # DANI: If we always check for an already existing residue there will never be repeated residues
1315 # DANI: However we may create residues with unconnected atoms silently (they were separated in the PDB)
1316 # DANI: For this reason we must only consider the last processed residue
1317 parsed_residue = label_residues.get(residue_label, None)
1318 if parsed_residue and parsed_residue != parsed_residues[-1]:
1319 parsed_residue = None
1320 # If the parsed residue was not yet instantiated then do it now
1321 if not parsed_residue:
1322 # Parse the residue number if it is to be parsed
1323 if residue_numeration_base:
1324 parsed_residue_number = int(residue_number, residue_numeration_base)
1325 # If we decided to ignore the numeration system then we just issue a new residue number
1326 # Use the last residue number from the current chain as reference
1327 else:
1328 chain_last_residue_index = parsed_chain.residue_indices[-1] if len(parsed_chain.residue_indices) > 0 else None
1329 chain_last_residue_number = parsed_residues[chain_last_residue_index].number if chain_last_residue_index != None else 0
1330 parsed_residue_number = chain_last_residue_number + 1
1331 # Instantiate the new parsed residue
1332 parsed_residue = Residue(name=residue_name, number=parsed_residue_number, icode=icode)
1333 parsed_residues.append(parsed_residue)
1334 label_residues[residue_label] = parsed_residue
1335 # Add current residue to the parsed chain
1336 residue_index += 1
1337 parsed_chain.residue_indices.append(residue_index)
1338 # Add current atom to the parsed residue
1339 atom_index += 1
1340 parsed_residue.atom_indices.append(atom_index)
1341 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains, residue_numeration_base=residue_numeration_base)
1343 # Set the structure from a pdb file
1344 # You may filter the input PDB file for a specific model
1345 # Some weird numeration systems are not supported and, when encountered, they are ignored
1346 # In these cases we set our own numeration system
1347 # Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead
1348 @classmethod
1349 def from_pdb_file (cls, pdb_filepath : str, model : Optional[int] = None, flexible_numeration : bool = True):
1350 pdb_file = File(pdb_filepath)
1351 if not pdb_file.exists:
1352 raise InputError(f'File "{pdb_filepath}" not found')
1353 if not pdb_file.format == 'pdb':
1354 raise InputError(f'"{pdb_filepath}" is not a path for a pdb file')
1355 # Read the pdb file
1356 pdb_content = None
1357 with open(pdb_filepath, 'r') as file:
1358 pdb_content = file.read()
1359 return cls.from_pdb(pdb_content, model, flexible_numeration)
1361 # Set the structure from mmcif
1362 # https://biopandas.github.io/biopandas/tutorials/Working_with_mmCIF_Structures_in_DataFrames/
1363 # You may filter the content for a specific model
1364 # You may ask for the author notation instead of the standarized notation for legacy reasons
1365 # This may have an effect in atom names, residue names, residue numbers and chain names
1366 @classmethod
1367 def from_mmcif (cls, mmcif_content : str, model : Optional[int] = None, author_notation : bool = False):
1368 # Read the pdb content line by line and set the parsed atoms, residues and chains
1369 parsed_atoms = []
1370 parsed_residues = []
1371 parsed_chains = []
1372 # Save chains and residues also in dictionaries only to find them faster
1373 name_chains = {}
1374 label_residues = {}
1375 # Keep track of the last issued atom and residue indices
1376 atom_index = -1
1377 residue_index = -1
1378 # Iterate the content line by line
1379 lines = iter(mmcif_content.split('\n'))
1380 # Now mine atoms data
1381 atom_headers = { 'ATOM', 'HETATM', 'ANISOU' }
1382 for line in lines:
1383 # Values are separated by spaces
1384 values = line.split()
1385 # If this is not atom line then we are done
1386 if len(values) == 0 or values[0] not in atom_headers: continue
1387 # Get the atom model number if a model number was passed
1388 # Note that then model is a number greater than 0
1389 if model:
1390 model_number = int(values[20])
1391 # If the model number odes not match then skip it
1392 if model != model_number: continue
1393 # Mine atom data
1394 # Next value is just the atom number, we do not need it
1395 # atom_number = values[1]
1396 # Mine the atom element
1397 element = values[2]
1398 # Mine the atom name
1399 atom_name = values[3].replace('"','')
1400 # Next value is a place holder to indicate alternate conformation, according to the docs
1401 # I don't know what this is but we do not need it
1402 # idk = values[4]
1403 # Mine the residue name
1404 residue_name = values[5]
1405 # Mine the chain name
1406 chain_name = values[6]
1407 # Next value is the chain number, we do not need it
1408 # chain_number = values[7]
1409 # Residue number is '.' for chains with only one residue
1410 residue_number = 1 if values[8] == '.' else int(values[8])
1411 icode = '' if values[9] == '?' else values[9]
1412 x_coord = float(values[10])
1413 y_coord = float(values[11])
1414 z_coord = float(values[12])
1415 # Next value is the occupancy, we do not need it
1416 # occupancy = float(values[13])
1417 # Next value is the isotropic displacement, we do not need it
1418 # isotropic = float(values[14])
1419 # Next value is the charge, we do not need it
1420 # charge = None if values[15] == '?' else float(values[15])
1421 # The rest of values are alternative author values
1422 if author_notation:
1423 residue_number = int(values[16])
1424 residue_name = values[17]
1425 chain_name = values[18]
1426 atom_name = values[19].replace('"','')
1427 # Set the parsed atom
1428 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord))
1429 # Add the parsed atom to the list and update the current atom index
1430 parsed_atoms.append(parsed_atom)
1431 # Get the parsed chain
1432 parsed_chain = name_chains.get(chain_name, None)
1433 # If the parsed chain was not yet instantiated then do it now
1434 if not parsed_chain:
1435 parsed_chain = Chain(name=chain_name)
1436 parsed_chains.append(parsed_chain)
1437 name_chains[chain_name] = parsed_chain
1438 # Get the parsed residue
1439 residue_label = (chain_name, residue_number, icode)
1440 parsed_residue = label_residues.get(residue_label, None)
1441 # If the parsed residue was not yet instantiated then do it now
1442 if not parsed_residue:
1443 # Instantiate the new parsed residue
1444 parsed_residue = Residue(name=residue_name, number=residue_number, icode=icode)
1445 parsed_residues.append(parsed_residue)
1446 label_residues[residue_label] = parsed_residue
1447 # Add current residue to the parsed chain
1448 residue_index += 1
1449 parsed_chain.residue_indices.append(residue_index)
1450 # Add current atom to the parsed residue
1451 atom_index += 1
1452 parsed_residue.atom_indices.append(atom_index)
1453 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)
1455 # Set the structure from a mmcif file
1456 @classmethod
1457 def from_mmcif_file (cls, mmcif_filepath : str, model : Optional[int] = None, author_notation : bool = False):
1458 mmcif_file = File(mmcif_filepath)
1459 if not mmcif_file.exists:
1460 raise InputError(f'File "{mmcif_filepath}" not found')
1461 if not mmcif_file.format == 'cif':
1462 raise InputError(f'"{mmcif_filepath}" is not a path for a mmcif file')
1463 # Read the mmcif file
1464 mmcif_content = None
1465 with open(mmcif_filepath, 'r') as file:
1466 mmcif_content = file.read()
1467 return cls.from_mmcif(mmcif_content, model, author_notation)
1469 # Set the structure from an MD analysis object
1470 @classmethod
1471 def from_mdanalysis (cls, mdanalysis_universe):
1472 # Set the final list of atoms to be included in the structure
1473 parsed_atoms = []
1474 parsed_residues = []
1475 parsed_chains = []
1476 # Setup atoms
1477 for atom in mdanalysis_universe.atoms:
1478 name = atom.name
1479 # WARNING: MDAnalysis cannot handle reading an element of an atom with no element
1480 element = atom.element if hasattr(atom, 'element') else '?'
1481 #coords = atom.position # DANI: Esto da error si no hay coordenadas
1482 parsed_atom = Atom(name=name, element=element)
1483 parsed_atoms.append(parsed_atom)
1484 # Setup residues
1485 for residue in mdanalysis_universe.residues:
1486 name = residue.resname
1487 number = residue.resnum
1488 icode = residue.icode if hasattr(residue, 'icode') else None # DANI: No se ha provado
1489 parsed_residue = Residue(name=name, number=number, icode=icode)
1490 atom_indices = [ atom.index for atom in residue.atoms ]
1491 parsed_residue.atom_indices = atom_indices
1492 parsed_residues.append(parsed_residue)
1493 # Setup chains
1494 for segment in mdanalysis_universe.segments:
1495 name = segment.segid
1496 parsed_chain = Chain(name=name)
1497 residue_indices = [ residue.resindex for residue in segment.residues ]
1498 parsed_chain.residue_indices = residue_indices
1499 parsed_chains.append(parsed_chain)
1500 # Setup the structure
1501 structure = cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)
1502 # Add bonds and charges which are also available in a topology
1503 atom_count = len(mdanalysis_universe.atoms)
1504 atom_bonds = [ [] for i in range(atom_count) ]
1505 for bond in mdanalysis_universe.bonds:
1506 a,b = bond.indices
1507 # Make sure atom indices are regular integers so they are JSON serializables
1508 atom_bonds[a].append(int(b))
1509 atom_bonds[b].append(int(a))
1510 structure.bonds = atom_bonds
1511 structure.charges = list(mdanalysis_universe._topology.charges.values)
1512 return structure
1514 # Set the structure from a prmtop file
1515 @classmethod
1516 def from_prmtop_file (cls, prmtop_filepath : str):
1517 # Make sure the input file exists and has the right format
1518 prmtop_file = File(prmtop_filepath)
1519 if not prmtop_file.exists:
1520 raise InputError(f'File "{prmtop_filepath}" not found')
1521 if not prmtop_file.format == 'prmtop':
1522 raise InputError(f'"{prmtop_filepath}" is not a name for a prmtop file')
1523 # In we do not have mdanalysis in our environment then we cannot proceed
1524 if not is_imported('MDAnalysis'):
1525 raise InputError('Missing dependency error: MDAnalysis')
1526 # Parse the topology using MDanalysis
1527 parser = MDAnalysis.topology.TOPParser.TOPParser(prmtop_filepath)
1528 mdanalysis_topology = parser.parse()
1529 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology)
1530 return cls.from_mdanalysis(mdanalysis_universe)
1532 # Set the structure from a tpr file
1533 @classmethod
1534 def from_tpr_file (cls, tpr_filepath : str):
1535 # Make sure the input file exists and has the right format
1536 tpr_file = File(tpr_filepath)
1537 if not tpr_file.exists:
1538 raise InputError(f'File "{tpr_filepath}" not found')
1539 if not tpr_file.format == 'tpr':
1540 raise InputError(f'"{tpr_filepath}" is not a name for a tpr file')
1541 # In we do not have mdanalysis in our environment then we cannot proceed
1542 if not is_imported('MDAnalysis'):
1543 raise InputError('Missing dependency error: MDAnalysis')
1544 # Parse the topology using MDanalysis
1545 parser = MDAnalysis.topology.TPRParser.TPRParser(tpr_filepath)
1546 mdanalysis_topology = parser.parse()
1547 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology)
1548 return cls.from_mdanalysis(mdanalysis_universe)
1550 # Set the structure from a file if the file format is supported
1551 @classmethod
1552 def from_file (cls, mysterious_filepath : str):
1553 mysterious_file = File(mysterious_filepath)
1554 if mysterious_file.format == 'pdb':
1555 return cls.from_pdb_file(mysterious_file.path)
1556 if mysterious_file.format == 'cif':
1557 return cls.from_mmcif_file(mysterious_file.path)
1558 if mysterious_file.format == 'prmtop':
1559 return cls.from_prmtop_file(mysterious_file.path)
1560 if mysterious_file.format == 'tpr':
1561 return cls.from_tpr_file(mysterious_file.path)
1562 raise InputError(f'Not supported format ({mysterious_file.format}) to setup a Structure')
1564 # Get the number of atoms in the structure
1565 def get_atom_count (self) -> int:
1566 return len(self.atoms)
1567 atom_count = property(get_atom_count, None, None, "The number of atoms in the structure (read only)")
1569 # Get the number of residues in the structure (read only)
1570 def get_residue_count (self) -> int:
1571 return len(self.residues)
1572 residue_count = property(get_residue_count, None, None, "Number of residues in the structure (read only)")
1574 # Get the number of chains in the structure (read only)
1575 def get_chain_count (self) -> int:
1576 return len(self.chains)
1577 chain_count = property(get_chain_count, None, None, "Number of chains in the structure (read only)")
1579 # Fix atom elements by guessing them when missing
1580 # Set all elements with the first letter upper and the second (if any) lower
1581 # Also check if atom elements are coherent with atom names
1582 # If 'trust' is set as False then we impose elements according to what we can guess from the atom name
1583 # Return True if any element was modified or False if not
1584 def fix_atom_elements (self, trust : bool = True) -> bool:
1585 modified = False
1586 added = False
1587 # Save the wrong guesses for a final report
1588 # This way we do not crowd the terminal with logs when a lot of atoms are affected
1589 reports = {}
1590 for atom in self.atoms:
1591 # Make sure elements have the first letter cap and the second letter not cap
1592 if atom.element:
1593 new_element = first_cap_only(atom.element)
1594 if atom.element != new_element:
1595 atom.element = new_element
1596 modified = True
1597 # Check the element to match what we would guess from the atom name
1598 # In case it does not just warn the user
1599 guess = atom.guess_element()
1600 if atom.element != guess:
1601 report = (atom.name, atom.element, guess)
1602 report_count = reports.get(report, 0)
1603 reports[report] = report_count + 1
1604 if not trust:
1605 atom.element = guess
1606 modified = True
1607 # If elements are missing then guess them from atom names
1608 else:
1609 atom.element = atom.guess_element()
1610 added = True
1611 # Warn the user about anormalies
1612 for report, count in reports.items():
1613 atom_name, atom_element, guess = report
1614 warn(f"Suspicious element for atom {atom_name}: {atom_element} -> shoudn't it be {guess}? ({count} occurrences)")
1615 # Warn the user that some elements were modified
1616 if modified: warn('Atom elements have been modified')
1617 if added: warn('Atom elements were missing and have been added')
1618 # Set atom elements as fixed in order to avoid repeating this process
1619 self._fixed_atom_elements = True
1620 return modified or added
1622 # Set new coordinates
1623 def set_new_coordinates (self, new_coordinates : List[Coords]):
1624 # Make sure the list of coordinates is as long as the number of atoms
1625 if len(new_coordinates) != self.atom_count:
1626 raise ValueError(f'The number of coordinates ({len(new_coordinates)}) does not match the number of atoms ({self.atom_count})')
1627 # Overwrite current coordinates with the new coordinates
1628 for i, atom in enumerate(self.atoms):
1629 atom.coords = tuple(new_coordinates[i])
1631 # Get all supported ion atom indices together in a set
1632 def get_ion_atom_indices (self) -> Set:
1633 # If we already did this then return the stored value
1634 if self._ion_atom_indices != None:
1635 return self._ion_atom_indices
1636 # Find ion atom indices
1637 indices = set()
1638 for atom in self.atoms:
1639 if atom.element in SUPPORTED_ION_ELEMENTS:
1640 indices.add(atom.index)
1641 self._ion_atom_indices = indices
1642 return self._ion_atom_indices
1643 ion_atom_indices = property(get_ion_atom_indices, None, None, "Atom indices for what we consider supported ions")
1645 # Get all dummy atom indices together in a set
1646 def get_dummy_atom_indices (self) -> Set:
1647 # If we already did this then return the stored value
1648 if self._dummy_atom_indices != None:
1649 return self._dummy_atom_indices
1650 # Find ion atom indices
1651 indices = set()
1652 for atom in self.atoms:
1653 if atom.element == DUMMY_ATOM_ELEMENT:
1654 indices.add(atom.index)
1655 self._dummy_atom_indices = indices
1656 return self._dummy_atom_indices
1657 dummy_atom_indices = property(get_dummy_atom_indices, None, None, "Atom indices for what we consider dummy atoms")
1659 # Generate a pdb file content with the current structure
1660 def generate_pdb (self):
1661 content = 'REMARK workflow generated pdb file\n'
1662 for a, atom in enumerate(self.atoms):
1663 residue = atom.residue
1664 index = str((a+1) % 100000).rjust(5)
1665 name = ' ' + atom.name.ljust(3) if len(atom.name) < 4 else atom.name
1666 residue_name = residue.name.ljust(4) if residue else 'XXX'.ljust(4)
1667 chain = atom.chain
1668 chain_name = chain.name if chain.name and len(chain.name) == 1 else 'X'
1669 residue_number = str(residue.number).rjust(4) if residue else '0'.rjust(4)
1670 # If residue number is longer than 4 characters then we must parse to hexadecimal
1671 if len(residue_number) > 4:
1672 residue_number = hex(residue.number)[2:].rjust(4)
1673 icode = residue.icode if residue.icode and len(residue.icode) else ' '
1674 # Make sure we have atom coordinates
1675 if atom.coords == None:
1676 raise InputError('Trying to write a PDB file from a structure with atoms without coordinates')
1677 x_coord, y_coord, z_coord = [ "{:.3f}".format(coord).rjust(8) for coord in atom.coords ]
1678 occupancy = '1.00' # Just a placeholder
1679 temp_factor = '0.00' # Just a placeholder
1680 element = atom.element.rjust(2)
1681 atom_line = ('ATOM ' + index + ' ' + name + ' ' + residue_name
1682 + chain_name + residue_number + icode + ' ' + x_coord + y_coord + z_coord
1683 + ' ' + occupancy + ' ' + temp_factor + ' ' + element).ljust(80) + '\n'
1684 content += atom_line
1685 return content
1687 # Generate a pdb file with current structure
1688 def generate_pdb_file (self, pdb_filepath : str):
1689 pdb_content = self.generate_pdb()
1690 with open(pdb_filepath, "w") as file:
1691 file.write(pdb_content)
1693 # Get the structure equivalent pytraj topology
1694 def get_pytraj_topology (self):
1695 # In we do not have pytraj in our environment then we cannot proceed
1696 if not is_imported('pytraj'):
1697 raise InputError('Missing dependency error: pytraj')
1698 # Generate a pdb file from the current structure to feed pytraj
1699 pdb_filepath = '.structure.pdb'
1700 self.generate_pdb_file(pdb_filepath)
1701 pytraj_topology = pytraj.load_topology(filename = pdb_filepath)
1702 os.remove(pdb_filepath)
1703 return pytraj_topology
1705 # Select atoms from the structure thus generating an atom indices list
1706 # Different tools may be used to make the selection:
1707 # - vmd (default)
1708 # - pytraj
1709 SUPPORTED_SELECTION_SYNTAXES = { 'vmd', 'pytraj' }
1710 def select (self, selection_string : str, syntax : str = 'vmd') -> Optional['Selection']:
1711 if syntax == 'vmd':
1712 # Generate a pdb for vmd to read it
1713 pdb_filepath = '.structure.pdb'
1714 self.generate_pdb_file(pdb_filepath)
1715 # Use vmd to find atom indices
1716 atom_indices = get_vmd_selection_atom_indices(pdb_filepath, selection_string)
1717 os.remove(pdb_filepath)
1718 if len(atom_indices) == 0:
1719 return Selection()
1720 return Selection(atom_indices)
1721 if syntax == 'pytraj':
1722 # In we do not have pytraj in our environment then we cannot proceed
1723 if not is_imported('pytraj'):
1724 raise InputError('Missing dependency error: pytraj')
1725 pytraj_topology = self.get_pytraj_topology()
1726 pytraj_selection = pytraj_topology[selection_string]
1727 atom_indices = [ atom.index for atom in pytraj_selection.atoms ]
1728 if len(atom_indices) == 0:
1729 return Selection()
1730 return Selection(atom_indices)
1732 options = ', '.join(self.SUPPORTED_SELECTION_SYNTAXES)
1733 raise InputError(f'Syntax "{syntax}" is not supported. Choose one of the following: {options}')
1735 # Set a function to make selections using atom indices
1736 def select_atom_indices (self, atom_indices : List[int]) -> 'Selection':
1737 # Check atom indices to be in the structure
1738 atom_count = self.atom_count
1739 for atom_index in atom_indices:
1740 if atom_index >= atom_count:
1741 raise InputError(f'Atom index {atom_index} is out of range ({atom_count})')
1742 return Selection(atom_indices)
1744 # Set a function to make selections using residue indices
1745 def select_residue_indices (self, residue_indices : List[int]) -> 'Selection':
1746 # WARNING: The following line gets stucked sometimes, idk why
1747 #atom_indices = sum([ self.residues[index].atom_indices for index in residue_indices ], [])
1748 atom_indices = []
1749 for i, index in enumerate(residue_indices):
1750 atom_indices += self.residues[index].atom_indices
1751 return Selection(atom_indices)
1753 # Get a selection with all atoms
1754 def select_all (self) -> 'Selection':
1755 return Selection(list(range(self.atom_count)))
1757 # Select atoms according to the classification of its residue
1758 def select_by_classification (self, classification : str) -> 'Selection':
1759 atom_indices = []
1760 for residue in self.residues:
1761 if residue.classification == classification:
1762 atom_indices += residue.atom_indices
1763 return Selection(atom_indices)
1765 # Select water atoms
1766 # WARNING: This logic is a bit guessy and it may fail for non-standard residue named structures
1767 def select_water (self) -> 'Selection':
1768 return self.select_by_classification('solvent')
1770 # Select ions
1771 def select_ions (self) -> 'Selection':
1772 return self.select_by_classification('ion')
1774 # Select counter ion atoms
1775 # WARNING: This logic is a bit guessy and it may fail for non-standard atom named structures
1776 def select_counter_ions (self, charge : Optional[str] = None) -> 'Selection':
1777 counter_ion_indices = []
1778 # Set the accepted names accoridng to the charge
1779 if charge == None:
1780 accepted_names = STANDARD_COUNTER_ION_ATOM_NAMES
1781 elif charge == '+':
1782 accepted_names = STANDARD_COUNTER_CATION_ATOM_NAMES
1783 elif charge == '-':
1784 accepted_names = STANDARD_COUNTER_ANION_ATOM_NAMES
1785 else:
1786 raise ValueError('Not supported charge')
1787 # Iterate atoms
1788 for atom in self.atoms:
1789 # If the residue has not one single atom then it is not an ion
1790 if len(atom.residue.atoms) != 1: continue
1791 # Get a simplified version of the atom name
1792 # Set all letters upper and remove non-letter characters (e.g. '+' and '-' symbols)
1793 simple_atom_name = ''.join(filter(str.isalpha, atom.name.upper()))
1794 if simple_atom_name in accepted_names:
1795 counter_ion_indices.append(atom.index)
1796 return Selection(counter_ion_indices)
1798 # Select both water and counter ions
1799 def select_water_and_counter_ions (self) -> 'Selection':
1800 return self.select_water() + self.select_counter_ions()
1802 # Select heavy atoms
1803 def select_heavy_atoms (self) -> 'Selection':
1804 atom_indices = []
1805 for atom in self.atoms:
1806 # If the atom is not an hydrogen then add it to the list
1807 if atom.element != 'H':
1808 atom_indices.append(atom.index)
1809 return Selection(atom_indices)
1811 # Select protein atoms
1812 # WARNING: Note that there is a small difference between VMD protein and our protein
1813 # This function is improved to consider terminal residues as proteins
1814 # VMD considers protein any resiude including N, C, CA and O while terminals may have OC1 and OC2 instead of O
1815 def select_protein (self) -> 'Selection':
1816 return self.select_by_classification('protein')
1818 # Select nucleic atoms
1819 def select_nucleic (self) -> 'Selection':
1820 return self.select_by_classification('dna') + self.select_by_classification('rna')
1822 # Select lipids
1823 def select_lipids (self) -> 'Selection':
1824 return self.select_by_classification('fatty') + self.select_by_classification('steroid')
1826 # Select carbohydrates
1827 def select_carbohydrates (self) -> 'Selection':
1828 return self.select_by_classification('carbohydrate')
1830 # Return a selection of the typical PBC atoms: solvent, counter ions and lipids
1831 # WARNING: This is just a guess
1832 def select_pbc_guess (self) -> 'Selection':
1833 return self.select_water() + self.select_counter_ions() + self.select_lipids()
1835 def select_cg (self) -> 'Selection':
1836 """Select coarse grain atoms."""
1837 return Selection([ atom.index for atom in self.atoms if atom.element == CG_ATOM_ELEMENT ])
1839 def select_cartoon (self, include_terminals : bool = False) -> 'Selection':
1840 """Select cartoon representable regions for VMD.
1842 Rules are:
1843 1. Residues must be protein (i.e. must contain C, CA, N and O atoms) or nucleic (P, OP1, OP2, O3', C3', C4', C5', O5')
1844 2. There must be at least 3 covalently bonded residues
1846 It does not matter their chain, numeration or even index order as long as they are bonded.
1847 * Note that we can represent cartoon while we display one residue alone, but it must be connected anyway.
1848 Also, we have the option to include terminals in the cartoon selection although they are not representable.
1849 This is helpful for the screenshot: terminals are better hidden than represented as ligands."""
1850 # Set fragments which are candidate to be cartoon representable
1851 fragments = []
1852 # Get protein fragments according to VMD
1853 protein_selection = self.select_protein() - self.select_cg() if include_terminals else self.select('protein', syntax='vmd')
1854 if protein_selection:
1855 fragments += list(self.find_fragments(protein_selection))
1856 # Get nucleic fragments according to VMD
1857 nucleic_selection = self.select_nucleic() - self.select_cg()
1858 if nucleic_selection:
1859 fragments += list(self.find_fragments(nucleic_selection))
1860 # Set the final selection including all valid fragments
1861 cartoon_selection = Selection()
1862 # Iterate over every fragment
1863 for fragment in fragments:
1864 # Make sure the fragment has at least 3 residues before adding it to the cartoon selection
1865 fragment_residue_indices = self.get_selection_residue_indices(fragment)
1866 if len(fragment_residue_indices) >= 3:
1867 cartoon_selection += fragment
1868 return cartoon_selection
1870 # Invert a selection
1871 def invert_selection (self, selection : 'Selection') -> 'Selection':
1872 atom_indices = list(range(self.atom_count))
1873 for atom_index in selection.atom_indices:
1874 atom_indices[atom_index] = None
1875 return Selection([ atom_index for atom_index in atom_indices if atom_index != None ])
1877 # Given a selection, get a list of residue indices for residues implicated
1878 # Note that if a single atom from the residue is in the selection then the residue index is returned
1879 def get_selection_residue_indices (self, selection : 'Selection') -> List[int]:
1880 return list(set([ self.atoms[atom_index].residue_index for atom_index in selection.atom_indices ]))
1882 # Given a selection, get a list of residues implicated
1883 # Note that if a single atom from the residue is in the selection then the residue is returned
1884 def get_selection_residues (self, selection : 'Selection') -> List['Residue']:
1885 residue_indices = self.get_selection_residue_indices(selection)
1886 return [ self.residues[index] for index in residue_indices ]
1888 # Given a selection, get a list of chain indices for chains implicated
1889 # Note that if a single atom from the chain is in the selection then the chain index is returned
1890 def get_selection_chain_indices (self, selection : 'Selection') -> List[int]:
1891 return list(set([ self.atoms[atom_index].chain_index for atom_index in selection.atom_indices ]))
1893 # Given a selection, get a list of chains implicated
1894 # Note that if a single atom from the chain is in the selection then the chain is returned
1895 def get_selection_chains (self, selection : 'Selection') -> List['Chain']:
1896 chain_indices = self.get_selection_chain_indices(selection)
1897 return [ self.chains[index] for index in chain_indices ]
1899 # Get type of the chain
1900 def get_selection_classification (self, selection : 'Selection') -> str:
1902 # Get selection residues
1903 selection_residue_indices = self.get_selection_residue_indices(selection)
1905 # Inicializar contadores para cada tipo de residuo
1906 residue_counts = {}
1908 # Count the residues of each type
1909 for residue_index in selection_residue_indices:
1910 residue = self.residues[residue_index]
1911 res_class = residue.classification
1912 if res_class in residue_counts:
1913 residue_counts[res_class] += 1
1914 else:
1915 # If the classification is not recognized, count it as "other"
1916 residue_counts[res_class] = 1
1918 # Count the total number of residues in the selection
1919 total_residues = sum(residue_counts.values())
1920 if total_residues == 0: raise ValueError('Should have residues at this point')
1922 # Calculate the proportion of each type of residue
1923 proportions = { k: v / total_residues for k, v in residue_counts.items() }
1925 # If one type of residue dominates, return it
1926 primary_type = max(proportions, key=proportions.get)
1927 # We establish a threshold of 80% to consider a chain as a single type
1928 if proportions[primary_type] >= 0.8:
1929 return primary_type
1931 # Special cases
1932 relevant_threshold = 0.3
1933 if proportions.get("dna", 0) > relevant_threshold and proportions.get("rna", 0) > relevant_threshold:
1934 return "nucleic"
1935 if proportions.get("carbohydrate", 0) > relevant_threshold and proportions.get("protein", 0) > relevant_threshold:
1936 return "glycoprotein"
1937 if proportions.get("fatty", 0) > relevant_threshold and proportions.get("steroid", 0) > relevant_threshold:
1938 return "lipid"
1940 # Any other combinations of different main proportions
1941 main_proportions = { k: v for k, v in proportions.items() if v > relevant_threshold }
1942 # Nothing is above the threshold
1943 if len(main_proportions) == 0:
1944 return "mix"
1945 # There is only one thing above threshold
1946 elif len(main_proportions) == 1:
1947 return f"{primary_type}/mix"
1948 # There are two things above threshold
1949 elif len(main_proportions) == 2:
1950 other_type = next(key for key in main_proportions.keys() if key != primary_type)
1951 return f"{primary_type}/{other_type}"
1952 # There are three things above the threshold (more is not possible)
1953 else:
1954 return "mix"
1956 # Create a new structure from the current using a selection to filter atoms
1957 def filter (self, selection : Union['Selection', str], selection_syntax : str = 'vmd') -> 'Structure':
1958 if not selection:
1959 raise InputError('No selection was passed')
1960 # In case the selection is not an actual Selection, but a string, parse the string into a Selection
1961 if type(selection) == str:
1962 selection = self.select(selection, selection_syntax)
1963 new_atoms = []
1964 new_residues = []
1965 new_chains = []
1966 # Get the selected atoms
1967 # Generate dictionaries with new indexes as keys and previous indexes as values for atoms, residues and chains
1968 # This is done with this structure for the residues and chains further to find the new indices fast
1969 new_atom_indices = {}
1970 # Collect also original indices to related atom residues and chains
1971 original_atom_residue_indices = []
1972 for new_index, original_index in enumerate(selection.atom_indices):
1973 # Make a copy of the selected atom in order to not modify the original one
1974 original_atom = self.atoms[original_index]
1975 new_atom = Atom(
1976 name=original_atom.name,
1977 element=original_atom.element,
1978 coords=original_atom.coords,
1979 )
1980 new_atoms.append(new_atom)
1981 # Save old and new indices in order to reconfigure all indices later
1982 new_atom_indices[original_index] = new_index
1983 original_atom_residue_indices.append(original_atom.residue_index)
1984 # Find the selected residues
1985 selected_residue_indices = list(set(original_atom_residue_indices))
1986 # Repeat the original/new indices backup we did before
1987 new_residue_indices = {}
1988 original_residue_atom_indices = []
1989 original_residue_chain_indices = []
1990 for new_index, original_index in enumerate(selected_residue_indices):
1991 # Make a copy of the selected residue in order to not modify the original one
1992 original_residue = self.residues[original_index]
1993 new_residue = Residue(
1994 name=original_residue.name,
1995 number=original_residue.number,
1996 icode=original_residue.icode,
1997 )
1998 new_residues.append(new_residue)
1999 # Save old and new indices in order to reconfigure all indices later
2000 new_residue_indices[original_index] = new_index
2001 original_residue_atom_indices.append(original_residue.atom_indices)
2002 original_residue_chain_indices.append(original_residue.chain_index)
2003 # Find the selected chains
2004 selected_chain_indices = list(set(original_residue_chain_indices))
2005 # Repeat the original/new indices backup we did before
2006 new_chain_indices = {}
2007 original_chain_residue_indices = []
2008 for new_index, original_index in enumerate(selected_chain_indices):
2009 # Make a copy of the selected chain in order to not modify the original one
2010 original_chain = self.chains[original_index]
2011 new_chain = Chain(
2012 name=original_chain.name,
2013 )
2014 new_chains.append(new_chain)
2015 # Save old and new indices in order to reconfigure all indices later
2016 new_chain_indices[original_index] = new_index
2017 original_chain_residue_indices.append(original_chain.residue_indices)
2018 # Finally, reset indices in all instances
2019 for a, atom in enumerate(new_atoms):
2020 atom.residue_index = new_residue_indices[ original_atom_residue_indices[a] ]
2021 for r, residue in enumerate(new_residues):
2022 atom_indices = []
2023 for original_index in original_residue_atom_indices[r]:
2024 new_index = new_atom_indices.get(original_index, None)
2025 if new_index != None:
2026 atom_indices.append(new_index)
2027 residue.atom_indices = atom_indices
2028 residue.chain_index = new_chain_indices[ original_residue_chain_indices[r] ]
2029 for c, chain in enumerate(new_chains):
2030 residue_indices = []
2031 for original_index in original_chain_residue_indices[c]:
2032 new_index = new_residue_indices.get(original_index, None)
2033 if new_index != None:
2034 residue_indices.append(new_index)
2035 chain.residue_indices = residue_indices
2036 return Structure(atoms=new_atoms, residues=new_residues, chains=new_chains)
2039 def chainer (self, selection : Optional['Selection'] = None, letter : Optional[str] = None, whole_fragments : bool = True):
2040 """Set chains on demand.
2041 If no selection is passed then the whole structure will be affected.
2042 If no chain is passed then a "chain by fragment" logic will be applied."""
2043 # If there is no selection we consider all atoms
2044 if selection == None:
2045 selection = self.select_all()
2046 # If the selection is empty then there is nothing to do here
2047 if len(selection) == 0: return
2048 # If a letter is specified then the logic is way simpler
2049 if letter:
2050 self.set_selection_chain_name(selection, letter)
2051 return
2052 # If a letter is not specified we run the "fragments" logic
2053 fragment_getter = self.find_whole_fragments if whole_fragments else self.find_fragments
2054 for fragment in fragment_getter(selection):
2055 chain_name = self.get_available_chain_name()
2056 self.set_selection_chain_name(fragment, chain_name)
2058 # Smart function to set chains automatically
2059 # Original chains will be overwritten
2060 def auto_chainer (self):
2061 # Reset chains
2062 self.chainer(letter='X')
2063 # Set solvent and ions as a unique chain
2064 ion_selection = self.select_ions()
2065 solvent_selection = self.select_water()
2066 ion_and_indices_selection = ion_selection + solvent_selection
2067 self.chainer(selection=ion_and_indices_selection, letter='S')
2068 # Set fatty acids and steroids as a unique chain
2069 # DANI: Se podrían incluir algunos carbohydrates (glycans)
2070 # DANI: Se podrían descartarían residuos que no pertenezcan a la membrana por proximidad
2071 membrane_selection = self.select_lipids()
2072 self.chainer(selection=membrane_selection, letter='M')
2073 # Set carbohydrates as a unique chain as well, just in case we have glycans
2074 # Note that in case glycan atoms are mixed with protein atoms glycan chains will be overwritten
2075 # However this is not a problem. It is indeed the best solution if we don't want ro resort atoms
2076 carbohydrate_selection = self.select_carbohydrates()
2077 self.chainer(selection=carbohydrate_selection, letter='H')
2078 # Add a chain per fragment for both proteins and nucleic acids
2079 protein_selection = self.select_protein()
2080 nucleic_selection = self.select_nucleic()
2081 protein_and_nucleic_selection = protein_selection + nucleic_selection
2082 self.chainer(selection=protein_and_nucleic_selection)
2083 # At this point we should have covered most of the molecules in the structure
2084 # However, in case there are more molecules, we have already set them all as a single chain ('X')
2085 # Here we do not apply the chain per fragment logic since it may be dangerous
2086 # Note that we may have a lot of independent residues (and thus a los of small fragments)
2087 # This would make us run out of letters in the alphabet and thus there would be no more chains
2088 # As the last step, fix repeated chains
2089 self.check_repeated_chains(fix_chains=True)
2091 # This is an alternative system to find protein chains (anything else is chained as 'X')
2092 # This system does not depend on VMD
2093 # It totally overrides previous chains since it is expected to be used only when chains are missing
2094 def raw_protein_chainer (self):
2095 current_chain = self.get_available_chain_name()
2096 previous_alpha_carbon = None
2097 for residue in self.residues:
2098 alpha_carbon = next((atom for atom in residue.atoms if atom.name == 'CA'), None)
2099 if not alpha_carbon:
2100 residue.set_chain('X')
2101 continue
2102 # Connected aminoacids have their alpha carbons at a distance of around 3.8 Ångstroms
2103 residues_are_connected = previous_alpha_carbon and calculate_distance(previous_alpha_carbon, alpha_carbon) < 4
2104 if not residues_are_connected:
2105 current_chain = self.get_available_chain_name()
2106 residue.set_chain(current_chain)
2107 previous_alpha_carbon = alpha_carbon
2109 # Given an atom selection, set the chain for all these atoms
2110 # Note that the chain is changed in every whole residue, no matter if only one atom was selected
2111 def set_selection_chain_name (self, selection : 'Selection', letter : str):
2112 # Find if the letter belongs to an already existing chain
2113 chain = self.get_chain_by_name(letter)
2114 # If the chain does not exist yet then create it
2115 if not chain:
2116 chain = Chain(name=letter)
2117 self.set_new_chain(chain)
2118 # Set the chain index of every residue to the new chain
2119 for atom_index in selection.atom_indices:
2120 # WARNING: Chain index has to be read every iteration since it may change. Do not save it
2121 atom = self.atoms[atom_index]
2122 atom.set_chain_index(chain.index)
2124 # Get an available chain name
2125 # Find alphabetically the first letter which is not yet used as a chain name
2126 # If all letters in the alphabet are used already then raise an error
2127 def get_available_chain_name (self) -> str:
2128 current_chain_names = [ chain.name for chain in self.chains ]
2129 next_available_chain_name = next((name for name in AVAILABLE_LETTERS if name not in current_chain_names), None)
2130 if next_available_chain_name == None:
2131 raise InputError(f'There are more chains than available chain letters ({len(AVAILABLE_LETTERS)})')
2132 return next_available_chain_name
2134 def get_next_available_chain_name (self, anterior : str) -> str:
2135 """
2136 Get the next available chain name.
2138 Args:
2139 anterior (str): The last chain name used, which is expected to be a single letter
2141 Raises:
2142 ValueError: If the anterior is not a letter or if there are more chains than available
2143 """
2144 current_chain_names = set([ chain.name for chain in self.chains ])
2145 # If anterior is cap then try caps first, if anterior is lower then try lowers first
2146 if anterior.isupper():
2147 first_group, second_group = AVAILABLE_CAPS, AVAILABLE_LOWS
2148 elif anterior.islower():
2149 first_group, second_group = AVAILABLE_LOWS, AVAILABLE_CAPS
2150 else: raise ValueError(f'Is "{anterior}" even a letter?')
2151 # Reorder letters in the first group, so the anterior is the last letter
2152 anterior_position = first_group.index(anterior)
2153 next_position = anterior_position + 1
2154 reordered_group = first_group[next_position:] + first_group[0:next_position]
2155 next_letter = next((letter for letter in reordered_group if letter not in current_chain_names), None)
2156 if next_letter: return next_letter
2157 # If there is not available letters is the first group then return the first available in the second
2158 next_letter = next((letter for letter in second_group if letter not in current_chain_names), None)
2159 if next_letter: return next_letter
2160 raise ValueError(f'There are more chains than available chain letters ({len(AVAILABLE_LETTERS)})')
2162 # Get a chain by its name
2163 def get_chain_by_name (self, name : str) -> Optional['Chain']:
2164 return next((c for c in self.chains if c.name == name), None)
2166 # Find a residue by its chain, number and insertion code
2167 def find_residue (self, chain_name : str, number : int, icode : str = '') -> Optional['Residue']:
2168 # Get the corresponding chain
2169 target_chain = self.get_chain_by_name(chain_name)
2170 if not target_chain: return None
2171 # Find the residue in the target chain
2172 target_chain.find_residue(number, icode)
2174 # Get a summary of the structure
2175 def display_summary (self):
2176 print(f'Atoms: {self.atom_count}')
2177 print(f'Residues: {len(self.residues)}')
2178 print(f'Chains: {len(self.chains)}')
2179 for chain in self.chains:
2180 print(f'Chain {chain.name} ({len(chain.residue_indices)} residues)')
2181 print(' -> ' + chain.get_sequence())
2184 def check_repeated_chains (self, fix_chains : bool = False, display_summary : bool = False) -> bool:
2185 """
2186 There may be chains which are equal in the structure (i.e. same chain name).
2187 This means we have a duplicated/splitted chain.
2188 Repeated chains are usual and they are usually supported but with some problems.
2189 Also, repeated chains usually come with repeated residues, which means more problems (see explanation below).
2191 In the context of this structure class we may have 2 different problems with a different solution each:
2192 1. There is more than one chain with the same letter (repeated chain) -> rename the duplicated chains
2193 2. There is a chain with atom indices which are not consecutive (splitted chain) -> create new chains
2195 Rename repeated chains or create new chains if the fix_chains argument is True.
2197 WARNING: These fixes are possible only if there are less chains than the number of letters in the alphabet.
2198 Although there is no limitation in this code for chain names, setting long chain names is not compatible with pdb format.
2200 Check splitted chains (a chains with non consecutive residues) and try to fix them if requested.
2201 Check repeated chains (two chains with the same name) and return True if there were any repeats.
2202 """
2203 # Order chains according to their names
2204 # Save also those chains which have a previous duplicate
2205 name_chains = {}
2206 repeated_chains = []
2207 for chain in self.chains:
2208 chain_name = chain.name
2209 current_name_chains = name_chains.get(chain_name, None)
2210 if not current_name_chains:
2211 name_chains[chain_name] = [chain]
2212 else:
2213 name_chains[chain_name].append(chain)
2214 repeated_chains.append(chain)
2215 # Display the summary of repeated chains if requested
2216 if display_summary:
2217 if len(repeated_chains) > 0:
2218 warn('There are repeated chains:')
2219 for chain_name, chains in name_chains.items():
2220 chains_count = len(chains)
2221 if chains_count > 1:
2222 print(f'- Chain {chain_name} has {chains_count} repeats')
2223 # Rename repeated chains if requested
2224 if len(repeated_chains) > 0 and fix_chains:
2225 n_chains = len(self.chains)
2226 n_available_letters = len(AVAILABLE_LETTERS)
2227 if n_chains > n_available_letters:
2228 # for chain in self.chains:
2229 # print(str(chain) + ': ' + str(chain.atom_indices[0]) + ' to ' + str(chain.atom_indices[-1]))
2230 raise ValueError(f'There are more chains ({n_chains}) than available chain letters ({n_available_letters})')
2231 current_letters = list(name_chains.keys())
2232 for repeated_chain in repeated_chains:
2233 last_chain_letter = repeated_chain.name
2234 while last_chain_letter in current_letters:
2235 last_chain_letter = get_next_letter(last_chain_letter)
2236 repeated_chain.name = last_chain_letter
2237 current_letters.append(last_chain_letter)
2238 # Check if there are splitted chains
2239 for chain in self.chains:
2240 residue_indices = sorted(chain.residue_indices)
2241 # Check if residue indices are consecutive
2242 if residue_indices[-1] - residue_indices[0] + 1 == len(residue_indices):
2243 continue
2244 warn(f'Splitted chain {chain.name}')
2245 # If indices are not consecutive then we must find ranges of consecutive residues and create new chains for them
2246 previous_residue_index = residue_indices[0]
2247 consecutive_residues = [previous_residue_index]
2248 overall_consecutive_residues = []
2249 for residue_index in residue_indices[1:]:
2250 # If next residue is consecutive
2251 if residue_index == previous_residue_index + 1:
2252 consecutive_residues.append(residue_index)
2253 # If next residue is NOT consecutive
2254 else:
2255 overall_consecutive_residues.append(consecutive_residues)
2256 consecutive_residues = [residue_index]
2257 # Update the previous index
2258 previous_residue_index = residue_index
2259 # Add the last split
2260 overall_consecutive_residues.append(consecutive_residues)
2261 # Now create new chains and reasign residues
2262 # Skip the first set of consecutive residues since they will stay in the original chain
2263 for residues_indices in overall_consecutive_residues[1:]:
2264 chain_name = self.get_available_chain_name()
2265 residues_selection = self.select_residue_indices(residues_indices)
2266 self.set_selection_chain_name(residues_selection, chain_name)
2268 # Fix repeated chains if requested
2269 return len(repeated_chains) > 0
2272 def check_repeated_residues (self, fix_residues : bool = False, display_summary : bool = False) -> bool:
2273 """
2274 There may be residues which are equal in the structure (i.e. same chain, name, number and icode).
2275 In case 2 residues in the structure are equal we must check distance between their atoms.
2276 If atoms are far it means they are different residues with the same notation (duplicated residues).
2277 If atoms are close it means they are indeed the same residue (splitted residue).
2279 Splitted residues are found in some pdbs and they are supported by some tools.
2280 These tools consider all atoms with the same 'record' as the same residue.
2281 However, there are other tools which would consider the splitted residue as two different residues.
2282 This causes inconsistency along different tools besides a long list of problems.
2283 The only possible is fix is changing the order of atoms in the topology.
2284 Note that this is a breaking change for associated trajectories, which must change the order of coordinates.
2285 However here we provide tools to fix associates trajectories as well.
2287 Duplicated residues are usual and they are usually supported but with some problems.
2288 For example, pytraj analysis outputs use to sort results by residues and each residue is tagged.
2289 If there are duplicated residues with the same tag it may be not possible to know which result belongs to each residue.
2290 Another example are NGL selections once in the web client.
2291 If you select residue ':A and 1' and there are multiple residues 1 in chain A all of them will be displayed.
2293 Check residues to search for duplicated and splitted residues.
2294 Renumerate repeated residues if the fix_residues argument is True.
2295 Return True if there were any repeats.
2296 """
2297 # Track if residues have to be changed or not
2298 modified = False
2299 # Group all residues in the structure according to their chain, number and icode
2300 grouped_residues = {}
2301 # Check repeated residues which are one after the other
2302 # Note that these residues MUST have a different name
2303 # Otherwise they would have not been considered different residues
2304 # For these rare cases we use icodes to solve the problem
2305 non_icoded_residues = []
2306 last_residue = None
2307 for residue in self.residues:
2308 # Check residue to be equal than the previous residue
2309 if residue == last_residue:
2310 non_icoded_residues.append(residue)
2311 last_residue = residue
2312 continue
2313 last_residue = residue
2314 # Add residue to the groups of repeated residues
2315 current_residue_repeats = grouped_residues.get(residue, None)
2316 if not current_residue_repeats:
2317 grouped_residues[residue] = [ residue ]
2318 else:
2319 current_residue_repeats.append(residue)
2320 # In case we have non icoded residues
2321 if len(non_icoded_residues) > 0:
2322 if display_summary:
2323 print(f'There are non-icoded residues ({len(non_icoded_residues)})')
2324 # Set new icodes for non icoded residues
2325 if fix_residues:
2326 print(' Non icoded residues will recieve an icode')
2327 for residue in non_icoded_residues:
2328 repeated_residues_group = grouped_residues[residue]
2329 current_icodes = [ residue.icode for residue in repeated_residues_group if residue.icode ]
2330 next_icode = next((cap for cap in AVAILABLE_CAPS if cap not in current_icodes), None)
2331 if not next_icode:
2332 raise ValueError('There are no more icodes available')
2333 # print('Setting icode ' + next_icode + ' to residue ' + str(residue))
2334 residue.icode = next_icode
2335 modified = True
2336 # Grouped residues with more than 1 result are considered as repeated
2337 repeated_residues = [ residues for residues in grouped_residues.values() if len(residues) > 1 ]
2338 if len(repeated_residues) == 0:
2339 return modified
2340 # In case we have repeated residues...
2341 if display_summary:
2342 warn(f'There are {len(repeated_residues)} different groups of repeated residues')
2343 print(f' e.g. {repeated_residues[0][0]}')
2344 if len(repeated_residues) == 9999 or len(repeated_residues) == 10000:
2345 print(' Probably you have more residues than the PDB numeration limit (1-9999)')
2346 # Now for each repeated residue, find out which are splitted and which are duplicated
2347 covalent_bonds = self.bonds
2348 overall_splitted_residues = []
2349 overall_duplicated_residues = []
2350 for residues in repeated_residues:
2351 # Iterate over repeated residues and check if residues are covalently bonded
2352 # If any pair of residues are bonded add them both to the splitted residues list
2353 # At the end, all non-splitted residues will be considered duplicated residues
2354 # WARNING: Using a set here is not possible since repeated residues have the same hash
2355 # WARNING: Also comparing residues themselves is not advisable, so we use indices at this point
2356 splitted_residue_indices = set()
2357 for residue, other_residues in otherwise(residues):
2358 if residue.index in splitted_residue_indices:
2359 continue
2360 # Get atom indices for all atoms connected to the current residue
2361 residue_bonds = sum([ covalent_bonds[index] for index in residue.atom_indices ], [])
2362 for other_residue in other_residues:
2363 # Get all atom indices for each other residue and collate with the current residue bonds
2364 if any( index in residue_bonds for index in other_residue.atom_indices ):
2365 splitted_residue_indices.add(residue.index)
2366 splitted_residue_indices.add(other_residue.index)
2367 # Finally obtain the splitted residues from its indices
2368 splitted_residues = [ self.residues[index] for index in splitted_residue_indices ]
2369 # Repeated residues which are not splitted are thus duplicated
2370 duplicated_residues = [ residue for residue in residues if residue.index not in splitted_residue_indices ]
2371 # Update the overall lists
2372 if len(splitted_residues) > 0:
2373 overall_splitted_residues.append(splitted_residues)
2374 if len(duplicated_residues) > 0:
2375 overall_duplicated_residues.append(duplicated_residues)
2376 # In case we have splitted residues
2377 if len(overall_splitted_residues) > 0:
2378 if display_summary:
2379 print(f' There are splitted residues ({len(overall_splitted_residues)})')
2380 # Fix splitted residues
2381 if fix_residues:
2382 print(' Splitted residues will be merged')
2383 # Set a function to sort atoms and residues by index
2384 def by_index (v):
2385 return v._index
2386 # Merge splitted residues
2387 # WARNING: Merging residues without sorting atoms is possible, but this would be lost after exporting to pdb
2388 for splitted_residues in overall_splitted_residues:
2389 # The first residue (i.e. the residue with the lower index) will be the one which remains
2390 # It will absorb other residue atom indices
2391 splitted_residues.sort(key=by_index) # Residues are not sorted by default, this is mandatory
2392 first_residue = splitted_residues[0]
2393 other_residues = splitted_residues[1:]
2394 for residue in other_residues:
2395 for atom in residue.atoms:
2396 atom.residue = first_residue
2397 # Then these other residues are eliminated
2398 self.purge_residue(residue)
2399 print(' Atoms will be sorted to be together by residues')
2400 print(' NEVER FORGET: This will break any associated trajectory if coordinates are not sorted as well')
2401 # Sort atoms to group residue atoms together
2402 # Note that each atom index must be updated
2403 new_atom_indices = sum([ residue.atom_indices for residue in self.residues ], [])
2404 for i, new_atom_index in enumerate(new_atom_indices):
2405 atom = self.atoms[new_atom_index]
2406 atom._index = i
2407 self.atoms.sort(key=by_index)
2408 # Also residue 'atom_indices' must be updated
2409 for residue in self.residues:
2410 residue._atom_indices = [ new_atom_indices.index(atom_index) for atom_index in residue._atom_indices ]
2411 # Bonds must be reset since atom indices have changes
2412 self._bonds = None
2413 # Prepare the trajectory atom sorter which must be returned
2414 # Include atom indices already so the user has to provide only the structure and trajectory filepaths
2415 def trajectory_atom_sorter (
2416 input_structure_file : 'File',
2417 input_trajectory_file : 'File',
2418 output_trajectory_file : 'File'
2419 ):
2420 sort_trajectory_atoms(
2421 input_structure_file,
2422 input_trajectory_file,
2423 output_trajectory_file,
2424 new_atom_indices
2425 )
2426 self.trajectory_atom_sorter = trajectory_atom_sorter
2427 self.new_atom_order = new_atom_indices
2428 modified = True
2429 # In case we have duplicated residues
2430 duplicated_residues_count = len(overall_duplicated_residues)
2431 if duplicated_residues_count > 0:
2432 if display_summary:
2433 warn(f'There are {duplicated_residues_count} different groups of duplicated residues')
2434 # Renumerate duplicated residues if requested
2435 if fix_residues:
2436 # First of all, from each group of repeated residues, discard the first residue
2437 # The first residue will remain as it is and the rest will be renumerated
2438 # Join all residues to be renumerated in a single list
2439 residue_to_renumerate = []
2440 for residues in overall_duplicated_residues:
2441 residue_to_renumerate += residues[1:]
2442 # Now group residues per chain since the renumeration is done by chains
2443 chain_residues = {}
2444 for residue in residue_to_renumerate:
2445 chain = residue.chain
2446 current_chain_residues = chain_residues.get(chain, None)
2447 if current_chain_residues: current_chain_residues.append(residue)
2448 else: chain_residues[chain] = [ residue ]
2449 # Iterate residue chain groups
2450 for chain, residues in chain_residues.items():
2451 # Find the last residue number in the current chain
2452 maximum_chain_number = max([ residue.number for residue in chain.residues ])
2453 # Sort residues by index
2454 residues.sort(key=lambda x: x.index, reverse=True)
2455 for residue in residues:
2456 # Set the number of the residue as the next available number
2457 residue.number = maximum_chain_number + 1
2458 # Update the maximum number
2459 maximum_chain_number = residue.number
2460 modified = True
2461 return modified
2463 # Atoms with identical chain, residue and name are considered repeated atoms
2464 # DANI: No recuerdo los problemas que daba tener átomos repetidos
2466 # Check atoms to search for repeated atoms
2467 # Rename repeated atoms if the fix_atoms argument is True
2468 # Return True if there were any repeats
2469 def check_repeated_atoms (self, fix_atoms : bool = False, display_summary : bool = False) -> bool:
2470 # Set two trackers for display
2471 repeated_atoms_count = 0
2472 example = None
2473 for residue in self.residues:
2474 # Iterate over the residue atoms counting how many repeated names we have
2475 current_names = []
2476 for atom in residue.atoms:
2477 # We check if the atom name already exists. If not, go to the next atom
2478 name = atom.name
2479 if name not in current_names:
2480 current_names.append(name)
2481 continue
2482 # When atom is repeated
2483 repeated_atoms_count += 1
2484 # If the fix was not requested we stop here
2485 if not fix_atoms:
2486 continue
2487 # We set the name of the atom as the element letter + the count of this element
2488 # If element is missing take the first character of the atom name
2489 initial = atom.element
2490 if not initial or initial == ' ':
2491 initial = name[0]
2492 number = 1
2493 new_name = initial + str(number)
2494 while new_name in current_names:
2495 number += 1
2496 new_name = initial + str(number)
2497 # Save an example for the logs if there is none yet
2498 if not example:
2499 example = f'{atom.name} renamed as {new_name} in residue {residue}'
2500 atom.name = new_name
2501 current_names.append(new_name)
2502 # Display the summary of repeated atoms if requested
2503 if display_summary:
2504 if repeated_atoms_count > 0:
2505 warn(f'There are repeated atoms ({repeated_atoms_count})')
2506 print(f' e.g. {example}')
2507 return repeated_atoms_count > 0
2509 # Check bonds to be incoherent
2510 # i.e. check atoms not to have more or less bonds than expected according to their element
2511 def check_incoherent_bonds (self) -> bool:
2512 # Find out if there are hydrogens in the structure
2513 # It may happen that we encounter an structure without hydrogens
2514 has_hydrogen = next(( True for atom in self.atoms if atom.element == 'H' ), False)
2515 coherent_bonds = coherent_bonds_with_hydrogen if has_hydrogen else coherent_bonds_without_hydrogen
2516 # Check coherent bonds atom by atom
2517 for atom in self.atoms:
2518 # Do not check this atom bonds if this may be an ion
2519 # Most authors "force" dummy bonds in these atoms to make them stable
2520 if atom.element in SUPPORTED_ION_ELEMENTS: continue
2521 # Ignore dummy atoms as well
2522 if atom.element == DUMMY_ATOM_ELEMENT: continue
2523 # Ignore coarse grain atoms as well
2524 if atom.element == CG_ATOM_ELEMENT: continue
2525 # Get actual number of bonds in the current atom both with and without ion bonds
2526 # LORE: This was a problem since some ions are force-bonded but bonds are actually not real
2527 # LORE: When an ion is forced we may end up with hyrogens with 2 bonds or carbons with 5 bonds
2528 # LORE: When an ions is really bonded we can not discard it or we may end up with orphan carbons (e.g. ligands)
2529 min_nbonds = len(atom.get_bonds(skip_ions = True, skip_dummies = True))
2530 max_nbonds = len(atom.get_bonds(skip_ions = False, skip_dummies = True))
2531 # Get the accepted range of number of bonds for the current atom according to its element
2532 element = atom.element
2533 element_coherent_bonds = coherent_bonds.get(element, None)
2534 # If there are no specficiations for the current atom element then skip it
2535 if not element_coherent_bonds:
2536 continue
2537 # Get the range of accepted number of bonds
2538 min_allowed_bonds = element_coherent_bonds['min']
2539 max_allowed_bonds = element_coherent_bonds['max']
2540 # Check the actual number of bonds is insdie the accepted range
2541 if max_nbonds < min_allowed_bonds or min_nbonds > max_allowed_bonds:
2542 if min_nbonds == max_nbonds:
2543 print(f' An atom with element {element} has {min_nbonds} bonds')
2544 else:
2545 print(f' An atom with element {element} has between {min_nbonds} bonds (withou ions) and {max_nbonds} bonds (with ions)')
2546 if min_allowed_bonds == max_allowed_bonds:
2547 plural_sufix = '' if min_allowed_bonds == 1 else 's'
2548 print(f' It should have {min_allowed_bonds} bond{plural_sufix}')
2549 else:
2550 print(f' It should have between {min_allowed_bonds} and {max_allowed_bonds} bonds')
2551 print(f' -> Atom {atom.label}')
2552 bond_label = ', '.join([ self.atoms[atom].label for atom in atom.get_bonds(skip_ions = False) ])
2553 print(f' -> Bonds {bond_label}')
2554 return True
2555 return False
2557 # Get all atomic covalent (strong) bonds
2558 # Bonds are defined as a list of atom indices for each atom in the structure
2559 # Rely on VMD logic to do so
2560 def get_covalent_bonds (self, selection : Optional['Selection'] = None) -> List[ List[int] ]:
2561 # It is important to fix elements before trying to fix bonds, since elements have an impact on bonds
2562 # VMD logic to find bonds relies in the atom element to set the covalent bond distance cutoff
2563 self.fix_atom_elements()
2564 # Generate a pdb strucutre to feed vmd
2565 auxiliar_pdb_filepath = '.structure.pdb'
2566 self.generate_pdb_file(auxiliar_pdb_filepath)
2567 # Get covalent bonds between both residue atoms
2568 covalent_bonds = get_covalent_bonds(auxiliar_pdb_filepath, selection)
2569 # For every atom in CG, replace its bonds with a class which will raise and error when read
2570 # Thus we make sure using these wrong bonds anywhere further will result in failure
2571 for atom_index in self.select_cg().atom_indices:
2572 covalent_bonds[atom_index] = MISSING_BONDS
2573 # Remove the auxiliar pdb file
2574 os.remove(auxiliar_pdb_filepath)
2575 return covalent_bonds
2577 # Make a copy of the bonds list
2578 def copy_bonds (self) -> List[List[int]]:
2579 new_bonds = []
2580 for atom_bonds in self.bonds:
2581 # Missing bonds coming from CG atoms are forwarded
2582 if atom_bonds == MISSING_BONDS:
2583 new_bonds.append(MISSING_BONDS)
2584 continue
2585 # Copy also the inner lists to avoid further mutation of the original structure
2586 new_bonds.append([ atom_index for atom_index in atom_bonds ])
2587 return new_bonds
2589 # Make a copy of the current structure
2590 def copy (self) -> 'Structure':
2591 atom_copies = [ atom.copy() for atom in self.atoms ]
2592 residue_copies = [ residue.copy() for residue in self.residues ]
2593 chain_copies = [ chain.copy() for chain in self.chains ]
2594 structure_copy = Structure(atom_copies, residue_copies, chain_copies)
2595 structure_copy.bonds = self.copy_bonds()
2596 return structure_copy
2598 # DANI: No lo he testeado en profundidad
2599 def merge (self, other : 'Structure') -> 'Structure':
2600 """Merge current structure with another structure."""
2601 # Copy self atoms, residues and chains
2602 self_atom_copies = [ atom.copy() for atom in self.atoms ]
2603 self_residue_copies = [ residue.copy() for residue in self.residues ]
2604 self_chain_copies = [ chain.copy() for chain in self.chains ]
2605 # Copy other atoms, residues and chains
2606 other_atom_copies = [ atom.copy() for atom in other.atoms ]
2607 other_residue_copies = [ residue.copy() for residue in other.residues ]
2608 other_chain_copies = [ chain.copy() for chain in other.chains ]
2609 # Adapt indices in other atoms, residues and chains
2610 atom_index_offset = len(self_atom_copies)
2611 residue_index_offset = len(self_residue_copies)
2612 chain_index_offset = len(self_chain_copies)
2613 for atom in other_atom_copies:
2614 atom._index += atom_index_offset
2615 atom._residue_index += residue_index_offset
2616 for residue in other_residue_copies:
2617 residue._index += residue_index_offset
2618 residue._atom_indices = [ i + atom_index_offset for i in residue._atom_indices ]
2619 residue._chain_index += chain_index_offset
2620 for chain in other_chain_copies:
2621 chain._index += chain_index_offset
2622 chain._residue_indices = [ i + residue_index_offset for i in chain._residue_indices ]
2623 # Merge self with other atoms, residues and chains and build the new structure
2624 merged_atoms = self_atom_copies + other_atom_copies
2625 merged_residues = self_residue_copies + other_residue_copies
2626 merged_chains = self_chain_copies + other_chain_copies
2627 return Structure(merged_atoms, merged_residues, merged_chains)
2629 # Find rings in the structure and yield them as they are found
2630 # LIMITATION: Wrong rings may be found including smaller rings
2631 # LIMITATION: Use this function to see if a specific ring exists, but not to count rings
2632 def find_rings (self, selection : Optional[Selection] = None) -> Generator[ List[Atom], None, None ]:
2633 selected_atom_indices = selection.atom_indices if selection else None
2634 # We will use a logic which finds long paths of bonded heavy atoms
2635 # These paths have no 'branches' and thus, when there are 2 possible paths then both paths are explored independently
2636 # Save already searched atoms and the number of times it has been explored already
2637 # Note that atoms may be visited as many times as their number of heavy atom bonds -1
2638 already_searched_atoms = {}
2639 def find_rings_recursive (atom_path : List[Atom]) -> Generator[ List[Atom], None, None ]:
2640 # Get the current atom to continue the followup: the last atom
2641 current_atom = atom_path[-1] # Note that the list MUST always have at least 1 atom
2642 # Get bonded atoms to continue the path
2643 followup_atoms = current_atom.get_bonded_atoms()
2644 # Hydrogens are discarded since they have only 1 bond and thus they can only lead to a dead end
2645 followup_atoms = [ atom for atom in followup_atoms if atom.element != 'H' ]
2646 # In case there is a selection, check followup atoms not to be in the selection
2647 if selected_atom_indices:
2648 followup_atoms = [ atom for atom in followup_atoms if atom.index in selected_atom_indices ]
2649 # Check if this atom has been visited already too many times and, if so, stop here
2650 # Allowed search times is the posible number of rings where this atom can belong
2651 allowed_search_times = comb(len(followup_atoms), 2)
2652 already_search_times = already_searched_atoms.get(current_atom, 0)
2653 if already_search_times == allowed_search_times:
2654 return
2655 already_searched_atoms[current_atom] = already_search_times + 1
2656 # Remove the previous atom to avoid going back
2657 previous_atom = atom_path[-2] if len(atom_path) > 1 else None
2658 if previous_atom:
2659 followup_atoms.remove(previous_atom)
2660 # Iterate over the following bonded atoms
2661 for followup_atom in followup_atoms:
2662 # Check if the followup atom is in the path
2663 path_index = next(( index for index, atom in enumerate(atom_path) if atom == followup_atom ), None)
2664 # If so, we have found a ring
2665 if path_index != None:
2666 ring = atom_path[path_index:]
2667 yield ring
2668 continue
2669 # Otherwise, keep searching
2670 new_path = atom_path + [followup_atom]
2671 for ring in find_rings_recursive(atom_path=new_path):
2672 yield ring
2673 # Find a starting atom and run the recursive logic
2674 candidate_atom_indices = selected_atom_indices if selected_atom_indices else range(self.atom_count)
2675 for candidate_atom_index in candidate_atom_indices:
2676 candidate_atom = self.atoms[candidate_atom_index]
2677 # It must be heavy atom
2678 if candidate_atom.element == 'H':
2679 continue
2680 # It must not be a dead end already
2681 bonded_atoms = candidate_atom.get_bonded_atoms()
2682 bonded_atoms = [ atom for atom in bonded_atoms if atom.element != 'H' ]
2683 # In case there is a selection, check followup atoms not to be in the selection
2684 if selected_atom_indices:
2685 bonded_atoms = [ atom for atom in bonded_atoms if atom.index in selected_atom_indices ]
2686 # Check this atom is not a dead end already
2687 allowed_search_times = comb(len(bonded_atoms), 2)
2688 if allowed_search_times < 1:
2689 continue
2690 for ring in find_rings_recursive(atom_path=[candidate_atom]):
2691 yield ring
2693 # Given an atom selection, get all bonds between these atoms and any other atom in the structure
2694 # Note that inner bonds between atoms in the selection are discarded
2695 def get_selection_outer_bonds (self, selection : Selection) -> List[int]:
2696 # Get bonds from all atoms in the selection
2697 bonds = set()
2698 for atom_index in selection.atom_indices:
2699 atom_bonds = set(self.bonds[atom_index])
2700 bonds = bonds.union(atom_bonds)
2701 # Discard selection atoms from the bonds list to discard inner bonds
2702 bonds -= set(selection.atom_indices)
2703 return list(bonds)
2705 # Set the type of PTM according to the classification of the bonded residue
2706 ptm_options = {
2707 'protein': ValueError('A PTM residue must never be protein'),
2708 'dna': 'DNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama
2709 'rna': 'RNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama
2710 'carbohydrate': 'Glycosilation',
2711 'fatty': 'Lipidation',
2712 'steroid': 'Steroid linkage', # DANI: Esto es posible aunque muy raro y no se como se llama
2713 'ion': Warning('Ion is covalently bonded to protein'), # DANI: esto no es "correcto" pero si habitual
2714 'solvent': Warning('Solvent is covalently bonded to protein'), # DANI: probablemente sea un error
2715 'acetyl': 'Acetylation', # Typical N-capping terminals
2716 'amide': 'Amidation', # Typical C-capping terminals
2717 'other': Warning('Unknow type of PTM'), # Something not supported
2718 }
2720 # Find Post Translational Modifications (PTM) in the structure
2721 def find_ptms (self) -> Generator[dict, None, None]:
2722 # Find bonds between protein and non-protein atoms
2723 # First get all protein atoms
2724 protein_selection = self.select_protein()
2725 if not protein_selection:
2726 return
2727 protein_atom_indices = set(protein_selection.atom_indices) # This is used later
2728 protein_outer_bonds = set(self.get_selection_outer_bonds(protein_selection))
2729 non_protein_selection = self.invert_selection(protein_selection)
2730 if not non_protein_selection:
2731 return
2732 non_protein_atom_indices = set(non_protein_selection.atom_indices)
2733 non_protein_atom_indices_bonded_to_protein = protein_outer_bonds.intersection(non_protein_atom_indices)
2734 # Get each residue bonded to the protein and based on its 'classification' set the name of the PTM
2735 for atom_index in non_protein_atom_indices_bonded_to_protein:
2736 atom = self.atoms[atom_index]
2737 residue = atom.residue
2738 residue_classification = residue.get_classification()
2739 ptm_classification = self.ptm_options[residue_classification]
2740 # If we found something impossible then raise the error
2741 if type(ptm_classification) == ValueError:
2742 print(f'Problematic residue: {residue}')
2743 raise ptm_classification
2744 # If we do not know what it is then do tag it as a PTM but print a warning
2745 if type(ptm_classification) == Warning:
2746 warn(f'{ptm_classification} -> {residue}')
2747 continue
2748 # At this point we have found a valid PTM
2749 # Find the protein residue linked to this PTM
2750 atom_bonds = self.bonds[atom_index]
2751 protein_bond = next((bond for bond in atom_bonds if bond in protein_atom_indices), None)
2752 if protein_bond == None:
2753 raise ValueError('There must be at least one protein bond to the current atom')
2754 protein_residue_index = self.atoms[protein_bond].residue_index
2755 # Set the PTM
2756 yield { 'name': ptm_classification, 'residue_index': protein_residue_index }
2758 # Ask if the structure has at least one coarse grain atom/residue
2759 def has_cg (self) -> bool:
2760 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)
2762### Related functions ###
2764# Calculate the distance between two atoms
2765def calculate_distance (atom_1 : Atom, atom_2 : Atom) -> float:
2766 squared_distances_sum = 0
2767 for i in { 'x': 0, 'y': 1, 'z': 2 }.values():
2768 squared_distances_sum += (atom_1.coords[i] - atom_2.coords[i])**2
2769 return math.sqrt(squared_distances_sum)
2771# Calculate the angle between 3 atoms
2772# https://stackoverflow.com/questions/35176451/python-code-to-calculate-angle-between-three-point-using-their-3d-coordinates
2773def calculate_angle (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom) -> float:
2774 # Get coordinates in numpy arrays, which allows to easily calculate the difference
2775 a = np.array(atom_1.coords)
2776 b = np.array(atom_2.coords)
2777 c = np.array(atom_3.coords)
2778 # Set the two vectors which make the angle
2779 ba = a - b
2780 bc = c - b
2781 # Calculate the angle between these 2 vectors
2782 cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
2783 angle = np.arccos(cosine_angle)
2784 return np.degrees(angle)
2786# Calculate the torsion between 4 atoms
2787# https://stackoverflow.com/questions/20305272/dihedral-torsion-angle-from-four-points-in-cartesian-coordinates-in-python
2788def calculate_torsion (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom, atom_4 : Atom) -> float:
2789 p0 = np.array(atom_1.coords)
2790 p1 = np.array(atom_2.coords)
2791 p2 = np.array(atom_3.coords)
2792 p3 = np.array(atom_4.coords)
2793 # Ge some vectors
2794 b0 = -1.0*(p1 - p0)
2795 b1 = p2 - p1
2796 b2 = p3 - p2
2797 # normalize b1 so that it does not influence magnitude of vector
2798 # rejections that come next
2799 b1 /= np.linalg.norm(b1)
2800 # vector rejections
2801 # v = projection of b0 onto plane perpendicular to b1
2802 # = b0 minus component that aligns with b1
2803 # w = projection of b2 onto plane perpendicular to b1
2804 # = b2 minus component that aligns with b1
2805 v = b0 - np.dot(b0, b1)*b1
2806 w = b2 - np.dot(b2, b1)*b1
2807 # angle between v and w in a plane is the torsion angle
2808 # v and w may not be normalized but that's fine since tan is y/x
2809 x = np.dot(v, w)
2810 y = np.dot(np.cross(b1, v), w)
2811 return float(np.degrees(np.arctan2(y, x)))
2813### Auxiliar functions ###
2815# Set a function to get the next letter from an input letter in alphabetic order
2816def get_next_letter (letter : str) -> str:
2817 if not letter:
2818 return 'A'
2819 if letter == 'z' or letter == 'Z':
2820 raise InputError("Limit of chain letters has been reached")
2821 next_letter = chr(ord(letter) + 1)
2822 return next_letter
2824# Given a string with 1 or 2 characters, return a new string with the first letter cap and the second letter not cap (if any)
2825def first_cap_only (name : str) -> str:
2826 if len(name) == 1:
2827 return name.upper()
2828 first_character = name[0].upper()
2829 second_character = name[1].lower()
2830 return first_character + second_character
2832# Convert numbers to lower text characters (chr 8020-8029)
2833lower_numbers = {
2834 '0': '₀',
2835 '1': '₁',
2836 '2': '₂',
2837 '3': '₃',
2838 '4': '₄',
2839 '5': '₅',
2840 '6': '₆',
2841 '7': '₇',
2842 '8': '₈',
2843 '9': '₉',
2844}
2845def get_lower_numbers (numbers_text : str) -> str:
2846 return ''.join([ lower_numbers[c] for c in numbers_text ])
2848# Set a function to filter lines in PDB content for a specific model
2849def filter_model (pdb_content : str, model : int) -> str:
2850 # Make sure the PDB content has multiple models
2851 # If not, then teturn the whole PDB content ignoring the flag
2852 generic_model_header_regex = r'\nMODEL\s*[0-9]*\s*\n'
2853 if not re.search(generic_model_header_regex, pdb_content):
2854 print(f'PDB content has no models at all so it will be loaded as is (ignored model "{model}").')
2855 return pdb_content
2856 # If a model was passed then it means we must filter the PDB content
2857 filtered_pdb_content = ''
2858 # Search PDB lines until we find our model's header
2859 model_header_regex = rf'^MODEL\s*{model}'
2860 pdb_lines = iter(pdb_content.split('\n'))
2861 for line in pdb_lines:
2862 if not re.match(model_header_regex, line): continue
2863 # If we just found the header
2864 filtered_pdb_content = line
2865 break
2866 # If we did not find the header then stop here
2867 if not filtered_pdb_content: raise RuntimeError(f'Could not find model "{model}" header')
2868 # Add every line to the filtered content until we find the tail
2869 model_footer_regex = r'^ENDMDL'
2870 for line in pdb_lines:
2871 filtered_pdb_content += '\n' + line
2872 if re.match(model_footer_regex, line): return filtered_pdb_content
2873 # If we did not find the footer then stop here
2874 raise RuntimeError(f'Could not find model "{model}" footer')