Coverage for mddb_workflow/utils/structures.py: 84%
1771 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 15:48 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 15:48 +0000
1# Main handler of the toolbelt
2import os
3import re
4import math
5import pytraj
6import MDAnalysis
7import numpy as np
8from bisect import bisect
10from mddb_workflow.utils.file import File
11from mddb_workflow.utils.selections import Selection
12from mddb_workflow.utils.vmd_spells import get_vmd_selection_atom_indices, get_covalent_bonds
13from mddb_workflow.utils.mdt_spells import sort_trajectory_atoms
14from mddb_workflow.utils.gmx_spells import make_index, read_ndx
15from mddb_workflow.utils.auxiliar import InputError, MISSING_BONDS
16from mddb_workflow.utils.auxiliar import is_imported, residue_name_to_letter, otherwise, warn
17from mddb_workflow.utils.constants import SUPPORTED_ION_ELEMENTS, SUPPORTED_ELEMENTS
18from mddb_workflow.utils.constants import STANDARD_COUNTER_CATION_ATOM_NAMES, STANDARD_COUNTER_ANION_ATOM_NAMES
19from mddb_workflow.utils.constants import STANDARD_SOLVENT_RESIDUE_NAMES, STANDARD_COUNTER_ION_ATOM_NAMES
20from mddb_workflow.utils.constants import STANDARD_DUMMY_ATOM_NAMES, DUMMY_ATOM_ELEMENT, CG_ATOM_ELEMENT
21from mddb_workflow.utils.constants import PROTEIN_RESIDUE_NAME_LETTERS, NUCLEIC_RESIDUE_NAME_LETTERS
22from mddb_workflow.utils.constants import DNA_RESIDUE_NAME_LETTERS, RNA_RESIDUE_NAME_LETTERS
23from mddb_workflow.utils.constants import FATTY_RESIDUE_NAMES, STEROID_RESIDUE_NAMES
24from mddb_workflow.utils.type_hints import *
27# Set all available chains according to pdb standards
28AVAILABLE_CAPS = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
29AVAILABLE_LOWS = list('abcdefghijklmnopqrstuvwxyz')
30AVAILABLE_LETTERS = AVAILABLE_CAPS + AVAILABLE_LOWS
32# Set letters to be found in alphanumerical bases
33hexadecimal_letters = set(AVAILABLE_CAPS[:6] + AVAILABLE_LOWS[:6])
34alphanumerical_letters = set(AVAILABLE_CAPS[6:] + AVAILABLE_LOWS[6:])
36# Set the expected number of bonds for each atom according to its element
37coherent_bonds_with_hydrogen = {
38 'H': { 'min': 1, 'max': 1 },
39 'O': { 'min': 1, 'max': 2 },
40 'N': { 'min': 1, 'max': 4 },
41 'C': { 'min': 2, 'max': 4 },
42 'S': { 'min': 1, 'max': 6 },
43 'P': { 'min': 2, 'max': 6 },
44}
45# WARNING: Not deeply tested
46coherent_bonds_without_hydrogen = {
47 'O': { 'min': 0, 'max': 2 },
48 'N': { 'min': 1, 'max': 3 },
49 'C': { 'min': 1, 'max': 3 },
50 'S': { 'min': 1, 'max': 4 },
51 'P': { 'min': 2, 'max': 4 },
52}
54# Set the limit of residue numbers according to PDB format (4 characters, starts count at 1)
55# This means the last number is 9999 and it is equivalent to index 9998
56pdb_last_decimal_residue_index = 9998
58class Atom:
59 """An atom class."""
61 def __init__ (self,
62 name : Optional[str] = None,
63 element : Optional[str] = None,
64 coords : Optional[Coords] = None,
65 ):
66 self.name = name
67 self.element = element
68 self.coords = tuple(coords) if coords else None
69 # Set variables to store references to other related instances
70 # These variables will be set further by the structure
71 self._structure = None
72 self._index = None
73 self._residue_index = None
75 def __repr__ (self):
76 return '<Atom ' + self.name + '>'
78 def __eq__ (self, other):
79 if type(self) != type(other):
80 return False
81 return self._residue_index == other._residue_index and self.name == other.name
83 def __hash__ (self):
84 return hash((self.index))
86 def get_structure (self) -> Optional['Structure']:
87 """The parent structure (read only).
88 This value is set by the structure itself."""
89 return self._structure
90 structure: 'Structure' = property(get_structure, None, None, "The parent structure (read only)")
92 def get_index (self) -> Optional[int]:
93 """The residue index according to parent structure residues (read only).
94 This value is set by the structure itself."""
95 return self._index
96 index = property(get_index, None, None, "The residue index according to parent structure residues (read only)")
98 def get_residue_index (self) -> int:
99 """The atom residue index according to parent structure residues.
100 If residue index is set then make changes in all the structure to make this change coherent."""
101 return self._residue_index
102 def set_residue_index (self, new_residue_index : int):
103 # If the new residue index is the current residue index then do nothing
104 # WARNING: It is important to stop this here or it could delete a residue which is not to be deleted
105 if new_residue_index == self.residue_index:
106 return
107 # If there is not strucutre yet it means the residue is beeing set before the structure
108 # We just save the residue index and wait for the structure to be set
109 if not self.structure:
110 self._residue_index = new_residue_index
111 return
112 # Relational indices are updated through a top-down hierarchy
113 # Affected residues are the ones to update this atom internal residue index
114 new_residue = self.structure.residues[new_residue_index]
115 new_residue.add_atom(self)
116 residue_index = property(get_residue_index, set_residue_index, None,
117 "The atom residue index according to parent structure residues")
119 def get_residue (self) -> Optional['Residue']:
120 """The atom residue.
121 If residue is set then make changes in all the structure to make this change coherent."""
122 # If there is not strucutre yet it means the atom is beeing set before the structure
123 # In this case it is not possible to get related residues in the structure
124 # It also may happend that the residue is missing because the atom has been set rawly
125 if not self.structure or self.residue_index == None:
126 return None
127 # Get the residue in the structure according to the residue index
128 return self.structure.residues[self.residue_index]
129 def set_residue (self, new_residue : 'Residue'):
130 """Find the new residue index and set it as the atom residue index.
131 Note that the residue must be set in the structure already."""
132 new_residue_index = new_residue.index
133 if new_residue_index == None:
134 raise ValueError('Residue ' + str(new_residue) + ' is not set in the structure')
135 self.set_residue_index(new_residue_index)
136 residue = property(get_residue, set_residue, None, "The atom residue")
138 def get_chain_index (self) -> Optional[int]:
139 """Get the atom chain index according to parent structure chains."""
140 # The residue may be missing if the atom has been set rawly
141 if not self.residue:
142 return None
143 return self.residue.chain_index
144 chain_index = property(get_chain_index, None, None,
145 "The atom chain index according to parent structure chains (read only)")
147 def get_chain (self) -> Optional['Chain']:
148 """The atom chain (read only).
149 In order to change the chain it must be changed in the atom residue."""
150 # If there is not strucutre yet it means the atom is beeing set before the structure
151 # In this case it is not possible to get related residues in the structure
152 # It also may happend that the chain is missing because the atom has been set rawly
153 if not self.structure or self.chain_index == None:
154 return None
155 # Get the chain in the structure according to the chain index
156 return self.structure.chains[self.chain_index]
157 chain: 'Chain' = property(get_chain, None, None, "The atom chain (read only)")
159 def get_bonds (self, skip_ions : bool = False, skip_dummies : bool = False) -> Optional[ list[int] ]:
160 """Get indices of other atoms in the structure which are covalently bonded to this atom."""
161 if not self.structure:
162 raise ValueError('The atom has not a structure defined')
163 if self.index == None:
164 raise ValueError('The atom has not an index defined')
165 bonds = self.structure.bonds[self.index]
166 # If the skip ions argument is passed then remove atom indices of supported ion atoms
167 if skip_ions:
168 bonds = list(set(bonds) - self.structure.ion_atom_indices)
169 if skip_dummies:
170 bonds = list(set(bonds) - self.structure.dummy_atom_indices)
171 return bonds
173 # Atoms indices of atoms in the structure which are covalently bonded to this atom
174 bonds = property(get_bonds, None, None,
175 'Atoms indices of atoms in the structure which are covalently bonded to this atom')
177 def get_bonded_atoms (self) -> list['Atom']:
178 """Get bonded atoms."""
179 return [ self.structure.atoms[atom_index] for atom_index in self.bonds ]
181 def get_selection (self) -> 'Selection':
182 """Generate a selection for this atom."""
183 return Selection([self.index])
185 def copy (self) -> 'Atom':
186 """Make a copy of the current atom."""
187 atom_copy = Atom(self.name, self.element, self.coords)
188 atom_copy._structure = self._structure
189 atom_copy._index = self._index
190 atom_copy._residue_index = self._residue_index
191 return atom_copy
193 def is_fatty_candidate (self) -> bool:
194 """
195 Check if this atom meets specific criteria:
196 1. It is a carbon
197 2. It is connected only to other carbons and hydrogens
198 3. It is connected to 1 or 2 carbons
199 """
200 # Ignore non carbon atoms
201 if self.element != 'C':
202 return False
203 # Get bonded atom elements
204 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ]
205 # Check only carbon and hydrogen atoms to be bonded
206 if any((element != 'C' and element != 'H') for element in bonded_atoms_elements):
207 return False
208 # Check it is connected to 1 or 2 carbons
209 connected_carbons_count = bonded_atoms_elements.count('C')
210 if connected_carbons_count != 1 and connected_carbons_count != 2:
211 return False
212 return True
214 def is_carbohydrate_candidate (self) -> bool:
215 """
216 Check if this atom meets specific criteria:
217 1. It is a carbon
218 2. It is connected only to other carbons, hydrogens or oxygens
219 3. It is connected to 1 or 2 carbons
220 4. It is connected to 1 oxygen
221 """
222 # Ignore non carbon atoms
223 if self.element != 'C':
224 return False
225 # Get bonded atom elements
226 bonded_atoms_elements = [ atom.element for atom in self.get_bonded_atoms() ]
227 # Check only carbon and hydrogen atoms to be bonded
228 if any((element != 'C' and element != 'H' and element != 'O') for element in bonded_atoms_elements):
229 return False
230 # Check it is connected to 1 or 2 carbons
231 connected_carbons_count = bonded_atoms_elements.count('C')
232 if connected_carbons_count != 1 and connected_carbons_count != 2:
233 return False
234 # Check it is connected to 1 oxygen
235 connected_oxygens_count = bonded_atoms_elements.count('O')
236 if connected_oxygens_count != 1:
237 return False
238 return True
240 def is_ion (self) -> bool:
241 """Check if it is an ion by checking if it has no bonds with other atoms."""
242 return len(self.bonds) == 0
244 def guess_element (self) -> str:
245 """Guess an atom element from its name and number of bonds."""
246 # If the atom name is among the known dummy atoms then return a standard element for dummy atoms
247 if self.name.upper() in STANDARD_DUMMY_ATOM_NAMES:
248 return DUMMY_ATOM_ELEMENT
249 # If the element is already the "coarse grained" flag then return this very same flag
250 # WARNING: This has to be preset before guessing or the guess may fail
251 if self.is_cg():
252 return CG_ATOM_ELEMENT
253 # If the name is SOD and it is a lonely atom then it is clearly sodium, not sulfur
254 if self.name.upper() == 'SOD' and self.residue.atom_count == 1:
255 return 'Na'
256 # If the name is POT and it is a lonely atom then it is clearly potassium, not phosphor
257 if self.name.upper() == 'POT' and self.residue.atom_count == 1:
258 return 'K'
259 # Find a obvios element name in the atom name
260 element = self.get_name_suggested_element()
261 # CA may refer to calcium or alpha carbon, so the number of atoms in the residue is decisive
262 if element == 'Ca' and self.residue.atom_count >= 2:
263 return 'C'
264 # NA may refer to sodium or some ligand nitrogen, so the number of atoms in the residue is decisive
265 if element == 'Na' and self.residue.atom_count >= 2:
266 return 'N'
267 return element
269 def get_name_suggested_element (self) -> str:
270 """Guess an atom element from its name only."""
271 # Get the atom name and its characters length
272 name = self.name
273 length = len(name)
274 next_character = None
275 for i, character in enumerate(name):
276 # Get the next character, since element may be formed by 2 letters
277 if i < length - 1:
278 next_character = name[i+1]
279 # If next character is not a string then ignore it
280 if not next_character.isalpha():
281 next_character = None
282 # Try to get all possible matches between the characters and the supported atoms
283 # First letter is always caps
284 character = character.upper()
285 # First try to match both letters together
286 if next_character:
287 # Start with the second letter in caps
288 next_character = next_character.upper()
289 both = character + next_character
290 if both in SUPPORTED_ELEMENTS:
291 return both
292 # Continue with the second letter in lowers
293 next_character = next_character.lower()
294 both = character + next_character
295 if both in SUPPORTED_ELEMENTS:
296 return both
297 # Finally, try with the first character alone
298 if character in SUPPORTED_ELEMENTS:
299 return character
300 raise InputError(f"Cannot guess atom element from atom name '{name}'")
302 def get_label (self) -> str:
303 """Get a standard label."""
304 return f'{self.residue.label}.{self.name} (index {self.index})'
305 # The atom standard label (read only)
306 label = property(get_label, None, None)
308 # Ask if the current atom is in coarse grain
309 def is_cg (self) -> bool:
310 return self.element == CG_ATOM_ELEMENT
312class Residue:
313 """A residue class."""
314 def __init__ (self,
315 name : Optional[str] = None,
316 number : Optional[int] = None,
317 icode : Optional[str] = None,
318 ):
319 self.name = name
320 self.number = number
321 self.icode = icode
322 # Set variables to store references to other related instaces
323 # These variables will be set further by the structure
324 self._structure = None
325 self._index = None
326 self._atom_indices = []
327 self._chain_index = None
328 self._classification = None
330 def __repr__ (self):
331 return f'<Residue {self.label}>'
333 def __eq__ (self, other):
334 if type(self) != type(other):
335 return False
336 return (
337 self._chain_index == other._chain_index and
338 #self.name == other.name and
339 self.number == other.number and
340 self.icode == other.icode
341 )
343 def __hash__ (self):
344 # WARNING: This is susceptible to duplicated residues
345 return hash((self._chain_index, self.number, self.icode))
346 # WARNING: This is not susceptible to duplicated residues
347 #return hash(tuple(self._atom_indices))
349 def get_structure (self) -> Optional['Structure']:
350 """Get the parent structure (read only).
351 This value is set by the structure itself."""
352 return self._structure
353 structure: 'Structure' = property(get_structure, None, None,
354 "The parent structure (read only)")
356 def get_index (self) -> Optional[int]:
357 """Get the residue index according to parent structure residues (read only).
358 This value is set by the structure itself."""
359 return self._index
360 def set_index (self, index):
361 # Update residue atoms
362 for atom in self.atoms:
363 atom._residue_index = index
364 # Update residue chain
365 chain_residue_index = self.chain._residue_indices.index(self._index)
366 self.chain._residue_indices[chain_residue_index] = index
367 # Finally update self index
368 self._index = index
369 index = property(get_index, set_index, None,
370 "The residue index according to parent structure residues (read only)")
372 def get_atom_indices (self) -> list[int]:
373 """Get the atom indices according to parent structure atoms for atoms in this residue.
374 If atom indices are set then make changes in all the structure to make this change coherent."""
375 return self._atom_indices
376 def set_atom_indices (self, new_atom_indices : list[int]):
377 # If there is not strucutre yet it means the residue is beeing set before the structure
378 # We just save atom indices and wait for the structure to be set
379 if not self.structure:
380 self._atom_indices = new_atom_indices
381 return
382 # Update the current atoms
383 for atom in self.atoms:
384 atom._residue_index = None
385 # Update the new atoms
386 for index in new_atom_indices:
387 atom = self.structure.atoms[index]
388 atom._residue_index = self.index
389 # Now new indices are coherent and thus we can save them
390 self._atom_indices = new_atom_indices
391 atom_indices = property(get_atom_indices, set_atom_indices, None,
392 "The atom indices according to parent structure atoms for atoms in this residue")
394 def get_atoms (self) -> list['Atom']:
395 """Get the atoms in this residue.
396 If atoms are set then make changes in all the structure to make this change coherent."""
397 # If there is not strucutre yet it means the chain is beeing set before the structure
398 # In this case it is not possible to get related atoms in the structure
399 if not self.structure:
400 return []
401 # Get atoms in the structure according to atom indices
402 atoms = self.structure.atoms
403 return [ atoms[atom_index] for atom_index in self.atom_indices ]
404 def set_atoms (self, new_atoms : list['Atom']):
405 # Find indices for new atoms and set their indices as the new atom indices
406 # Note that atoms must be set in the structure already
407 new_atom_indices = []
408 for new_atom in new_atoms:
409 new_atom_index = new_atom.index
410 if new_atom_index == None:
411 raise ValueError('Atom ' + str(new_atom) + ' is not set in the structure')
412 new_atom_indices.append(new_atom_index)
413 self.set_atom_indices(new_atom_indices)
414 atoms: list['Atom'] = property(get_atoms, set_atoms, None, "The atoms in this residue")
416 # Get the number of atoms in the residue
417 def get_atom_count (self) -> int:
418 return len(self.atom_indices)
419 atom_count = property(get_atom_count, None, None, "The number of atoms in the residue (read only)")
421 def add_atom (self, new_atom : 'Atom'):
422 """Add an atom to the residue."""
423 # Remove the atom from its previous residue owner
424 if new_atom.residue_index:
425 new_atom.residue.remove_atom(new_atom)
426 # Insert the new atom index in the list of atom indices keeping the order
427 new_atom_index = new_atom.index
428 sorted_atom_index = bisect(self.atom_indices, new_atom_index)
429 self.atom_indices.insert(sorted_atom_index, new_atom_index)
430 # Update the atom internal index
431 new_atom._residue_index = self.index
433 def remove_atom (self, current_atom : 'Atom'):
434 """Remove an atom from the residue."""
435 # Remove the current atom index from the atom indices list
436 self.atom_indices.remove(current_atom.index) # This index MUST be in the list
437 # If we removed the last atom then this residue must be removed from its structure
438 if len(self.atom_indices) == 0 and self.structure:
439 self.structure.purge_residue(self)
440 # Update the atom internal index
441 current_atom._residue_index = None
443 # The residue chain index according to parent structure chains
444 # If chain index is set then make changes in all the structure to make this change coherent
445 def get_chain_index (self) -> int:
446 return self._chain_index
447 def set_chain_index (self, new_chain_index : int):
448 # If the new chain index is the current chain index do nothing
449 # WARNING: It is important to stop this here or it could delete a chain which is not to be deleted
450 if new_chain_index == self.chain_index:
451 return
452 # If there is not strucutre yet it means the chain is beeing set before the structure
453 # We just save the chain index and wait for the structure to be set
454 if not self.structure:
455 self._chain_index = new_chain_index
456 return
457 # Relational indices are updated through a top-down hierarchy
458 # Affected chains are the ones to update this residue internal chain index
459 # WARNING: It is critical to find the new chain before removing/adding residues
460 # WARNING: It may happend that we remove the last residue in the current chain and the current chain is purged
461 # WARNING: Thus the 'new_chain_index' would be obsolete since the structure.chains list would have changed
462 new_chain = self.structure.chains[new_chain_index]
463 if self.chain:
464 self.chain.remove_residue(self)
465 new_chain.add_residue(self)
466 chain_index = property(get_chain_index, set_chain_index, None,
467 "The residue chain index according to parent structure chains")
469 # The residue chain
470 # If chain is set then make changes in all the structure to make this change coherent
471 def get_chain (self) -> Optional['Chain']:
472 # If there is not strucutre yet then it means the residue is set before the structure
473 # In this case it is not possible to get related chain in the structure
474 if not self.structure or self.chain_index == None:
475 return None
476 # Get the chain in the structure according to the chain index
477 return self.structure.chains[self.chain_index]
478 def set_chain (self, new_chain : Union['Chain', str]):
479 # In case the chain is just a string we must find/create the corresponding chain
480 if type(new_chain) == str:
481 letter = new_chain
482 # Get the residue structure
483 structure = self.structure
484 if not structure:
485 raise ValueError(f'Cannot find the corresponding {new_chain} chain without the structure')
486 # Find if the letter belongs to an already existing chain
487 new_chain = structure.get_chain_by_name(letter)
488 # If the chain does not exist yet then create it
489 if not new_chain:
490 new_chain = Chain(name=letter)
491 structure.set_new_chain(new_chain)
492 # Find the new chain index and set it as the residue chain index
493 # Note that the chain must be set in the structure already
494 new_chain_index = new_chain.index
495 if new_chain_index == None:
496 raise ValueError(f'Chain {new_chain} is not set in the structure')
497 self.set_chain_index(new_chain_index)
498 chain: 'Chain' = property(get_chain, set_chain, None, "The residue chain")
500 def get_bonded_atom_indices (self) -> list[int]:
501 """Get atom indices from atoms bonded to this residue."""
502 # Get all bonds among residue atoms
503 all_bonds = []
504 for atom in self.atoms:
505 all_bonds += atom.bonds
506 # Remove self atom indices from the list
507 external_bonds = set(all_bonds) - set(self.atom_indices)
508 return list(external_bonds)
510 def get_bonded_atoms (self) -> list['Atom']:
511 """Get atoms bonded to this residue."""
512 return [ self.structure.atoms[atom_index] for atom_index in self.get_bonded_atom_indices() ]
514 def get_bonded_residue_indices (self) -> list[int]:
515 """Get residue indices from residues bonded to this residue."""
516 return list(set([ atom.residue_index for atom in self.get_bonded_atoms() ]))
518 def get_bonded_residues (self) -> list['Residue']:
519 """Get residues bonded to this residue."""
520 return [ self.structure.residues[residue_index] for residue_index in self.get_bonded_residue_indices() ]
522 def is_bonded_with_residue (self, other : 'Residue') -> bool:
523 """Given another residue, check if it is bonded with this residue."""
524 bonded_atom_indices = set(self.get_bonded_atom_indices())
525 if next((index for index in other.atom_indices if index in bonded_atom_indices), None) != None: return True
526 return False
528 def is_missing_any_bonds (self) -> bool:
529 return any(atom.bonds == MISSING_BONDS for atom in self.atoms)
531 def is_coherent (self) -> bool:
532 """Make sure atoms within the residue are all bonded."""
533 # If bonds are missing then just say everything is
534 if self.is_missing_any_bonds():
535 raise RuntimeError('Trying to check if residue with missing bonds is coherent')
536 residue_selection = self.get_selection()
537 residue_fragments = self.structure.find_fragments(residue_selection)
538 first_residue_fragment = next(residue_fragments)
539 return len(first_residue_fragment) == self.atom_count
541 def get_classification (self) -> str:
542 """
543 Get the residue biochemistry classification.
545 WARNING: Note that this logic will not work in a structure without hydrogens.
547 Available classifications:
548 - protein
549 - dna
550 - rna
551 - carbohydrate
552 - fatty
553 - steroid
554 - ion
555 - solvent
556 - acetyl
557 - amide
558 - other
559 """
560 # Return the internal value, if any
561 if self._classification:
562 return self._classification
563 # If this is a CG residue then we can not classify it
564 if self.is_cg(): return self.get_classification_by_name()
565 # If we dont have a value then we must classify the residue
566 # -------------------------------------------------------------------------------------------------------
567 # Ions are single atom residues
568 if self.atom_count == 1:
569 self._classification = 'ion'
570 return self._classification
571 # -------------------------------------------------------------------------------------------------------
572 # At this point we need atoms to have elements fixed
573 # WARNING: missing elements would result in silent failure to recognize some classifications
574 if self.structure._fixed_atom_elements == False:
575 self.structure.fix_atom_elements()
576 # Solvent is water molecules
577 # First rely on the residue name
578 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES:
579 self._classification = 'solvent'
580 return self._classification
581 # It may be water with a not known name
582 # Literally check if its a molecule with 3 atoms: 2 hydrogens and 1 oxygen
583 if self.atom_count == 3:
584 atom_elements = [ atom.element for atom in self.atoms ]
585 if atom_elements.count('H') == 2 and atom_elements.count('O') == 1:
586 self._classification = 'solvent'
587 return self._classification
588 # -------------------------------------------------------------------------------------------------------
589 # Protein definition according to vmd:
590 # a residue with atoms named C, N, CA, and O
591 # In our case we accept OC1 and OC2 or OT1 and OT2 instead of O for terminal residues
592 atom_names = set([ atom.name for atom in self.atoms ])
593 if ( all((name in atom_names) for name in ['C', 'N', 'CA']) and (
594 'O' in atom_names or { 'OC1', 'OC2' } <= atom_names or { 'OT1', 'OT2' } <= atom_names
595 )):
596 self._classification = 'protein'
597 return self._classification
598 # -------------------------------------------------------------------------------------------------------
599 # Nucleic acids definition according to vmd:
600 # a residue with atoms named P, O1P, O2P and either O3’, C3’, C4’, C5’, O5’ or O3*, C3*, C4*, C5*, O5*
601 # Apparently it has been fixed so now a residue does not need to be phosphorylated to be considered nucleic
602 # LORE: This included the condition "all((name in atom_names) for name in ['P', 'OP1', 'OP2'])"
603 if (( all((name in atom_names) for name in ["O3'", "C3'", "C4'", "C5'", "O5'"]) or
604 all((name in atom_names) for name in ["O3*", "C3*", "C4*", "C5*", "O5*"]) )):
605 # At this point we know it is nucleic
606 # We must tell the difference between DNA and RNA
607 if "O2'" in atom_names or "O2*" in atom_names:
608 self._classification = 'rna'
609 else:
610 self._classification = 'dna'
611 return self._classification
612 # -------------------------------------------------------------------------------------------------------
613 # To define carbohydrates search for rings made of 1 oxygen and 'n' carbons
614 # WARNING: This logic may fail for some very specific molecules such as furine
615 # LIMITATION: This logic only aims for cyclical carbohydrates. The linear form of carbohydrates is not yet consider
616 rings = self.find_rings(6)
617 for ring in rings:
618 ring_elements = [ atom.element for atom in ring ]
619 # Check the ring has 1 oxygen
620 oxygen_count = ring_elements.count('O')
621 if oxygen_count != 1:
622 continue
623 # Check the rest are only carbon
624 carbon_count = ring_elements.count('C')
625 if carbon_count != len(ring) - 1:
626 continue
627 self._classification = 'carbohydrate'
628 return self._classification
629 # -------------------------------------------------------------------------------------------------------
630 # To define steroids search for 3 6-carbon rings and 1 5-carbon ring (at least)
631 # WARNING: According to this logic some exotical non-steroid molecules may result in false positives
632 num_6_carbon_rings = 0
633 num_5_carbon_rings = 0
634 for ring in rings:
635 ring_elements = [ atom.element for atom in ring ]
636 if any(element != 'C' for element in ring_elements):
637 continue
638 ring_lenght = len(ring)
639 if ring_lenght == 5:
640 num_5_carbon_rings += 1
641 if ring_lenght == 6:
642 num_6_carbon_rings += 1
643 if num_6_carbon_rings >= 3 and num_5_carbon_rings >= 1:
644 self._classification = 'steroid'
645 return self._classification
646 # -------------------------------------------------------------------------------------------------------
647 # To define fatty acids search for a series of carbon atoms connected together (3 at least)
648 # These carbon atoms must be connected only to hydrogen in addition to 1-2 carbon
649 # These carbons must not be bonded in a cycle so check atoms not be repeated in the series
650 bonded_fatty_atoms = []
651 def get_bonded_fatty_atoms_recursive (current_atom : Atom, previous_atom : Optional[Atom] = None) -> bool:
652 # Iterate over the input atom bonded atoms
653 for bonded_atom in current_atom.get_bonded_atoms():
654 # Discard the previous atom to avoid an infinite loop
655 if bonded_atom == previous_atom:
656 continue
657 # If we find an atom which is already in the fatty atoms list then it means we are cycling
658 if bonded_atom in bonded_fatty_atoms:
659 return False
660 # If this is not a fatty atom then discard it
661 if not bonded_atom.is_fatty_candidate():
662 continue
663 # Add the current bonded fatty atom to the list
664 bonded_fatty_atoms.append(bonded_atom)
665 # Find more bonded fatty atoms and add them to list as well
666 if get_bonded_fatty_atoms_recursive(current_atom=bonded_atom, previous_atom=current_atom) == False:
667 return False
668 return True
669 # Now check all atoms and try to set the series until we find one which works
670 already_checked_atoms = []
671 for atom in self.atoms:
672 # If we already checked this atom trough the recursive logic then skip it
673 if atom in already_checked_atoms:
674 continue
675 # If the atom is not a fatty candidate then skip it
676 if not atom.is_fatty_candidate():
677 continue
678 # Now that we have found a suitable candidate atom we start the series
679 bonded_fatty_atoms = [atom]
680 if get_bonded_fatty_atoms_recursive(atom) and len(bonded_fatty_atoms) >= 3:
681 self._classification = 'fatty'
682 return self._classification
683 already_checked_atoms += bonded_fatty_atoms
684 # -------------------------------------------------------------------------------------------------------
685 # Check if it is an acetylation capping terminal
686 if self.name == 'ACE':
687 self._classification = 'acetyl'
688 return self._classification
689 # -------------------------------------------------------------------------------------------------------
690 # Check if it is an N-methyl amide capping terminal
691 if self.name == 'NME':
692 self._classification = 'amide'
693 return self._classification
694 # -------------------------------------------------------------------------------------------------------
695 self._classification = 'other'
696 return self._classification
698 def get_classification_by_name (self) -> str:
699 """
700 Set an alternative function to "try" to classify the residues according only to its name.
701 This is useful for corase grain residues whose atoms may not reflect the real atoms.
702 WARNING: This logic is very limited and will return "unknown" most of the times.
703 WARNING: This logic relies mostly in atom names, which may be not standard.
704 """
705 if self.name in PROTEIN_RESIDUE_NAME_LETTERS:
706 return 'protein'
707 if self.name in DNA_RESIDUE_NAME_LETTERS:
708 return 'dna'
709 if self.name in RNA_RESIDUE_NAME_LETTERS:
710 return 'rna'
711 if self.name in NUCLEIC_RESIDUE_NAME_LETTERS:
712 return 'nucleic'
713 if self.name in FATTY_RESIDUE_NAMES:
714 return 'fatty'
715 if self.name in STEROID_RESIDUE_NAMES:
716 return 'steroid'
717 if self.name in STANDARD_COUNTER_ION_ATOM_NAMES:
718 return 'ion'
719 if self.name in STANDARD_SOLVENT_RESIDUE_NAMES:
720 return 'solvent'
721 # If we do not know what it is
722 return 'unknown'
724 # The residue biochemistry classification (read only)
725 classification = property(get_classification, None, None)
727 def get_selection (self) -> 'Selection':
728 """Generate a selection for this residue."""
729 return Selection(self.atom_indices)
731 def copy (self) -> 'Residue':
732 """Make a copy of the current residue."""
733 residue_copy = Residue(self.name, self.number, self.icode)
734 residue_copy._structure = self._structure
735 residue_copy._index = self._index
736 residue_copy._atom_indices = self._atom_indices
737 residue_copy._chain_index = self._chain_index
738 return residue_copy
740 def get_letter (self) -> str:
741 """Get the residue equivalent single letter code.
742 Note that this makes sense for aminoacids and nucleotides only.
743 Non recognized residue names return 'X'."""
744 return residue_name_to_letter(self.name)
746 def get_formula (self) -> str:
747 """Get the formula of the residue."""
748 elements = [ atom.element for atom in self.atoms ]
749 unique_elements = list(set(elements))
750 formula = ''
751 for unique_element in unique_elements:
752 count = elements.count(unique_element)
753 number = get_lower_numbers(str(count)) if count > 1 else ''
754 formula += unique_element + number
755 return formula
757 def find_rings (self, max_ring_size : int) -> list[ list[Atom] ]:
758 """Find rings in the residue."""
759 return self.structure.find_rings(max_ring_size, selection=self.get_selection())
761 def split (self,
762 first_residue_atom_indices : list[int],
763 second_residue_atom_indices : list[int],
764 first_residue_name : Optional[str] = None,
765 second_residue_name : Optional[str] = None,
766 first_residue_number : Optional[int] = None,
767 second_residue_number : Optional[int] = None,
768 first_residue_icode : Optional[str] = None,
769 second_residue_icode : Optional[str] = None,
770 ) -> tuple['Residue', 'Residue']:
771 """Split this residue in 2 residues and return them in a tuple.
772 Keep things coherent in the structure (renumerate all residues below this one).
773 Note that all residue atoms must be covered by the splits."""
774 # This function is expected to be called in a residue with an already set structure
775 if not self.structure:
776 raise InputError('The split function should be called when the residue has an already defined structure')
777 # Make sure all atoms in the residue are covered between both the first and the second residue
778 if set(self.atom_indices) != set(first_residue_atom_indices + second_residue_atom_indices):
779 print('Residue atoms: ' + ', '.join([ str(v) for v in set(self.atom_indices) ]))
780 print('Covered atoms: ' + ', '.join([ str(v) for v in set(first_residue_atom_indices + second_residue_atom_indices) ]))
781 raise InputError('All atom indices must be covered between both the first and the second residue')
782 # Reuse this first residue instead of creating a new one
783 if first_residue_name:
784 self.name = first_residue_name
785 if first_residue_number:
786 self.number = first_residue_number
787 if first_residue_icode:
788 self.icode = first_residue_icode
789 # Set the new second residue
790 _second_residue_name = second_residue_name if second_residue_name else self.name
791 _second_residue_number = second_residue_number if second_residue_number else self.number
792 _second_residue_icode = second_residue_icode if second_residue_icode else get_next_letter(self.icode)
793 second_residue = Residue(_second_residue_name, _second_residue_number, _second_residue_icode)
794 second_residue._structure = self.structure
795 new_residue_index = self.index + 1
796 second_residue._index = new_residue_index
797 # Insert the second residue in the structure residues list right after this residue
798 self.structure.residues.insert(new_residue_index, second_residue)
799 # Set the second residue index
800 # Update the index of all residues which have been shifted after the insertion
801 for residue_index in range(new_residue_index + 1, len(self.structure.residues)):
802 residue = self.structure.residues[residue_index]
803 residue.index = residue_index
804 # Now transfer atoms from residue 1 to residue 2 as it is specified
805 # Note that this will automatically update every atom
806 second_residue.atom_indices = second_residue_atom_indices
807 # Now add the new residue to the chain
808 self.chain.add_residue(second_residue)
810 def split_by_atom_names (self,
811 first_residue_atom_names : list[str],
812 second_residue_atom_names : list[str],
813 first_residue_name : Optional[str] = None,
814 second_residue_name : Optional[str] = None,
815 first_residue_number : Optional[int] = None,
816 second_residue_number : Optional[int] = None,
817 first_residue_icode : Optional[str] = None,
818 second_residue_icode : Optional[str] = None,
819 ) -> tuple['Residue', 'Residue']:
820 """Parse atom names to atom indices and then call the split function."""
821 # Check all atom names to exist in the residue
822 input_atom_names = set(first_residue_atom_names + second_residue_atom_names)
823 residue_atom_names = set([ atom.name for atom in self.atoms ])
824 if input_atom_names != residue_atom_names:
825 print(self)
826 print(self.atoms)
827 print('Input atom names: ' + ', '.join(input_atom_names))
828 print('Residue atom names: ' + ', '.join(residue_atom_names))
829 raise InputError('All residue atoms must be covered between both the first and the second residue atom names')
830 # Convert atom names to atom indices
831 first_residue_atom_indices = [ self.get_atom_by_name(name).index for name in first_residue_atom_names ]
832 second_residue_atom_indices = [ self.get_atom_by_name(name).index for name in second_residue_atom_names ]
833 # Call the actual split logic
834 return self.split(first_residue_atom_indices, second_residue_atom_indices, first_residue_name,
835 second_residue_name, first_residue_number, second_residue_number, first_residue_icode, second_residue_icode)
837 def get_atom_by_name (self, atom_name : str) -> 'Atom':
838 """Get a residue atom given its name."""
839 return next(( atom for atom in self.atoms if atom.name == atom_name ), None)
841 def get_label (self) -> str:
842 """Get a standard label."""
843 chainname = self.chain.name if self.chain.name.strip() else ''
844 return f'{chainname}{self.number}{self.icode}({self.name})'
845 label = property(get_label, None, None,
846 "The residue standard label (read only)")
848 def is_cg (self) -> bool:
849 """Ask if the current residue is in coarse grain.
850 Note that we assume there may be not hybrid aa/cg residues."""
851 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)
853class Chain:
854 """A chain of residues."""
855 def __init__ (self,
856 name : Optional[str] = None,
857 classification : Optional[str] = None,
858 ):
859 self.name = name
860 self._classification = classification
861 # Set variables to store references to other related instaces
862 # These variables will be set further by the structure
863 self._structure = None
864 self._index = None
865 self._residue_indices = []
867 def __repr__ (self):
868 return '<Chain ' + self.name + '>'
870 def __eq__ (self, other):
871 return self.name == other.name
873 def __hash__ (self):
874 return hash(self.name)
876 # The parent structure (read only)
877 # This value is set by the structure itself
878 def get_structure (self) -> Optional['Structure']:
879 return self._structure
880 structure: 'Structure' = property(get_structure, None, None, "The parent structure (read only)")
882 # The residue index according to parent structure residues (read only)
883 # This value is set by the structure itself
884 def get_index (self) -> Optional[int]:
885 return self._index
886 # When the index is set all residues are updated with the next chain index
887 def set_index (self, index : int):
888 for residue in self.residues:
889 residue._chain_index = index
890 self._index = index
891 index = property(get_index, set_index, None, "The residue index according to parent structure residues (read only)")
893 # The residue indices according to parent structure residues for residues in this chain
894 # If residue indices are set then make changes in all the structure to make this change coherent
895 def get_residue_indices (self) -> list[int]:
896 return self._residue_indices
897 def set_residue_indices (self, new_residue_indices : list[int]):
898 # If there is not strucutre yet it means the chain is beeing set before the structure
899 # We just save residue indices and wait for the structure to be set
900 if not self.structure:
901 self._residue_indices = new_residue_indices
902 return
903 # Update the current residues
904 for residue in self.residues:
905 residue._chain_index = None
906 # Update the new residues
907 for index in new_residue_indices:
908 residue = self.structure.residues[index]
909 residue._chain_index = self.index
910 # In case the new residue indices list is empty this chain must be removed from its structure
911 if len(new_residue_indices) == 0:
912 self.structure.purge_chain(self)
913 # Now new indices are coherent and thus we can save them
914 self._residue_indices = new_residue_indices
915 residue_indices = property(get_residue_indices, set_residue_indices, None, "The residue indices according to parent structure residues for residues in this residue")
917 def get_residues (self) -> list['Residue']:
918 """The residues in this chain.
919 If residues are set then make changes in all the structure to make this change coherent."""
920 # If there is not strucutre yet it means the chain is beeing set before the structure
921 # In this case it is not possible to get related residues in the structure
922 if not self.structure:
923 return []
924 # Get residues in the structure according to residue indices
925 residues = self.structure.residues
926 return [ residues[residue_index] for residue_index in self.residue_indices ]
928 def set_residues (self, new_residues : list['Residue']):
929 """Find indices for new residues and set their indices as the new residue indices.
930 Note that residues must be set in the structure already."""
931 new_residue_indices = []
932 for new_residue in new_residues:
933 new_residue_index = new_residue.index
934 if new_residue_index == None:
935 raise ValueError(f'Residue {new_residue} is not set in the structure')
936 new_residue_indices.append(new_residue_index)
937 self.set_residue_indices(new_residue_indices)
938 residues: list['Residue'] = property(get_residues, set_residues, None, "The residues in this chain")
940 def add_residue (self, residue : 'Residue'):
941 """Add a residue to the chain."""
942 # Insert the new residue index in the list of residue indices keeping the order
943 sorted_residue_index = bisect(self.residue_indices, residue.index)
944 self.residue_indices.insert(sorted_residue_index, residue.index)
945 # Update the residue internal chain index
946 residue._chain_index = self.index
948 def remove_residue (self, residue : 'Residue'):
949 """Remove a residue from the chain.
950 WARNING: Note that this function does not trigger the set_residue_indices."""
951 self.residue_indices.remove(residue.index) # This index MUST be in the list
952 # If we removed the last residue then this chain must be removed from its structure
953 if len(self.residue_indices) == 0 and self.structure:
954 self.structure.purge_chain(self)
955 # Update the residue internal chain index
956 residue._chain_index = None
958 def get_atom_indices (self) -> list[int]:
959 """Get atom indices for all atoms in the chain (read only).
960 In order to change atom indices they must be changed in their corresponding residues."""
961 return sum([ residue.atom_indices for residue in self.residues ], [])
962 atom_indices = property(get_atom_indices, None, None, "Atom indices for all atoms in the chain (read only)")
964 def get_atoms (self) -> list[int]:
965 """Get the atoms in the chain (read only).
966 In order to change atoms they must be changed in their corresponding residues."""
967 return sum([ residue.atoms for residue in self.residues ], [])
968 atoms: list['Atom'] = property(get_atoms, None, None, "Atoms in the chain (read only)")
970 def get_atom_count (self) -> int:
971 """Get the number of atoms in the chain (read only)."""
972 return len(self.atom_indices)
973 atom_count = property(get_atom_count, None, None, "Number of atoms in the chain (read only)")
975 def get_residue_count (self) -> int:
976 """Get the number of residues in the chain (read only)."""
977 return len(self._residue_indices)
978 residue_count = property(get_residue_count, None, None, "Number of residues in the chain (read only)")
980 def get_classification (self) -> str:
981 """Get the chain classification."""
982 if self._classification:
983 return self._classification
984 self._classification = self.structure.get_selection_classification(self.get_selection())
985 return self._classification
987 def set_classification (self, classification : str):
988 """Force the chain classification."""
989 self._classification = classification
991 classification = property(get_classification, set_classification, None,
992 "Classification of the chain (manual or automatic)")
994 def get_sequence (self) -> str:
995 """Get the residues sequence in one-letter code."""
996 return ''.join([ residue_name_to_letter(residue.name) for residue in self.residues ])
998 def get_selection (self) -> 'Selection':
999 """Generate a selection for this chain."""
1000 return Selection(self.atom_indices)
1002 def copy (self) -> 'Chain':
1003 """Make a copy of the current chain."""
1004 chain_copy = Chain(self.name)
1005 chain_copy._structure = self._structure
1006 chain_copy._index = self._index
1007 chain_copy.residue_indices = self.residue_indices
1008 return chain_copy
1010 def has_cg (self) -> bool:
1011 """Ask if the current chain has at least one coarse grain atom/residue."""
1012 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)
1014 def is_missing_any_bonds (self) -> bool:
1015 return any(atom.bonds == MISSING_BONDS for atom in self.atoms)
1017 def find_residue (self, number : int, icode : str = '', index = None) -> Optional['Residue']:
1018 """Find a residue by its number and insertion code."""
1019 # Iterate chain residues
1020 for residue in self.residues:
1021 if residue.number == number and residue.icode == icode and (index is None or residue.index == index):
1022 return residue
1023 return None
1025class Structure:
1026 """A structure is a group of atoms organized in chains and residues."""
1027 def __init__ (self,
1028 atoms : list['Atom'] = [],
1029 residues : list['Residue'] = [],
1030 chains : list['Chain'] = [],
1031 residue_numeration_base : int = 10,
1032 ):
1033 self.atoms: list['Atom'] = []
1034 self.residues: list['Residue'] = []
1035 self.chains: list['Chain'] = []
1036 # Set references between instances
1037 for atom in atoms:
1038 self.set_new_atom(atom)
1039 for residue in residues:
1040 self.set_new_residue(residue)
1041 for chain in chains:
1042 self.set_new_chain(chain)
1043 # --- Set other internal variables ---
1044 # Set bonds between atoms
1045 self._bonds = None
1046 # Set fragments of bonded atoms
1047 self._fragments = None
1048 # --- Set other auxiliar variables ---
1049 # Trajectory atom sorter is a function used to sort coordinates in a trajectory file
1050 # This function is generated when sorting indices in the structure
1051 # Also the new atom order is saved
1052 self.trajectory_atom_sorter = None
1053 self.new_atom_order = None
1054 # Track when atom elements have been fixed
1055 self._fixed_atom_elements = False
1056 # Save indices of supported ion atoms
1057 self._ion_atom_indices = None
1058 self._dummy_atom_indices = None
1059 # Set the scale of the residue numbers
1060 # It may be decimal (10), hexadecimal(16) or alphanumeric(36)
1061 # Note that hybrid scales (1-9999 decimal and different further) are not explicitly supported
1062 # However, the scale is guessed on read and conserved on write, so the original numeration would be conserved
1063 # The default values is used only when the structure is not read from a PDB
1064 self.residue_numeration_base = residue_numeration_base
1066 def __repr__ (self):
1067 return f'<Structure ({self.atom_count} atoms)>'
1069 def get_bonds (self) -> list[ list[int] ]:
1070 """Get the bonds between atoms."""
1071 # Return the stored value, if exists
1072 if self._bonds:
1073 return self._bonds
1074 # If not, we must calculate the bonds using vmd
1075 self._bonds = self.get_covalent_bonds()
1076 return self._bonds
1077 def set_bonds (self, bonds : list[ list[int] ]):
1078 """Force specific bonds."""
1079 self._bonds = bonds
1080 # Reset fragments
1081 self._fragments = None
1082 bonds = property(get_bonds, set_bonds, None, "The structure bonds")
1084 def get_fragments (self) -> list['Selection']:
1085 """Get the groups of atoms which are covalently bonded."""
1086 # Return the stored value, if exists
1087 if self._fragments != None:
1088 return self._fragments
1089 # Otherwise, find fragments in all structure atoms
1090 self._fragments = list(self.find_fragments())
1091 return self._fragments
1092 # Fragments of covalently bonded atoms (read only)
1093 fragments: list['Selection'] = property(get_fragments, None, None, "The structure fragments (read only)")
1095 def find_fragments (self,
1096 selection : Optional['Selection'] = None,
1097 exclude_disulfide : bool = True,
1098 atom_bonds : Optional[list[list[int]]] = None,
1099 ) -> Generator['Selection', None, None]:
1100 """Find fragments in a selection of atoms. A fragment is a selection of
1101 covalently bonded atoms. All atoms are searched if no selection is provided.
1103 WARNING: Note that fragments generated from a specific selection may not
1104 match the structure fragments. A selection including 2 separated regions of a
1105 structure fragment will yield 2 fragments.
1107 For convenience, protein disulfide bonds are not considered in this logic by default.
1108 """
1109 # If there is no selection we consider all atoms
1110 if not selection: selection = self.select_all()
1111 # Get/Find covalent bonds between atoms in a new object avoid further corruption (deletion) of the original list
1112 safe_bonds = atom_bonds if atom_bonds else self.bonds
1113 atom_indexed_covalent_bonds = { atom_index: [ *safe_bonds[atom_index] ] for atom_index in selection.atom_indices }
1114 # Exclude disulfide bonds from
1115 if exclude_disulfide:
1116 # First search for protein disulfide bonds
1117 excluded = set()
1118 # Iterate atom bonds
1119 for atom_index, bonds in atom_indexed_covalent_bonds.items():
1120 # Get the actual atom
1121 atom = self.atoms[atom_index]
1122 # If it is not sulfur then continue
1123 if atom.element != 'S': continue
1124 # Iterate bonded atoms
1125 for bonded_atom_index in bonds:
1126 # Bonds with atoms out of the selection are not considered
1127 if bonded_atom_index not in atom_indexed_covalent_bonds: continue
1128 # Get the bonded atom
1129 bonded_atom = self.atoms[bonded_atom_index]
1130 if bonded_atom.element != 'S': continue
1131 # We found a disulfide bond here
1132 # Make sure this is protein
1133 if atom.residue.classification != 'protein': continue
1134 if bonded_atom.residue.classification != 'protein': continue
1135 # We found a protein disulfide bond, we must exclude it
1136 bond = tuple(sorted([atom_index, bonded_atom_index]))
1137 excluded.add(bond)
1138 # Now remove the corresponding bonds
1139 for bond in excluded:
1140 first_atom_index, second_atom_index = bond
1141 atom_indexed_covalent_bonds[first_atom_index].remove(second_atom_index)
1142 atom_indexed_covalent_bonds[second_atom_index].remove(first_atom_index)
1143 # Group the connected atoms in "fragments"
1144 while len(atom_indexed_covalent_bonds) > 0:
1145 start_atom_index, bonds = next(iter(atom_indexed_covalent_bonds.items()))
1146 del atom_indexed_covalent_bonds[start_atom_index]
1147 fragment_atom_indices = [ start_atom_index ]
1148 while len(bonds) > 0:
1149 # Get the next bond atom and remove it from the bonds list
1150 next_atom_index = bonds[0]
1151 bonds.remove(next_atom_index)
1152 next_bonds = atom_indexed_covalent_bonds.get(next_atom_index, None)
1153 # If this atom is out of the selection then skip it
1154 if next_bonds == None:
1155 continue
1156 next_new_bonds = [ bond for bond in next_bonds if bond not in fragment_atom_indices + bonds ]
1157 bonds += next_new_bonds
1158 fragment_atom_indices.append(next_atom_index)
1159 del atom_indexed_covalent_bonds[next_atom_index]
1160 yield Selection(fragment_atom_indices)
1162 def find_whole_fragments (self, selection : 'Selection') -> Generator['Selection', None, None]:
1163 """Given a selection of atoms, find all whole structure fragments on them."""
1164 for fragment in self.fragments:
1165 if selection & fragment:
1166 yield fragment
1168 def name_selection (self, selection : 'Selection') -> str:
1169 """Name an atom selection depending on the chains it contains. This is used for debug purpouses."""
1170 atoms = [ self.atoms[index] for index in selection.atom_indices ]
1171 # Count atoms per chain
1172 atom_count_per_chain = { chain: 0 for chain in self.chains }
1173 for atom in atoms:
1174 atom_count_per_chain[atom.chain] += 1
1175 # Set labels accoridng to the coverage of every chain
1176 chain_labels = []
1177 for chain, atom_count in atom_count_per_chain.items():
1178 if atom_count == 0: continue
1179 coverage = atom_count / chain.atom_count
1180 label = f'chain {chain.name}'
1181 if coverage < 1:
1182 percent = round(coverage * 1000) / 10
1183 label += f' ({percent}%)'
1184 chain_labels.append(label)
1185 return ', '.join(chain_labels)
1187 def set_new_atom (self, atom : 'Atom'):
1188 """Set a new atom in the structure."""
1189 atom._structure = self
1190 new_atom_index = self.atom_count
1191 self.atoms.append(atom)
1192 atom._index = new_atom_index
1194 def set_new_residue (self, residue : 'Residue'):
1195 """
1196 Set a new residue in the structure.
1197 WARNING: Atoms must be set already before setting residues.
1198 """
1199 residue._structure = self
1200 new_residue_index = len(self.residues)
1201 self.residues.append(residue)
1202 residue._index = new_residue_index
1203 # In case the residue has atom indices, set relational indices on each atom
1204 for atom_index in residue.atom_indices:
1205 atom = self.atoms[atom_index]
1206 atom._residue_index = new_residue_index
1208 def purge_residue (self, residue : 'Residue'):
1209 """
1210 Purge residue from the structure and its chain.
1211 This can be done only when the residue has no atoms left in the structure.
1212 Renumerate all residue indices which have been offsetted as a result of the purge.
1213 """
1214 # Check the residue can be purged
1215 if residue not in self.residues:
1216 raise ValueError(f'{residue} is not in the structure already')
1217 if len(residue.atom_indices) > 0:
1218 raise ValueError(f'{residue} is still having atoms and thus it cannot be purged')
1219 # Remove the purged residue from its chain
1220 if residue.chain: residue.chain.remove_residue(residue)
1221 # Get the current index of the residue to be purged
1222 purged_index = residue.index
1223 # Residues and their atoms below this index are not to be modified
1224 # Residues and their atoms over this index must be renumerated
1225 for affected_residue in self.residues[purged_index+1:]:
1226 # Chaging the index automatically changes all residues atoms '_residue_index' values
1227 # Chaging the index automatically changes its corresponding index in residue chain '_residue_indices'
1228 affected_residue.index -= 1
1229 # Finally, remove the current residue from the list of residues in the structure
1230 del self.residues[purged_index]
1232 def set_new_chain (self, chain : 'Chain'):
1233 """Set a new chain in the structure.
1234 WARNING: Residues and atoms must be set already before setting chains."""
1235 chain._structure = self
1236 new_chain_index = len(self.chains)
1237 self.chains.append(chain)
1238 chain._index = new_chain_index
1239 # In case the chain has residue indices, set relational indices on each residue
1240 for residue_index in chain.residue_indices:
1241 residue = self.residues[residue_index]
1242 residue._chain_index = new_chain_index
1244 def purge_chain (self, chain : 'Chain'):
1245 """Purge chain from the structure.
1246 This can be done only when the chain has no residues left in the structure.
1247 Renumerate all chain indices which have been offsetted as a result of the purge."""
1248 # Check the chain can be purged
1249 if chain not in self.chains:
1250 raise ValueError(f'Chain {chain.name} is not in the structure already')
1251 if len(chain.residue_indices) > 0:
1252 raise ValueError(f'Chain {chain.name} is still having residues and thus it cannot be purged')
1253 # Get the current index of the chain to be purged
1254 purged_index = chain.index
1255 # Chains and their residues below this index are not to be modified
1256 # Chains and their residues over this index must be renumerated
1257 for affected_chain in self.chains[purged_index+1:]:
1258 # Chaging the index automatically changes all chain residues '_chain_index' values
1259 affected_chain.index -= 1
1260 # Finally, remove the current chain from the list of chains in the structure
1261 del self.chains[purged_index]
1263 @classmethod
1264 def from_pdb (cls, pdb_content : str, model : Optional[int] = None, flexible_numeration : bool = True):
1265 """Set the structure from a pdb file.
1266 You may filter the PDB content for a specific model.
1267 Some weird numeration systems are not supported and, when encountered, they are ignored.
1268 In these cases we set our own numeration system.
1269 Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead."""
1270 # Filter the PDB content in case a model was passed
1271 filtered_pdb_content = filter_model(pdb_content, model) if model else pdb_content
1272 # Split the PDB content in lines
1273 pdb_lines = filtered_pdb_content.split('\n')
1274 # Before we start, we must guess the numeration system
1275 # To do so mine all residue numbers
1276 all_residue_number_characters = set()
1277 for line in pdb_lines:
1278 # Parse atoms only
1279 start = line[0:6]
1280 is_atom = start == 'ATOM ' or start == 'HETATM'
1281 if not is_atom: continue
1282 # Mine all residue numbers
1283 residue_number = line[22:26]
1284 for character in residue_number:
1285 all_residue_number_characters.add(character)
1286 # Remove white spaces
1287 all_residue_number_characters.discard(' ')
1288 # If we find a non-numerical and non-alphabetical character then we assume it has a weird numeration system
1289 # Since we can not support every scenario, in this cases we set the numeration totally ignoring the one in the PDB
1290 weird_character = next((character for character in all_residue_number_characters if not (character.isalnum() or character == '-')), None)
1291 if weird_character:
1292 if flexible_numeration == False: raise InputError(f'Not supported numeration system including "{weird_character}" characters')
1293 warn(f'Weird residue numeration including "{weird_character}" characters found in the PDB file. Ignoring it.')
1294 residue_numeration_base = None
1295 # Search among all resiude numbers any letters (non-numerical characters)
1296 elif next((letter for letter in alphanumerical_letters if letter in all_residue_number_characters), None):
1297 residue_numeration_base = 36
1298 elif next((letter for letter in hexadecimal_letters if letter in all_residue_number_characters), None):
1299 residue_numeration_base = 16
1300 else:
1301 residue_numeration_base = 10
1302 # Read the pdb content line by line and set the parsed atoms, residues and chains
1303 parsed_atoms = []
1304 parsed_residues = []
1305 parsed_chains = []
1306 # Save chains and residues also in dictionaries only to find them faster
1307 name_chains = {}
1308 label_residues = {}
1309 # Keep track of the last issued atom and residue indices
1310 atom_index = -1
1311 residue_index = -1
1312 for line in pdb_lines:
1313 # Parse atoms only
1314 start = line[0:6]
1315 is_atom = start == 'ATOM ' or start == 'HETATM'
1316 if not is_atom:
1317 continue
1318 # Mine all atom data
1319 atom_name = line[11:16].strip()
1320 residue_name = line[17:21].strip()
1321 chain_name = line[21:22]
1322 residue_number = line[22:26]
1323 icode = line[26:27]
1324 if icode == ' ':
1325 icode = ''
1326 x_coord = float(line[30:38])
1327 y_coord = float(line[38:46])
1328 z_coord = float(line[46:54])
1329 element = line[76:78].strip()
1330 # Set the parsed atom
1331 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord))
1332 # Add the parsed atom to the list and update the current atom index
1333 parsed_atoms.append(parsed_atom)
1334 # Get the parsed chain
1335 # DANI: If we always check for an already existing chain there will never be repeated chains
1336 # DANI: However we may create chains with unconnected atoms silently (they were separated in the PDB)
1337 # DANI: For this reason we must only consider the last processed chain
1338 parsed_chain = name_chains.get(chain_name, None)
1339 if parsed_chain and parsed_chain != parsed_chains[-1]:
1340 parsed_chain = None
1341 # If the parsed chain was not yet instantiated then do it now
1342 if not parsed_chain:
1343 parsed_chain = Chain(name=chain_name)
1344 parsed_chains.append(parsed_chain)
1345 name_chains[chain_name] = parsed_chain
1346 # Get the parsed residue
1347 residue_label = (chain_name, residue_number, icode)
1348 # DANI: If we always check for an already existing residue there will never be repeated residues
1349 # DANI: However we may create residues with unconnected atoms silently (they were separated in the PDB)
1350 # DANI: For this reason we must only consider the last processed residue
1351 parsed_residue = label_residues.get(residue_label, None)
1352 if parsed_residue and parsed_residue != parsed_residues[-1]:
1353 parsed_residue = None
1354 # If the parsed residue was not yet instantiated then do it now
1355 if not parsed_residue:
1356 # Parse the residue number if it is to be parsed
1357 if residue_numeration_base:
1358 parsed_residue_number = int(residue_number, residue_numeration_base)
1359 # If we decided to ignore the numeration system then we just issue a new residue number
1360 # Use the last residue number from the current chain as reference
1361 else:
1362 chain_last_residue_index = parsed_chain.residue_indices[-1] if len(parsed_chain.residue_indices) > 0 else None
1363 chain_last_residue_number = parsed_residues[chain_last_residue_index].number if chain_last_residue_index != None else 0
1364 parsed_residue_number = chain_last_residue_number + 1
1365 # Instantiate the new parsed residue
1366 parsed_residue = Residue(name=residue_name, number=parsed_residue_number, icode=icode)
1367 parsed_residues.append(parsed_residue)
1368 label_residues[residue_label] = parsed_residue
1369 # Add current residue to the parsed chain
1370 residue_index += 1
1371 parsed_chain.residue_indices.append(residue_index)
1372 # Add current atom to the parsed residue
1373 atom_index += 1
1374 parsed_residue.atom_indices.append(atom_index)
1375 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains, residue_numeration_base=residue_numeration_base)
1377 @classmethod
1378 def from_pdb_file (cls, pdb_filepath : str, model : Optional[int] = None, flexible_numeration : bool = True):
1379 """Set the structure from a pdb file.
1380 You may filter the input PDB file for a specific model.
1381 Some weird numeration systems are not supported and, when encountered, they are ignored.
1382 In these cases we set our own numeration system.
1383 Set the flexible numeration argument as false to avoid this behaviour, thus crashing instead."""
1384 pdb_file = File(pdb_filepath)
1385 if not pdb_file.exists:
1386 raise InputError(f'File "{pdb_filepath}" not found')
1387 if not pdb_file.format == 'pdb':
1388 raise InputError(f'"{pdb_filepath}" is not a path for a pdb file')
1389 # Read the pdb file
1390 pdb_content = None
1391 with open(pdb_filepath, 'r') as file:
1392 pdb_content = file.read()
1393 return cls.from_pdb(pdb_content, model, flexible_numeration)
1395 # https://biopandas.github.io/biopandas/tutorials/Working_with_mmCIF_Structures_in_DataFrames/
1396 @classmethod
1397 def from_mmcif (cls, mmcif_content : str, model : Optional[int] = None, author_notation : bool = False):
1398 """Set the structure from mmcif.
1399 You may filter the content for a specific model.
1400 You may ask for the author notation instead of the standarized notation for legacy reasons.
1401 This may have an effect in atom names, residue names, residue numbers and chain names.
1402 Read the pdb content line by line and set the parsed atoms, residues and chains.
1403 """
1404 parsed_atoms = []
1405 parsed_residues = []
1406 parsed_chains = []
1407 # Save chains and residues also in dictionaries only to find them faster
1408 name_chains = {}
1409 label_residues = {}
1410 # Keep track of the last issued atom and residue indices
1411 atom_index = -1
1412 residue_index = -1
1413 # Iterate the content line by line
1414 lines = iter(mmcif_content.split('\n'))
1415 # Now mine atoms data
1416 atom_headers = { 'ATOM', 'HETATM', 'ANISOU' }
1417 for line in lines:
1418 # Values are separated by spaces
1419 values = line.split()
1420 # If this is not atom line then we are done
1421 if len(values) == 0 or values[0] not in atom_headers: continue
1422 # Get the atom model number if a model number was passed
1423 # Note that then model is a number greater than 0
1424 if model:
1425 model_number = int(values[20])
1426 # If the model number odes not match then skip it
1427 if model != model_number: continue
1428 # Mine atom data
1429 # Next value is just the atom number, we do not need it
1430 # atom_number = values[1]
1431 # Mine the atom element
1432 element = values[2]
1433 # Mine the atom name
1434 atom_name = values[3].replace('"','')
1435 # Next value is a place holder to indicate alternate conformation, according to the docs
1436 # I don't know what this is but we do not need it
1437 # idk = values[4]
1438 # Mine the residue name
1439 residue_name = values[5]
1440 # Mine the chain name
1441 chain_name = values[6]
1442 # Next value is the chain number, we do not need it
1443 # chain_number = values[7]
1444 # Residue number is '.' for chains with only one residue
1445 residue_number = 1 if values[8] == '.' else int(values[8])
1446 icode = '' if values[9] == '?' else values[9]
1447 x_coord = float(values[10])
1448 y_coord = float(values[11])
1449 z_coord = float(values[12])
1450 # Next value is the occupancy, we do not need it
1451 # occupancy = float(values[13])
1452 # Next value is the isotropic displacement, we do not need it
1453 # isotropic = float(values[14])
1454 # Next value is the charge, we do not need it
1455 # charge = None if values[15] == '?' else float(values[15])
1456 # The rest of values are alternative author values
1457 if author_notation:
1458 residue_number = int(values[16])
1459 residue_name = values[17]
1460 chain_name = values[18]
1461 atom_name = values[19].replace('"','')
1462 # Set the parsed atom
1463 parsed_atom = Atom(name=atom_name, element=element, coords=(x_coord, y_coord, z_coord))
1464 # Add the parsed atom to the list and update the current atom index
1465 parsed_atoms.append(parsed_atom)
1466 # Get the parsed chain
1467 parsed_chain = name_chains.get(chain_name, None)
1468 # If the parsed chain was not yet instantiated then do it now
1469 if not parsed_chain:
1470 parsed_chain = Chain(name=chain_name)
1471 parsed_chains.append(parsed_chain)
1472 name_chains[chain_name] = parsed_chain
1473 # Get the parsed residue
1474 residue_label = (chain_name, residue_number, icode)
1475 parsed_residue = label_residues.get(residue_label, None)
1476 # If the parsed residue was not yet instantiated then do it now
1477 if not parsed_residue:
1478 # Instantiate the new parsed residue
1479 parsed_residue = Residue(name=residue_name, number=residue_number, icode=icode)
1480 parsed_residues.append(parsed_residue)
1481 label_residues[residue_label] = parsed_residue
1482 # Add current residue to the parsed chain
1483 residue_index += 1
1484 parsed_chain.residue_indices.append(residue_index)
1485 # Add current atom to the parsed residue
1486 atom_index += 1
1487 parsed_residue.atom_indices.append(atom_index)
1488 return cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)
1490 @classmethod
1491 def from_mmcif_file (cls, mmcif_filepath : str, model : Optional[int] = None, author_notation : bool = False):
1492 """Set the structure from a mmcif file."""
1493 mmcif_file = File(mmcif_filepath)
1494 if not mmcif_file.exists:
1495 raise InputError(f'File "{mmcif_filepath}" not found')
1496 if not mmcif_file.format == 'cif':
1497 raise InputError(f'"{mmcif_filepath}" is not a path for a mmcif file')
1498 # Read the mmcif file
1499 mmcif_content = None
1500 with open(mmcif_filepath, 'r') as file:
1501 mmcif_content = file.read()
1502 return cls.from_mmcif(mmcif_content, model, author_notation)
1504 @classmethod
1505 def from_mdanalysis (cls, mdanalysis_universe):
1506 """Set the structure from an MD analysis object."""
1507 # Set the final list of atoms to be included in the structure
1508 parsed_atoms = []
1509 parsed_residues = []
1510 parsed_chains = []
1511 # Setup atoms
1512 for atom in mdanalysis_universe.atoms:
1513 name = atom.name
1514 # WARNING: MDAnalysis cannot handle reading an element of an atom with no element
1515 element = atom.element if hasattr(atom, 'element') else '?'
1516 #coords = atom.position # DANI: Esto da error si no hay coordenadas
1517 parsed_atom = Atom(name=name, element=element)
1518 parsed_atoms.append(parsed_atom)
1519 # Setup residues
1520 for residue in mdanalysis_universe.residues:
1521 name = residue.resname
1522 number = residue.resnum
1523 icode = residue.icode if hasattr(residue, 'icode') else None # DANI: No se ha provado
1524 parsed_residue = Residue(name=name, number=number, icode=icode)
1525 atom_indices = [ atom.index for atom in residue.atoms ]
1526 parsed_residue.atom_indices = atom_indices
1527 parsed_residues.append(parsed_residue)
1528 # Setup chains
1529 for segment in mdanalysis_universe.segments:
1530 name = segment.segid
1531 parsed_chain = Chain(name=name)
1532 residue_indices = [ residue.resindex for residue in segment.residues ]
1533 parsed_chain.residue_indices = residue_indices
1534 parsed_chains.append(parsed_chain)
1535 # Setup the structure
1536 structure = cls(atoms=parsed_atoms, residues=parsed_residues, chains=parsed_chains)
1537 # Add bonds and charges which are also available in a topology
1538 atom_count = len(mdanalysis_universe.atoms)
1539 atom_bonds = [ [] for i in range(atom_count) ]
1540 for bond in mdanalysis_universe.bonds:
1541 a,b = bond.indices
1542 # Make sure atom indices are regular integers so they are JSON serializables
1543 atom_bonds[a].append(int(b))
1544 atom_bonds[b].append(int(a))
1545 structure.bonds = atom_bonds
1546 structure.charges = list(mdanalysis_universe._topology.charges.values)
1547 return structure
1549 @classmethod
1550 def from_prmtop_file (cls, prmtop_filepath : str):
1551 """Set the structure from a prmtop file."""
1552 # Make sure the input file exists and has the right format
1553 prmtop_file = File(prmtop_filepath)
1554 if not prmtop_file.exists:
1555 raise InputError(f'File "{prmtop_filepath}" not found')
1556 if not prmtop_file.format == 'prmtop':
1557 raise InputError(f'"{prmtop_filepath}" is not a name for a prmtop file')
1558 # In we do not have mdanalysis in our environment then we cannot proceed
1559 if not is_imported('MDAnalysis'):
1560 raise InputError('Missing dependency error: MDAnalysis')
1561 # Parse the topology using MDanalysis
1562 parser = MDAnalysis.topology.TOPParser.TOPParser(prmtop_filepath)
1563 mdanalysis_topology = parser.parse()
1564 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology)
1565 return cls.from_mdanalysis(mdanalysis_universe)
1567 @classmethod
1568 def from_tpr_file (cls, tpr_filepath : str):
1569 """Set the structure from a tpr file."""
1570 # Make sure the input file exists and has the right format
1571 tpr_file = File(tpr_filepath)
1572 if not tpr_file.exists:
1573 raise InputError(f'File "{tpr_filepath}" not found')
1574 if not tpr_file.format == 'tpr':
1575 raise InputError(f'"{tpr_filepath}" is not a name for a tpr file')
1576 # In we do not have mdanalysis in our environment then we cannot proceed
1577 if not is_imported('MDAnalysis'):
1578 raise InputError('Missing dependency error: MDAnalysis')
1579 # Parse the topology using MDanalysis
1580 parser = MDAnalysis.topology.TPRParser.TPRParser(tpr_filepath)
1581 mdanalysis_topology = parser.parse()
1582 mdanalysis_universe = MDAnalysis.Universe(mdanalysis_topology)
1583 return cls.from_mdanalysis(mdanalysis_universe)
1585 @classmethod
1586 def from_file (cls, mysterious_filepath : str):
1587 """Set the structure from a file if the file format is supported."""
1588 mysterious_file = File(mysterious_filepath)
1589 if mysterious_file.format == 'pdb':
1590 return cls.from_pdb_file(mysterious_file.path)
1591 if mysterious_file.format == 'cif':
1592 return cls.from_mmcif_file(mysterious_file.path)
1593 if mysterious_file.format == 'prmtop':
1594 return cls.from_prmtop_file(mysterious_file.path)
1595 if mysterious_file.format == 'tpr':
1596 return cls.from_tpr_file(mysterious_file.path)
1597 raise InputError(f'Not supported format ({mysterious_file.format}) to setup a Structure')
1599 def get_atom_count (self) -> int:
1600 """Get the number of atoms in the structure."""
1601 return len(self.atoms)
1602 atom_count = property(get_atom_count, None, None, "The number of atoms in the structure (read only)")
1604 def get_residue_count (self) -> int:
1605 """Get the number of residues in the structure (read only)."""
1606 return len(self.residues)
1607 residue_count = property(get_residue_count, None, None, "Number of residues in the structure (read only)")
1609 def get_chain_count (self) -> int:
1610 """Get the number of chains in the structure."""
1611 return len(self.chains)
1612 chain_count = property(get_chain_count, None, None, "Number of chains in the structure (read only)")
1614 def fix_atom_elements (self, trust : bool = True) -> bool:
1615 """Fix atom elements by guessing them when missing.
1616 Set all elements with the first letter upper and the second (if any) lower.
1617 Also check if atom elements are coherent with atom names.
1619 Args:
1620 trust (bool): If 'trust' is set as False then we impose elements according to what we can guess from the atom name.
1622 Returns:
1623 bool:
1624 Return True if any element was modified or False if not.
1625 """
1626 modified = False
1627 added = False
1628 # Save the wrong guesses for a final report
1629 # This way we do not crowd the terminal with logs when a lot of atoms are affected
1630 reports = {}
1631 for atom in self.atoms:
1632 # Make sure elements have the first letter cap and the second letter not cap
1633 if atom.element:
1634 new_element = first_cap_only(atom.element)
1635 if atom.element != new_element:
1636 atom.element = new_element
1637 modified = True
1638 # Check the element to match what we would guess from the atom name
1639 # In case it does not just warn the user
1640 guess = atom.guess_element()
1641 if atom.element != guess:
1642 report = (atom.name, atom.element, guess)
1643 report_count = reports.get(report, 0)
1644 reports[report] = report_count + 1
1645 if not trust:
1646 atom.element = guess
1647 modified = True
1648 # If elements are missing then guess them from atom names
1649 else:
1650 atom.element = atom.guess_element()
1651 added = True
1652 # Warn the user about anormalies
1653 for report, count in reports.items():
1654 atom_name, atom_element, guess = report
1655 warn(f"Suspicious element for atom {atom_name}: {atom_element} -> shoudn't it be {guess}? ({count} occurrences)")
1656 # Warn the user that some elements were modified
1657 if modified: warn('Atom elements have been modified')
1658 if added: warn('Atom elements were missing and have been added')
1659 # Set atom elements as fixed in order to avoid repeating this process
1660 self._fixed_atom_elements = True
1661 return modified or added
1663 def set_new_coordinates (self, new_coordinates : list[Coords]):
1664 """Set new coordinates."""
1665 # Make sure the list of coordinates is as long as the number of atoms
1666 if len(new_coordinates) != self.atom_count:
1667 raise ValueError(f'The number of coordinates ({len(new_coordinates)}) does not match the number of atoms ({self.atom_count})')
1668 # Overwrite current coordinates with the new coordinates
1669 for i, atom in enumerate(self.atoms):
1670 atom.coords = tuple(new_coordinates[i])
1672 def get_ion_atom_indices (self) -> set:
1673 """Get all supported ion atom indices together in a set."""
1674 # If we already did this then return the stored value
1675 if self._ion_atom_indices != None:
1676 return self._ion_atom_indices
1677 # Find ion atom indices
1678 indices = set()
1679 for atom in self.atoms:
1680 if atom.element in SUPPORTED_ION_ELEMENTS:
1681 indices.add(atom.index)
1682 self._ion_atom_indices = indices
1683 return self._ion_atom_indices
1684 ion_atom_indices = property(get_ion_atom_indices, None, None, "Atom indices for what we consider supported ions")
1686 def get_dummy_atom_indices (self) -> set:
1687 """Get all dummy atom indices together in a set."""
1688 # If we already did this then return the stored value
1689 if self._dummy_atom_indices != None:
1690 return self._dummy_atom_indices
1691 # Find ion atom indices
1692 indices = set()
1693 for atom in self.atoms:
1694 if atom.element == DUMMY_ATOM_ELEMENT:
1695 indices.add(atom.index)
1696 self._dummy_atom_indices = indices
1697 return self._dummy_atom_indices
1698 dummy_atom_indices = property(get_dummy_atom_indices, None, None, "Atom indices for what we consider dummy atoms")
1700 def generate_pdb (self):
1701 """Generate a pdb file content with the current structure."""
1702 content = 'REMARK workflow generated pdb file\n'
1703 for a, atom in enumerate(self.atoms):
1704 residue = atom.residue
1705 index = str((a+1) % 100000).rjust(5)
1706 name = ' ' + atom.name.ljust(3) if len(atom.name) < 4 else atom.name
1707 residue_name = residue.name.ljust(4) if residue else 'XXX'.ljust(4)
1708 chain = atom.chain
1709 chain_name = chain.name if chain.name and len(chain.name) == 1 else 'X'
1710 residue_number = str(residue.number).rjust(4) if residue else '0'.rjust(4)
1711 # If residue number is longer than 4 characters then we must parse to hexadecimal
1712 if len(residue_number) > 4:
1713 residue_number = hex(residue.number)[2:].rjust(4)
1714 icode = residue.icode if residue.icode and len(residue.icode) else ' '
1715 # Make sure we have atom coordinates
1716 if atom.coords == None:
1717 raise InputError('Trying to write a PDB file from a structure with atoms without coordinates')
1718 x_coord, y_coord, z_coord = [ "{:.3f}".format(coord).rjust(8) for coord in atom.coords ]
1719 occupancy = '1.00' # Just a placeholder
1720 temp_factor = '0.00' # Just a placeholder
1721 element = atom.element.rjust(2)
1722 atom_line = ('ATOM ' + index + ' ' + name + ' ' + residue_name
1723 + chain_name + residue_number + icode + ' ' + x_coord + y_coord + z_coord
1724 + ' ' + occupancy + ' ' + temp_factor + ' ' + element).ljust(80) + '\n'
1725 content += atom_line
1726 return content
1728 def generate_pdb_file (self, pdb_filepath : str):
1729 """Generate a pdb file with current structure."""
1730 pdb_content = self.generate_pdb()
1731 with open(pdb_filepath, "w") as file:
1732 file.write(pdb_content)
1734 def get_pytraj_topology (self):
1735 """Get the structure equivalent pytraj topology."""
1736 # In we do not have pytraj in our environment then we cannot proceed
1737 if not is_imported('pytraj'):
1738 raise InputError('Missing dependency error: pytraj')
1739 # Generate a pdb file from the current structure to feed pytraj
1740 pdb_filepath = '.structure.pdb'
1741 self.generate_pdb_file(pdb_filepath)
1742 pytraj_topology = pytraj.load_topology(filename = pdb_filepath)
1743 os.remove(pdb_filepath)
1744 return pytraj_topology
1746 SUPPORTED_SELECTION_SYNTAXES = { 'vmd', 'pytraj', 'gmx' }
1747 def select (self, selection_string : str, syntax : str = 'vmd') -> Optional['Selection']:
1748 """Select atoms from the structure thus generating an atom indices list.
1749 Different tools may be used to make the selection:
1750 - vmd (default)
1751 - pytraj"""
1752 if syntax == 'vmd':
1753 # Generate a pdb for vmd to read it
1754 auxiliar_pdb_filepath = '.structure.pdb'
1755 self.generate_pdb_file(auxiliar_pdb_filepath)
1756 # Use vmd to find atom indices
1757 atom_indices = get_vmd_selection_atom_indices(auxiliar_pdb_filepath, selection_string)
1758 os.remove(auxiliar_pdb_filepath)
1759 if len(atom_indices) == 0:
1760 return Selection()
1761 return Selection(atom_indices)
1762 if syntax == 'pytraj':
1763 # In we do not have pytraj in our environment then we cannot proceed
1764 if not is_imported('pytraj'):
1765 raise InputError('Missing dependency error: pytraj')
1766 pytraj_topology = self.get_pytraj_topology()
1767 pytraj_selection = pytraj_topology[selection_string]
1768 atom_indices = [ atom.index for atom in pytraj_selection.atoms ]
1769 if len(atom_indices) == 0:
1770 return Selection()
1771 return Selection(atom_indices)
1772 if syntax == 'gmx':
1773 # Generate a pdb strucutre to feed gmx
1774 auxiliar_pdb_filepath = '.structure.pdb'
1775 self.generate_pdb_file(auxiliar_pdb_filepath)
1776 auxiliar_pdb_file = File(auxiliar_pdb_filepath)
1777 # Create the index file with the current atom selection
1778 index_filepath = f'.structure.ndx'
1779 index_file = File(index_filepath)
1780 selection_name, selection_exists = make_index(auxiliar_pdb_file, index_file, selection_string)
1781 if not selection_exists:
1782 available_selections = ', '.join(list(index_groups.keys()))
1783 raise InputError(f'Something was wrong with the selection. Available gromacs selections: {available_selections}')
1784 # Read the index file to capture the selection of atoms
1785 index_groups = read_ndx(index_file)
1786 indices = index_groups.get(selection_name, None)
1787 if indices == None: raise RuntimeError('Atom group must exist at this point')
1788 auxiliar_pdb_file.remove()
1789 index_file.remove()
1790 # Return the parsed selection
1791 return Selection(indices)
1793 options = ', '.join(self.SUPPORTED_SELECTION_SYNTAXES)
1794 raise InputError(f'Syntax "{syntax}" is not supported. Choose one of the following: {options}')
1796 def select_atom_indices (self, atom_indices : list[int]) -> 'Selection':
1797 """Set a function to make selections using atom indices."""
1798 # Check atom indices to be in the structure
1799 atom_count = self.atom_count
1800 for atom_index in atom_indices:
1801 if atom_index >= atom_count:
1802 raise InputError(f'Atom index {atom_index} is out of range ({atom_count})')
1803 return Selection(atom_indices)
1805 def select_residue_indices (self, residue_indices : list[int]) -> 'Selection':
1806 """Set a function to make selections using residue indices."""
1807 # WARNING: The following line gets stucked sometimes, idk why
1808 #atom_indices = sum([ self.residues[index].atom_indices for index in residue_indices ], [])
1809 atom_indices = []
1810 for i, index in enumerate(residue_indices):
1811 atom_indices += self.residues[index].atom_indices
1812 return Selection(atom_indices)
1814 def select_all (self) -> 'Selection':
1815 """Get a selection with all atoms."""
1816 return Selection(list(range(self.atom_count)))
1818 def select_by_classification (self, classification : str) -> 'Selection':
1819 """Select atoms according to the classification of its residue."""
1820 atom_indices = []
1821 for residue in self.residues:
1822 if residue.classification == classification:
1823 atom_indices += residue.atom_indices
1824 return Selection(atom_indices)
1826 def select_water (self) -> 'Selection':
1827 """Select water atoms.
1828 WARNING: This logic is a bit guessy and it may fail for non-standard residue named structures
1829 """
1830 return self.select_by_classification('solvent')
1832 def select_ions (self) -> 'Selection':
1833 """Select ions."""
1834 return self.select_by_classification('ion')
1836 def select_counter_ions (self, charge : Optional[str] = None) -> 'Selection':
1837 """Select counter ion atoms.
1838 WARNING: This logic is a bit guessy and it may fail for non-standard atom named structures
1839 """
1840 counter_ion_indices = []
1841 # Set the accepted names accoridng to the charge
1842 if charge == None:
1843 accepted_names = STANDARD_COUNTER_ION_ATOM_NAMES
1844 elif charge == '+':
1845 accepted_names = STANDARD_COUNTER_CATION_ATOM_NAMES
1846 elif charge == '-':
1847 accepted_names = STANDARD_COUNTER_ANION_ATOM_NAMES
1848 else:
1849 raise ValueError('Not supported charge')
1850 # Iterate atoms
1851 for atom in self.atoms:
1852 # If the residue has not one single atom then it is not an ion
1853 if len(atom.residue.atoms) != 1: continue
1854 # Get a simplified version of the atom name
1855 # Set all letters upper and remove non-letter characters (e.g. '+' and '-' symbols)
1856 simple_atom_name = ''.join(filter(str.isalpha, atom.name.upper()))
1857 if simple_atom_name in accepted_names:
1858 counter_ion_indices.append(atom.index)
1859 return Selection(counter_ion_indices)
1861 def select_water_and_counter_ions (self) -> 'Selection':
1862 """Select both water and counter ions."""
1863 return self.select_water() + self.select_counter_ions()
1865 def select_heavy_atoms (self) -> 'Selection':
1866 """Select heavy atoms."""
1867 atom_indices = []
1868 for atom in self.atoms:
1869 # If the atom is not an hydrogen then add it to the list
1870 if atom.element != 'H':
1871 atom_indices.append(atom.index)
1872 return Selection(atom_indices)
1874 def select_protein (self) -> 'Selection':
1875 """Select protein atoms.
1876 WARNING: Note that there is a small difference between VMD protein and our protein.
1877 This function is improved to consider terminal residues as proteins.
1878 VMD considers protein any residue including N, C, CA and O while terminals may have OC1 and OC2 instead of O.
1879 """
1880 return self.select_by_classification('protein')
1882 def select_nucleic (self) -> 'Selection':
1883 """Select nucleic atoms."""
1884 return self.select_by_classification('dna') + self.select_by_classification('rna')
1886 def select_lipids (self) -> 'Selection':
1887 """Select lipids."""
1888 return self.select_by_classification('fatty') + self.select_by_classification('steroid')
1890 def select_carbohydrates (self) -> 'Selection':
1891 """Select carbohydrates."""
1892 return self.select_by_classification('carbohydrate')
1894 def select_pbc_guess (self) -> 'Selection':
1895 """Return a selection of the typical PBC atoms: solvent, counter ions and lipids.
1896 WARNING: This is just a guess."""
1897 return self.select_water() + self.select_counter_ions() + self.select_lipids()
1899 def select_cg (self) -> 'Selection':
1900 """Select coarse grain atoms."""
1901 return Selection([ atom.index for atom in self.atoms if atom.element == CG_ATOM_ELEMENT ])
1903 def select_cartoon (self, include_terminals : bool = False) -> 'Selection':
1904 """Select cartoon representable regions for VMD.
1906 Rules are:
1907 1. Residues must be protein (i.e. must contain C, CA, N and O atoms) or nucleic (P, OP1, OP2, O3', C3', C4', C5', O5')
1908 2. There must be at least 3 covalently bonded residues
1910 It does not matter their chain, numeration or even index order as long as they are bonded.
1911 * Note that we can represent cartoon while we display one residue alone, but it must be connected anyway.
1912 Also, we have the option to include terminals in the cartoon selection although they are not representable.
1913 This is helpful for the screenshot: terminals are better hidden than represented as ligands."""
1914 # Set fragments which are candidate to be cartoon representable
1915 fragments = []
1916 # Get protein fragments according to VMD
1917 protein_selection = self.select_protein() - self.select_cg() if include_terminals else self.select('protein', syntax='vmd')
1918 if protein_selection:
1919 fragments += list(self.find_fragments(protein_selection))
1920 # Get nucleic fragments according to VMD
1921 nucleic_selection = self.select_nucleic() - self.select_cg()
1922 if nucleic_selection:
1923 fragments += list(self.find_fragments(nucleic_selection))
1924 # Set the final selection including all valid fragments
1925 cartoon_selection = Selection()
1926 # Iterate over every fragment
1927 for fragment in fragments:
1928 # Make sure the fragment has at least 3 residues before adding it to the cartoon selection
1929 fragment_residue_indices = self.get_selection_residue_indices(fragment)
1930 if len(fragment_residue_indices) >= 3:
1931 cartoon_selection += fragment
1932 return cartoon_selection
1934 def invert_selection (self, selection : 'Selection') -> 'Selection':
1935 """Invert a selection."""
1936 atom_indices = list(range(self.atom_count))
1937 for atom_index in selection.atom_indices:
1938 atom_indices[atom_index] = None
1939 return Selection([ atom_index for atom_index in atom_indices if atom_index != None ])
1941 def get_selection_residue_indices (self, selection : 'Selection') -> list[int]:
1942 """Given a selection, get a list of residue indices for residues implicated.
1943 Note that if a single atom from the residue is in the selection then the residue index is returned."""
1944 return list(set([ self.atoms[atom_index].residue_index for atom_index in selection.atom_indices ]))
1946 def get_selection_residues (self, selection : 'Selection') -> list['Residue']:
1947 """Given a selection, get a list of residues implicated.
1948 Note that if a single atom from the residue is in the selection then the residue is returned."""
1949 residue_indices = self.get_selection_residue_indices(selection)
1950 return [ self.residues[index] for index in residue_indices ]
1952 def get_selection_chain_indices (self, selection : 'Selection') -> list[int]:
1953 """Given a selection, get a list of chain indices for chains implicated.
1954 Note that if a single atom from the chain is in the selection then the chain index is returned."""
1955 return list(set([ self.atoms[atom_index].chain_index for atom_index in selection.atom_indices ]))
1957 def get_selection_chains (self, selection : 'Selection') -> list['Chain']:
1958 """Given a selection, get a list of chains implicated.
1959 Note that if a single atom from the chain is in the selection then the chain is returned."""
1960 chain_indices = self.get_selection_chain_indices(selection)
1961 return [ self.chains[index] for index in chain_indices ]
1963 def get_selection_classification (self, selection : 'Selection') -> str:
1964 """Get type of the chain."""
1965 # Get selection residues
1966 selection_residue_indices = self.get_selection_residue_indices(selection)
1968 # Inicializar contadores para cada tipo de residuo
1969 residue_counts = {}
1971 # Count the residues of each type
1972 for residue_index in selection_residue_indices:
1973 residue = self.residues[residue_index]
1974 res_class = residue.classification
1975 if res_class in residue_counts:
1976 residue_counts[res_class] += 1
1977 else:
1978 # If the classification is not recognized, count it as "other"
1979 residue_counts[res_class] = 1
1981 # Count the total number of residues in the selection
1982 total_residues = sum(residue_counts.values())
1983 if total_residues == 0: raise ValueError('Should have residues at this point')
1985 # Calculate the proportion of each type of residue
1986 proportions = { k: v / total_residues for k, v in residue_counts.items() }
1988 # If one type of residue dominates, return it
1989 primary_type = max(proportions, key=proportions.get)
1990 # We establish a threshold of 80% to consider a chain as a single type
1991 if proportions[primary_type] >= 0.8:
1992 return primary_type
1994 # Special cases
1995 relevant_threshold = 0.3
1996 if proportions.get("dna", 0) > relevant_threshold and proportions.get("rna", 0) > relevant_threshold:
1997 return "nucleic"
1998 if proportions.get("carbohydrate", 0) > relevant_threshold and proportions.get("protein", 0) > relevant_threshold:
1999 return "glycoprotein"
2000 if proportions.get("fatty", 0) > relevant_threshold and proportions.get("steroid", 0) > relevant_threshold:
2001 return "lipid"
2003 # Any other combinations of different main proportions
2004 main_proportions = { k: v for k, v in proportions.items() if v > relevant_threshold }
2005 # Nothing is above the threshold
2006 if len(main_proportions) == 0:
2007 return "mix"
2008 # There is only one thing above threshold
2009 elif len(main_proportions) == 1:
2010 return f"{primary_type}/mix"
2011 # There are two things above threshold
2012 elif len(main_proportions) == 2:
2013 other_type = next(key for key in main_proportions.keys() if key != primary_type)
2014 return f"{primary_type}/{other_type}"
2015 # There are three things above the threshold (more is not possible)
2016 else:
2017 return "mix"
2019 def filter (self, selection : Union['Selection', str], selection_syntax : str = 'vmd') -> 'Structure':
2020 """Create a new structure from the current using a selection to filter atoms."""
2021 if not selection: raise RuntimeError('No selection was passed')
2022 # In case the selection is not an actual Selection, but a string, parse the string into a Selection
2023 if type(selection) == str:
2024 selection = self.select(selection, selection_syntax)
2025 new_atoms = []
2026 new_residues = []
2027 new_chains = []
2028 # Get the selected atoms
2029 # Generate dictionaries with new indexes as keys and previous indexes as values for atoms, residues and chains
2030 # This is done with this structure for the residues and chains further to find the new indices fast
2031 new_atom_indices = {}
2032 # Collect also original indices to related atom residues and chains
2033 original_atom_residue_indices = []
2034 for new_index, original_index in enumerate(selection.atom_indices):
2035 # Make a copy of the selected atom in order to not modify the original one
2036 original_atom = self.atoms[original_index]
2037 new_atom = Atom(
2038 name=original_atom.name,
2039 element=original_atom.element,
2040 coords=original_atom.coords,
2041 )
2042 new_atoms.append(new_atom)
2043 # Save old and new indices in order to reconfigure all indices later
2044 new_atom_indices[original_index] = new_index
2045 original_atom_residue_indices.append(original_atom.residue_index)
2046 # Find the selected residues
2047 selected_residue_indices = list(set(original_atom_residue_indices))
2048 # Repeat the original/new indices backup we did before
2049 new_residue_indices = {}
2050 original_residue_atom_indices = []
2051 original_residue_chain_indices = []
2052 for new_index, original_index in enumerate(selected_residue_indices):
2053 # Make a copy of the selected residue in order to not modify the original one
2054 original_residue = self.residues[original_index]
2055 new_residue = Residue(
2056 name=original_residue.name,
2057 number=original_residue.number,
2058 icode=original_residue.icode,
2059 )
2060 new_residues.append(new_residue)
2061 # Save old and new indices in order to reconfigure all indices later
2062 new_residue_indices[original_index] = new_index
2063 original_residue_atom_indices.append(original_residue.atom_indices)
2064 original_residue_chain_indices.append(original_residue.chain_index)
2065 # Find the selected chains
2066 selected_chain_indices = list(set(original_residue_chain_indices))
2067 # Repeat the original/new indices backup we did before
2068 new_chain_indices = {}
2069 original_chain_residue_indices = []
2070 for new_index, original_index in enumerate(selected_chain_indices):
2071 # Make a copy of the selected chain in order to not modify the original one
2072 original_chain = self.chains[original_index]
2073 new_chain = Chain(
2074 name=original_chain.name,
2075 )
2076 new_chains.append(new_chain)
2077 # Save old and new indices in order to reconfigure all indices later
2078 new_chain_indices[original_index] = new_index
2079 original_chain_residue_indices.append(original_chain.residue_indices)
2080 # Finally, reset indices in all instances
2081 for a, atom in enumerate(new_atoms):
2082 atom.residue_index = new_residue_indices[ original_atom_residue_indices[a] ]
2083 for r, residue in enumerate(new_residues):
2084 atom_indices = []
2085 for original_index in original_residue_atom_indices[r]:
2086 new_index = new_atom_indices.get(original_index, None)
2087 if new_index != None:
2088 atom_indices.append(new_index)
2089 residue.atom_indices = atom_indices
2090 residue.chain_index = new_chain_indices[ original_residue_chain_indices[r] ]
2091 for c, chain in enumerate(new_chains):
2092 residue_indices = []
2093 for original_index in original_chain_residue_indices[c]:
2094 new_index = new_residue_indices.get(original_index, None)
2095 if new_index != None:
2096 residue_indices.append(new_index)
2097 chain.residue_indices = residue_indices
2098 return Structure(atoms=new_atoms, residues=new_residues, chains=new_chains)
2100 def chainer (self,
2101 selection : Optional['Selection'] = None,
2102 letter : Optional[str] = None,
2103 whole_fragments : bool = False):
2104 """Set chains on demand.
2105 If no selection is passed then the whole structure will be affected.
2106 If no chain is passed then a "chain by fragment" logic will be applied."""
2107 # If there is no selection we consider all atoms
2108 if selection == None:
2109 selection = self.select_all()
2110 # If the selection is empty then there is nothing to do here
2111 if len(selection) == 0: return
2112 # If a letter is specified then the logic is way simpler
2113 if letter and not whole_fragments:
2114 self.set_selection_chain_name(selection, letter)
2115 return
2116 # If a letter is not specified we run the "fragments" logic
2117 fragment_getter = self.find_whole_fragments if whole_fragments else self.find_fragments
2118 for fragment in fragment_getter(selection):
2119 chain_name = letter if letter else self.get_available_chain_name()
2120 self.set_selection_chain_name(fragment, chain_name)
2122 def auto_chainer (self, verbose : bool = False):
2123 """Smart function to set chains automatically.
2124 Original chains will be overwritten."""
2125 if verbose: print('Running auto-chainer')
2126 # Set all chains to X
2127 self.chainer(letter='X')
2128 # Set solvent and ions as a unique chain
2129 ion_selection = self.select_ions()
2130 if verbose: print(f' Ion atoms: {len(ion_selection)}')
2131 solvent_selection = self.select_water()
2132 if verbose: print(f' Solvent atoms: {len(solvent_selection)}')
2133 ion_and_indices_selection = ion_selection + solvent_selection
2134 self.chainer(selection=ion_and_indices_selection, letter='S')
2135 # Set fatty acids and steroids as a unique chain
2136 # RUBEN: con whole_fragments algunos carbohidratos se sobreescriben como glucolípidos
2137 # DANI: Se podrían descartarían residuos que no pertenezcan a la membrana por proximidad
2138 membrane_selection = self.select_lipids()
2139 if verbose: print(f' Membrane atoms: {len(membrane_selection)}')
2140 self.chainer(selection=membrane_selection, letter='M', whole_fragments=True)
2141 # Set carbohydrates as a unique chain as well, just in case we have glycans
2142 # Note that in case glycan atoms are mixed with protein atoms glycan chains will be overwritten
2143 # However this is not a problem. It is indeed the best solution if we don't want ro resort atoms
2144 carbohydrate_selection = self.select_carbohydrates()
2145 if verbose: print(f' Carbohydrate atoms: {len(carbohydrate_selection)}')
2146 self.chainer(selection=carbohydrate_selection, letter='H')
2147 # Add a chain per fragment for both proteins and nucleic acids
2148 protein_selection = self.select_protein()
2149 if verbose: print(f' Protein atoms: {len(protein_selection)}')
2150 nucleic_selection = self.select_nucleic()
2151 if verbose: print(f' Nucleic acid atoms: {len(nucleic_selection)}')
2152 protein_and_nucleic_selection = protein_selection + nucleic_selection
2153 self.chainer(selection=protein_and_nucleic_selection)
2154 # At this point we should have covered most of the molecules in the structure
2155 # However, in case there are more molecules, we have already set them all as a single chain ('X')
2156 # Here we do not apply the chain per fragment logic since it may be dangerous
2157 # Note that we may have a lot of independent residues (and thus a los of small fragments)
2158 # This would make us run out of letters in the alphabet and thus there would be no more chains
2159 # As the last step, fix repeated chains
2160 # RUBEN: sacar esta parte ya que se hace luego en el structure_corrector?
2161 self.check_repeated_chains(fix_chains=True)
2163 def raw_protein_chainer (self):
2164 """This is an alternative system to find protein chains (anything else is chained as 'X').
2165 This system does not depend on VMD.
2166 It totally overrides previous chains since it is expected to be used only when chains are missing."""
2167 current_chain = self.get_available_chain_name()
2168 previous_alpha_carbon = None
2169 for residue in self.residues:
2170 alpha_carbon = next((atom for atom in residue.atoms if atom.name == 'CA'), None)
2171 if not alpha_carbon:
2172 residue.set_chain('X')
2173 continue
2174 # Connected aminoacids have their alpha carbons at a distance of around 3.8 Ångstroms
2175 residues_are_connected = previous_alpha_carbon and calculate_distance(previous_alpha_carbon, alpha_carbon) < 4
2176 if not residues_are_connected:
2177 current_chain = self.get_available_chain_name()
2178 residue.set_chain(current_chain)
2179 previous_alpha_carbon = alpha_carbon
2181 def set_selection_chain_name (self, selection : 'Selection', letter : str):
2182 """
2183 Given an atom selection, set the chain for all these atoms.
2184 Note that the chain is changed in every whole residue, no
2185 matter if only one atom was selected.
2186 """
2187 # Find if the letter belongs to an already existing chain
2188 chain = self.get_chain_by_name(letter)
2189 if not chain:
2190 chain = Chain(name=letter)
2191 self.set_new_chain(chain)
2192 # Get the selection residue indices
2193 selection_residue_indices = self.get_selection_residue_indices(selection)
2194 # Set the chain index of every residue to the new chain
2195 # WARNING:
2196 # This may have to be done atom by atom but we forgot why so we change
2197 # it so see what was the problem. Doing it atom by atom is not efficient
2198 # and was causing a bug where residues with same number could change their name
2199 # to that of the first residue found with that number.
2200 for residue_index in selection_residue_indices:
2201 # WARNING: Chain index has to be read every iteration since it may change. Do not save it
2202 residue = self.residues[residue_index]
2203 residue.set_chain_index(chain.index)
2205 def check_available_chains(self):
2206 """Check if there are more chains than available letters."""
2207 n_chains = len(self.chains)
2208 n_available_letters = len(AVAILABLE_LETTERS)
2209 if n_chains <= n_available_letters: return
2210 raise InputError(f'There are {n_chains} chains in the structure so far. If this is not expected then there may be a problem.\n' +
2211 f' If this is expected then unfortunatelly there is a limit of {n_available_letters} available chain letters.\n' +
2212 ' Please manually set the chains from scratch or merge together some chains to reduce the number.\n' +
2213 ' Customize a PDB file using the command "mwf chainer". Then use the customized PDB as input structure (-stru).')
2215 def get_available_chain_name (self) -> str:
2216 """Get an available chain name.
2217 Find alphabetically the first letter which is not yet used as a chain name.
2218 If all letters in the alphabet are used already then raise an error."""
2219 self.check_available_chains()
2220 current_chain_names = [ chain.name for chain in self.chains ]
2221 next_available_chain_name = next((name for name in AVAILABLE_LETTERS if name not in current_chain_names), None)
2222 return next_available_chain_name
2224 def get_next_available_chain_name (self, anterior : str) -> str:
2225 """
2226 Get the next available chain name.
2228 Args:
2229 anterior (str): The last chain name used, which is expected to be a single letter
2231 Raises:
2232 ValueError: If the anterior is not a letter or if there are more chains than available
2233 """
2234 self.check_available_chains()
2235 current_chain_names = set([ chain.name for chain in self.chains ])
2236 # If anterior is cap then try caps first, if anterior is lower then try lowers first
2237 if anterior.isupper():
2238 first_group, second_group = AVAILABLE_CAPS, AVAILABLE_LOWS
2239 elif anterior.islower():
2240 first_group, second_group = AVAILABLE_LOWS, AVAILABLE_CAPS
2241 else: raise ValueError(f'Is "{anterior}" even a letter?')
2242 # Reorder letters in the first group, so the anterior is the last letter
2243 anterior_position = first_group.index(anterior)
2244 next_position = anterior_position + 1
2245 reordered_group = first_group[next_position:] + first_group[0:next_position]
2246 next_letter = next((letter for letter in reordered_group if letter not in current_chain_names), None)
2247 if next_letter: return next_letter
2248 # If there is not available letters is the first group then return the first available in the second
2249 next_letter = next((letter for letter in second_group if letter not in current_chain_names), None)
2250 return next_letter
2252 def get_chain_by_name (self, name : str) -> Optional['Chain']:
2253 """Get a chain by its name."""
2254 return next((c for c in self.chains if c.name == name), None)
2256 def find_residue (self, chain_name : str, number : int, icode : str = '' ) -> Optional['Residue']:
2257 """Find a residue by its chain, number and insertion code."""
2258 # Get the corresponding chain
2259 target_chain = self.get_chain_by_name(chain_name)
2260 if not target_chain: return None
2261 # Find the residue in the target chain
2262 target_chain.find_residue(number, icode)
2264 def display_summary (self):
2265 """Get a summary of the structure."""
2266 print(f'Atoms: {self.atom_count}')
2267 print(f'Residues: {len(self.residues)}')
2268 print(f'Chains: {len(self.chains)}')
2269 for chain in self.chains:
2270 print(f'Chain {chain.name} ({len(chain.residue_indices)} residues)')
2271 print(' -> ' + chain.get_sequence())
2273 def check_repeated_chains (self, fix_chains : bool = False, display_summary : bool = False) -> bool:
2274 """
2275 There may be chains which are equal in the structure (i.e. same chain name).
2276 This means we have a duplicated/splitted chain.
2277 Repeated chains are usual and they are usually supported but with some problems.
2278 Also, repeated chains usually come with repeated residues, which means more problems (see explanation below).
2280 In the context of this structure class we may have 2 different problems with a different solution each:
2281 1. There is more than one chain with the same letter (repeated chain) -> rename the duplicated chains
2282 2. There is a chain with atom indices which are not consecutive (splitted chain) -> create new chains
2284 Rename repeated chains or create new chains if the fix_chains argument is True.
2286 WARNING: These fixes are possible only if there are less chains than the number of letters in the alphabet.
2287 Although there is no limitation in this code for chain names, setting long chain names is not compatible with pdb format.
2289 Check splitted chains (a chains with non consecutive residues) and try to fix them if requested.
2290 Check repeated chains (two chains with the same name) and return True if there were any repeats.
2291 """
2292 # Order chains according to their names
2293 # Save also those chains which have a previous duplicate
2294 name_chains = {}
2295 repeated_chains = []
2296 for chain in self.chains:
2297 chain_name = chain.name
2298 current_name_chains = name_chains.get(chain_name, None)
2299 if not current_name_chains:
2300 name_chains[chain_name] = [chain]
2301 else:
2302 name_chains[chain_name].append(chain)
2303 repeated_chains.append(chain)
2304 # Display the summary of repeated chains if requested
2305 if display_summary:
2306 if len(repeated_chains) > 0:
2307 warn('There are repeated chains:')
2308 for chain_name, chains in name_chains.items():
2309 chains_count = len(chains)
2310 if chains_count > 1:
2311 print(f'- Chain {chain_name} has {chains_count} repeats')
2312 # Rename repeated chains if requested
2313 if len(repeated_chains) > 0 and fix_chains:
2314 self.check_available_chains()
2315 current_letters = list(name_chains.keys())
2316 for repeated_chain in repeated_chains:
2317 last_chain_letter = repeated_chain.name
2318 while last_chain_letter in current_letters:
2319 last_chain_letter = get_next_letter(last_chain_letter)
2320 repeated_chain.name = last_chain_letter
2321 current_letters.append(last_chain_letter)
2322 # Check if there are splitted chains
2323 # RUBEN: sacar esta parte ya que esta self.check_splitted_chains
2324 for chain in self.chains:
2325 residue_indices = sorted(chain.residue_indices)
2326 # Check if residue indices are consecutive
2327 if residue_indices[-1] - residue_indices[0] + 1 == len(residue_indices):
2328 continue
2329 warn(f'Splitted chain {chain.name}')
2330 # If indices are not consecutive then we must find ranges of consecutive residues and create new chains for them
2331 previous_residue_index = residue_indices[0]
2332 consecutive_residues = [previous_residue_index]
2333 overall_consecutive_residues = []
2334 for residue_index in residue_indices[1:]:
2335 # If next residue is consecutive
2336 if residue_index == previous_residue_index + 1:
2337 consecutive_residues.append(residue_index)
2338 # If next residue is NOT consecutive
2339 else:
2340 overall_consecutive_residues.append(consecutive_residues)
2341 consecutive_residues = [residue_index]
2342 # Update the previous index
2343 previous_residue_index = residue_index
2344 # Add the last split
2345 overall_consecutive_residues.append(consecutive_residues)
2346 # Now create new chains and reasign residues
2347 # Skip the first set of consecutive residues since they will stay in the original chain
2348 for residues_indices in overall_consecutive_residues[1:]:
2349 chain_name = self.get_available_chain_name()
2350 residues_selection = self.select_residue_indices(residues_indices)
2351 self.set_selection_chain_name(residues_selection, chain_name)
2353 # Fix repeated chains if requested
2354 return len(repeated_chains) > 0
2356 def check_splitted_chains (self, fix_chains : bool = False, display_summary : bool = False) -> bool:
2357 """Check if non-consecutive atoms belong to the same chain.
2358 If so, separate pieces of non-consecuite atoms in different chains.
2359 Note that the new chains will be duplicated, so you will need to run check_repeated_chains after.
2361 Args:
2362 fix_chains (bool): If True then the splitted chains will be fixed.
2363 display_summary (bool): If True then a summary of the splitted chains will be displayed.
2365 Returns:
2366 bool:
2367 True if we encountered splitted chains and false otherwise.
2368 """
2369 splitted_fragments: list[tuple[str, list[int]]] = []
2370 # Keep track of already checked chains
2371 checked_chains = set()
2372 last_chain = None
2373 last_fragment_start = None
2374 for atom_index, atom in enumerate(self.atoms):
2375 # Get the chain name
2376 chain_name = atom.chain.name
2377 # Skip the atom if it belong to the previous chain
2378 if chain_name == last_chain: continue
2379 # If we were in a splitted fragment then end it here
2380 if last_fragment_start != None:
2381 new_fragment_indices = list(range(last_fragment_start, atom_index))
2382 new_fragment = (last_chain, new_fragment_indices)
2383 splitted_fragments.append(new_fragment)
2384 last_fragment_start = None
2385 # Check if the chain was already found
2386 if chain_name in checked_chains:
2387 # Start a new fragment
2388 last_fragment_start = atom_index
2389 # Update last chain
2390 last_chain = chain_name
2391 # Add the new chain to the set of already checked chains
2392 checked_chains.add(chain_name)
2394 # Make a summary of the splitted chains if requested
2395 if display_summary and len(splitted_fragments) > 0:
2396 warn(f'Found {len(splitted_fragments)} splitted fragments')
2397 affected_chains = sorted(list(set([ fragment[0] for fragment in splitted_fragments ])))
2398 print(f' We are having splits in chains {", ".join(affected_chains)}')
2400 # Fix chains if requested
2401 if fix_chains:
2402 for chain_name, fragment_atom_indices in splitted_fragments:
2403 # Create a new chain
2404 new_chain = Chain(name=chain_name)
2405 self.set_new_chain(new_chain)
2406 # Ge the selection residue indices for the fragment
2407 fragment_selection = self.select_atom_indices(fragment_atom_indices)
2408 fragment_residue_indices = self.get_selection_residue_indices(fragment_selection)
2409 # Move atoms in the fragment to the new chain
2410 for residue_index in fragment_residue_indices:
2411 residue = self.residues[residue_index]
2412 residue.set_chain_index(new_chain.index)
2413 return len(splitted_fragments) > 0
2415 def sort_residues (self):
2416 """Coherently sort residues according to the indices of the atoms they hold."""
2417 # Set a function to sort atoms and residues by index
2418 def by_first_atom_index (residue):
2419 return min(residue.atom_indices)
2420 # Sort residues according to their first atom index
2421 sorted_residues = sorted(self.residues, key = by_first_atom_index)
2422 # Iterate sorted residues letting them know their new index
2423 for r, residue in enumerate(sorted_residues):
2424 residue.index = r
2425 # Finally update the structure's residues list
2426 self.residues = sorted_residues
2428 def check_merged_residues (self, fix_residues : bool = False, display_summary : bool = False) -> bool:
2429 """
2430 There may be residues which contain unconnected (unbonded) atoms. They are not allowed.
2431 They may come from a wrong parsing and be indeed duplicated residues.
2433 Search for merged residues.
2434 Create new residues for every group of connected atoms if the fix_residues argument is True.
2435 Note that the new residues will be repeated, so you will need to run check_repeated_residues after.
2436 Return True if there were any merged residues.
2437 """
2438 # Get the list of merged residues we encounter
2439 merged_residues = []
2440 # Iterate residues
2441 for residue in self.residues:
2442 residue_selection = residue.get_selection()
2443 residue_fragments = list(self.find_fragments(residue_selection))
2444 if len(residue_fragments) <= 1: continue
2445 # If current residue has more than 1 fragment then it is a merged residue
2446 merged_residues.append(residue)
2447 if not fix_residues: continue
2448 # If the residue is to be fixed then let the first fragment as the current residue
2449 # Then create a new residue for every other fragment
2450 for extra_fragment in residue_fragments[1:]:
2451 # Set a new residue identical to the current one
2452 new_residue = Residue(residue.name, residue.number, residue.icode)
2453 self.set_new_residue(new_residue)
2454 # Add it to the same chain
2455 residue.chain.add_residue(new_residue)
2456 # Add atoms to it
2457 for atom_index in extra_fragment.atom_indices:
2458 atom = self.atoms[atom_index]
2459 new_residue.add_atom(atom)
2460 # Now that we have new resiudes, sort all residues to keep them coherent
2461 self.sort_residues()
2462 # Count how many merged residues we encountered
2463 merged_residues_count = len(merged_residues)
2464 # Log some details if the summary is requested
2465 if display_summary and merged_residues_count > 0:
2466 print(f'Found {merged_residues_count} merged residues')
2467 print(f' e.g. {merged_residues[0]}')
2468 # Return if we found merged residues
2469 return merged_residues_count > 0
2471 def check_repeated_residues (self, fix_residues : bool = False, display_summary : bool = False) -> bool:
2472 """
2473 There may be residues which are equal in the structure (i.e. same chain, number and icode).
2474 In case 2 residues in the structure are equal we must check distance between their atoms.
2475 If atoms are far it means they are different residues with the same notation (duplicated residues).
2476 If atoms are close it means they are indeed the same residue (splitted residue).
2478 Splitted residues are found in some pdbs and they are supported by some tools.
2479 These tools consider all atoms with the same 'record' as the same residue.
2480 However, there are other tools which would consider the splitted residue as two different residues.
2481 This causes inconsistency along different tools besides a long list of problems.
2482 The only possible is fix is changing the order of atoms in the topology.
2483 Note that this is a breaking change for associated trajectories, which must change the order of coordinates.
2484 However here we provide tools to fix associates trajectories as well.
2486 Duplicated residues are usual and they are usually supported but with some problems.
2487 For example, pytraj analysis outputs use to sort results by residues and each residue is tagged.
2488 If there are duplicated residues with the same tag it may be not possible to know which result belongs to each residue.
2489 Another example are NGL selections once in the web client.
2490 If you select residue ':A and 1' and there are multiple residues 1 in chain A all of them will be displayed.
2492 Check residues to search for duplicated and splitted residues.
2493 Renumerate repeated residues if the fix_residues argument is True.
2494 Return True if there were any repeats.
2495 """
2496 # Track if residues have to be changed or not
2497 modified = False
2498 # Group all residues in the structure according to their chain, number and icode
2499 grouped_residues = {}
2500 # Check repeated residues which are one after the other
2501 # Note that these residues MUST have a different name
2502 # Otherwise they would have not been considered different residues
2503 # For these rare cases we use icodes to solve the problem
2504 non_icoded_residues = []
2505 last_residue = None
2506 for residue in self.residues:
2507 # Check residue to be equal than the previous residue
2508 if residue == last_residue:
2509 non_icoded_residues.append(residue)
2510 last_residue = residue
2511 continue
2512 last_residue = residue
2513 # Add residue to the groups of repeated residues
2514 current_residue_repeats = grouped_residues.get(residue, None)
2515 if not current_residue_repeats:
2516 grouped_residues[residue] = [ residue ]
2517 else:
2518 current_residue_repeats.append(residue)
2519 # In case we have non icoded residues
2520 if len(non_icoded_residues) > 0:
2521 if display_summary:
2522 print(f'There are non-icoded residues ({len(non_icoded_residues)})')
2523 # Set new icodes for non icoded residues
2524 if fix_residues:
2525 print(' Non icoded residues will recieve an icode')
2526 for residue in non_icoded_residues:
2527 repeated_residues_group = grouped_residues[residue]
2528 current_icodes = [ residue.icode for residue in repeated_residues_group if residue.icode ]
2529 next_icode = next((cap for cap in AVAILABLE_CAPS if cap not in current_icodes), None)
2530 if not next_icode:
2531 raise ValueError('There are no more icodes available')
2532 # print('Setting icode ' + next_icode + ' to residue ' + str(residue))
2533 residue.icode = next_icode
2534 modified = True
2535 # Grouped residues with more than 1 result are considered as repeated
2536 repeated_residues = [ residues for residues in grouped_residues.values() if len(residues) > 1 ]
2537 if len(repeated_residues) == 0:
2538 return modified
2539 # In case we have repeated residues...
2540 if display_summary:
2541 warn(f'There are {len(repeated_residues)} different groups of repeated residues')
2542 print(f' e.g. {repeated_residues[0][0]}')
2543 if len(repeated_residues) == 9999 or len(repeated_residues) == 10000:
2544 print(' Probably you have more residues than the PDB numeration limit (1-9999)')
2545 # Now for each repeated residue, find out which are splitted and which are duplicated
2546 covalent_bonds = self.bonds
2547 overall_splitted_residues = []
2548 overall_duplicated_residues = []
2549 for residues in repeated_residues:
2550 # Iterate over repeated residues and check if residues are covalently bonded
2551 # If any pair of residues are bonded add them both to the splitted residues list
2552 # At the end, all non-splitted residues will be considered duplicated residues
2553 # WARNING: Using a set here is not possible since repeated residues have the same hash
2554 # WARNING: Also comparing residues themselves is not advisable, so we use indices at this point
2555 splitted_residue_indices = set()
2556 for residue, other_residues in otherwise(residues):
2557 if residue.index in splitted_residue_indices:
2558 continue
2559 # Get atom indices for all atoms connected to the current residue
2560 residue_bonds = sum([ covalent_bonds[index] for index in residue.atom_indices ], [])
2561 for other_residue in other_residues:
2562 # Get all atom indices for each other residue and collate with the current residue bonds
2563 if any( index in residue_bonds for index in other_residue.atom_indices ):
2564 splitted_residue_indices.add(residue.index)
2565 splitted_residue_indices.add(other_residue.index)
2566 # Finally obtain the splitted residues from its indices
2567 splitted_residues = [ self.residues[index] for index in splitted_residue_indices ]
2568 # Repeated residues which are not splitted are thus duplicated
2569 duplicated_residues = [ residue for residue in residues if residue.index not in splitted_residue_indices ]
2570 # Update the overall lists
2571 if len(splitted_residues) > 0:
2572 overall_splitted_residues.append(splitted_residues)
2573 if len(duplicated_residues) > 0:
2574 overall_duplicated_residues.append(duplicated_residues)
2575 # In case we have splitted residues
2576 if len(overall_splitted_residues) > 0:
2577 if display_summary:
2578 print(f' There are splitted residues ({len(overall_splitted_residues)})')
2579 # Fix splitted residues
2580 if fix_residues:
2581 print(' Splitted residues will be merged')
2582 # Set a function to sort atoms and residues by index
2583 by_index = lambda v: v._index
2584 # Merge splitted residues
2585 # WARNING: Merging residues without sorting atoms is possible, but this would be lost after exporting to pdb
2586 for splitted_residues in overall_splitted_residues:
2587 # The first residue (i.e. the residue with the lower index) will be the one which remains
2588 # It will absorb other residue atom indices
2589 splitted_residues.sort(key=by_index) # Residues are not sorted by default, this is mandatory
2590 first_residue = splitted_residues[0]
2591 other_residues = splitted_residues[1:]
2592 for residue in other_residues:
2593 for atom in residue.atoms:
2594 atom.residue = first_residue
2595 print(' Atoms will be sorted to be together by residues')
2596 print(' NEVER FORGET: This will break any associated trajectory if coordinates are not sorted as well')
2597 # Sort atoms to group residue atoms together
2598 # Note that each atom index must be updated
2599 new_atom_indices = sum([ residue.atom_indices for residue in self.residues ], [])
2600 for i, new_atom_index in enumerate(new_atom_indices):
2601 atom = self.atoms[new_atom_index]
2602 atom._index = i
2603 self.atoms.sort(key=by_index)
2604 # Also residue 'atom_indices' must be updated
2605 for residue in self.residues:
2606 residue._atom_indices = [ new_atom_indices.index(atom_index) for atom_index in residue._atom_indices ]
2607 # Bonds must be reset since atom indices have changes
2608 self._bonds = None
2609 # Prepare the trajectory atom sorter which must be returned
2610 # Include atom indices already so the user has to provide only the structure and trajectory filepaths
2611 def trajectory_atom_sorter (
2612 input_structure_file : 'File',
2613 input_trajectory_file : 'File',
2614 output_trajectory_file : 'File'
2615 ):
2616 sort_trajectory_atoms(
2617 input_structure_file,
2618 input_trajectory_file,
2619 output_trajectory_file,
2620 new_atom_indices
2621 )
2622 self.trajectory_atom_sorter = trajectory_atom_sorter
2623 self.new_atom_order = new_atom_indices
2624 modified = True
2625 # In case we have duplicated residues
2626 duplicated_residues_count = len(overall_duplicated_residues)
2627 if duplicated_residues_count > 0:
2628 if display_summary:
2629 warn(f'There are {duplicated_residues_count} different groups of duplicated residues')
2630 print(f' e.g. {overall_duplicated_residues[0][0]}')
2631 # Renumerate duplicated residues if requested
2632 if fix_residues:
2633 # First of all, from each group of repeated residues, discard the first residue
2634 # The first residue will remain as it is and the rest will be renumerated
2635 # Join all residues to be renumerated in a single list
2636 residue_to_renumerate = []
2637 for residues in overall_duplicated_residues:
2638 residue_to_renumerate += residues[1:]
2639 # Now group residues per chain since the renumeration is done by chains
2640 chain_residues = {}
2641 for residue in residue_to_renumerate:
2642 chain = residue.chain
2643 current_chain_residues = chain_residues.get(chain, None)
2644 if current_chain_residues: current_chain_residues.append(residue)
2645 else: chain_residues[chain] = [ residue ]
2646 # Iterate residue chain groups
2647 for chain, residues in chain_residues.items():
2648 # Find the last residue number in the current chain
2649 maximum_chain_number = max([ residue.number for residue in chain.residues ])
2650 # Sort residues by index
2651 residues.sort(key=lambda x: x.index, reverse=True)
2652 for residue in residues:
2653 # Set the number of the residue as the next available number
2654 residue.number = maximum_chain_number + 1
2655 # Update the maximum number
2656 maximum_chain_number = residue.number
2657 modified = True
2658 return modified
2660 # DANI: No recuerdo los problemas que daba tener átomos repetidos
2661 def check_repeated_atoms (self, fix_atoms : bool = False, display_summary : bool = False) -> bool:
2662 """Check atoms to search for repeated atoms.
2663 Atoms with identical chain, residue and name are considered repeated atoms.
2665 Args:
2666 fix_atoms (bool): If True, rename repeated atoms.
2667 display_summary (bool): If True, display a summary of repeated atoms.
2669 Returns:
2670 bool: True if there were any repeated atoms, False otherwise.
2671 """
2672 # Set two trackers for display
2673 repeated_atoms_count = 0
2674 example = None
2675 for residue in self.residues:
2676 # Iterate over the residue atoms counting how many repeated names we have
2677 current_names = []
2678 for atom in residue.atoms:
2679 # We check if the atom name already exists. If not, go to the next atom
2680 name = atom.name
2681 if name not in current_names:
2682 current_names.append(name)
2683 continue
2684 # When atom is repeated
2685 repeated_atoms_count += 1
2686 # If the fix was not requested we stop here
2687 if not fix_atoms:
2688 continue
2689 # We set the name of the atom as the element letter + the count of this element
2690 # If element is missing take the first character of the atom name
2691 initial = atom.element
2692 if not initial or initial == ' ':
2693 initial = name[0]
2694 number = 1
2695 new_name = initial + str(number)
2696 while new_name in current_names:
2697 number += 1
2698 if number > 999:
2699 raise ValueError('There are more than 999 atoms with the same name in the residue')
2700 new_name = initial + str(number)
2701 # Save an example for the logs if there is none yet
2702 if not example:
2703 example = f'{atom.name} renamed as {new_name} in residue {residue}'
2704 atom.name = new_name
2705 current_names.append(new_name)
2706 # Display the summary of repeated atoms if requested
2707 if display_summary:
2708 if repeated_atoms_count > 0:
2709 warn(f'There are repeated atoms ({repeated_atoms_count})')
2710 print(f' e.g. {example}')
2711 return repeated_atoms_count > 0
2713 def is_missing_any_bonds (self) -> bool:
2714 return any(bond == MISSING_BONDS for bond in self.bonds)
2716 def check_incoherent_bonds (self) -> bool:
2717 """ Check bonds to be incoherent i.e. check atoms not to have more or less
2718 bonds than expected according to their element.
2719 Return True if any incoherent bond is found. """
2720 # Find out if there are hydrogens in the structure
2721 # It may happen that we encounter an structure without hydrogens
2722 has_hydrogen = next(( True for atom in self.atoms if atom.element == 'H' ), False)
2723 coherent_bonds = coherent_bonds_with_hydrogen if has_hydrogen else coherent_bonds_without_hydrogen
2724 # Check coherent bonds atom by atom
2725 for atom in self.atoms:
2726 # Do not check this atom bonds if this may be an ion
2727 # Most authors "force" dummy bonds in these atoms to make them stable
2728 if atom.element in SUPPORTED_ION_ELEMENTS: continue
2729 # Ignore dummy atoms as well
2730 if atom.element == DUMMY_ATOM_ELEMENT: continue
2731 # Ignore coarse grain atoms as well
2732 if atom.element == CG_ATOM_ELEMENT: continue
2733 # Get actual number of bonds in the current atom both with and without ion bonds
2734 # LORE: This was a problem since some ions are force-bonded but bonds are actually not real
2735 # LORE: When an ion is forced we may end up with hyrogens with 2 bonds or carbons with 5 bonds
2736 # LORE: When an ions is really bonded we can not discard it or we may end up with orphan carbons (e.g. ligands)
2737 min_nbonds = len(atom.get_bonds(skip_ions = True, skip_dummies = True))
2738 max_nbonds = len(atom.get_bonds(skip_ions = False, skip_dummies = True))
2739 # Get the accepted range of number of bonds for the current atom according to its element
2740 element = atom.element
2741 element_coherent_bonds = coherent_bonds.get(element, None)
2742 # If there are no specficiations for the current atom element then skip it
2743 if not element_coherent_bonds:
2744 continue
2745 # Get the range of accepted number of bonds
2746 min_allowed_bonds = element_coherent_bonds['min']
2747 max_allowed_bonds = element_coherent_bonds['max']
2748 # Check the actual number of bonds is insdie the accepted range
2749 if max_nbonds < min_allowed_bonds or min_nbonds > max_allowed_bonds:
2750 if min_nbonds == max_nbonds:
2751 print(f' An atom with element {element} has {min_nbonds} bonds')
2752 else:
2753 print(f' An atom with element {element} has between {min_nbonds} bonds (withou ions) and {max_nbonds} bonds (with ions)')
2754 if min_allowed_bonds == max_allowed_bonds:
2755 plural_sufix = '' if min_allowed_bonds == 1 else 's'
2756 print(f' It should have {min_allowed_bonds} bond{plural_sufix}')
2757 else:
2758 print(f' It should have between {min_allowed_bonds} and {max_allowed_bonds} bonds')
2759 print(f' -> Atom {atom.label}')
2760 bond_label = ', '.join([ self.atoms[atom].label for atom in atom.get_bonds(skip_ions = False) ])
2761 print(f' -> Bonds {bond_label}')
2762 return True
2763 return False
2765 def get_covalent_bonds (self, selection : Optional['Selection'] = None) -> list[ list[int] ]:
2766 """Get all atomic covalent (strong) bonds.
2767 Bonds are defined as a list of atom indices for each atom in the structure.
2768 Rely on VMD logic to do so."""
2769 # It is important to fix elements before trying to fix bonds, since elements have an impact on bonds
2770 # VMD logic to find bonds relies in the atom element to set the covalent bond distance cutoff
2771 self.fix_atom_elements()
2772 # Generate a pdb strucutre to feed vmd
2773 auxiliar_pdb_filepath = '.structure.pdb'
2774 self.generate_pdb_file(auxiliar_pdb_filepath)
2775 # Get covalent bonds between both residue atoms
2776 covalent_bonds = get_covalent_bonds(auxiliar_pdb_filepath, selection)
2777 # For every atom in CG, replace its bonds with a class which will raise and error when read
2778 # Thus we make sure using these wrong bonds anywhere further will result in failure
2779 for atom_index in self.select_cg().atom_indices:
2780 covalent_bonds[atom_index] = MISSING_BONDS
2781 # Remove the auxiliar pdb file
2782 os.remove(auxiliar_pdb_filepath)
2783 return covalent_bonds
2785 def copy_bonds (self) -> list[list[int]]:
2786 """Make a copy of the bonds list."""
2787 new_bonds = []
2788 for atom_bonds in self.bonds:
2789 # Missing bonds coming from CG atoms are forwarded
2790 if atom_bonds == MISSING_BONDS:
2791 new_bonds.append(MISSING_BONDS)
2792 continue
2793 # Copy also the inner lists to avoid further mutation of the original structure
2794 new_bonds.append([ atom_index for atom_index in atom_bonds ])
2795 return new_bonds
2797 def copy (self) -> 'Structure':
2798 """Make a copy of the current structure."""
2799 atom_copies = [ atom.copy() for atom in self.atoms ]
2800 residue_copies = [ residue.copy() for residue in self.residues ]
2801 chain_copies = [ chain.copy() for chain in self.chains ]
2802 structure_copy = Structure(atom_copies, residue_copies, chain_copies)
2803 structure_copy.bonds = self.copy_bonds()
2804 return structure_copy
2806 # DANI: No lo he testeado en profundidad
2807 def merge (self, other : 'Structure') -> 'Structure':
2808 """Merge current structure with another structure."""
2809 # Copy self atoms, residues and chains
2810 self_atom_copies = [ atom.copy() for atom in self.atoms ]
2811 self_residue_copies = [ residue.copy() for residue in self.residues ]
2812 self_chain_copies = [ chain.copy() for chain in self.chains ]
2813 # Copy other atoms, residues and chains
2814 other_atom_copies = [ atom.copy() for atom in other.atoms ]
2815 other_residue_copies = [ residue.copy() for residue in other.residues ]
2816 other_chain_copies = [ chain.copy() for chain in other.chains ]
2817 # Adapt indices in other atoms, residues and chains
2818 atom_index_offset = len(self_atom_copies)
2819 residue_index_offset = len(self_residue_copies)
2820 chain_index_offset = len(self_chain_copies)
2821 for atom in other_atom_copies:
2822 atom._index += atom_index_offset
2823 atom._residue_index += residue_index_offset
2824 for residue in other_residue_copies:
2825 residue._index += residue_index_offset
2826 residue._atom_indices = [ i + atom_index_offset for i in residue._atom_indices ]
2827 residue._chain_index += chain_index_offset
2828 for chain in other_chain_copies:
2829 chain._index += chain_index_offset
2830 chain._residue_indices = [ i + residue_index_offset for i in chain._residue_indices ]
2831 # Merge self with other atoms, residues and chains and build the new structure
2832 merged_atoms = self_atom_copies + other_atom_copies
2833 merged_residues = self_residue_copies + other_residue_copies
2834 merged_chains = self_chain_copies + other_chain_copies
2835 return Structure(merged_atoms, merged_residues, merged_chains)
2837 def find_rings (self, max_ring_size : int, selection : Optional[Selection] = None) -> list[ list[Atom] ]:
2838 """Find rings with a maximum specific size or less in the structure and yield them as they are found."""
2839 found_rings = []
2840 selected_atom_indices = selection.atom_indices if selection else None
2841 # Find rings recursively
2842 def find_rings_recursive (atom_path : list[Atom]) -> Generator[ tuple[Atom], None, None ]:
2843 # Get the current atom to continue the followup: the last atom
2844 current_atom = atom_path[-1] # Note that the list MUST always have at least 1 atom
2845 # Get bonded atoms to continue the path
2846 followup_atoms = current_atom.get_bonded_atoms()
2847 # Hydrogens are discarded since they have only 1 bond and thus they can only lead to a dead end
2848 followup_atoms = [ atom for atom in followup_atoms if atom.element != 'H' ]
2849 # In case there is a selection, check followup atoms not to be in the selection
2850 if selected_atom_indices:
2851 followup_atoms = [ atom for atom in followup_atoms if atom.index in selected_atom_indices ]
2852 # Remove the previous atom to avoid going back
2853 previous_atom = atom_path[-2] if len(atom_path) > 1 else None
2854 if previous_atom:
2855 # HARDCODE: This is a problem which should be studied and fixed
2856 # DANI: I temporatily bypass the problem to make it on time to a meeting
2857 if previous_atom not in followup_atoms: return
2858 followup_atoms.remove(previous_atom)
2859 # Iterate over the following bonded atoms
2860 for followup_atom in followup_atoms:
2861 # Check if the followup atom is in the path
2862 path_index = next(( index for index, atom in enumerate(atom_path) if atom == followup_atom ), None)
2863 # If so, we have found a ring
2864 if path_index != None:
2865 ring = atom_path[path_index:]
2866 ring.sort(key=lambda x: x.index)
2867 yield tuple(ring)
2868 continue
2869 # If the ring size is reached, we can not continue
2870 if len(atom_path) == max_ring_size: continue
2871 # Otherwise, keep searching
2872 new_path = atom_path + [followup_atom]
2873 for ring in find_rings_recursive(atom_path=new_path):
2874 yield ring
2875 # Find a starting atom and run the recursive logic
2876 candidate_atom_indices = selected_atom_indices if selected_atom_indices else range(self.atom_count)
2877 for candidate_atom_index in candidate_atom_indices:
2878 candidate_atom = self.atoms[candidate_atom_index]
2879 # It must be heavy atom
2880 if len(candidate_atom.bonds) < 2: continue
2881 # It must not be a dead end already
2882 bonded_atoms = candidate_atom.get_bonded_atoms()
2883 bonded_atoms = [ atom for atom in bonded_atoms if len(atom.bonds) >= 2 ]
2884 # In case there is a selection, check followup atoms not to be in the selection
2885 if selected_atom_indices:
2886 bonded_atoms = [ atom for atom in bonded_atoms if atom.index in selected_atom_indices ]
2887 for ring in find_rings_recursive(atom_path=[candidate_atom]):
2888 found_rings.append(ring)
2889 # Get unique rings
2890 unique_rings = set(found_rings)
2891 return list(unique_rings)
2893 def get_selection_outer_bonds (self, selection : Selection) -> list[int]:
2894 """Given an atom selection, get all bonds between these atoms and any other atom in the structure.
2895 Note that inner bonds between atoms in the selection are discarded."""
2896 # Get bonds from all atoms in the selection
2897 bonds = set()
2898 for atom_index in selection.atom_indices:
2899 atom_bonds = set(self.bonds[atom_index])
2900 bonds = bonds.union(atom_bonds)
2901 # Discard selection atoms from the bonds list to discard inner bonds
2902 bonds -= set(selection.atom_indices)
2903 return list(bonds)
2905 # Set the type of PTM according to the classification of the bonded residue
2906 ptm_options = {
2907 'protein': ValueError('A PTM residue must never be protein'),
2908 'dna': 'DNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama
2909 'rna': 'RNA linkage', # DANI: Esto es posible aunque muy raro y no se como se llama
2910 'carbohydrate': 'Glycosilation',
2911 'fatty': 'Lipidation',
2912 'steroid': 'Steroid linkage', # DANI: Esto es posible aunque muy raro y no se como se llama
2913 'ion': Warning('Ion is covalently bonded to protein'), # DANI: esto no es "correcto" pero si habitual
2914 'solvent': Warning('Solvent is covalently bonded to protein'), # DANI: probablemente sea un error
2915 'acetyl': 'Acetylation', # Typical N-capping terminals
2916 'amide': 'Amidation', # Typical C-capping terminals
2917 'other': Warning('Unknow type of PTM'), # Something not supported
2918 }
2920 def find_ptms (self) -> Generator[dict, None, None]:
2921 """Find Post Translational Modifications (PTM) in the structure."""
2922 # Find bonds between protein and non-protein atoms
2923 # First get all protein atoms
2924 protein_selection = self.select_protein()
2925 if not protein_selection:
2926 return
2927 protein_atom_indices = set(protein_selection.atom_indices) # This is used later
2928 protein_outer_bonds = set(self.get_selection_outer_bonds(protein_selection))
2929 non_protein_selection = self.invert_selection(protein_selection)
2930 if not non_protein_selection:
2931 return
2932 non_protein_atom_indices = set(non_protein_selection.atom_indices)
2933 non_protein_atom_indices_bonded_to_protein = protein_outer_bonds.intersection(non_protein_atom_indices)
2934 # Get each residue bonded to the protein and based on its 'classification' set the name of the PTM
2935 for atom_index in non_protein_atom_indices_bonded_to_protein:
2936 atom = self.atoms[atom_index]
2937 residue = atom.residue
2938 residue_classification = residue.get_classification()
2939 ptm_classification = self.ptm_options[residue_classification]
2940 # If we found something impossible then raise the error
2941 if type(ptm_classification) == ValueError:
2942 print(f'Problematic residue: {residue}')
2943 raise ptm_classification
2944 # If we do not know what it is then do tag it as a PTM but print a warning
2945 if type(ptm_classification) == Warning:
2946 warn(f'{ptm_classification} -> {residue}')
2947 continue
2948 # At this point we have found a valid PTM
2949 # Find the protein residue linked to this PTM
2950 atom_bonds = self.bonds[atom_index]
2951 protein_bond = next((bond for bond in atom_bonds if bond in protein_atom_indices), None)
2952 if protein_bond == None:
2953 raise ValueError('There must be at least one protein bond to the current atom')
2954 protein_residue_index = self.atoms[protein_bond].residue_index
2955 # Set the PTM
2956 yield { 'name': ptm_classification, 'residue_index': protein_residue_index }
2958 def has_cg (self) -> bool:
2959 """Ask if the structure has at least one coarse grain atom/residue."""
2960 return any(atom.element == CG_ATOM_ELEMENT for atom in self.atoms)
2963# =========================
2964# Related functions
2965# =========================
2967def calculate_distance (atom_1 : Atom, atom_2 : Atom) -> float:
2968 """Calculate the distance between two atoms."""
2969 squared_distances_sum = 0
2970 for i in { 'x': 0, 'y': 1, 'z': 2 }.values():
2971 squared_distances_sum += (atom_1.coords[i] - atom_2.coords[i])**2
2972 return math.sqrt(squared_distances_sum)
2974def calculate_angle (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom) -> float:
2975 """Calculate the angle between 3 atoms."""
2976 # From: https://stackoverflow.com/questions/35176451/python-code-to-calculate-angle-between-three-point-using-their-3d-coordinates
2977 # Get coordinates in numpy arrays, which allows to easily calculate the difference
2978 a = np.array(atom_1.coords)
2979 b = np.array(atom_2.coords)
2980 c = np.array(atom_3.coords)
2981 # Set the two vectors which make the angle
2982 ba = a - b
2983 bc = c - b
2984 # Calculate the angle between these 2 vectors
2985 cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
2986 angle = np.arccos(cosine_angle)
2987 return np.degrees(angle)
2989def calculate_torsion (atom_1 : Atom, atom_2 : Atom, atom_3 : Atom, atom_4 : Atom) -> float:
2990 """Calculate the torsion between 4 atoms."""
2991 # From: https://stackoverflow.com/questions/20305272/dihedral-torsion-angle-from-four-points-in-cartesian-coordinates-in-python
2992 p0 = np.array(atom_1.coords)
2993 p1 = np.array(atom_2.coords)
2994 p2 = np.array(atom_3.coords)
2995 p3 = np.array(atom_4.coords)
2996 # Get some vectors
2997 b0 = -1.0*(p1 - p0)
2998 b1 = p2 - p1
2999 b2 = p3 - p2
3000 # normalize b1 so that it does not influence magnitude of vector
3001 # rejections that come next
3002 b1 /= np.linalg.norm(b1)
3003 # vector rejections
3004 # v = projection of b0 onto plane perpendicular to b1
3005 # = b0 minus component that aligns with b1
3006 # w = projection of b2 onto plane perpendicular to b1
3007 # = b2 minus component that aligns with b1
3008 v = b0 - np.dot(b0, b1)*b1
3009 w = b2 - np.dot(b2, b1)*b1
3010 # angle between v and w in a plane is the torsion angle
3011 # v and w may not be normalized but that's fine since tan is y/x
3012 x = np.dot(v, w)
3013 y = np.dot(np.cross(b1, v), w)
3014 return float(np.degrees(np.arctan2(y, x)))
3016# =========================
3017# Auxiliar functions
3018# =========================
3020def get_next_letter (letter : str) -> str:
3021 """Set a function to get the next letter from an input letter in alphabetic order."""
3022 if not letter:
3023 return 'A'
3024 if letter == 'z' or letter == 'Z':
3025 raise InputError("Limit of chain letters has been reached")
3026 next_letter = chr(ord(letter) + 1)
3027 return next_letter
3029def first_cap_only (name : str) -> str:
3030 """Given a string with 1 or 2 characters, return a new string with
3031 the first letter cap and the second letter not cap (if any)"""
3032 if len(name) == 1:
3033 return name.upper()
3034 first_character = name[0].upper()
3035 second_character = name[1].lower()
3036 return first_character + second_character
3038lower_numbers = {
3039 '0': '₀',
3040 '1': '₁',
3041 '2': '₂',
3042 '3': '₃',
3043 '4': '₄',
3044 '5': '₅',
3045 '6': '₆',
3046 '7': '₇',
3047 '8': '₈',
3048 '9': '₉',
3049}
3050def get_lower_numbers (numbers_text : str) -> str:
3051 """Convert numbers to lower text characters (chr 8020-8029)."""
3052 return ''.join([ lower_numbers[c] for c in numbers_text ])
3054def filter_model (pdb_content : str, model : int) -> str:
3055 """Set a function to filter lines in PDB content for a specific model."""
3056 # Make sure the PDB content has multiple models
3057 # If not, then return the whole PDB content ignoring the flag
3058 generic_model_header_regex = r'\nMODEL\s*[0-9]*\s*\n'
3059 if not re.search(generic_model_header_regex, pdb_content):
3060 print(f'PDB content has no models at all so it will be loaded as is (ignored model "{model}").')
3061 return pdb_content
3062 # If a model was passed then it means we must filter the PDB content
3063 filtered_pdb_content = ''
3064 # Search PDB lines until we find our model's header
3065 model_header_regex = rf'^MODEL\s*{model}'
3066 pdb_lines = iter(pdb_content.split('\n'))
3067 for line in pdb_lines:
3068 if not re.match(model_header_regex, line): continue
3069 # If we just found the header
3070 filtered_pdb_content = line
3071 break
3072 # If we did not find the header then stop here
3073 if not filtered_pdb_content: raise RuntimeError(f'Could not find model "{model}" header')
3074 # Add every line to the filtered content until we find the tail
3075 model_footer_regex = r'^ENDMDL'
3076 for line in pdb_lines:
3077 filtered_pdb_content += '\n' + line
3078 if re.match(model_footer_regex, line): return filtered_pdb_content
3079 # If we did not find the footer then stop here
3080 raise RuntimeError(f'Could not find model "{model}" footer')