Coverage for model_workflow/tools/get_inchi_keys.py: 67%
127 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
1import MDAnalysis
2from rdkit import Chem
3from model_workflow.utils.structures import Structure
4from model_workflow.utils.type_hints import *
5from model_workflow.utils.warnings import warn
6from functools import lru_cache
7import requests
10def get_connected_residues(
11 residue: 'MDAnalysis.Residue',
12 visited=None):
13 """
14 Recursively finds all residues that are connected through bonds to the given residue.
15 Parameters
16 ----------
17 residue : MDAnalysis.core.groups.Residue
18 The residue to start the search from
19 visited : set, optional
20 Set of already visited residue IDs to prevent infinite recursion.
21 Default is None, which initializes an empty set.
22 Returns
23 -------
24 list
25 A set containing the residue IDs of all residues that are connected
26 through bonds to the input residue, either directly or indirectly.
27 Notes
28 -----
29 This function traverses the molecular structure recursively, following bonds
30 between residues. It keeps track of visited residues to avoid cycles in the
31 traversal. Both direct bonds between residues and indirect connections through
32 other residues are included in the result.
33 """
35 if visited is None:
36 visited = set()
38 # Add current residue to visited set
39 visited.add(residue.resindex)
41 # Get direct external bonds
42 external_bonds = set([residue.resindex])
43 for bond in residue.atoms.bonds:
44 external_bonds.update(bond.atoms.resindices)
46 # Recursively check external bonds
47 all_external_bonds = external_bonds.copy()
48 u = residue.universe
49 for ext_resid in external_bonds:
50 if ext_resid not in visited:
51 # Get external bonds for the connected residue
52 recursive_bonds = get_connected_residues(u.residues[ext_resid], visited)
53 all_external_bonds.update(recursive_bonds)
55 return list(all_external_bonds)
58def residue_to_inchi(res_atoms: 'MDAnalysis.AtomGroup',
59 resindex : int) -> Tuple[str, str, int]:
60 """
61 Process a single residue to get its InChI key and related information.
62 """
63 # Convert to RDKIT and get InChI data
64 res_RD = res_atoms.convert_to("RDKIT")
65 # Calculate InChI key and string
66 inchikey = Chem.MolToInchiKey(res_RD) # slow step, 50% of time
67 # rdinchi.MolToInchi so it doesnt print the warnings
68 inchi, retcode, message, logs, aux = Chem.rdinchi.MolToInchi(res_RD)
69 return (inchikey, inchi, resindex)
72def get_inchi_keys (
73 u : 'MDAnalysis.Universe',
74 structure : 'Structure'
75) -> dict:
76 """
77 Generate a dictionary mapping InChI keys to residue information for non-standard residues.
79 This function uses MDAnalysis to parse the input structure and topology files and identifies
80 residues that are not classified as 'ion', 'solvent', 'nucleic', or 'protein'. For each
81 identified residue, it converts the structure to RDKit format to obtain the InChI key
82 and InChI string. The resulting data is stored in dictionaries to map InChI keys to residue
83 details and residue names to InChI keys. PDB coordinates are necesary to distinguish stereoisomers.
85 Args:
86 input_structure_file (File): The input structure file.
87 input_topology_file (Optional[File]): The input topology file.
88 structure (Structure): The Structure object containing residues.
90 Returns:
91 dict: A dictionary where keys are InChI keys and values are dictionaries containing:
92 - 'inchi' (str): The InChI string for the residue
93 - 'resindices' (list): A list of all residue indices with this InChI key
94 - 'resgroups' (list): Lists of residue indices that are connected as a single group
95 - 'resname' (set): Set of residue names associated with this InChI key
96 - 'classification' (set): Set of residue classifications for this InChI key
97 Notes:
98 The function also performs consistency checks, warning if multiple residue names
99 map to the same InChI key or if multiple InChI keys map to the same residue name,
100 which can indicate mismatched residue definitions or stereoisomers.
101 """
102 # 1) Prepare residue data for parallel processing (not yet implemented)
103 # First group residues that are bonded together
104 results = []
105 residues: List[Residue] = structure.residues
106 visited = set()
107 for resindex in range(len(residues)):
108 # Skip residues that have already been visited
109 if resindex in visited:
110 continue
111 # Residues grouped by bonded atoms
112 res_grp_idx = get_connected_residues(u.residues[resindex], visited)
113 classes = set([residues[grp_idx].classification for grp_idx in res_grp_idx])
114 # Skip residues that are aminoacids, nucleics, or too small
115 # We also skips residues connected to them: glicoprotein, lipid-anchored protein...
116 if any(cls in ['ion', 'solvent', 'dna', 'rna', 'protein'] for cls in classes):
117 continue
118 # Select residues atoms with MDAnalysis
119 res_atoms = u.residues[res_grp_idx].atoms
120 # Convert to RDKit and get InChI data
121 results.append(residue_to_inchi(res_atoms,res_grp_idx))
123 # 2) Process results and build dictionaries
124 key_2_name = {} # To see if different name for same residue
125 name_2_key = {} # To see if different residues under same name
126 for result in results:
127 inchikey, inchi, indexes = result
129 # If key don't existe we create the default entry with info that is only put once
130 if inchikey not in key_2_name:
131 key_2_name[inchikey] = {'inchi': inchi,
132 'resindices': [],
133 'resgroups': [], # For glucolipids
134 'resname': set(),
135 'classification': set()
136 }
137 # Add residue index to the list
138 key_2_name[inchikey]['resindices'].extend(indexes)
139 key_2_name[inchikey]['resgroups'].append(indexes)
140 # Add residue name to the list. For multi residues we join the names
141 resname = '-'.join(sorted([residues[index].name for index in indexes]))
142 key_2_name[inchikey]['resname'].add(resname)
143 # Add residue class to the list
144 if len(indexes) > 1:
145 classes = tuple(set([residues[index].classification for index in indexes]))
146 key_2_name[inchikey]['classification'].add(classes)
147 else:
148 key_2_name[inchikey]['classification'].add(residues[indexes[0]].classification)
150 # Incorrect residue name, estereoisomers, lose of atoms...
151 if resname not in name_2_key:
152 name_2_key[resname] = []
153 name_2_key[resname].append(inchikey)
155 # 3) Check data coherence
156 # Check if there are multiple names for the same InChI key
157 for inchikey, data in key_2_name.items():
158 if len(data['resname']) > 1:
159 warn('Same residue with different names:\n'
160 f'{inchikey} -> {str(data["resname"])}')
161 if len(data['classification']) > 1:
162 warn('Same residue with different classifications:\n'
163 f'{inchikey} + -> {str(data["classification"])} for names {str(data["resname"])}')
165 # Check if there are multiple InChI keys for the same name
166 for name, inchikeys in name_2_key.items():
167 inchikeys = set(inchikeys)
168 if len(inchikeys) > 1:
169 key_counts = '\n'.join([f'{key}: {len(key_2_name[key]["resindices"]): >4}. '
170 f'{key_2_name[key]["inchi"]}. '
171 f'Sample index: {key_2_name[key]["resindices"][0]}'
172 for key in inchikeys])
173 warn(f'The residue {name} has more than one InChi key:\n'
174 f'{key_counts}')
175 return key_2_name
178@lru_cache(maxsize=None)
179def is_in_LIPID_MAPS(inchikey, only_first_layer=False) -> dict:
180 """Search the InChi keys in LIPID MAPS"""
181 headers = {'accept': 'json'}
182 # https://www.lipidmaps.org/resources/rest
183 # Output item = physchem, is the only one that returns data for the inchi key
184 # for only the two first layers (main and atom connection)
185 # To see InChiKey layers: https://www.inchi-trust.org/
186 key = inchikey[:14] if only_first_layer else inchikey
187 url = f"https://www.lipidmaps.org/rest/compound/inchi_key/{key}/all"
188 response = requests.get(url, headers=headers)
189 if response.status_code == 200:
190 js = response.json()
191 if js != []:
192 return js
193 else:
194 return False
195 else:
196 print(f"Error for {inchikey}: {response.status_code}")
199# @lru_cache(maxsize=None)
200def is_in_swiss_lipids(inchikey, only_first_layer=False) -> dict:
201 """Search the InChi keys in LIPID MAPS"""
202 key = inchikey[:14] if only_first_layer else inchikey
203 headers = {'accept': 'json'}
204 url = f"https://www.swisslipids.org/api/index.php/advancedSearch?InChIkey={key}"
205 response = requests.get(url, headers=headers)
206 if response.status_code == 200:
207 return response.json()[0]
208 else:
209 return False
211def get_atoms_info(atom):
212 """Get the RDkit atom information."""
213 info = {
214 'índice': atom.GetIdx(),
215 'símbolo': atom.GetSymbol(),
216 'carga_formal': atom.GetFormalCharge(),
217 'carga_parcial': atom.GetDoubleProp('_GasteigerCharge') if atom.HasProp('_GasteigerCharge') else 'N/A',
218 'valencia': atom.GetTotalValence(),
219 'grado': atom.GetDegree(),
220 'hidrógenos_explícitos': atom.GetNumExplicitHs(),
221 'hidrógenos_implícitos': atom.GetNumImplicitHs(),
222 'aromático': atom.GetIsAromatic()
223 }
224 return info
227def inchi_2_mol(inchi, withHs=False, neutralize=False):
228 """Converts an InChI string to a RDKIT. Usefull to display them."""
229 mol = Chem.MolFromInchi(inchi)
231 if neutralize:
232 # Crear una copia editable de la molecula
233 rwmol = Chem.RWMol(mol)
234 for atom in mol.GetAtoms():
235 charge = atom.GetFormalCharge()
236 if charge:
237 idx = atom.GetIdx()
238 Hs = atom.GetNumExplicitHs()
240 rw_atom = rwmol.GetAtomWithIdx(idx)
241 rw_atom.SetFormalCharge(0)
242 rw_atom.SetNumExplicitHs(Hs-charge)
243 mol = rwmol
245 if withHs:
246 return Chem.AddHs(mol)
247 else:
248 return mol
251def Residue_2_Mol(res: 'Residue'):
252 "WiP"
253 mol = Chem.RWMol()
254 # map index in universe to index in mol
255 atom_mapper = {}
256 bonds = []
257 non_res_bonds = res.get_bonded_atom_indices()
258 for i, atom in enumerate(res.atoms):
259 # create atom
260 rdatom = Chem.Atom(atom.element)
261 # enable/disable adding implicit H to the molecule
262 rdatom.SetNoImplicit(True)
263 for bond_to in atom.bonds:
264 if bond_to in non_res_bonds:
265 continue
266 bond = sorted([atom.index, bond_to])
267 if bond not in bonds:
268 bonds.append(bond)
269 # add atom
270 index = mol.AddAtom(rdatom)
271 atom_mapper[atom.index] = index
272 for bond in bonds:
273 mol.AddBond(bond[0], atom_mapper[bond[1]], Chem.BondType.SINGLE)
275 # sanitize if possible
276 err = Chem.SanitizeMol(mol, catchErrors=True)
277 # infer bond orders and formal charges
278 MDAnalysis.converters.RDKit._infer_bo_and_charges(mol)
279 return mol