Coverage for mddb_workflow/tools/get_inchi_keys.py: 54%
120 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 15:48 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 15:48 +0000
1import MDAnalysis
2import multiprocessing
3from rdkit import Chem
4from mddb_workflow.utils.structures import Structure
5from mddb_workflow.utils.auxiliar import warn
6from mddb_workflow.utils.type_hints import *
9def get_inchikeys (
10 universe : 'MDAnalysis.Universe',
11 structure : 'Structure',
12 skip_class = {'ion', 'solvent'},
13) -> dict:
14 """
15 Generate a dictionary mapping InChI keys to residue information for non-standard residues.
17 This function uses MDAnalysis to parse the input structure and topology files and identifies
18 residues that are not classified as 'ion', 'solvent', 'nucleic', or 'protein'. For each
19 identified residue, it converts the structure to RDKit format to obtain the InChI key
20 and InChI string. The resulting data is stored in dictionaries to map InChI keys to residue
21 details and residue names to InChI keys. PDB coordinates are necesary to distinguish stereoisomers.
23 Args:
24 universe (Universe): The MDAnalysis Universe object containing the structure and topology.
25 structure (Structure): The Structure object containing residues.
26 skip_class (set): A set of residue classifications to skip. Defaults to {'ion', 'solvent'}.
28 Returns:
29 key_2_name (dict): A dictionary where keys are InChI keys and values are dictionaries containing:
30 - 'inchi' (str): The InChI string for the residue
31 - 'resindices' (list): A list of all residue indices with this InChI key
32 - 'fragments' (list[list]): Lists of residue indices that are connected as a single group
33 - 'frag_len' (int): Length of the fragments. 1 if no fragments are present.
34 - 'resname' (set): Set of residue names associated with this InChI key
35 - 'classification' (set): Set of residue classifications for this InChI key
36 residx_2_key (dict): A dictionary mapping residue indices to their corresponding InChI keys.
38 Notes:
39 The function also performs consistency checks, warning if multiple residue names
40 map to the same InChI key or if multiple InChI keys map to the same residue name,
41 which can indicate mismatched residue definitions or stereoisomers.
42 """
43 try:
44 universe.universe.atoms.charges
45 except:
46 warn('Topology file does not have charges, InChI keys may be unreliable.')
48 # 1) Prepare residue data for parallel processing
49 # First group residues that are bonded together
50 tasks = []
51 residues = structure.residues
53 # Fragment = residues that are bonded together
54 fragments = universe.atoms.fragments
55 for i, fragment in enumerate(fragments):
56 resindices = fragment.residues.resindices.tolist()
58 # Continue to the next fragment if any of its
59 # residues are of a disallowed classification
60 classes = {residues[resindex].classification for resindex in resindices}
61 if (classes.intersection(skip_class) or
62 (classes.intersection({'dna', 'rna', 'protein'}) and len(resindices) > 1)):
63 continue
65 # Select residues atoms with MDAnalysis
66 resatoms = universe.residues[resindices].atoms
67 if 'Cg' in resatoms.types:
68 # Skip coarse grain residues
69 continue
70 # If you pass a residue selection to a parallel worker, you a passing a whole MDAnalysis
71 # universe, slowing the process down because you have to pickle the object
72 # To avoid this we create
73 resatoms = MDAnalysis.Merge(resatoms).universe.atoms
74 # Convert to RDKit and get InChI data
75 tasks.append((resatoms, resindices))
77 results = []
78 # Execute tasks in parallel
79 with multiprocessing.Pool() as pool:
80 results = pool.map(residue_to_inchi, tasks)
82 # 2) Process results and build dictionaries
83 key_2_name = {} # To see if different name for same residue
84 name_2_key = {} # To see if different residues under same name
85 residx_2_key = {} # A mapping from residue index to InChI key
86 for (inchikey, inchi, resindices) in results:
88 # If key don't existe we create the default entry with info that is only put once
89 if inchikey not in key_2_name:
90 key_2_name[inchikey] = {'inchi': inchi,
91 'resindices': [],
92 'fragments': [], # For glucolipids
93 'resname': set(),
94 'classification': set()
95 }
96 # Add residue index to the list
97 key_2_name[inchikey]['resindices'].extend(resindices)
98 # Add residue name to the list. For multi residues we join the names
99 resname = '-'.join(sorted([residues[index].name for index in resindices]))
100 key_2_name[inchikey]['resname'].add(resname)
101 # Add residue class to the list
102 if len(resindices) > 1:
103 classes = tuple(set([residues[index].classification for index in resindices]))
104 key_2_name[inchikey]['classification'].add(classes)
105 # Glucolipids saved the groups of residues the form a 'fragment' to solve a
106 # problem with FATSLiM later (ex: A01IR, A01J5)
107 key_2_name[inchikey]['fragments'].append(list(map(int, resindices)))
108 else:
109 key_2_name[inchikey]['classification'].add(residues[resindices[0]].classification)
111 # Incorrect residue name, estereoisomers, lose of atoms...
112 if resname not in name_2_key:
113 name_2_key[resname] = []
114 name_2_key[resname].append(inchikey)
115 # Add the InChI key to the residue index mapping
116 for resindex in resindices:
117 residx_2_key[resindex] = inchikey
119 # 3) Check data coherence
120 for inchikey, data in key_2_name.items():
121 # Check if there are multiple names for the same InChI key
122 if len(data['resname']) > 1:
123 warn('Same residue with different names:\n'
124 f'{inchikey} -> {str(data["resname"])}')
125 # Check if there are multiple classifications for the same InChI key
126 if len(data['classification']) > 1:
127 warn('Same residue with different classifications:\n'
128 f'{inchikey} + -> {str(data["classification"])} for names {str(data["resname"])}')
129 # Check if there are multiple fragments length for the same InChI key
130 if len(data['fragments']) == 0:
131 data['frag_len'] = 1
132 else:
133 frag_len = len(set([len(fragment) for fragment in data['fragments']]))
134 assert frag_len == 1, \
135 f'Fragments of different lengths for InChI key {inchikey}: {str(frag_len)}'
136 data['frag_len'] = frag_len
138 # Check if there are multiple InChI keys for the same name
139 for name, inchikeys in name_2_key.items():
140 inchikeys = list(set(inchikeys))
141 if len(inchikeys) < 2: continue
142 counts = {}
143 # Count the number of fragments
144 for key in inchikeys:
145 # If there are not fragments, we use the number of residues
146 counts[key] = (key_2_name[key]["frag_len"]
147 if key_2_name[key]["frag_len"] > 1
148 else len(key_2_name[key]["resindices"]))
149 # Format the counts for printing
150 key_counts = '\n'.join([f'\t{key}: {counts[key]: >4}' for key in inchikeys])
151 warn(f'The fragment {name} has more than one InChi key:\n'
152 f'{key_counts}')
154 # Convert sets to lists for JSON serialization
155 for inchikey in key_2_name:
156 key_2_name[inchikey]['resname'] = list(key_2_name[inchikey]['resname'])
157 key_2_name[inchikey]['classification'] = list(key_2_name[inchikey]['classification'])
159 return { 'key_2_name': key_2_name, 'residx_2_key': residx_2_key }
161def residue_to_inchi(task: tuple['MDAnalysis.AtomGroup', int]) -> tuple[str, str, int]:
162 """
163 Process a single residue to get its InChI key and related information.
164 """
165 resatoms , resindices = task
166 # Convert to RDKIT and get InChI data
167 res_RD = resatoms.convert_to("RDKIT")
168 # Calculate InChI key and string
169 inchikey = Chem.MolToInchiKey(res_RD) # slow step, 50% of time
170 # rdinchi.MolToInchi so it doesnt print the warnings
171 inchi, retcode, message, logs, aux = Chem.rdinchi.MolToInchi(res_RD)
172 return (inchikey, inchi, resindices)
174def obtain_inchikey_from_pdb(pdb_file : str) -> Optional[str]:
175 mol = Chem.MolFromPDBFile(pdb_file, sanitize=True)
176 if mol is None:
177 print("❌ InchiKey couldn't be constructed from PDB file.")
178 return None
179 return Chem.inchi.MolToInchiKey(mol)
181def get_atoms_info(atom):
182 """Get the RDkit atom information."""
183 info = {
184 'índice': atom.GetIdx(),
185 'símbolo': atom.GetSymbol(),
186 'carga_formal': atom.GetFormalCharge(),
187 'carga_parcial': atom.GetDoubleProp('_GasteigerCharge') if atom.HasProp('_GasteigerCharge') else 'N/A',
188 'valencia': atom.GetTotalValence(),
189 'grado': atom.GetDegree(),
190 'hidrógenos_explícitos': atom.GetNumExplicitHs(),
191 'hidrógenos_implícitos': atom.GetNumImplicitHs(),
192 'aromático': atom.GetIsAromatic()
193 }
194 return info
196def inchi_2_mol(inchi, withHs=False, neutralize=False):
197 """Converts an InChI string to a RDKIT. Usefull to display them."""
198 mol = Chem.MolFromInchi(inchi)
200 if neutralize:
201 # Crear una copia editable de la molecula
202 rwmol = Chem.RWMol(mol)
203 for atom in mol.GetAtoms():
204 charge = atom.GetFormalCharge()
205 if charge:
206 idx = atom.GetIdx()
207 Hs = atom.GetNumExplicitHs()
208 rw_atom = rwmol.GetAtomWithIdx(idx)
209 rw_atom.SetFormalCharge(0)
210 rw_atom.SetNumExplicitHs(Hs-charge)
211 mol = rwmol
213 if withHs:
214 return Chem.AddHs(mol)
215 else:
216 return mol
218def Residue_2_Mol(res: 'Residue'):
219 "WiP"
220 mol = Chem.RWMol()
221 # map index in universe to index in mol
222 atom_mapper = {}
223 bonds = []
224 non_res_bonds = res.get_bonded_atom_indices()
225 for i, atom in enumerate(res.atoms):
226 # create atom
227 rdatom = Chem.Atom(atom.element)
228 # enable/disable adding implicit H to the molecule
229 rdatom.SetNoImplicit(True)
230 for bond_to in atom.bonds:
231 if bond_to in non_res_bonds:
232 continue
233 bond = sorted([atom.index, bond_to])
234 if bond not in bonds:
235 bonds.append(bond)
236 # add atom
237 index = mol.AddAtom(rdatom)
238 atom_mapper[atom.index] = index
239 for bond in bonds:
240 mol.AddBond(bond[0], atom_mapper[bond[1]], Chem.BondType.SINGLE)
242 # sanitize if possible
243 err = Chem.SanitizeMol(mol, catchErrors=True)
244 # infer bond orders and formal charges
245 MDAnalysis.converters.RDKit._infer_bo_and_charges(mol)
246 return mol