Coverage for mddb_workflow/tools/get_inchi_keys.py: 54%

120 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-29 15:48 +0000

1import MDAnalysis 

2import multiprocessing 

3from rdkit import Chem 

4from mddb_workflow.utils.structures import Structure 

5from mddb_workflow.utils.auxiliar import warn 

6from mddb_workflow.utils.type_hints import * 

7 

8 

9def get_inchikeys ( 

10 universe : 'MDAnalysis.Universe', 

11 structure : 'Structure', 

12 skip_class = {'ion', 'solvent'}, 

13) -> dict: 

14 """ 

15 Generate a dictionary mapping InChI keys to residue information for non-standard residues. 

16 

17 This function uses MDAnalysis to parse the input structure and topology files and identifies 

18 residues that are not classified as 'ion', 'solvent', 'nucleic', or 'protein'. For each 

19 identified residue, it converts the structure to RDKit format to obtain the InChI key 

20 and InChI string. The resulting data is stored in dictionaries to map InChI keys to residue 

21 details and residue names to InChI keys. PDB coordinates are necesary to distinguish stereoisomers. 

22 

23 Args: 

24 universe (Universe): The MDAnalysis Universe object containing the structure and topology. 

25 structure (Structure): The Structure object containing residues. 

26 skip_class (set): A set of residue classifications to skip. Defaults to {'ion', 'solvent'}. 

27 

28 Returns: 

29 key_2_name (dict): A dictionary where keys are InChI keys and values are dictionaries containing: 

30 - 'inchi' (str): The InChI string for the residue 

31 - 'resindices' (list): A list of all residue indices with this InChI key 

32 - 'fragments' (list[list]): Lists of residue indices that are connected as a single group 

33 - 'frag_len' (int): Length of the fragments. 1 if no fragments are present. 

34 - 'resname' (set): Set of residue names associated with this InChI key 

35 - 'classification' (set): Set of residue classifications for this InChI key 

36 residx_2_key (dict): A dictionary mapping residue indices to their corresponding InChI keys. 

37  

38 Notes: 

39 The function also performs consistency checks, warning if multiple residue names  

40 map to the same InChI key or if multiple InChI keys map to the same residue name, 

41 which can indicate mismatched residue definitions or stereoisomers. 

42 """ 

43 try: 

44 universe.universe.atoms.charges 

45 except: 

46 warn('Topology file does not have charges, InChI keys may be unreliable.') 

47 

48 # 1) Prepare residue data for parallel processing 

49 # First group residues that are bonded together 

50 tasks = [] 

51 residues = structure.residues 

52 

53 # Fragment = residues that are bonded together 

54 fragments = universe.atoms.fragments 

55 for i, fragment in enumerate(fragments): 

56 resindices = fragment.residues.resindices.tolist() 

57 

58 # Continue to the next fragment if any of its 

59 # residues are of a disallowed classification 

60 classes = {residues[resindex].classification for resindex in resindices} 

61 if (classes.intersection(skip_class) or 

62 (classes.intersection({'dna', 'rna', 'protein'}) and len(resindices) > 1)): 

63 continue 

64 

65 # Select residues atoms with MDAnalysis 

66 resatoms = universe.residues[resindices].atoms 

67 if 'Cg' in resatoms.types: 

68 # Skip coarse grain residues 

69 continue 

70 # If you pass a residue selection to a parallel worker, you a passing a whole MDAnalysis 

71 # universe, slowing the process down because you have to pickle the object 

72 # To avoid this we create 

73 resatoms = MDAnalysis.Merge(resatoms).universe.atoms 

74 # Convert to RDKit and get InChI data 

75 tasks.append((resatoms, resindices)) 

76 

77 results = [] 

78 # Execute tasks in parallel 

79 with multiprocessing.Pool() as pool: 

80 results = pool.map(residue_to_inchi, tasks) 

81 

82 # 2) Process results and build dictionaries 

83 key_2_name = {} # To see if different name for same residue 

84 name_2_key = {} # To see if different residues under same name 

85 residx_2_key = {} # A mapping from residue index to InChI key 

86 for (inchikey, inchi, resindices) in results: 

87 

88 # If key don't existe we create the default entry with info that is only put once 

89 if inchikey not in key_2_name: 

90 key_2_name[inchikey] = {'inchi': inchi, 

91 'resindices': [], 

92 'fragments': [], # For glucolipids 

93 'resname': set(), 

94 'classification': set() 

95 } 

96 # Add residue index to the list 

97 key_2_name[inchikey]['resindices'].extend(resindices) 

98 # Add residue name to the list. For multi residues we join the names 

99 resname = '-'.join(sorted([residues[index].name for index in resindices])) 

100 key_2_name[inchikey]['resname'].add(resname) 

101 # Add residue class to the list 

102 if len(resindices) > 1: 

103 classes = tuple(set([residues[index].classification for index in resindices])) 

104 key_2_name[inchikey]['classification'].add(classes) 

105 # Glucolipids saved the groups of residues the form a 'fragment' to solve a  

106 # problem with FATSLiM later (ex: A01IR, A01J5) 

107 key_2_name[inchikey]['fragments'].append(list(map(int, resindices))) 

108 else: 

109 key_2_name[inchikey]['classification'].add(residues[resindices[0]].classification) 

110 

111 # Incorrect residue name, estereoisomers, lose of atoms... 

112 if resname not in name_2_key: 

113 name_2_key[resname] = [] 

114 name_2_key[resname].append(inchikey) 

115 # Add the InChI key to the residue index mapping 

116 for resindex in resindices: 

117 residx_2_key[resindex] = inchikey 

118 

119 # 3) Check data coherence 

120 for inchikey, data in key_2_name.items(): 

121 # Check if there are multiple names for the same InChI key  

122 if len(data['resname']) > 1: 

123 warn('Same residue with different names:\n' 

124 f'{inchikey} -> {str(data["resname"])}') 

125 # Check if there are multiple classifications for the same InChI key 

126 if len(data['classification']) > 1: 

127 warn('Same residue with different classifications:\n' 

128 f'{inchikey} + -> {str(data["classification"])} for names {str(data["resname"])}') 

129 # Check if there are multiple fragments length for the same InChI key 

130 if len(data['fragments']) == 0: 

131 data['frag_len'] = 1 

132 else: 

133 frag_len = len(set([len(fragment) for fragment in data['fragments']])) 

134 assert frag_len == 1, \ 

135 f'Fragments of different lengths for InChI key {inchikey}: {str(frag_len)}' 

136 data['frag_len'] = frag_len 

137 

138 # Check if there are multiple InChI keys for the same name 

139 for name, inchikeys in name_2_key.items(): 

140 inchikeys = list(set(inchikeys)) 

141 if len(inchikeys) < 2: continue 

142 counts = {} 

143 # Count the number of fragments  

144 for key in inchikeys: 

145 # If there are not fragments, we use the number of residues 

146 counts[key] = (key_2_name[key]["frag_len"] 

147 if key_2_name[key]["frag_len"] > 1 

148 else len(key_2_name[key]["resindices"])) 

149 # Format the counts for printing 

150 key_counts = '\n'.join([f'\t{key}: {counts[key]: >4}' for key in inchikeys]) 

151 warn(f'The fragment {name} has more than one InChi key:\n' 

152 f'{key_counts}') 

153 

154 # Convert sets to lists for JSON serialization 

155 for inchikey in key_2_name: 

156 key_2_name[inchikey]['resname'] = list(key_2_name[inchikey]['resname']) 

157 key_2_name[inchikey]['classification'] = list(key_2_name[inchikey]['classification']) 

158 

159 return { 'key_2_name': key_2_name, 'residx_2_key': residx_2_key } 

160 

161def residue_to_inchi(task: tuple['MDAnalysis.AtomGroup', int]) -> tuple[str, str, int]: 

162 """ 

163 Process a single residue to get its InChI key and related information. 

164 """ 

165 resatoms , resindices = task 

166 # Convert to RDKIT and get InChI data 

167 res_RD = resatoms.convert_to("RDKIT") 

168 # Calculate InChI key and string 

169 inchikey = Chem.MolToInchiKey(res_RD) # slow step, 50% of time  

170 # rdinchi.MolToInchi so it doesnt print the warnings 

171 inchi, retcode, message, logs, aux = Chem.rdinchi.MolToInchi(res_RD) 

172 return (inchikey, inchi, resindices) 

173 

174def obtain_inchikey_from_pdb(pdb_file : str) -> Optional[str]: 

175 mol = Chem.MolFromPDBFile(pdb_file, sanitize=True) 

176 if mol is None: 

177 print("❌ InchiKey couldn't be constructed from PDB file.") 

178 return None 

179 return Chem.inchi.MolToInchiKey(mol) 

180 

181def get_atoms_info(atom): 

182 """Get the RDkit atom information.""" 

183 info = { 

184 'índice': atom.GetIdx(), 

185 'símbolo': atom.GetSymbol(), 

186 'carga_formal': atom.GetFormalCharge(), 

187 'carga_parcial': atom.GetDoubleProp('_GasteigerCharge') if atom.HasProp('_GasteigerCharge') else 'N/A', 

188 'valencia': atom.GetTotalValence(), 

189 'grado': atom.GetDegree(), 

190 'hidrógenos_explícitos': atom.GetNumExplicitHs(), 

191 'hidrógenos_implícitos': atom.GetNumImplicitHs(), 

192 'aromático': atom.GetIsAromatic() 

193 } 

194 return info 

195 

196def inchi_2_mol(inchi, withHs=False, neutralize=False): 

197 """Converts an InChI string to a RDKIT. Usefull to display them.""" 

198 mol = Chem.MolFromInchi(inchi) 

199 

200 if neutralize: 

201 # Crear una copia editable de la molecula 

202 rwmol = Chem.RWMol(mol) 

203 for atom in mol.GetAtoms(): 

204 charge = atom.GetFormalCharge() 

205 if charge: 

206 idx = atom.GetIdx() 

207 Hs = atom.GetNumExplicitHs() 

208 rw_atom = rwmol.GetAtomWithIdx(idx) 

209 rw_atom.SetFormalCharge(0) 

210 rw_atom.SetNumExplicitHs(Hs-charge) 

211 mol = rwmol 

212 

213 if withHs: 

214 return Chem.AddHs(mol) 

215 else: 

216 return mol 

217 

218def Residue_2_Mol(res: 'Residue'): 

219 "WiP" 

220 mol = Chem.RWMol() 

221 # map index in universe to index in mol 

222 atom_mapper = {} 

223 bonds = [] 

224 non_res_bonds = res.get_bonded_atom_indices() 

225 for i, atom in enumerate(res.atoms): 

226 # create atom 

227 rdatom = Chem.Atom(atom.element) 

228 # enable/disable adding implicit H to the molecule 

229 rdatom.SetNoImplicit(True) 

230 for bond_to in atom.bonds: 

231 if bond_to in non_res_bonds: 

232 continue 

233 bond = sorted([atom.index, bond_to]) 

234 if bond not in bonds: 

235 bonds.append(bond) 

236 # add atom 

237 index = mol.AddAtom(rdatom) 

238 atom_mapper[atom.index] = index 

239 for bond in bonds: 

240 mol.AddBond(bond[0], atom_mapper[bond[1]], Chem.BondType.SINGLE) 

241 

242 # sanitize if possible 

243 err = Chem.SanitizeMol(mol, catchErrors=True) 

244 # infer bond orders and formal charges 

245 MDAnalysis.converters.RDKit._infer_bo_and_charges(mol) 

246 return mol