Coverage for model_workflow/tools/get_inchi_keys.py: 67%

127 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-23 10:54 +0000

1import MDAnalysis 

2from rdkit import Chem 

3from model_workflow.utils.structures import Structure 

4from model_workflow.utils.type_hints import * 

5from model_workflow.utils.warnings import warn 

6from functools import lru_cache 

7import requests 

8 

9 

10def get_connected_residues( 

11 residue: 'MDAnalysis.Residue', 

12 visited=None): 

13 """ 

14 Recursively finds all residues that are connected through bonds to the given residue. 

15 Parameters 

16 ---------- 

17 residue : MDAnalysis.core.groups.Residue 

18 The residue to start the search from 

19 visited : set, optional 

20 Set of already visited residue IDs to prevent infinite recursion.  

21 Default is None, which initializes an empty set. 

22 Returns 

23 ------- 

24 list 

25 A set containing the residue IDs of all residues that are connected  

26 through bonds to the input residue, either directly or indirectly. 

27 Notes 

28 ----- 

29 This function traverses the molecular structure recursively, following bonds  

30 between residues. It keeps track of visited residues to avoid cycles in the 

31 traversal. Both direct bonds between residues and indirect connections through 

32 other residues are included in the result. 

33 """ 

34 

35 if visited is None: 

36 visited = set() 

37 

38 # Add current residue to visited set 

39 visited.add(residue.resindex) 

40 

41 # Get direct external bonds 

42 external_bonds = set([residue.resindex]) 

43 for bond in residue.atoms.bonds: 

44 external_bonds.update(bond.atoms.resindices) 

45 

46 # Recursively check external bonds 

47 all_external_bonds = external_bonds.copy() 

48 u = residue.universe 

49 for ext_resid in external_bonds: 

50 if ext_resid not in visited: 

51 # Get external bonds for the connected residue 

52 recursive_bonds = get_connected_residues(u.residues[ext_resid], visited) 

53 all_external_bonds.update(recursive_bonds) 

54 

55 return list(all_external_bonds) 

56 

57 

58def residue_to_inchi(res_atoms: 'MDAnalysis.AtomGroup', 

59 resindex : int) -> Tuple[str, str, int]: 

60 """ 

61 Process a single residue to get its InChI key and related information. 

62 """ 

63 # Convert to RDKIT and get InChI data 

64 res_RD = res_atoms.convert_to("RDKIT") 

65 # Calculate InChI key and string 

66 inchikey = Chem.MolToInchiKey(res_RD) # slow step, 50% of time  

67 # rdinchi.MolToInchi so it doesnt print the warnings 

68 inchi, retcode, message, logs, aux = Chem.rdinchi.MolToInchi(res_RD) 

69 return (inchikey, inchi, resindex) 

70 

71 

72def get_inchi_keys ( 

73 u : 'MDAnalysis.Universe', 

74 structure : 'Structure' 

75) -> dict: 

76 """ 

77 Generate a dictionary mapping InChI keys to residue information for non-standard residues. 

78 

79 This function uses MDAnalysis to parse the input structure and topology files and identifies 

80 residues that are not classified as 'ion', 'solvent', 'nucleic', or 'protein'. For each 

81 identified residue, it converts the structure to RDKit format to obtain the InChI key 

82 and InChI string. The resulting data is stored in dictionaries to map InChI keys to residue 

83 details and residue names to InChI keys. PDB coordinates are necesary to distinguish stereoisomers. 

84 

85 Args: 

86 input_structure_file (File): The input structure file. 

87 input_topology_file (Optional[File]): The input topology file. 

88 structure (Structure): The Structure object containing residues. 

89 

90 Returns: 

91 dict: A dictionary where keys are InChI keys and values are dictionaries containing: 

92 - 'inchi' (str): The InChI string for the residue 

93 - 'resindices' (list): A list of all residue indices with this InChI key 

94 - 'resgroups' (list): Lists of residue indices that are connected as a single group 

95 - 'resname' (set): Set of residue names associated with this InChI key 

96 - 'classification' (set): Set of residue classifications for this InChI key 

97 Notes: 

98 The function also performs consistency checks, warning if multiple residue names  

99 map to the same InChI key or if multiple InChI keys map to the same residue name, 

100 which can indicate mismatched residue definitions or stereoisomers. 

101 """ 

102 # 1) Prepare residue data for parallel processing (not yet implemented) 

103 # First group residues that are bonded together 

104 results = [] 

105 residues: List[Residue] = structure.residues 

106 visited = set() 

107 for resindex in range(len(residues)): 

108 # Skip residues that have already been visited 

109 if resindex in visited: 

110 continue 

111 # Residues grouped by bonded atoms 

112 res_grp_idx = get_connected_residues(u.residues[resindex], visited) 

113 classes = set([residues[grp_idx].classification for grp_idx in res_grp_idx]) 

114 # Skip residues that are aminoacids, nucleics, or too small 

115 # We also skips residues connected to them: glicoprotein, lipid-anchored protein... 

116 if any(cls in ['ion', 'solvent', 'dna', 'rna', 'protein'] for cls in classes): 

117 continue 

118 # Select residues atoms with MDAnalysis 

119 res_atoms = u.residues[res_grp_idx].atoms 

120 # Convert to RDKit and get InChI data 

121 results.append(residue_to_inchi(res_atoms,res_grp_idx)) 

122 

123 # 2) Process results and build dictionaries 

124 key_2_name = {} # To see if different name for same residue 

125 name_2_key = {} # To see if different residues under same name 

126 for result in results: 

127 inchikey, inchi, indexes = result 

128 

129 # If key don't existe we create the default entry with info that is only put once 

130 if inchikey not in key_2_name: 

131 key_2_name[inchikey] = {'inchi': inchi, 

132 'resindices': [], 

133 'resgroups': [], # For glucolipids 

134 'resname': set(), 

135 'classification': set() 

136 } 

137 # Add residue index to the list 

138 key_2_name[inchikey]['resindices'].extend(indexes) 

139 key_2_name[inchikey]['resgroups'].append(indexes) 

140 # Add residue name to the list. For multi residues we join the names 

141 resname = '-'.join(sorted([residues[index].name for index in indexes])) 

142 key_2_name[inchikey]['resname'].add(resname) 

143 # Add residue class to the list 

144 if len(indexes) > 1: 

145 classes = tuple(set([residues[index].classification for index in indexes])) 

146 key_2_name[inchikey]['classification'].add(classes) 

147 else: 

148 key_2_name[inchikey]['classification'].add(residues[indexes[0]].classification) 

149 

150 # Incorrect residue name, estereoisomers, lose of atoms... 

151 if resname not in name_2_key: 

152 name_2_key[resname] = [] 

153 name_2_key[resname].append(inchikey) 

154 

155 # 3) Check data coherence 

156 # Check if there are multiple names for the same InChI key 

157 for inchikey, data in key_2_name.items(): 

158 if len(data['resname']) > 1: 

159 warn('Same residue with different names:\n' 

160 f'{inchikey} -> {str(data["resname"])}') 

161 if len(data['classification']) > 1: 

162 warn('Same residue with different classifications:\n' 

163 f'{inchikey} + -> {str(data["classification"])} for names {str(data["resname"])}') 

164 

165 # Check if there are multiple InChI keys for the same name 

166 for name, inchikeys in name_2_key.items(): 

167 inchikeys = set(inchikeys) 

168 if len(inchikeys) > 1: 

169 key_counts = '\n'.join([f'{key}: {len(key_2_name[key]["resindices"]): >4}. ' 

170 f'{key_2_name[key]["inchi"]}. ' 

171 f'Sample index: {key_2_name[key]["resindices"][0]}' 

172 for key in inchikeys]) 

173 warn(f'The residue {name} has more than one InChi key:\n' 

174 f'{key_counts}') 

175 return key_2_name 

176 

177 

178@lru_cache(maxsize=None) 

179def is_in_LIPID_MAPS(inchikey, only_first_layer=False) -> dict: 

180 """Search the InChi keys in LIPID MAPS""" 

181 headers = {'accept': 'json'} 

182 # https://www.lipidmaps.org/resources/rest 

183 # Output item = physchem, is the only one that returns data for the inchi key 

184 # for only the two first layers (main and atom connection) 

185 # To see InChiKey layers: https://www.inchi-trust.org/ 

186 key = inchikey[:14] if only_first_layer else inchikey 

187 url = f"https://www.lipidmaps.org/rest/compound/inchi_key/{key}/all" 

188 response = requests.get(url, headers=headers) 

189 if response.status_code == 200: 

190 js = response.json() 

191 if js != []: 

192 return js 

193 else: 

194 return False 

195 else: 

196 print(f"Error for {inchikey}: {response.status_code}") 

197 

198 

199# @lru_cache(maxsize=None) 

200def is_in_swiss_lipids(inchikey, only_first_layer=False) -> dict: 

201 """Search the InChi keys in LIPID MAPS""" 

202 key = inchikey[:14] if only_first_layer else inchikey 

203 headers = {'accept': 'json'} 

204 url = f"https://www.swisslipids.org/api/index.php/advancedSearch?InChIkey={key}" 

205 response = requests.get(url, headers=headers) 

206 if response.status_code == 200: 

207 return response.json()[0] 

208 else: 

209 return False 

210 

211def get_atoms_info(atom): 

212 """Get the RDkit atom information.""" 

213 info = { 

214 'índice': atom.GetIdx(), 

215 'símbolo': atom.GetSymbol(), 

216 'carga_formal': atom.GetFormalCharge(), 

217 'carga_parcial': atom.GetDoubleProp('_GasteigerCharge') if atom.HasProp('_GasteigerCharge') else 'N/A', 

218 'valencia': atom.GetTotalValence(), 

219 'grado': atom.GetDegree(), 

220 'hidrógenos_explícitos': atom.GetNumExplicitHs(), 

221 'hidrógenos_implícitos': atom.GetNumImplicitHs(), 

222 'aromático': atom.GetIsAromatic() 

223 } 

224 return info 

225 

226 

227def inchi_2_mol(inchi, withHs=False, neutralize=False): 

228 """Converts an InChI string to a RDKIT. Usefull to display them.""" 

229 mol = Chem.MolFromInchi(inchi) 

230 

231 if neutralize: 

232 # Crear una copia editable de la molecula 

233 rwmol = Chem.RWMol(mol) 

234 for atom in mol.GetAtoms(): 

235 charge = atom.GetFormalCharge() 

236 if charge: 

237 idx = atom.GetIdx() 

238 Hs = atom.GetNumExplicitHs() 

239 

240 rw_atom = rwmol.GetAtomWithIdx(idx) 

241 rw_atom.SetFormalCharge(0) 

242 rw_atom.SetNumExplicitHs(Hs-charge) 

243 mol = rwmol 

244 

245 if withHs: 

246 return Chem.AddHs(mol) 

247 else: 

248 return mol 

249 

250 

251def Residue_2_Mol(res: 'Residue'): 

252 "WiP" 

253 mol = Chem.RWMol() 

254 # map index in universe to index in mol 

255 atom_mapper = {} 

256 bonds = [] 

257 non_res_bonds = res.get_bonded_atom_indices() 

258 for i, atom in enumerate(res.atoms): 

259 # create atom 

260 rdatom = Chem.Atom(atom.element) 

261 # enable/disable adding implicit H to the molecule 

262 rdatom.SetNoImplicit(True) 

263 for bond_to in atom.bonds: 

264 if bond_to in non_res_bonds: 

265 continue 

266 bond = sorted([atom.index, bond_to]) 

267 if bond not in bonds: 

268 bonds.append(bond) 

269 # add atom 

270 index = mol.AddAtom(rdatom) 

271 atom_mapper[atom.index] = index 

272 for bond in bonds: 

273 mol.AddBond(bond[0], atom_mapper[bond[1]], Chem.BondType.SINGLE) 

274 

275 # sanitize if possible 

276 err = Chem.SanitizeMol(mol, catchErrors=True) 

277 # infer bond orders and formal charges 

278 MDAnalysis.converters.RDKit._infer_bo_and_charges(mol) 

279 return mol