Coverage for model_workflow/tools/get_inchi

1import MDAnalysis

2from rdkit import Chem

3from model_workflow.utils.structures import Structure

4from model_workflow.utils.type_hints import *

5from model_workflow.utils.warnings import warn

6from functools import lru_cache

7import requests

10def get_connected_residues(

11 residue: 'MDAnalysis.Residue',

12 visited=None):

13 """

14 Recursively finds all residues that are connected through bonds to the given residue.

15 Parameters

16 ----------

17 residue : MDAnalysis.core.groups.Residue

18 The residue to start the search from

19 visited : set, optional

20 Set of already visited residue IDs to prevent infinite recursion.

21 Default is None, which initializes an empty set.

22 Returns

23 -------

24 list

25 A set containing the residue IDs of all residues that are connected

26 through bonds to the input residue, either directly or indirectly.

27 Notes

28 -----

29 This function traverses the molecular structure recursively, following bonds

30 between residues. It keeps track of visited residues to avoid cycles in the

31 traversal. Both direct bonds between residues and indirect connections through

32 other residues are included in the result.

33 """

35 if visited is None:

36 visited = set()

38 # Add current residue to visited set

39 visited.add(residue.resindex)

41 # Get direct external bonds

42 external_bonds = set([residue.resindex])

43 for bond in residue.atoms.bonds:

44 external_bonds.update(bond.atoms.resindices)

46 # Recursively check external bonds

47 all_external_bonds = external_bonds.copy()

48 u = residue.universe

49 for ext_resid in external_bonds:

50 if ext_resid not in visited:

51 # Get external bonds for the connected residue

52 recursive_bonds = get_connected_residues(u.residues[ext_resid], visited)

53 all_external_bonds.update(recursive_bonds)

55 return list(all_external_bonds)

58def residue_to_inchi(res_atoms: 'MDAnalysis.AtomGroup',

59 resindex : int) -> Tuple[str, str, int]:

60 """

61 Process a single residue to get its InChI key and related information.

62 """

63 # Convert to RDKIT and get InChI data

64 res_RD = res_atoms.convert_to("RDKIT")

65 # Calculate InChI key and string

66 inchikey = Chem.MolToInchiKey(res_RD) # slow step, 50% of time

67 # rdinchi.MolToInchi so it doesnt print the warnings

68 inchi, retcode, message, logs, aux = Chem.rdinchi.MolToInchi(res_RD)

69 return (inchikey, inchi, resindex)

72def get_inchi_keys (

73 u : 'MDAnalysis.Universe',

74 structure : 'Structure'

75) -> dict:

76 """

77 Generate a dictionary mapping InChI keys to residue information for non-standard residues.

79 This function uses MDAnalysis to parse the input structure and topology files and identifies

80 residues that are not classified as 'ion', 'solvent', 'nucleic', or 'protein'. For each

81 identified residue, it converts the structure to RDKit format to obtain the InChI key

82 and InChI string. The resulting data is stored in dictionaries to map InChI keys to residue

83 details and residue names to InChI keys. PDB coordinates are necesary to distinguish stereoisomers.

85 Args:

86 input_structure_file (File): The input structure file.

87 input_topology_file (Optional[File]): The input topology file.

88 structure (Structure): The Structure object containing residues.

90 Returns:

91 dict: A dictionary where keys are InChI keys and values are dictionaries containing:

92 - 'inchi' (str): The InChI string for the residue

93 - 'resindices' (list): A list of all residue indices with this InChI key

94 - 'resgroups' (list): Lists of residue indices that are connected as a single group

95 - 'resname' (set): Set of residue names associated with this InChI key

96 - 'classification' (set): Set of residue classifications for this InChI key

97 Notes:

98 The function also performs consistency checks, warning if multiple residue names

99 map to the same InChI key or if multiple InChI keys map to the same residue name,

100 which can indicate mismatched residue definitions or stereoisomers.

101 """

102 # 1) Prepare residue data for parallel processing (not yet implemented)

103 # First group residues that are bonded together

104 results = []

105 residues: List[Residue] = structure.residues

106 visited = set()

107 for resindex in range(len(residues)):

108 # Skip residues that have already been visited

109 if resindex in visited:

110 continue

111 # Residues grouped by bonded atoms

112 res_grp_idx = get_connected_residues(u.residues[resindex], visited)

113 classes = set([residues[grp_idx].classification for grp_idx in res_grp_idx])

114 # Skip residues that are aminoacids, nucleics, or too small

115 # We also skips residues connected to them: glicoprotein, lipid-anchored protein...

116 if any(cls in ['ion', 'solvent', 'dna', 'rna', 'protein'] for cls in classes):

117 continue

118 # Select residues atoms with MDAnalysis

119 res_atoms = u.residues[res_grp_idx].atoms

120 # Convert to RDKit and get InChI data

121 results.append(residue_to_inchi(res_atoms,res_grp_idx))

122

123 # 2) Process results and build dictionaries

124 key_2_name = {} # To see if different name for same residue

125 name_2_key = {} # To see if different residues under same name

126 for result in results:

127 inchikey, inchi, indexes = result

128

129 # If key don't existe we create the default entry with info that is only put once

130 if inchikey not in key_2_name:

131 key_2_name[inchikey] = {'inchi': inchi,

132 'resindices': [],

133 'resgroups': [], # For glucolipids

134 'resname': set(),

135 'classification': set()

136 }

137 # Add residue index to the list

138 key_2_name[inchikey]['resindices'].extend(indexes)

139 key_2_name[inchikey]['resgroups'].append(indexes)

140 # Add residue name to the list. For multi residues we join the names

141 resname = '-'.join(sorted([residues[index].name for index in indexes]))

142 key_2_name[inchikey]['resname'].add(resname)

143 # Add residue class to the list

144 if len(indexes) > 1:

145 classes = tuple(set([residues[index].classification for index in indexes]))

146 key_2_name[inchikey]['classification'].add(classes)

147 else:

148 key_2_name[inchikey]['classification'].add(residues[indexes[0]].classification)

149

150 # Incorrect residue name, estereoisomers, lose of atoms...

151 if resname not in name_2_key:

152 name_2_key[resname] = []

153 name_2_key[resname].append(inchikey)

154

155 # 3) Check data coherence

156 # Check if there are multiple names for the same InChI key

157 for inchikey, data in key_2_name.items():

158 if len(data['resname']) > 1:

159 warn('Same residue with different names:\n'

160 f'{inchikey} -> {str(data["resname"])}')

161 if len(data['classification']) > 1:

162 warn('Same residue with different classifications:\n'

163 f'{inchikey} + -> {str(data["classification"])} for names {str(data["resname"])}')

164

165 # Check if there are multiple InChI keys for the same name

166 for name, inchikeys in name_2_key.items():

167 inchikeys = set(inchikeys)

168 if len(inchikeys) > 1:

169 key_counts = '\n'.join([f'{key}: {len(key_2_name[key]["resindices"]): >4}. '

170 f'{key_2_name[key]["inchi"]}. '

171 f'Sample index: {key_2_name[key]["resindices"][0]}'

172 for key in inchikeys])

173 warn(f'The residue {name} has more than one InChi key:\n'

174 f'{key_counts}')

175 return key_2_name

176

177

178@lru_cache(maxsize=None)

179def is_in_LIPID_MAPS(inchikey, only_first_layer=False) -> dict:

180 """Search the InChi keys in LIPID MAPS"""

181 headers = {'accept': 'json'}

182 # https://www.lipidmaps.org/resources/rest

183 # Output item = physchem, is the only one that returns data for the inchi key

184 # for only the two first layers (main and atom connection)

185 # To see InChiKey layers: https://www.inchi-trust.org/

186 key = inchikey[:14] if only_first_layer else inchikey

187 url = f"https://www.lipidmaps.org/rest/compound/inchi_key/{key}/all"

188 response = requests.get(url, headers=headers)

189 if response.status_code == 200:

190 js = response.json()

191 if js != []:

192 return js

193 else:

194 return False

195 else:

196 print(f"Error for {inchikey}: {response.status_code}")

197

198

199# @lru_cache(maxsize=None)

200def is_in_swiss_lipids(inchikey, only_first_layer=False) -> dict:

201 """Search the InChi keys in LIPID MAPS"""

202 key = inchikey[:14] if only_first_layer else inchikey

203 headers = {'accept': 'json'}

204 url = f"https://www.swisslipids.org/api/index.php/advancedSearch?InChIkey={key}"

205 response = requests.get(url, headers=headers)

206 if response.status_code == 200:

207 return response.json()[0]

208 else:

209 return False

210

211def get_atoms_info(atom):

212 """Get the RDkit atom information."""

213 info = {

214 'índice': atom.GetIdx(),

215 'símbolo': atom.GetSymbol(),

216 'carga_formal': atom.GetFormalCharge(),

217 'carga_parcial': atom.GetDoubleProp('_GasteigerCharge') if atom.HasProp('_GasteigerCharge') else 'N/A',

218 'valencia': atom.GetTotalValence(),

219 'grado': atom.GetDegree(),

220 'hidrógenos_explícitos': atom.GetNumExplicitHs(),

221 'hidrógenos_implícitos': atom.GetNumImplicitHs(),

222 'aromático': atom.GetIsAromatic()

223 }

224 return info

225

226

227def inchi_2_mol(inchi, withHs=False, neutralize=False):

228 """Converts an InChI string to a RDKIT. Usefull to display them."""

229 mol = Chem.MolFromInchi(inchi)

230

231 if neutralize:

232 # Crear una copia editable de la molecula

233 rwmol = Chem.RWMol(mol)

234 for atom in mol.GetAtoms():

235 charge = atom.GetFormalCharge()

236 if charge:

237 idx = atom.GetIdx()

238 Hs = atom.GetNumExplicitHs()

239

240 rw_atom = rwmol.GetAtomWithIdx(idx)

241 rw_atom.SetFormalCharge(0)

242 rw_atom.SetNumExplicitHs(Hs-charge)

243 mol = rwmol

244

245 if withHs:

246 return Chem.AddHs(mol)

247 else:

248 return mol

249

250

251def Residue_2_Mol(res: 'Residue'):

252 "WiP"

253 mol = Chem.RWMol()

254 # map index in universe to index in mol

255 atom_mapper = {}

256 bonds = []

257 non_res_bonds = res.get_bonded_atom_indices()

258 for i, atom in enumerate(res.atoms):

259 # create atom

260 rdatom = Chem.Atom(atom.element)

261 # enable/disable adding implicit H to the molecule

262 rdatom.SetNoImplicit(True)

263 for bond_to in atom.bonds:

264 if bond_to in non_res_bonds:

265 continue

266 bond = sorted([atom.index, bond_to])

267 if bond not in bonds:

268 bonds.append(bond)

269 # add atom

270 index = mol.AddAtom(rdatom)

271 atom_mapper[atom.index] = index

272 for bond in bonds:

273 mol.AddBond(bond[0], atom_mapper[bond[1]], Chem.BondType.SINGLE)

274

275 # sanitize if possible

276 err = Chem.SanitizeMol(mol, catchErrors=True)

277 # infer bond orders and formal charges

278 MDAnalysis.converters.RDKit._infer_bo_and_charges(mol)

279 return mol

Coverage for model_workflow/tools/get_inchi_keys.py: 67%

127 statements