Coverage for mddb_workflow/tools/get_inchi

1import MDAnalysis

2import multiprocessing

3from rdkit import Chem

4from mddb_workflow.utils.structures import Structure

5from mddb_workflow.utils.auxiliar import warn

6from mddb_workflow.utils.type_hints import *

9def get_inchikeys (

10 universe : 'MDAnalysis.Universe',

11 structure : 'Structure',

12 skip_class = {'ion', 'solvent'},

13) -> dict:

14 """

15 Generate a dictionary mapping InChI keys to residue information for non-standard residues.

17 This function uses MDAnalysis to parse the input structure and topology files and identifies

18 residues that are not classified as 'ion', 'solvent', 'nucleic', or 'protein'. For each

19 identified residue, it converts the structure to RDKit format to obtain the InChI key

20 and InChI string. The resulting data is stored in dictionaries to map InChI keys to residue

21 details and residue names to InChI keys. PDB coordinates are necesary to distinguish stereoisomers.

23 Args:

24 universe (Universe): The MDAnalysis Universe object containing the structure and topology.

25 structure (Structure): The Structure object containing residues.

26 skip_class (set): A set of residue classifications to skip. Defaults to {'ion', 'solvent'}.

28 Returns:

29 key_2_name (dict): A dictionary where keys are InChI keys and values are dictionaries containing:

30 - 'inchi' (str): The InChI string for the residue

31 - 'resindices' (list): A list of all residue indices with this InChI key

32 - 'fragments' (list[list]): Lists of residue indices that are connected as a single group

33 - 'frag_len' (int): Length of the fragments. 1 if no fragments are present.

34 - 'resname' (set): Set of residue names associated with this InChI key

35 - 'classification' (set): Set of residue classifications for this InChI key

36 residx_2_key (dict): A dictionary mapping residue indices to their corresponding InChI keys.

38 Notes:

39 The function also performs consistency checks, warning if multiple residue names

40 map to the same InChI key or if multiple InChI keys map to the same residue name,

41 which can indicate mismatched residue definitions or stereoisomers.

42 """

43 try:

44 universe.universe.atoms.charges

45 except:

46 warn('Topology file does not have charges, InChI keys may be unreliable.')

48 # 1) Prepare residue data for parallel processing

49 # First group residues that are bonded together

50 tasks = []

51 residues = structure.residues

53 # Fragment = residues that are bonded together

54 fragments = universe.atoms.fragments

55 for i, fragment in enumerate(fragments):

56 resindices = fragment.residues.resindices.tolist()

58 # Continue to the next fragment if any of its

59 # residues are of a disallowed classification

60 classes = {residues[resindex].classification for resindex in resindices}

61 if (classes.intersection(skip_class) or

62 (classes.intersection({'dna', 'rna', 'protein'}) and len(resindices) > 1)):

63 continue

65 # Select residues atoms with MDAnalysis

66 resatoms = universe.residues[resindices].atoms

67 if 'Cg' in resatoms.types:

68 # Skip coarse grain residues

69 continue

70 # If you pass a residue selection to a parallel worker, you a passing a whole MDAnalysis

71 # universe, slowing the process down because you have to pickle the object

72 # To avoid this we create

73 resatoms = MDAnalysis.Merge(resatoms).universe.atoms

74 # Convert to RDKit and get InChI data

75 tasks.append((resatoms, resindices))

77 results = []

78 # Execute tasks in parallel

79 with multiprocessing.Pool() as pool:

80 results = pool.map(residue_to_inchi, tasks)

82 # 2) Process results and build dictionaries

83 key_2_name = {} # To see if different name for same residue

84 name_2_key = {} # To see if different residues under same name

85 residx_2_key = {} # A mapping from residue index to InChI key

86 for (inchikey, inchi, resindices) in results:

88 # If key don't existe we create the default entry with info that is only put once

89 if inchikey not in key_2_name:

90 key_2_name[inchikey] = {'inchi': inchi,

91 'resindices': [],

92 'fragments': [], # For glucolipids

93 'resname': set(),

94 'classification': set()

95 }

96 # Add residue index to the list

97 key_2_name[inchikey]['resindices'].extend(resindices)

98 # Add residue name to the list. For multi residues we join the names

99 resname = '-'.join(sorted([residues[index].name for index in resindices]))

100 key_2_name[inchikey]['resname'].add(resname)

101 # Add residue class to the list

102 if len(resindices) > 1:

103 classes = tuple(set([residues[index].classification for index in resindices]))

104 key_2_name[inchikey]['classification'].add(classes)

105 # Glucolipids saved the groups of residues the form a 'fragment' to solve a

106 # problem with FATSLiM later (ex: A01IR, A01J5)

107 key_2_name[inchikey]['fragments'].append(list(map(int, resindices)))

108 else:

109 key_2_name[inchikey]['classification'].add(residues[resindices[0]].classification)

110

111 # Incorrect residue name, estereoisomers, lose of atoms...

112 if resname not in name_2_key:

113 name_2_key[resname] = []

114 name_2_key[resname].append(inchikey)

115 # Add the InChI key to the residue index mapping

116 for resindex in resindices:

117 residx_2_key[resindex] = inchikey

118

119 # 3) Check data coherence

120 for inchikey, data in key_2_name.items():

121 # Check if there are multiple names for the same InChI key

122 if len(data['resname']) > 1:

123 warn('Same residue with different names:\n'

124 f'{inchikey} -> {str(data["resname"])}')

125 # Check if there are multiple classifications for the same InChI key

126 if len(data['classification']) > 1:

127 warn('Same residue with different classifications:\n'

128 f'{inchikey} + -> {str(data["classification"])} for names {str(data["resname"])}')

129 # Check if there are multiple fragments length for the same InChI key

130 if len(data['fragments']) == 0:

131 data['frag_len'] = 1

132 else:

133 frag_len = len(set([len(fragment) for fragment in data['fragments']]))

134 assert frag_len == 1, \

135 f'Fragments of different lengths for InChI key {inchikey}: {str(frag_len)}'

136 data['frag_len'] = frag_len

137

138 # Check if there are multiple InChI keys for the same name

139 for name, inchikeys in name_2_key.items():

140 inchikeys = list(set(inchikeys))

141 if len(inchikeys) < 2: continue

142 counts = {}

143 # Count the number of fragments

144 for key in inchikeys:

145 # If there are not fragments, we use the number of residues

146 counts[key] = (key_2_name[key]["frag_len"]

147 if key_2_name[key]["frag_len"] > 1

148 else len(key_2_name[key]["resindices"]))

149 # Format the counts for printing

150 key_counts = '\n'.join([f'\t{key}: {counts[key]: >4}' for key in inchikeys])

151 warn(f'The fragment {name} has more than one InChi key:\n'

152 f'{key_counts}')

153

154 # Convert sets to lists for JSON serialization

155 for inchikey in key_2_name:

156 key_2_name[inchikey]['resname'] = list(key_2_name[inchikey]['resname'])

157 key_2_name[inchikey]['classification'] = list(key_2_name[inchikey]['classification'])

158

159 return { 'key_2_name': key_2_name, 'residx_2_key': residx_2_key }

160

161def residue_to_inchi(task: tuple['MDAnalysis.AtomGroup', int]) -> tuple[str, str, int]:

162 """

163 Process a single residue to get its InChI key and related information.

164 """

165 resatoms , resindices = task

166 # Convert to RDKIT and get InChI data

167 res_RD = resatoms.convert_to("RDKIT")

168 # Calculate InChI key and string

169 inchikey = Chem.MolToInchiKey(res_RD) # slow step, 50% of time

170 # rdinchi.MolToInchi so it doesnt print the warnings

171 inchi, retcode, message, logs, aux = Chem.rdinchi.MolToInchi(res_RD)

172 return (inchikey, inchi, resindices)

173

174def obtain_inchikey_from_pdb(pdb_file : str) -> Optional[str]:

175 mol = Chem.MolFromPDBFile(pdb_file, sanitize=True)

176 if mol is None:

177 print("❌ InchiKey couldn't be constructed from PDB file.")

178 return None

179 return Chem.inchi.MolToInchiKey(mol)

180

181def get_atoms_info(atom):

182 """Get the RDkit atom information."""

183 info = {

184 'índice': atom.GetIdx(),

185 'símbolo': atom.GetSymbol(),

186 'carga_formal': atom.GetFormalCharge(),

187 'carga_parcial': atom.GetDoubleProp('_GasteigerCharge') if atom.HasProp('_GasteigerCharge') else 'N/A',

188 'valencia': atom.GetTotalValence(),

189 'grado': atom.GetDegree(),

190 'hidrógenos_explícitos': atom.GetNumExplicitHs(),

191 'hidrógenos_implícitos': atom.GetNumImplicitHs(),

192 'aromático': atom.GetIsAromatic()

193 }

194 return info

195

196def inchi_2_mol(inchi, withHs=False, neutralize=False):

197 """Converts an InChI string to a RDKIT. Usefull to display them."""

198 mol = Chem.MolFromInchi(inchi)

199

200 if neutralize:

201 # Crear una copia editable de la molecula

202 rwmol = Chem.RWMol(mol)

203 for atom in mol.GetAtoms():

204 charge = atom.GetFormalCharge()

205 if charge:

206 idx = atom.GetIdx()

207 Hs = atom.GetNumExplicitHs()

208 rw_atom = rwmol.GetAtomWithIdx(idx)

209 rw_atom.SetFormalCharge(0)

210 rw_atom.SetNumExplicitHs(Hs-charge)

211 mol = rwmol

212

213 if withHs:

214 return Chem.AddHs(mol)

215 else:

216 return mol

217

218def Residue_2_Mol(res: 'Residue'):

219 "WiP"

220 mol = Chem.RWMol()

221 # map index in universe to index in mol

222 atom_mapper = {}

223 bonds = []

224 non_res_bonds = res.get_bonded_atom_indices()

225 for i, atom in enumerate(res.atoms):

226 # create atom

227 rdatom = Chem.Atom(atom.element)

228 # enable/disable adding implicit H to the molecule

229 rdatom.SetNoImplicit(True)

230 for bond_to in atom.bonds:

231 if bond_to in non_res_bonds:

232 continue

233 bond = sorted([atom.index, bond_to])

234 if bond not in bonds:

235 bonds.append(bond)

236 # add atom

237 index = mol.AddAtom(rdatom)

238 atom_mapper[atom.index] = index

239 for bond in bonds:

240 mol.AddBond(bond[0], atom_mapper[bond[1]], Chem.BondType.SINGLE)

241

242 # sanitize if possible

243 err = Chem.SanitizeMol(mol, catchErrors=True)

244 # infer bond orders and formal charges

245 MDAnalysis.converters.RDKit._infer_bo_and_charges(mol)

246 return mol

Coverage for mddb_workflow/tools/get_inchi_keys.py: 54%

120 statements