Coverage for mddb_workflow / tools / get_ligands.py: 68%

434 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 18:45 +0000

1import re 

2import json 

3from rdkit import Chem 

4from rdkit.Chem.MolStandardize import rdMolStandardize 

5from mordred import Calculator, descriptors 

6from mddb_workflow.utils.constants import LIGANDS_MATCH_FLAG, PDB_TO_PUBCHEM 

7from mddb_workflow.utils.auxiliar import InputError, request_pdb_data, warn 

8from mddb_workflow.utils.type_hints import * 

9from urllib.request import urlopen 

10from urllib.error import HTTPError, URLError 

11from functools import lru_cache 

12import requests 

13from scipy.optimize import linear_sum_assignment 

14import numpy as np 

15 

16# Set the expected ligand data fields 

17LIGAND_DATA_FIELDS = set(['name', 'pubchem', 'drugbank', 'chembl', 'smiles', 'formula', 'morgan', 'mordred', 'pdbid', 'inchikey']) 

18MINIMUM_TANIMOTO_THRESHOLD = 0.3 

19 

20 

21def record_pubchem_match( 

22 inchikey: str, 

23 pubchem_id: str, 

24 match_mol: Chem.Mol, 

25 reference_fg: dict, 

26 ligand_references: dict, 

27 reason: str, 

28 threshold: float = MINIMUM_TANIMOTO_THRESHOLD, 

29) -> None: 

30 """Set the pubchem id for the inchikey and compute/print Tanimoto vs descriptor_data if match_mol provided.""" 

31 matched_fg = obtain_mordred_morgan_descriptors(match_mol) 

32 tc = tanimoto_similarity(reference_fg['morgan_fp_bit_array'], matched_fg['morgan_fp_bit_array']) 

33 if tc >= threshold: 

34 ligand_references[inchikey]['pubchem'] = str(pubchem_id) 

35 ligand_references[inchikey]['tc'] = tc 

36 print(f'Found CID {pubchem_id} after {reason}. Tanimoto coef. (morgan fingerprint): {tc:.3f}') 

37 return tc 

38 

39 

40def generate_ligand_references( 

41 structure: 'Structure', 

42 cache: 'Cache', 

43 input_ligands: Optional[list[dict]], 

44 pdb_ids: list[str], 

45 inchikeys: dict[str, 'InChIKeyData'], 

46 lipid_references: dict[str, dict], 

47 membrane_map: dict, 

48 mercy: list[str] = [], 

49 ) -> dict[str, dict]: 

50 """Generate a map of residues associated to ligands. 

51 

52 This function identifies and maps ligands in the molecular structure through a 

53 multi-step matching process: 

54 

55 1. **Direct InChIKey matching**: Extracts InChIKeys from structure fragments 

56 (excluding lipids and membrane components) and attempts direct matching with 

57 PubChem database. 

58 

59 2. **Chemical similarity matching**: If direct matching fails, progressively 

60 modifies the molecular structure and calculates Tanimoto coefficient (TC) 

61 for similarity assessment: 

62 1. Neutralize charges 

63 2. Remove stereochemistry information 

64 3. Apply PubChem standardization (tautomer, protonation, etc.) 

65 4. Match against PDB-derived ligands (TC threshold ≥ 0.9) 

66 5. Perform similarity search in PubChem/ChEMBL (TC threshold ≥ 0.9) 

67 

68 3. **Fallback handling**: Unmatched ligands are saved as-is with warnings. 

69 

70 4. **User-forced selections**: Respects user-specified ligand selections from 

71 inputs.yaml, with warnings if TC compared to original fragment is insufficient. 

72 """ 

73 ligand_references = {} 

74 # Check input ligands format validity 

75 input_ligands = input_ligands or [] 

76 for i, ligand in enumerate(input_ligands): 

77 if type(ligand) is int or (type(ligand) is str and ligand.isnumeric()): 

78 print(f'A ligand number ID has been identified {ligand}, assuming that is a PubChem ID...') 

79 input_ligands[i] = {'pubchem': str(ligand)} 

80 elif type(ligand) is str: 

81 raise InputError(f'A name of ligand has been identified: {ligand}. Anyway, provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.') 

82 # Get non-lipid inchikeys 

83 ligand_keys = {key: {} for key in inchikeys if key not in lipid_references.keys()} 

84 residx_2_inchikey = {resid: inchikey 

85 for inchikey, data in inchikeys.items() 

86 for resid in data.resindices} 

87 # Add non-membrane lipid inchikeys 

88 for resid in membrane_map['no_mem_lipid']: 

89 # Some residues may not have InChIKeys (e.g. CG) 

90 inchikey = residx_2_inchikey.get(resid, None) 

91 if inchikey and inchikey not in ligand_keys: 

92 ligand_keys[inchikey][resid] = {} 

93 automatic_references = {key: {} for key in ligand_keys} 

94 pubchem_ids_from_pdb = [] 

95 pdb_ids = pdb_ids or [] 

96 pubchem_ids_from_pdb = pdbs_2_pubchems(pdb_ids, cache) 

97 for inchikey in ligand_keys: 

98 # 1. Direct InChIKey match 

99 pubchem_id = inchikey_2_pubchem(inchikey) 

100 inchi = inchikeys[inchikey].inchi 

101 mol = Chem.MolFromInchi(inchi) 

102 # Original descriptors to calculate similarity later 

103 descriptor_data = obtain_mordred_morgan_descriptors(mol) 

104 automatic_references[inchikey].update(descriptor_data) 

105 if pubchem_id: 

106 automatic_references[inchikey]['pubchem'] = pubchem_id 

107 print(f'Found CID {pubchem_id} by direct InChIKey match.') 

108 continue 

109 # 2.1. Neutralize charges 

110 neutral_mol = rdMolStandardize.ChargeParent(mol) 

111 neutral_inchikey = Chem.MolToInchiKey(neutral_mol) 

112 if pubchem_id := inchikey_2_pubchem(neutral_inchikey): 

113 record_pubchem_match(inchikey, pubchem_id, neutral_mol, descriptor_data, 

114 automatic_references, 'neutralization') 

115 continue 

116 # 2.2. Remove stereochemistry 

117 snon_inchikey = Chem.MolToInchiKey(mol, options='-SNon') 

118 if pubchem_id := inchikey_2_pubchem(snon_inchikey): 

119 Chem.RemoveStereochemistry(mol) 

120 record_pubchem_match(inchikey, pubchem_id, mol, descriptor_data, 

121 automatic_references, 'stereochemistry removal') 

122 continue 

123 # Neutralize charges and remove stereochemistry 

124 snon_inchikey = Chem.MolToInchiKey(neutral_mol, options='-SNon') 

125 if pubchem_id := inchikey_2_pubchem(snon_inchikey): 

126 Chem.RemoveStereochemistry(neutral_mol) 

127 record_pubchem_match(inchikey, pubchem_id, mol, descriptor_data, 

128 automatic_references, 'neutralization + stereochemistry removal') 

129 continue 

130 # 2.3. Standardize molecule 

131 standar_id = pubchem_standardization(inchi) 

132 if standar_id and len(standar_id) == 1: 

133 standar_pubchem = standar_id[0]['pubchem'] 

134 standar_mol = Chem.MolFromInchi(standar_id[0]['inchi']) 

135 record_pubchem_match(inchikey, standar_pubchem, standar_mol, descriptor_data, 

136 automatic_references, 'standardization') 

137 continue 

138 # 2.4. Match against PDB-derived ligands 

139 if pubchem_ids_from_pdb: 

140 for pdb_ligand in pubchem_ids_from_pdb: 

141 if not pdb_ligand.get('inchi', None): 

142 ligand_data = obtain_pubchem_data_from_input(pdb_ligand) 

143 pdb_ligand.update(ligand_data) 

144 

145 pdb_mol = Chem.MolFromInchi(pdb_ligand['inchi']) 

146 tc = record_pubchem_match(inchikey, pdb_ligand['pubchem'], pdb_mol, descriptor_data, 

147 automatic_references, 'matching against PDB-derived ligands') 

148 print(f'Tanimoto coefficient against PDB ligand {pdb_ligand["pubchem"]}: {tc:.3f}') 

149 if tc >= MINIMUM_TANIMOTO_THRESHOLD: 

150 break 

151 # RUBEN: disabled as it does not work very well and it is slow 

152 if False: 

153 # 2.5. Similarity search in PubChem using neutral molecule 

154 neutral_inchi = Chem.MolToInchi(neutral_mol) 

155 for threshold in [95, 90]: 

156 similarity_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastsimilarity_2d/inchi/JSON?Threshold={threshold}&MaxRecords=5" 

157 payload = {'inchi': neutral_inchi} 

158 r = requests.post(similarity_url, data=payload) 

159 if r.status_code != 200: 

160 continue 

161 data = r.json() 

162 tc = 0 

163 for compound in data['PC_Compounds']: 

164 if compound['id']['id']['cid']: 

165 cid = str(compound['id']['id']['cid']) 

166 for prop in compound['props']: 

167 if prop['urn']['label'] == 'InChI': 

168 standard_inchi = prop['value']['sval'] 

169 if not compound or not standard_inchi: 

170 assert False, "PubChem similarity search did not return valid compound data" 

171 simil_mol = Chem.MolFromInchi(standard_inchi) 

172 tc = record_pubchem_match(inchikey, cid, simil_mol, descriptor_data, 

173 ligand_references, 'similarity search in PubChem') 

174 if tc >= MINIMUM_TANIMOTO_THRESHOLD: break 

175 if tc >= MINIMUM_TANIMOTO_THRESHOLD: break 

176 

177 for inchikey, ligand_data in automatic_references.items(): 

178 if ligand_data.get('pubchem', None): 

179 automatic_references[inchikey].update(obtain_pubchem_data_from_input(ligand_data)) 

180 

181 # 4. User-forced selections 

182 input_ligands_data = [] 

183 for input_ligand in input_ligands: 

184 ligand_data = obtain_pubchem_data_from_input(input_ligand) 

185 # If the user defined a ligand name, it will be respected and added to the metadata 

186 if forced_name := input_ligand.get('name', None): 

187 ligand_data['forced_name'] = forced_name 

188 forced_selection = input_ligand.get('selection', None) 

189 if forced_selection: 

190 # Could be a single residue or a list of residues 

191 selection_atoms = structure.select(forced_selection) 

192 residue_indices = structure.get_selection_residue_indices(selection_atoms) 

193 print(f'User forced selection of {len(residue_indices)} residue/s for ligand {input_ligand}.') 

194 if len(residue_indices) == 0 and LIGANDS_MATCH_FLAG not in mercy: 

195 raise InputError(f'Ligand with PubChem ID {pubchem_id} did not map with any residue') 

196 ligand_data['resindices'] = residue_indices 

197 # Search for this pubchem ID in automatic_references 

198 matched_inchikey = None 

199 for auto_inchikey, auto_data in automatic_references.items(): 

200 if auto_data.get('pubchem') == ligand_data['pubchem']: 

201 matched_inchikey = auto_inchikey 

202 break 

203 # If found, move from automatic_references to ligand_references 

204 if matched_inchikey: 

205 ligand_references[matched_inchikey] = automatic_references.pop(matched_inchikey) 

206 if forced_selection: 

207 ligand_references[matched_inchikey]['resindices'] = residue_indices 

208 elif forced_selection: 

209 # If not found, create a new entry in ligand_references 

210 ligand_references[ligand_data['inchikey']] = ligand_data 

211 else: 

212 # Add to input ligands for similarity search 

213 input_fg = obtain_mordred_morgan_descriptors(Chem.MolFromInchi(ligand_data['inchi'])) 

214 input_ligands_data.append({**input_ligand, **ligand_data, **input_fg}) 

215 # Create similarity matrix (N x M) 

216 auto_inchikeys = list(automatic_references.keys()) 

217 similarity_matrix = np.zeros((len(input_ligands_data), len(auto_inchikeys))) 

218 for i, input_fg in enumerate(input_ligands_data): 

219 for j, auto_inchikey in enumerate(auto_inchikeys): 

220 tc = tanimoto_similarity( 

221 input_fg['morgan_fp_bit_array'], 

222 automatic_references[auto_inchikey]['morgan_fp_bit_array']) 

223 similarity_matrix[i, j] = tc 

224 # Use Hungarian algorithm to find optimal assignment (maximize similarity) 

225 # Convert to minimization problem by negating similarities 

226 row_ind, col_ind = linear_sum_assignment(-similarity_matrix) 

227 # Process matched pairs 

228 for input_idx, ref_idx in zip(row_ind, col_ind): 

229 input_data = input_ligands_data[input_idx] 

230 matched_inchikey = auto_inchikeys[ref_idx] 

231 tc = similarity_matrix[input_idx, ref_idx] 

232 

233 if tc < 0.6: 

234 warn(f'Ligand "{input_data.get("name", input_data["inchikey"])}" input selection has Tanimoto coefficient {tc:.3f} ' 

235 f'which is below the threshold of 0.6 when compared to the matched fragment (InChIKey: {matched_inchikey}).') 

236 

237 # Update ligand reference with input data 

238 ligand_references[matched_inchikey] = input_data 

239 automatic_references.pop(matched_inchikey) 

240 print(f'Matched input ligand {input_data["pubchem"]} to reference {matched_inchikey} with TC={tc:.3f}') 

241 

242 # Merge remaining automatic references into ligand_references 

243 ligand_references.update(automatic_references) 

244 not_matched_ligands = [(inchikeys[k].molname, inchikeys[k].resindices) 

245 for k, v in ligand_references.items() 

246 if 'pubchem' not in v] 

247 if not_matched_ligands: 

248 for k, v in not_matched_ligands: 

249 warn(f'Ligand {inchikeys[k].molname} could not be matched to any PubChem ID. Residues: {v}') 

250 if LIGANDS_MATCH_FLAG not in mercy: 

251 raise InputError('Provide PubChem IDs for all ligands, a PDB code where it is ' 

252 f'present or use the flag -m {LIGANDS_MATCH_FLAG} to bypass this check.') 

253 

254 if not ligand_references: 

255 print('No ligands were matched') 

256 

257 return ligand_references 

258 

259 

260def pdb_ligand_2_prd(pdb_id: str) -> Optional[str]: 

261 """Given a PDB ID, get its PRD ligand code.""" 

262 query = '''query structure($id: String!) { 

263 entry(entry_id: $id) { 

264 pdbx_molecule_features { 

265 prd_id 

266 } 

267 } 

268 }''' 

269 parsed_response = request_pdb_data(pdb_id, query) 

270 pdbx_id = parsed_response.get('pdbx_molecule_features', None) 

271 if pdbx_id is None: 

272 print(f'WARNING: Cannot find PRD ligand code for PDB ID {pdb_id}') 

273 return None 

274 prd_id = pdbx_id[0].get('prd_id', None) # AGUS: podría haber casos donde haya más de uno? 

275 if prd_id is None: 

276 print(f'WARNING: Cannot find PRD ligand code for PDB ID {pdb_id}') 

277 return None 

278 # If the PRD ID is not empty then return it 

279 return prd_id 

280 

281 

282def get_pdb_ligand_codes(pdb_id: str) -> list[str]: 

283 """Given a PDB ID, get all its ligand codes. 

284 e.g. 2I3I -> 618, BTB, ZN, EDO, LI. 

285 """ 

286 # Set the request query 

287 query = '''query structure($id: String!) { 

288 entry(entry_id: $id) { 

289 nonpolymer_entities { nonpolymer_comp { chem_comp { id } } } 

290 } 

291 }''' 

292 # Request PDB data 

293 parsed_response = request_pdb_data(pdb_id, query) 

294 # Mine data for nonpolymer entities 

295 nonpolymers = parsed_response['nonpolymer_entities'] 

296 # If there are no nonpolymer entities, another type of entitie could be used 

297 # AGUS: esto es un caso muy concreto que me encontré con el PDB 1N3W 

298 # AGUS: el ligando en este caso es una 'Biologically Interesting Molecules' y se muestran como 'PRD_' 

299 prd_code = None 

300 if nonpolymers is None: 

301 # Get the prd ligand code 

302 prd_code = pdb_ligand_2_prd(pdb_id) 

303 if prd_code is not None: 

304 return [prd_code] 

305 

306 if nonpolymers is None and prd_code is None: return [] 

307 # Iterate nonpolymer entities to mine each PDB code 

308 ligand_codes = [] 

309 for nonpolymer in nonpolymers: 

310 ligand_code = nonpolymer['nonpolymer_comp']['chem_comp']['id'] 

311 ligand_codes.append(ligand_code) 

312 print(f' Ligand codes for PDB ID {pdb_id}: ' + ', '.join(ligand_codes)) 

313 return ligand_codes 

314 

315 

316def pdb_ligand_2_pubchem(pdb_ligand_id: str) -> Optional[str]: 

317 """Given a PDB ligand code, get its PubChem ID.""" 

318 # Set the request query 

319 query = '''query molecule($id:String!){ 

320 chem_comp(comp_id:$id) { rcsb_chem_comp_related{ resource_name resource_accession_code } } 

321 }''' 

322 # Request PDB data 

323 parsed_response = request_pdb_data(pdb_ligand_id, query) 

324 related_resources = parsed_response['rcsb_chem_comp_related'] 

325 # It may happend that a ligand code has no related resources at all 

326 # e.g. ZN 

327 if not related_resources: return None 

328 pubchem_resource = next((resource for resource in related_resources if resource['resource_name'] == 'PubChem'), None) 

329 if not pubchem_resource: return None 

330 return pubchem_resource['resource_accession_code'] 

331 

332 

333def pdb_ligand_2_pubchem_RAW(pdb_ligand_id: str) -> Optional[str]: 

334 """Given a PDB ligand code, get its PubChem ID. 

335 Use a web crawler to avoid having to use the PDB API. 

336 """ 

337 # Set the request URL 

338 request_url = f'https://www.rcsb.org/ligand/{pdb_ligand_id}' 

339 # Run the query 

340 parsed_response = None 

341 try: 

342 with urlopen(request_url) as response: 

343 parsed_response = response.read().decode("utf-8") 

344 # If the accession is not found in the PDB then we can stop here 

345 except HTTPError as error: 

346 if error.code == 404: 

347 print(f' PDB ligand {pdb_ligand_id} not found') 

348 return None 

349 else: 

350 print(error.msg) 

351 raise ValueError('Something went wrong with the PDB ligand request: ' + request_url) 

352 # Mine the PubChem ID out of the whole response 

353 pattern = re.compile('pubchem.ncbi.nlm.nih.gov\/compound\/([0-9]*)\"') 

354 match = re.search(pattern, parsed_response) 

355 # If there is no PubChem ID then return none 

356 # This is normal for some ligands such as counter ions (e.g. LI) 

357 if not match: 

358 return None 

359 pubchem_id = match[1] 

360 return pubchem_id 

361 

362 

363# DANI: No se ha provado a fondo 

364def pdb_ligand_2_pubchem_RAW_RAW(pdb_ligand_id: str) -> Optional[str]: 

365 """Given a PDB ligand code, get its PubChem ID. 

366 Ask to PubChem if the ligand exists and hope there is only one result. 

367 """ 

368 # Set the request URL 

369 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{pdb_ligand_id}/json' 

370 # Run the query 

371 parsed_response = None 

372 try: 

373 with urlopen(request_url) as response: 

374 parsed_response = json.loads(response.read().decode("utf-8")) 

375 # If the accession is not found in the PDB then we can stop here 

376 except HTTPError as error: 

377 # This may happen for weird things such as UNX (unknown atom or ion) 

378 if error.code == 404: 

379 return None 

380 print(error.msg) 

381 raise RuntimeError('Something went wrong with the PDB ligand request in PubChem: ' + request_url) 

382 # Mine the PubChem ID 

383 compounds = parsed_response['PC_Compounds'] 

384 if len(compounds) != 1: 

385 # There could be more than one result 

386 # For example, the case HEM: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/HEM/json 

387 # In this case we picked the first result 

388 compound = compounds[0] 

389 # raise RuntimeError('We are not having 1 and only 1 result from PubChem: ' + request_url) 

390 compound = compounds[0] 

391 id1 = compound['id'] 

392 id2 = id1['id'] 

393 pubchem_id = str(id2['cid']) 

394 return pubchem_id 

395 

396 

397def pdb_2_pubchem(pdb_id: str) -> list[str]: 

398 """Given a PDB ID, get its PubChem IDs.""" 

399 print(f'Searching PubChem IDs for PDB ID {pdb_id}') 

400 pubchem_ids = [] 

401 # Iterate over pdb ligand codes 

402 ligand_codes = get_pdb_ligand_codes(pdb_id) 

403 for ligand_code in ligand_codes: 

404 # Ask the PDB API for the ligand 

405 pubchem_id = pdb_ligand_2_pubchem(ligand_code) 

406 # If this did not work then try mining the PDB client with a web crawler 

407 if not pubchem_id: 

408 pubchem_id = pdb_ligand_2_pubchem_RAW(ligand_code) 

409 # If this did not work then try it from PubChem 

410 if not pubchem_id: 

411 pubchem_id = pdb_ligand_2_pubchem_RAW_RAW(ligand_code) 

412 # Otherwise we surrender 

413 if not pubchem_id: 

414 print(f' {ligand_code} -> No PubChem ID') 

415 continue 

416 print(f' {ligand_code} -> {pubchem_id}') 

417 pubchem_ids.append(pubchem_id) 

418 

419 return pubchem_ids 

420 

421 

422def pdbs_2_pubchems(pdb_ids: list[str], cache: 'Cache') -> list[dict]: 

423 """Given a list of PDB IDs, get their PubChem IDs and add them to the ligands list.""" 

424 pdb_ligands = [] 

425 # Check we have cached pdb 2 PubChem values 

426 pdb_2_pubchem_cache = cache.retrieve(PDB_TO_PUBCHEM, {}) 

427 new_data_to_cache = False 

428 # Get input ligands from the pdb IDs, if any 

429 for pdb_id in pdb_ids: 

430 # Check we have cached this specific pdb 

431 pubchem_ids_from_pdb = pdb_2_pubchem_cache.get(pdb_id, None) 

432 if pubchem_ids_from_pdb is not None: 

433 print(f' Retriving from cache PubChem IDs for PDB ID {pdb_id}: ') 

434 if len(pubchem_ids_from_pdb) > 0: 

435 print(' PubChem IDs: ' + ', '.join(pubchem_ids_from_pdb)) 

436 else: 

437 print(' This PDB ID has no PubChem IDs') 

438 

439 # If we had no cached pdb 2 PubChem then ask for them 

440 if pubchem_ids_from_pdb is None: 

441 pubchem_ids_from_pdb = pdb_2_pubchem(pdb_id) 

442 # Save the result in the cache object so it is saved to cache later 

443 pdb_2_pubchem_cache[pdb_id] = pubchem_ids_from_pdb 

444 new_data_to_cache = True 

445 for pubchem_id in pubchem_ids_from_pdb: 

446 # Ligands in the structure (PDB) and the 'inputs.json' could be the same so it's not necessary to do it twice 

447 if not any('pubchem' in ligand and ligand['pubchem'] == pubchem_id for ligand in pdb_ligands): 

448 pdb_ligands.append({'pubchem': pubchem_id, 'pdb': True}) 

449 # Save all pdb to PubChem results in cache, in case there is anything new 

450 if new_data_to_cache: cache.update(PDB_TO_PUBCHEM, pdb_2_pubchem_cache) 

451 

452 return pdb_ligands 

453 

454 

455def handle_http_request(request_url, error_context="request"): 

456 """Make an HTTP request handler with consistent error handling.""" 

457 try: 

458 with urlopen(request_url) as response: 

459 return response.read().decode("utf-8", errors='ignore') 

460 except HTTPError as error: 

461 if error.code == 404: 

462 return None 

463 raise ValueError(f'Something went wrong with the {error_context} (error {error.code})') 

464 except URLError as error: 

465 raise ValueError(f'Something went wrong with the {error_context}: {error.reason}') 

466 

467 

468def get_pubchem_data(pubchem_id: str) -> Optional[dict]: 

469 """Given a PubChem ID, use the uniprot API to request its data and then mine what is needed for the database.""" 

470 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{pubchem_id}/JSON/' 

471 parsed_response = handle_http_request(request_url, "PubChem request") 

472 if parsed_response is None: 

473 print('WARNING: Cannot find PubChem entry for accession ' + pubchem_id) 

474 return None 

475 parsed_response = json.loads(parsed_response) 

476 # Mine target data: SMILES 

477 record = parsed_response.get('Record', None) 

478 if record is None: 

479 raise RuntimeError('Wrong PubChem data structure: no record: ' + request_url) 

480 sections = record.get('Section', None) 

481 if sections is None: 

482 raise RuntimeError('Wrong PubChem data structure: no sections: ' + request_url) 

483 names_and_ids_section = next((section for section in sections if section.get('TOCHeading', None) == 'Names and Identifiers'), None) 

484 if names_and_ids_section is None: 

485 raise RuntimeError('Wrong PubChem data structure: no name and IDs section: ' + request_url) 

486 names_and_ids_subsections = names_and_ids_section.get('Section', None) 

487 if names_and_ids_subsections is None: 

488 raise RuntimeError('Wrong PubChem data structure: no name and IDs subsections: ' + request_url) 

489 

490 # Mine the name 

491 synonims = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Synonyms'), None) 

492 if synonims is None: 

493 descriptors = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None) 

494 descriptors_subsections = descriptors.get('Section', None) 

495 if descriptors_subsections is None: 

496 raise RuntimeError('Wrong PubChem data structure: no name and IDs subsections: ' + request_url) 

497 depositor_supplied_descriptors = next((s for s in descriptors_subsections if s.get('TOCHeading', None) == 'IUPAC Name'), None) 

498 name_substance = depositor_supplied_descriptors.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

499 else: 

500 synonims_subsections = synonims.get('Section', None) 

501 if synonims_subsections is None: 

502 raise RuntimeError('Wrong PubChem data structure: no name and IDs subsections: ' + request_url) 

503 depositor_supplied_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Depositor-Supplied Synonyms'), None) 

504 if depositor_supplied_synonims is None: 

505 removed_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Removed Synonyms'), None) 

506 name_substance = removed_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

507 else: 

508 name_substance = depositor_supplied_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

509 

510 # Mine the SMILES 

511 computed_descriptors_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None) 

512 if computed_descriptors_subsection is None: 

513 raise RuntimeError('Wrong PubChem data structure: no computeed descriptors: ' + request_url) 

514 canonical_smiles_section = computed_descriptors_subsection.get('Section', None) 

515 if canonical_smiles_section is None: 

516 raise RuntimeError('Wrong PubChem data structure: no canonical SMILES section: ' + request_url) 

517 canonical_smiles = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'Canonical SMILES'), None) 

518 if canonical_smiles is None: 

519 # In some cases there is no canonical SMILES but a non-canonical one could exists 

520 non_canonical_smiles_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'SMILES'), None) 

521 if non_canonical_smiles_section is None: 

522 raise RuntimeError('Wrong PubChem data structure: no canonical SMILES: ' + request_url) 

523 

524 if canonical_smiles: 

525 smiles = canonical_smiles.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

526 if non_canonical_smiles_section: 

527 smiles = non_canonical_smiles_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

528 

529 if smiles is None: 

530 raise RuntimeError('Wrong PubChem data structure: no SMILES: ' + request_url) 

531 

532 # Mine target data: MOLECULAR FORMULA 

533 molecular_formula_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Molecular Formula'), None) 

534 if molecular_formula_subsection is None: 

535 raise RuntimeError('Wrong PubChem data structure: no molecular formula section: ' + request_url) 

536 molecular_formula = molecular_formula_subsection.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

537 if molecular_formula is None: 

538 raise RuntimeError('Wrong PubChem data structure: no molecular formula: ' + request_url) 

539 

540 # Mine target data: PDB ID 

541 pdb_id = None 

542 pdb_id_subsection = next((s for s in sections if s.get('TOCHeading', None) == 'Interactions and Pathways'), None) 

543 # If this section is missing then it means this PubChem compound has no PDB ID 

544 if pdb_id_subsection: 

545 pdb_id_subsections = pdb_id_subsection.get('Section', None) 

546 if pdb_id_subsections is None: 

547 raise RuntimeError('Wrong PubChem data structure: no name and IDs subsections: ' + request_url) 

548 bond_structures = next((s for s in pdb_id_subsections if s.get('TOCHeading', None) == 'Protein Bound 3D Structures'), None) 

549 if bond_structures: 

550 bond_structures_section = bond_structures.get('Section', None) 

551 # If this section is missing then it means this PubChem compound has no PDB ID 

552 if bond_structures_section: 

553 ligands_structure = next((s for s in bond_structures_section if s.get('TOCHeading', None) == 'Ligands from Protein Bound 3D Structures'), None) 

554 if ligands_structure is None: 

555 raise RuntimeError('Wrong PubChem data structure: no Ligands from Protein Bound 3D Structures section: ' + request_url) 

556 ligands_structure_subsections = ligands_structure.get('Section', None) 

557 if ligands_structure_subsections is None: 

558 raise RuntimeError('Wrong PubChem data structure: no Ligands from Protein Bound 3D Structures subsections: ' + request_url) 

559 ligands_pdb = next((s for s in ligands_structure_subsections if s.get('TOCHeading', None) == 'PDBe Ligand Code'), None) 

560 if ligands_pdb is None: 

561 raise RuntimeError('Wrong PubChem data structure: no PDBe Ligand Code: ' + request_url) 

562 pdb_id = ligands_pdb.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

563 

564 # Mine de INCHI and INCHIKEY 

565 inchi_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChI'), None) 

566 if inchi_section is None: 

567 raise RuntimeError('Wrong PubChem data structure: no InChI: ' + request_url) 

568 if inchi_section: 

569 inchi = inchi_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

570 

571 inchikey_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChIKey'), None) 

572 if inchikey_section is None: 

573 raise RuntimeError('Wrong PubChem data structure: no InChIKey: ' + request_url) 

574 if inchikey_section: 

575 inchikey = inchikey_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

576 # Prepare the PubChem data to be returned 

577 return {'name': name_substance, 'smiles': smiles, 'formula': molecular_formula, 'pdbid': pdb_id, 'inchi': inchi, 'inchikey': inchikey} 

578 

579 

580def drugbank_2_pubchem(drugbank_id) -> Optional[str]: 

581 """Given a DrugBank ID, request its PubChem compound ID.""" 

582 url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drugbank_id}/JSON' 

583 r = requests.get(url) 

584 if r.ok: 

585 data = r.json() 

586 return str(data['PC_Compounds'][0]['id']['id']['cid']) 

587 else: 

588 raise ValueError("Something went wrong with the DrugBank to PubChem request (error " + str(r.status_code) + ")") 

589 

590 

591def chembl_2_pubchem(chembl_id) -> Optional[str]: 

592 """Given a ChemBL ID, use the uniprot API to request its PubChem compound ID.""" 

593 url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/xref/RegistryID/{chembl_id}/cids/JSON' 

594 r = requests.get(url) 

595 if r.ok: 

596 data = r.json() 

597 return str(data['IdentifierList']['CID'][0]) 

598 else: 

599 raise ValueError("Something went wrong with the ChEMBL to PubChem request (error " + str(r.status_code) + ")") 

600 

601 

602def obtain_mordred_morgan_descriptors(mol: Chem.Mol) -> dict: 

603 """Calculate the Morgan fingerprint and the Mordred descriptors from a molecule.""" 

604 Chem.AllChem.Compute2DCoords(mol) 

605 mol_block = Chem.MolToMolBlock(mol) 

606 # We can select the different submodules of mordred descriptors, avaible in: 'https://mordred-descriptor.github.io/documentation/master/' 

607 calc = Calculator([ 

608 descriptors.ABCIndex, # Índice de ramificación 

609 descriptors.AcidBase.AcidicGroupCount, # Grupos ácidos 

610 descriptors.AcidBase.BasicGroupCount, # Grupos básicos 

611 descriptors.RingCount, # Conteo de anillos 

612 descriptors.Constitutional, # Propiedades generales como número de átomos, peso molecular 

613 descriptors.TopologicalCharge, # Índices topológicos, Cargas parciales, polaridad 

614 descriptors.HydrogenBond, # Donantes y aceptores de enlaces de hidrógeno 

615 descriptors.Lipinski, # Reglas de Lipinski (drug-likeness) 

616 descriptors.FragmentComplexity, # Identificación de subestructuras frecuentes 

617 descriptors.PathCount, # Conteo de caminos moleculares 

618 ], ignore_3D=True) 

619 

620 # Calculate Mordred results 

621 mordred_results = calc(mol).drop_missing().asdict() 

622 

623 # MORGAN FINGERPRINT 

624 morgan_fp_gen = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048) 

625 ao = Chem.rdFingerprintGenerator.AdditionalOutput() 

626 ao.AllocateAtomCounts() 

627 ao.AllocateAtomToBits() 

628 ao.AllocateBitInfoMap() 

629 fp = morgan_fp_gen.GetFingerprint(mol, additionalOutput=ao) 

630 morgan_fp_bit_array = list(fp) 

631 morgan_highlight_atoms = {} 

632 for bit, atoms in ao.GetBitInfoMap().items(): 

633 morgan_highlight_atoms[bit] = list(set(atom for atom, radius in atoms)) 

634 data = {'mordred': mordred_results, 

635 'morgan_fp_bit_array': morgan_fp_bit_array, 

636 'morgan_highlight_atoms': morgan_highlight_atoms, 

637 'mol_block': mol_block} 

638 return data 

639 

640 

641def tanimoto_similarity(fp1: list[int], fp2: list[int]) -> float: 

642 """Calculate the Tanimoto similarity between two fingerprints.""" 

643 if len(fp1) != len(fp2): 

644 raise ValueError('Fingerprints must be of the same length') 

645 a = sum(1 for i in range(len(fp1)) if fp1[i] == 1 and fp2[i] == 1) 

646 b = sum(1 for i in range(len(fp1)) if fp1[i] == 1 and fp2[i] == 0) 

647 c = sum(1 for i in range(len(fp1)) if fp1[i] == 0 and fp2[i] == 1) 

648 if (a + b + c) == 0: 

649 return 0.0 

650 return a / (a + b + c) 

651 

652 

653def obtain_pubchem_data_from_input(ligand: dict) -> dict[str, Optional[str]]: 

654 """Given an input ligand, obtain all necessary data.""" 

655 # Save in a dictionary all ligand data including its name and IDs 

656 # The ID can be of the databases: 'drugbank' , 'pubchem' , 'chembl' 

657 # Define the needed variables to check if the ligand has a database ID or it is None 

658 ligand_data = {field: None for field in LIGAND_DATA_FIELDS} 

659 # Set ligand data PubChem ID, even if the input ID is not from pubhcme (e.g. drugbank, chembl) 

660 if 'pubchem' in ligand: 

661 ligand_data['pubchem'] = str(ligand.get('pubchem')) 

662 elif 'drugbank' in ligand: 

663 ligand_data['drugbank'] = ligand.get('drugbank') 

664 ligand_data['pubchem'] = str(drugbank_2_pubchem(ligand_data['drugbank'])) 

665 elif 'chembl' in ligand: 

666 ligand_data['chembl'] = ligand.get('chembl') 

667 ligand_data['pubchem'] = str(chembl_2_pubchem(ligand_data['chembl'])) 

668 else: 

669 raise InputError('None of the ligand IDs are defined. Please provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.') 

670 

671 # Request ligand data from pubchem 

672 pubchem_data = get_pubchem_data(ligand_data['pubchem']) 

673 if not pubchem_data: 

674 raise RuntimeError('No PubChem data avilable') 

675 

676 # Add PubChem data to ligand data 

677 ligand_data = {**ligand_data, **pubchem_data} 

678 return ligand_data 

679 

680 

681@lru_cache(maxsize=None) 

682def inchikey_2_pubchem(inchikey: str, conectivity_only=False) -> Optional[str]: 

683 """Given an InChIKey, get the PubChem CID.""" 

684 inchikey = inchikey.split('-')[0] if conectivity_only else inchikey 

685 url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/cids/JSON" 

686 r = requests.get(url) 

687 if r.ok: 

688 data = r.json() 

689 if conectivity_only: 

690 return [str(cid) for cid in data['IdentifierList']['CID']] 

691 else: 

692 return str(data['IdentifierList']['CID'][0]) 

693 return None 

694 

695 

696@lru_cache(maxsize=None) 

697def pubchem_standardization(inchi: str) -> Optional[list[dict]]: 

698 """Try PubChem standardization service to write things like the bonds from aromatic rings in the same order. 

699 

700 You can see examples on the paper https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0293-8 

701 """ 

702 url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/standardize/inchi/JSON" 

703 payload = {'inchi': inchi} 

704 r = requests.post(url, data=payload) 

705 if r.ok: 

706 data = r.json() 

707 results = [] 

708 for compound in data['PC_Compounds']: 

709 if compound['id']['id']['cid']: 

710 cid = str(compound['id']['id']['cid']) 

711 else: 

712 continue 

713 standard_inchi = None 

714 for prop in compound['props']: 

715 if prop['urn']['label'] == 'InChI': 

716 standard_inchi = prop['value']['sval'] 

717 results.append({'pubchem': cid, 'inchi': standard_inchi}) 

718 return results 

719 return None