Coverage for mddb_workflow/tools/generate_ligands_desc.py: 60%

578 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-29 15:48 +0000

1import re 

2import os 

3import json 

4import requests 

5from rdkit import Chem 

6from functools import lru_cache 

7from mordred import Calculator, descriptors 

8from mddb_workflow.utils.constants import LIGANDS_MATCH_FLAG, PDB_TO_PUBCHEM, NOT_MATCHED_LIGANDS 

9from mddb_workflow.utils.auxiliar import InputError, load_json, save_json, request_pdb_data 

10from mddb_workflow.utils.type_hints import * 

11from mddb_workflow.utils.structures import Structure 

12from mddb_workflow.tools.get_inchi_keys import obtain_inchikey_from_pdb 

13from urllib.request import Request, urlopen 

14from urllib.parse import urlencode 

15from urllib.error import HTTPError, URLError 

16 

17def get_drugbank_smiles (id_drugbank : str) -> Optional[str]: 

18 # Request Drugbank 

19 request_url = Request( 

20 url= f'https://go.drugbank.com/structures/small_molecule_drugs/{id_drugbank}.smiles', 

21 headers={'User-Agent': 'Mozilla/5.0'} 

22 ) 

23 try: 

24 with urlopen(request_url) as response: 

25 smiles = response.read() 

26 # If the accession is not found in the database then we stop here 

27 except HTTPError as error: 

28 # If the drugbank ID is not yet in the Drugbank references then return None 

29 if error.code == 404: 

30 return None 

31 else: 

32 print('Error when requesting ' + request_url) 

33 raise ValueError('Something went wrong with the Drugbank request (error ' + str(error.code) + ')') 

34 # This error may occur if there is no internet connection 

35 except URLError as error: 

36 print('Error when requesting ' + request_url) 

37 raise ValueError('Something went wrong with the MDposit request') 

38 

39 return smiles 

40 

41# Given a ChemBL ID, use the uniprot API to request its data and then mine what is needed for the database 

42def get_chembl_smiles (id_chembl : str) -> Optional[str]: 

43 # Request ChemBL 

44 parsed_response = None 

45 request_url = Request( 

46 url= f'https://www.ebi.ac.uk/chembl/interface_api/es_proxy/es_data/get_es_document/chembl_molecule/{id_chembl}', 

47 headers={'User-Agent': 'Mozilla/5.0'} 

48 ) 

49 try: 

50 with urlopen(request_url) as response: 

51 parsed_response = json.loads(response.read().decode("utf-8")) 

52 smiles = parsed_response['_source']['molecule_structures']['canonical_smiles'] 

53 pubchem_id = parsed_response['_source']['_metadata']['unichem'][8]['id'] 

54 # If the accession is not found in ChemBL then the id is not valid 

55 except HTTPError as error: 

56 if error.code == 404: 

57 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl) 

58 return None 

59 print('Error when requesting ' + request_url) 

60 raise ValueError('Something went wrong with the ChemBL request (error ' + str(error.code) + ')') 

61 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607) 

62 if parsed_response == None: 

63 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl) 

64 return None 

65 return smiles 

66 

67# Given a PubChem ID, use the uniprot API to request its data and then mine what is needed for the database 

68def get_pubchem_data (id_pubchem : str) -> Optional[dict]: 

69 # Request PubChem 

70 parsed_response = None 

71 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{id_pubchem}/JSON/' 

72 try: 

73 with urlopen(request_url) as response: 

74 #parsed_response = json.loads(response.read().decode("windows-1252")) 

75 parsed_response = json.loads(response.read().decode("utf-8", errors='ignore')) 

76 # If the accession is not found in PubChem then the id is not valid 

77 # This may happen with pubchem ids of non-discrete compounds (e.g. 483927498) 

78 except HTTPError as error: 

79 if error.code == 404: 

80 print('WARNING: Cannot find PubChem entry for accession ', id_pubchem) 

81 return None 

82 print('Error when requesting ', request_url) 

83 raise ValueError('Something went wrong with the PubChem request (error ', str(error.code), ')') 

84 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607) 

85 if parsed_response == None: 

86 print('WARNING: Cannot find PubChem entry for accession ' + id_pubchem) 

87 return None 

88 # Mine target data: SMILES 

89 record = parsed_response.get('Record', None) 

90 if record == None: 

91 raise RuntimeError('Wrong Pubchem data structure: no record: ' + request_url) 

92 sections = record.get('Section', None) 

93 if sections == None: 

94 raise RuntimeError('Wrong Pubchem data structure: no sections: ' + request_url) 

95 names_and_ids_section = next((section for section in sections if section.get('TOCHeading', None) == 'Names and Identifiers'), None) 

96 if names_and_ids_section == None: 

97 raise RuntimeError('Wrong Pubchem data structure: no name and ids section: ' + request_url) 

98 names_and_ids_subsections = names_and_ids_section.get('Section', None) 

99 if names_and_ids_subsections == None: 

100 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url) 

101 

102 # Mine the name 

103 synonims = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Synonyms'), None) 

104 if synonims == None: 

105 descriptors = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None) 

106 descriptors_subsections = descriptors.get('Section', None) 

107 if descriptors_subsections == None: 

108 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url) 

109 depositor_supplied_descriptors = next((s for s in descriptors_subsections if s.get('TOCHeading', None) == 'IUPAC Name'), None) 

110 name_substance = depositor_supplied_descriptors.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

111 else: 

112 synonims_subsections = synonims.get('Section', None) 

113 if synonims_subsections == None: 

114 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url) 

115 depositor_supplied_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Depositor-Supplied Synonyms'), None) 

116 if depositor_supplied_synonims == None: 

117 removed_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Removed Synonyms'), None) 

118 name_substance = removed_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

119 else: 

120 name_substance = depositor_supplied_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

121 

122 # Mine the SMILES 

123 computed_descriptors_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None) 

124 if computed_descriptors_subsection == None: 

125 raise RuntimeError('Wrong Pubchem data structure: no computeed descriptors: ' + request_url) 

126 canonical_smiles_section = computed_descriptors_subsection.get('Section', None) 

127 if canonical_smiles_section == None: 

128 raise RuntimeError('Wrong Pubchem data structure: no canonical SMILES section: ' + request_url) 

129 canonical_smiles = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'Canonical SMILES'), None) 

130 if canonical_smiles == None: 

131 # In some cases there is no canonical SMILES but a non-canonical one could exists 

132 non_canonical_smiles_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'SMILES'), None) 

133 if non_canonical_smiles_section == None: 

134 raise RuntimeError('Wrong Pubchem data structure: no canonical SMILES: ' + request_url) 

135 

136 if canonical_smiles: 

137 smiles = canonical_smiles.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

138 if non_canonical_smiles_section: 

139 smiles = non_canonical_smiles_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

140 

141 if smiles == None: 

142 raise RuntimeError('Wrong Pubchem data structure: no SMILES: ' + request_url) 

143 

144 # Mine target data: MOLECULAR FORMULA 

145 molecular_formula_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Molecular Formula'), None) 

146 if molecular_formula_subsection == None: 

147 raise RuntimeError('Wrong Pubchem data structure: no molecular formula section: ' + request_url) 

148 molecular_formula = molecular_formula_subsection.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

149 if molecular_formula == None: 

150 raise RuntimeError('Wrong Pubchem data structure: no molecular formula: ' + request_url) 

151 

152 # Mine target data: PDB ID 

153 pdb_id = None 

154 pdb_id_subsection = next((s for s in sections if s.get('TOCHeading', None) == 'Interactions and Pathways'), None) 

155 # If this section is missing then it means this PubChem compound has no PDB id 

156 if pdb_id_subsection: 

157 pdb_id_subsections = pdb_id_subsection.get('Section', None) 

158 if pdb_id_subsections == None: 

159 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url) 

160 bond_structures = next((s for s in pdb_id_subsections if s.get('TOCHeading', None) == 'Protein Bound 3D Structures'), None) 

161 if bond_structures: 

162 bond_structures_section = bond_structures.get('Section', None) 

163 # If this section is missing then it means this PubChem compound has no PDB id 

164 if bond_structures_section: 

165 ligands_structure = next((s for s in bond_structures_section if s.get('TOCHeading', None) == 'Ligands from Protein Bound 3D Structures'), None) 

166 if ligands_structure == None: 

167 raise RuntimeError('Wrong Pubchem data structure: no Ligands from Protein Bound 3D Structures section: ' + request_url) 

168 ligands_structure_subsections = ligands_structure.get('Section', None) 

169 if ligands_structure_subsections == None: 

170 raise RuntimeError('Wrong Pubchem data structure: no Ligands from Protein Bound 3D Structures subsections: ' + request_url) 

171 ligands_pdb = next((s for s in ligands_structure_subsections if s.get('TOCHeading', None) == 'PDBe Ligand Code'), None) 

172 if ligands_pdb == None: 

173 raise RuntimeError('Wrong Pubchem data structure: no PDBe Ligand Code: ' + request_url) 

174 pdb_id = ligands_pdb.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

175 

176 # Mine de INCHI and INCHIKEY 

177 inchi_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChI'), None) 

178 if inchi_section == None: 

179 raise RuntimeError('Wrong Pubchem data structure: no InChI: ' + request_url) 

180 if inchi_section: 

181 inchi = inchi_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

182 

183 inchikey_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChIKey'), None) 

184 if inchikey_section == None: 

185 raise RuntimeError('Wrong Pubchem data structure: no InChIKey: ' + request_url) 

186 if inchikey_section: 

187 inchikey = inchikey_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

188 # Prepare the pubchem data to be returned 

189 return { 'name': name_substance, 'smiles': smiles, 'formula': molecular_formula , 'pdbid': pdb_id, 'inchi': inchi, 'inchikey': inchikey } 

190 

191def find_drugbank_pubchem (drugbank_id): 

192 # Request Drugbank 

193 request_url = Request( 

194 url=f'https://go.drugbank.com/drugs/{drugbank_id}', 

195 headers={ 

196 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 

197 'Accept-Language': 'en-US,en;q=0.9', 

198 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' 

199 } 

200) 

201 pubchem_id = None 

202 try: 

203 with urlopen(request_url) as response: 

204 content = response.read().decode("utf-8") 

205 pattern = re.compile("http\:\/\/pubchem.ncbi.nlm.nih.gov\/summary\/summary.cgi\?cid\=([0-9]*)") 

206 match = re.search(pattern, str(content)) 

207 if not match: 

208 raise ValueError("No se encontró información sobre el Pubchem compound en esta página.") 

209 pubchem_id = match[1] 

210 if not pubchem_id: 

211 raise ValueError("No se encontró información sobre el Pubchem compound en esta página.") 

212 # If the accession is not found in the database then we stop here 

213 except HTTPError as error: 

214 # If the drugbank ID is not yet in the Drugbank references then return None 

215 raise ValueError(f'Wrong request. Code: {error.code}') 

216 # This error may occur if there is no internet connection 

217 except URLError as error: 

218 print('Error when requesting ' + request_url) 

219 raise ValueError('Something went wrong with the DrugBank request') 

220 

221 return pubchem_id 

222 

223def find_chembl_pubchem (id_chembl): 

224 # Request ChemBL 

225 parsed_response = None 

226 request_url = Request( 

227 url= f'https://www.ebi.ac.uk/chembl/interface_api/es_proxy/es_data/get_es_document/chembl_molecule/{id_chembl}', 

228 headers={'User-Agent': 'Mozilla/5.0'} 

229 ) 

230 try: 

231 with urlopen(request_url) as response: 

232 parsed_response = json.loads(response.read().decode("utf-8")) 

233 unichem = parsed_response['_source']['_metadata']['unichem'] 

234 if unichem == None: 

235 raise RuntimeError('Wrong Pubchem data structure: no unichem section') 

236 pubchem_section = next((section for section in unichem if section.get('src_url', None) == "http://pubchem.ncbi.nlm.nih.gov"), None) 

237 if pubchem_section == None: 

238 raise RuntimeError('Wrong Pubchem data structure: no pubchem section') 

239 pubchem_id = pubchem_section.get('id', None) 

240 if pubchem_id == None: 

241 raise RuntimeError('Wrong Pubchem data structure: no pubchem ID') 

242 # If the accession is not found in ChemBL then the id is not valid 

243 except HTTPError as error: 

244 if error.code == 404: 

245 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl) 

246 return None 

247 print('Error when requesting ' + request_url) 

248 raise ValueError('Something went wrong with the ChemBL request (error ' + str(error.code) + ')') 

249 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607) 

250 if parsed_response == None: 

251 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl) 

252 return None 

253 

254 return pubchem_id 

255 

256# Calculate the Morgan fingerprint and the Mordred descriptors from a ligand SMILES 

257def obtain_mordred_morgan_descriptors (smiles : str) -> tuple: 

258 mol = Chem.MolFromSmiles(smiles) 

259 if mol is None: 

260 print(f'WARNING: Cannot generate a molecule from SMILES {smiles}') 

261 return None 

262 

263 Chem.AllChem.Compute2DCoords(mol) 

264 mol_block = Chem.MolToMolBlock(mol) 

265 # We can select the different submodules of mordred descriptors, avaible in: 'https://mordred-descriptor.github.io/documentation/master/' 

266 calc = Calculator([ 

267 descriptors.ABCIndex, # Índice de ramificación 

268 descriptors.AcidBase.AcidicGroupCount, # Grupos ácidos 

269 descriptors.AcidBase.BasicGroupCount, # Grupos básicos 

270 descriptors.RingCount, # Conteo de anillos 

271 descriptors.Constitutional, # Propiedades generales como número de átomos, peso molecular 

272 descriptors.TopologicalCharge, # Índices topológicos, Cargas parciales, polaridad 

273 descriptors.HydrogenBond, # Donantes y aceptores de enlaces de hidrógeno 

274 descriptors.Lipinski, # Reglas de Lipinski (drug-likeness) 

275 descriptors.FragmentComplexity, # Identificación de subestructuras frecuentes 

276 descriptors.PathCount, # Conteo de caminos moleculares 

277 ], ignore_3D=True) 

278 

279 

280 # Calculate Mordred results 

281 mordred_results = calc(mol).drop_missing().asdict() 

282 

283 ######## MORGAN FINGERPRINT ########### 

284 morgan_fp_gen = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048) 

285 ao = Chem.rdFingerprintGenerator.AdditionalOutput() 

286 ao.AllocateAtomCounts() 

287 ao.AllocateAtomToBits() 

288 ao.AllocateBitInfoMap() 

289 fp = morgan_fp_gen.GetFingerprint(mol,additionalOutput=ao) 

290 morgan_fp_bit_array = list(fp) 

291 morgan_highlight_atoms = {} 

292 for bit, atoms in ao.GetBitInfoMap().items(): 

293 morgan_highlight_atoms[bit] = list(set(atom for atom, radius in atoms)) 

294 

295 return mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block 

296 

297# Check the cache to see if the ligand has already been matched or not, if not the ligand is skipped 

298def check_matched_ligand (ligand: dict, ligand_data: dict, cache: 'Cache') -> bool: 

299 # The pubchem id could be in the ligands data or the ligand input 

300 pubchem_id_1 = ligand.get('pubchem', None) 

301 if ligand_data != None: 

302 pubchem_id_2 = ligand_data.get('pubchem', None) 

303 else: 

304 pubchem_id_2 = None 

305 # If we have no pubchem id then we cannot check if the ligand matched 

306 if not pubchem_id_1 and not pubchem_id_2: 

307 return False 

308 # If we have a pubchem id then check if it is already in the cache 

309 cache_not_matched = cache.retrieve(NOT_MATCHED_LIGANDS, []) 

310 for pubchem in cache_not_matched: 

311 if pubchem == pubchem_id_1 or pubchem == pubchem_id_2: 

312 print(f" Ligand with PubChem id {pubchem} didn't match before, skipping it") 

313 return True 

314 

315 return False 

316 

317def generate_ligand_mapping ( 

318 structure : 'Structure', 

319 cache : 'Cache', 

320 input_ligands : Optional[list[dict]], 

321 pdb_ids : list[str], 

322 output_filepath : str, 

323 inchikeys: dict, 

324 mercy : list[str] = [], 

325 ) -> list[dict]: 

326 """Generate a map of residues associated to ligands.""" 

327 # Merge input ligands and pdb ligands 

328 ligands = input_ligands or [] 

329 # Check we have cached pdb 2 pubchem values 

330 pdb_to_pubchem_cache = cache.retrieve(PDB_TO_PUBCHEM, {}) 

331 new_data_to_cache = False 

332 # Get input ligands from the pdb ids, if any 

333 for pdb_id in pdb_ids: 

334 # Check we have cached this specific pdb 

335 pubchem_ids_from_pdb = pdb_to_pubchem_cache.get(pdb_id, None) 

336 if pubchem_ids_from_pdb != None: 

337 print(f' Retriving from cache PubChem ids for PDB id {pdb_id}: ') 

338 if len(pubchem_ids_from_pdb) > 0: 

339 print(' PubChem ids: ' + ', '.join(pubchem_ids_from_pdb)) 

340 else: 

341 print(' This PDB id has no PubChem ids') 

342 

343 # If we had no cached pdb 2 pubchem then ask for them 

344 if pubchem_ids_from_pdb == None: 

345 pubchem_ids_from_pdb = pdb_to_pubchem(pdb_id) 

346 # Save the result in the cache object so it is saved to cache later 

347 pdb_to_pubchem_cache[pdb_id] = pubchem_ids_from_pdb 

348 new_data_to_cache = True 

349 for pubchem_id in pubchem_ids_from_pdb: 

350 # Ligands in the structure (PDB) and the 'inputs.json' could be the same so it's not necessary to do it twice 

351 if not any('pubchem' in ligand and ligand['pubchem'] == pubchem_id for ligand in ligands): 

352 ligands.append({ 'pubchem': pubchem_id, 'pdb': True }) 

353 # Save all pdb to pubchem results in cache, in case there is anything new 

354 if new_data_to_cache: cache.update(PDB_TO_PUBCHEM, pdb_to_pubchem_cache) 

355 

356 # If no input ligands are passed then stop here 

357 # if len(ligands) == 0: 

358 # return [] 

359 

360 # Save data from all ligands to be saved in a file 

361 json_ligands_data = [] 

362 ligands_data = [] 

363 

364 # Load the ligands file if exists already 

365 if os.path.exists(output_filepath): 

366 json_ligands_data += import_ligands(output_filepath) 

367 # If json ligands exists and it is empty means that ligands analysis has been already done but no ligands were matched 

368 # so the file will contain an empty list [] 

369 if len(json_ligands_data) == 0: 

370 print('No ligands have been matched yet.\nIf you want to force a ligand to be matched, please provide the field "residues" in the inputs.json file.') 

371 return [] 

372 

373 # Visited formulas 

374 visited_formulas = [] 

375 # Save the maps of every ligand 

376 ligand_maps = [] 

377 # Get cached ligand data 

378 # AGUS: esto lo creamos por alguna razón para que funcione sin internet (en el cluster) pero realmente 

379 # AGUS: llena el cache con datos que no son necesarios porque toda esa info está en el ligand_references.json 

380 #ligand_data_cache = cache.retrieve(LIGANDS_DATA, {}) 

381 # Iterate input ligands 

382 for ligand in ligands: 

383 # Set the pubchem id which may be assigned in different steps 

384 pubchem_id = None 

385 # If input ligand is not a dict but a single int/string then handle it 

386 if type(ligand) == int: 

387 print(f'A ligand number ID has been identified {ligand}, assuming that is a PubChem ID...') 

388 ligand = { 'pubchem': str(ligand) } 

389 elif type(ligand) == str: 

390 raise InputError(f'A name of ligand has been identified: {ligand}. Anyway, provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.') 

391 # Check if we already have this ligand data 

392 ligand_data = obtain_ligand_data_from_file(json_ligands_data, ligand) 

393 # Check if the ligand didn't match before 

394 if check_matched_ligand(ligand, ligand_data, cache): continue 

395 # If we do not have its data try to get from the cache 

396 # if not ligand_data: 

397 # # If this is a ligand not in ligans.json but in cache it means it comes from PDB, never form user inputs 

398 # # For this reason, this ligand will always have a PubChem id 

399 # pubchem_id = ligand.get('pubchem', None) 

400 # if pubchem_id: 

401 # ligand_data = ligand_data_cache.get(pubchem_id, None) 

402 # # If we still have no ligand data then request it to PubChem 

403 # if not ligand_data: 

404 # ligand_data = obtain_ligand_data_from_pubchem(ligand) 

405 # # Save data mined from PubChem in the cache 

406 # pubchem_id = ligand_data['pubchem'] 

407 # ligand_data_cache[pubchem_id] = { **ligand_data } 

408 # cache.update(LIGANDS_DATA, ligand_data_cache) 

409 # Add current ligand data to the general list 

410 if not ligand_data: 

411 ligand_data = obtain_ligand_data_from_pubchem(ligand) 

412 ligands_data.append(ligand_data) 

413 # Get pubchem id 

414 pubchem_id = ligand_data.get('pubchem', None) 

415 # If we already visited a different ligand but with identical formula then we skip this ligand  

416 # Note that the mapping will be identical thus overwritting the previous map 

417 # However, ligands forced by the user are processed before so we keep them as priority 

418 formula = ligand_data.get('formula', None) 

419 if formula in visited_formulas: 

420 print(f'WARNING: Ligand with PubChem Id {pubchem_id} has a formula which has been already matched') 

421 # AGUS: en este punto si el usuario ha definido el mismo ligando con diferente selección quiere decir que está repetido 

422 # AGUS: y que deberíamos mantener el ligando para diferentes residuos matcheados 

423 if forced_selection: 

424 visited_formulas.pop() 

425 ligands_data.pop() 

426 print(f'WARNING: Ligand with PubChem Id {pubchem_id} has been forced by the user and its repeated') 

427 else: 

428 ligands_data.pop() 

429 continue 

430 # Add it to the list of visited formulas 

431 visited_formulas.append(formula) 

432 # If the user defined a ligand name, it will be respectedand added to the metadata 

433 # if there isn't a defined name, it will be mined from PubChem 

434 user_forced_ligand_name = ligand.get('name', None) 

435 # Map structure residues with the current ligand 

436 # If the user forced the residues then use them 

437 forced_selection = ligand.get('selection', None) 

438 if forced_selection: 

439 # Could be a single residue or a list of residues 

440 selection_atoms = structure.select(forced_selection) 

441 residue_indices = structure.get_selection_residue_indices(selection_atoms) 

442 ligand_map = { 'name': pubchem_id, 'residue_indices': residue_indices, 'match': { 'ref': { 'pubchem': pubchem_id } } } 

443 if user_forced_ligand_name: ligand_map['forced_name'] = user_forced_ligand_name 

444 # If the user did not force the residues then map them 

445 else: 

446 ligand_map = map_ligand_residues(structure, ligand_data) 

447 # Add current ligand map to the general list 

448 ligand_maps.append(ligand_map) 

449 # If the ligand did not map then discard it 

450 did_not_match = len(ligand_map['residue_indices']) == 0 

451 if did_not_match: 

452 is_forced_by_user = not bool(ligand.get('pdb', False)) 

453 if is_forced_by_user: 

454 # At this point we should have macthed all sequences 

455 # If not, kill the process unless mercy was given 

456 must_be_killed = LIGANDS_MATCH_FLAG not in mercy 

457 if must_be_killed: 

458 raise InputError(f'Ligand with PubChem Id {pubchem_id} did not map with any residue') 

459 # Otherwise simply remove it from the final ligand references file and the forwarded maps  

460 ligands_data.pop() 

461 ligand_maps.pop() 

462 # Update the cache with the pubchem id of the ligands that didn't match 

463 not_matched_pubchems = cache.retrieve(NOT_MATCHED_LIGANDS, []) 

464 not_matched_pubchems.append(ligand_map['name']) 

465 cache.update(NOT_MATCHED_LIGANDS, not_matched_pubchems) 

466 # If the ligand matched then calculate some additional data 

467 # Get morgan and mordred descriptors, the SMILES is needed for this part 

468 smiles = ligand_data['smiles'] 

469 mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block = obtain_mordred_morgan_descriptors(smiles) 

470 ligand_data['mordred'] = mordred_results 

471 ligand_data['morgan_fp_bit_array'] = morgan_fp_bit_array 

472 ligand_data['morgan_highlight_atoms'] = morgan_highlight_atoms 

473 ligand_data['mol_block'] = mol_block 

474 

475 # At this point maybe some ligands in the structure may not match because they have not been included by either the author or the user. Alternatively, the authors may have added them 

476 # So those "ligands" are residues that are not matched to any ligand in the structure and the analyses will not run 

477 # LORE AGUS: en el dataset de MDBind (cineca) hay muchas simulaciones con ligandos modificados (incluso solo un átomo) y no matchean  

478 # LORE AGUS: con los que están en el PDB, por lo que no se hace ningún análisis sobre estos (y se debería) 

479 # LORE AGUS: ahora se intenta buscar su pubchem ID para extraer la información a partir del inchikey (da menos problemas que el smiles) y se hacen los análisis pertinentes.  

480 # AGUS: si no se puede extraer información de estos porque el inchikey no funciona, se muestra la información más básica posible y se genera una referencia vacía (será problemático) 

481 

482 # Obtain a list of the possible residues in the structure that are not either protein/nucleic/water/lipid or matched ligands 

483 for residue in structure.residues: 

484 # Skip solvent, lipid or ions 

485 if residue.classification in ['solvent', 'fatty', 'steroid', 'ion']: continue 

486 # Make sure the residue is not bonded to other things 

487 # If we are missing bonds then we can not rely in this rule 

488 if residue.is_missing_any_bonds(): continue 

489 if len(residue.get_bonded_residue_indices()) > 0: continue 

490 # Check if the residue is already matched to a ligand 

491 matched = False 

492 for ligand_map in ligand_maps: 

493 if residue.index in ligand_map['residue_indices']: 

494 matched = True 

495 break 

496 # If the residue is not matched then add it to the ligands data 

497 if matched: continue 

498 # Create a new ligand data with the residue name and its atoms count 

499 ligand_data = { field: None for field in LIGAND_DATA_FIELDS } 

500 ligand_data['name'] = residue.name 

501 # Add the residue as a ligand map 

502 ligand_map = { 

503 'name': residue.name, 

504 'residue_indices': [residue.index], 

505 'match': { 'ref': { 'pubchem': None } } 

506 } 

507 

508 inchikey = inchikeys['residx_2_key'].get(residue.index, None) 

509 # If the InChIKey is not None then try to obtain the PubChem CID 

510 if inchikey: 

511 ligand_data['inchikey'] = inchikey 

512 if cid := search_cid_by_inchikey(inchikey): 

513 ligand_data = obtain_ligand_data_from_pubchem( {'pubchem': cid} ) 

514 smiles = ligand_data['smiles'] 

515 mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block = obtain_mordred_morgan_descriptors(smiles) 

516 ligand_data['mordred'] = mordred_results 

517 ligand_data['morgan_fp_bit_array'] = morgan_fp_bit_array 

518 ligand_data['morgan_highlight_atoms'] = morgan_highlight_atoms 

519 ligand_data['mol_block'] = mol_block 

520 ligand_data['pubchem'] = cid 

521 ligand_map['match']['ref']['pubchem'] = str(cid) 

522 # Add the ligand data to the list of ligands data 

523 ligands_data.append(ligand_data) 

524 # Add the ligand map to the list of ligand maps 

525 ligand_maps.append(ligand_map) 

526 

527 # Export ligands to a file 

528 save_json(ligands_data, output_filepath) 

529 

530 # Log matched ligands 

531 if not ligand_maps: 

532 print('No ligands were matched') 

533 else: 

534 print('Matched ligands:') 

535 for ligand_map in ligand_maps: 

536 residue_indices = ligand_map["residue_indices"] 

537 residue_count = len(residue_indices) 

538 plural_suffix = '' if residue_count == 1 else 's' 

539 print(f' - {ligand_map["name"]}: {residue_count} residue{plural_suffix}') 

540 

541 return ligand_maps 

542 

543# Set the expected ligand data fields 

544LIGAND_DATA_FIELDS = set(['name', 'pubchem', 'drugbank', 'chembl', 'smiles', 'formula', 'morgan', 'mordred', 'pdbid']) 

545 

546# Import ligands json file so we do not have to rerun this process 

547def import_ligands (output_filepath : str) -> dict: 

548 # Read the file 

549 imported_ligands = load_json(output_filepath) 

550 # Format data as the process expects to find it 

551 for imported_ligand in imported_ligands: 

552 for expected_field in LIGAND_DATA_FIELDS: 

553 if expected_field not in imported_ligand: 

554 imported_ligand[expected_field] = None 

555 return imported_ligands 

556 

557# Check if the current input ligand is already among the ligands we already have data for 

558def obtain_ligand_data_from_file ( ligands_data : list[dict], ligand: dict ) -> Optional[dict]: 

559 for ligand_data in ligands_data: 

560 if (ligand.get('pubchem') and ligand['pubchem'] == ligand_data.get('pubchem')) or \ 

561 (ligand.get('drugbank') and ligand['drugbank'] == ligand_data.get('drugbank')) or \ 

562 (ligand.get('chembl') and ligand['chembl'] == ligand_data.get('chembl')): 

563 return ligand_data 

564 return None 

565 

566# Given an input ligand, obtain all necessary data 

567def obtain_ligand_data_from_pubchem (ligand : dict) -> dict: 

568 # Save in a dictionary all ligand data including its name and ids 

569 # The ID can be of the databases: 'drugbank' , 'pubchem' , 'chembl' 

570 # Define the needed variables to check if the ligand has a database ID or it is None 

571 ligand_data = { 

572 'name': None, 

573 'pubchem': None, 

574 'drugbank': None, 

575 'chembl': None, 

576 'smiles': None, 

577 'formula': None, 

578 'pdbid': None, 

579 } 

580 # Set ligand data pubchem id, even if the input id is not from pubhcme (e.g. drugbank, chembl) 

581 if 'pubchem' in ligand: 

582 ligand_data['pubchem'] = str(ligand.get('pubchem')) 

583 elif 'drugbank' in ligand: 

584 ligand_data['drugbank'] = ligand.get('drugbank') 

585 ligand_data['pubchem'] = str(find_drugbank_pubchem(ligand_data['drugbank'])) 

586 elif 'chembl' in ligand: 

587 ligand_data['chembl'] = ligand.get('chembl') 

588 ligand_data['pubchem'] = str(find_chembl_pubchem(ligand_data['chembl'])) 

589 else: 

590 raise InputError('None of the ligand IDs are defined. Please provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.') 

591 

592 # Request ligand data from pubchem 

593 pubchem_data = get_pubchem_data(ligand_data['pubchem']) 

594 if not pubchem_data: 

595 raise RuntimeError('No PubChem data avilable') 

596 

597 # Add pubchem data to ligand data 

598 ligand_data = { **ligand_data, **pubchem_data } 

599 return ligand_data 

600 

601# RUBEN: mover a Structure.count_atom_elements_per_residue 

602def count_atom_elements_per_residue ( structure : 'Structure' ) -> dict: 

603 """For each residue, count the number of atom elements.""" 

604 # Obtain a list of residues of ligand candidates 

605 residue_element_count = {} 

606 # Iterate over the residue(s) 

607 for residue in structure.residues: 

608 # Obtain the list of elements for each residue 

609 atom_elements = [ atom.element for atom in residue.atoms ] 

610 # Generate a dict with elements (keys) and their counting (values) 

611 atoms_count = {} 

612 for atom in atom_elements: 

613 if atom in atoms_count: 

614 atoms_count[atom] += 1 

615 else: 

616 atoms_count[atom] = 1 

617 # Define the key of the dictionary as the residue name and save it in a list with the all the residues 

618 residue_element_count[residue] = atoms_count 

619 

620 return residue_element_count 

621 

622def nest_brackets(tokens, i = 0): 

623 l = [] 

624 while i < len(tokens): 

625 if tokens[i] == ')': 

626 return i,l 

627 elif tokens[i] == '(': 

628 i,subl = nest_brackets(tokens, i+1) 

629 l.append(subl) 

630 else: 

631 l.append(tokens[i]) 

632 i += 1 

633 return i,l 

634 

635def parse_compound(formula : str) -> list[str]: 

636 tokens = [''.join(t) for t in split_when(formula, lambda a,b: b.isupper() or b in '()' or (b.isdigit() and not a.isdigit()))] 

637 tokens = [(int(t) if t.isdigit() else t) for t in tokens] 

638 i, l = nest_brackets(tokens) 

639 assert(i == len(tokens)) # crash if unmatched ')' 

640 return l 

641 

642# Split a string using a function 

643def split_when(string : str, func : Callable) -> list[str]: 

644 splits = [] 

645 last_split = string[0] 

646 for s in string[1:]: 

647 if func(last_split, s): 

648 splits.append(last_split) 

649 last_split = s 

650 else: 

651 last_split += s 

652 splits.append(last_split) 

653 return splits 

654 

655# Given a chemical formula, get the count of atoms per element 

656def count_atom_elements (molecular_formula : str) -> dict: 

657 # Clean the formula from charges 

658 # These charges include numbers which are not atom counts (e.g. 49867169 -> C18H16NO5PS-2) 

659 parsed_molecular_formula = re.sub('[+-][0-9]*', '', molecular_formula) 

660 l = parse_compound(parsed_molecular_formula) 

661 c = parse_splits(l) 

662 return c 

663 

664# This function associates elements in a list 

665# If a string is followed by a number then they go together 

666# If a string has no number then the number is 1 for this specific string 

667def parse_splits (splits : list[str]) -> dict: 

668 parsed = {} 

669 previous_key = None 

670 for split in splits: 

671 if type(split) == str: 

672 if previous_key: 

673 parsed[previous_key] = 1 

674 previous_key = split 

675 elif type(split) == int: 

676 parsed[previous_key] = split 

677 previous_key = None 

678 else: 

679 raise ValueError('Not supported type ' + type(split)) 

680 if previous_key: 

681 parsed[previous_key] = 1 

682 return parsed 

683 

684def match_ligandsID_to_res (ligand_atom_element_count : dict, residue_atom_element_count : dict) -> bool: 

685 # Remove hydrogens since we only pay attention to heavy atoms 

686 # This is to avoid having mismatched_residues due to different protonation states 

687 if 'H' in ligand_atom_element_count: 

688 del ligand_atom_element_count['H'] 

689 if 'H' in residue_atom_element_count: 

690 del residue_atom_element_count['H'] 

691 #shared_items = {k: ligand_atom_element_count[k] for k in ligand_atom_element_count if k in residue_atom_element_count and ligand_atom_element_count[k] == residue_atom_element_count[k]} 

692 ligand_elements = set(ligand_atom_element_count.keys()) 

693 residue_elements = set(residue_atom_element_count.keys()) 

694 

695 # Check if there are different elements 

696 # If so, we are done 

697 different_keys = ligand_elements.symmetric_difference(residue_elements) 

698 if len(different_keys) > 0: 

699 # print("Elements that are not matching:") 

700 # for atom in different_keys: 

701 # if atom in ligand_atom_element_count: 

702 # print(f"In Pubchem {list(ligand_atom_element_count.keys())[0]} : {atom}: {ligand_atom_element_count[atom]}") 

703 # else: 

704 # print(f"In PDB residue {list(residue_atom_element_count.keys())[0]}: {atom}: {residue_atom_element_count[atom]}") 

705 return False 

706 

707 # Different values 

708 different_values = [] 

709 for element in ligand_elements: 

710 if ligand_atom_element_count[element] != residue_atom_element_count[element]: 

711 different_values.append(element) 

712 

713 # If the count of elements does not match then stop here 

714 if len(different_values) > 0: 

715 # Print the differences found between the elements 

716 # print("Number of atoms that are different:") 

717 # for atom in different_values: 

718 # print(f"In Pubchem {list(ligand_atom_element_count.keys())[0]} {atom}: {ligand_atom_element_count[atom]}, In PDB residue {list(pdb_dict.keys())[0]}: {atom}: {residue_atom_element_count[atom]}\n") 

719 return False 

720 

721 return True 

722 

723def map_ligand_residues (structure : 'Structure', ligand_data : dict) -> dict: 

724 # Load the structure where the atoms will be minresidue.indexed and obtain a residues dict 

725 atom_elements_per_residue = count_atom_elements_per_residue(structure) 

726 # Get the ligand formula 

727 ligand_formula = ligand_data['formula'] 

728 if not ligand_formula: 

729 raise RuntimeError(f'Ligand with PubChem id {ligand_data["pubchem"]} is missing formula') 

730 # From pubchem mine the atoms of the ligands and save it in a dict 

731 ligand_atom_element_count = count_atom_elements(ligand_data['formula']) 

732 matched_residues = [] 

733 # Get ligand pubchem id 

734 pubchem_id = ligand_data['pubchem'] 

735 for residue, residue_atom_element_count in atom_elements_per_residue.items(): 

736 if match_ligandsID_to_res(ligand_atom_element_count, residue_atom_element_count): 

737 matched_residues.append(residue.index) 

738 # Format the output as we expect it 

739 ligand_map = { 'name': pubchem_id, 'residue_indices': matched_residues, 'match': { 'ref': { 'pubchem': pubchem_id } } } 

740 return ligand_map 

741 

742# Given a smiles, get the pubchem id 

743# e.g. CC1=C(C=NC=C1)NC(=O)CC2=CC(=CC(=C2)Cl)OC -> 154876006 

744# e.g. O=C4N3C(C(=O)Nc2cc(nn2c1ccccc1)C)C(SC3CC=CC4NC(=O)C(NC)C)(C)C -> None 

745def smiles_to_pubchem_id (smiles : str) -> Optional[str]: 

746 # Set the request URL 

747 request_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON' 

748 # Set the POST data 

749 data = urlencode({ 'smiles': smiles }).encode() 

750 try: 

751 with urlopen(request_url, data=data) as response: 

752 parsed_response = json.loads(response.read().decode("utf-8")) 

753 # If the smiles is not found in pubchem then we can stop here 

754 except HTTPError as error: 

755 if error.code == 404: 

756 print(f' Smiles {smiles} not found') 

757 return None 

758 else: 

759 raise ValueError('Something went wrong with the PubChem request: ' + request_url) 

760 # Get the PubChem id 

761 compounds = parsed_response.get('PC_Compounds', None) 

762 if not compounds: 

763 raise RuntimeError(f'Something went wrong when mining pubchem data for SMILES {smiles}') 

764 # Make sure there is only 1 compound 

765 # DANI: Esto algún día podría no ser así, ya lo gestionaremos 

766 if len(compounds) != 1: 

767 raise RuntimeError(f'There should be one and only one compound for SMILES {smiles}') 

768 # Keep mining 

769 compound = compounds[0] 

770 first_id = compound.get('id', None) 

771 # If there is no first id then it means there is no direct match with pubchem 

772 if not first_id: 

773 return None 

774 second_id = first_id.get('id', None) 

775 if not second_id: 

776 raise RuntimeError(f'Missing second id when mining pubchem data for SMILES {smiles}') 

777 pubchem_id = second_id.get('cid', None) 

778 if not pubchem_id: 

779 raise RuntimeError(f'Missing pubchem id when mining pubchem data for SMILES {smiles}') 

780 return str(pubchem_id) 

781 

782# Given a PDB ligand code, get its pubchem 

783def pdb_ligand_to_pubchem (pdb_ligand_id : str) -> Optional[str]: 

784 # Set the request query 

785 query = '''query molecule($id:String!){ 

786 chem_comp(comp_id:$id) { rcsb_chem_comp_related{ resource_name resource_accession_code } } 

787 }''' 

788 # Request PDB data 

789 parsed_response = request_pdb_data(pdb_ligand_id, query) 

790 related_resources = parsed_response['rcsb_chem_comp_related'] 

791 # It may happend that a ligand code has no related resources at all 

792 # e.g. ZN 

793 if not related_resources: return None 

794 pubchem_resource = next((resource for resource in related_resources if resource['resource_name'] == 'PubChem'), None) 

795 if not pubchem_resource: return None 

796 return pubchem_resource['resource_accession_code'] 

797 

798# Given a PDB ligand code, get its pubchem 

799# Use a web crawler to avoid having to use the PDB API 

800def pdb_ligand_to_pubchem_RAW (pdb_ligand_id : str) -> Optional[str]: 

801 # Set the request URL 

802 request_url = f'https://www.rcsb.org/ligand/{pdb_ligand_id}' 

803 # Run the query 

804 parsed_response = None 

805 try: 

806 with urlopen(request_url) as response: 

807 parsed_response = response.read().decode("utf-8") 

808 # If the accession is not found in the PDB then we can stop here 

809 except HTTPError as error: 

810 if error.code == 404: 

811 print(f' PDB ligand {pdb_ligand_id} not found') 

812 return None 

813 else: 

814 print(error.msg) 

815 raise ValueError('Something went wrong with the PDB ligand request: ' + request_url) 

816 # Mine the pubchem id out of the whole response 

817 pattern = re.compile('pubchem.ncbi.nlm.nih.gov\/compound\/([0-9]*)\"') 

818 match = re.search(pattern, parsed_response) 

819 # If there is no pubchem id then return none 

820 # This is normal for some ligands such as counter ions (e.g. LI) 

821 if not match: 

822 return None 

823 pubchem_id = match[1] 

824 return pubchem_id 

825 

826# Given a PDB ligand code, get its pubchem 

827# Ask to pubchem if the ligand exists and hope there is only one result 

828# DANI: No se ha provado a fondo 

829def pdb_ligand_to_pubchem_RAW_RAW (pdb_ligand_id : str) -> Optional[str]: 

830 # Set the request URL 

831 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{pdb_ligand_id}/json' 

832 # Run the query 

833 parsed_response = None 

834 try: 

835 with urlopen(request_url) as response: 

836 parsed_response = json.loads(response.read().decode("utf-8")) 

837 # If the accession is not found in the PDB then we can stop here 

838 except HTTPError as error: 

839 # This may happen for weird things such as UNX (unknown atom or ion) 

840 if error.code == 404: 

841 return None 

842 print(error.msg) 

843 raise RuntimeError('Something went wrong with the PDB ligand request in PubChem: ' + request_url) 

844 # Mine the pubchem id 

845 compounds = parsed_response['PC_Compounds'] 

846 if len(compounds) != 1: 

847 # There could be more than one result 

848 # For example, the case HEM: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/HEM/json  

849 # In this case we picked the first result 

850 compound = compounds[0] 

851 #raise RuntimeError('We are not having 1 and only 1 result from PubChem: ' + request_url) 

852 compound = compounds[0] 

853 id1 = compound['id'] 

854 id2 = id1['id'] 

855 pubchem_id = str(id2['cid']) 

856 return pubchem_id 

857 

858# Given a PDB id, get all its ligand codes 

859# e.g. 2I3I -> 618, BTB, ZN, EDO, LI 

860def get_pdb_ligand_codes (pdb_id : str) -> list[str]: 

861 # Set the request query 

862 query = '''query structure($id: String!) { 

863 entry(entry_id: $id) { 

864 nonpolymer_entities { nonpolymer_comp { chem_comp { id } } } 

865 } 

866 }''' 

867 # Request PDB data 

868 parsed_response = request_pdb_data(pdb_id, query) 

869 # Mine data for nonpolymer entities 

870 nonpolymers = parsed_response['nonpolymer_entities'] 

871 # If there are no nonpolymer entities, another type of entitie could be used 

872 # AGUS: esto es un caso muy concreto que me encontré con el PDB 1N3W 

873 # AGUS: el ligando en este caso es una 'Biologically Interesting Molecules' y se muestran como 'PRD_' 

874 prd_code = None 

875 if nonpolymers == None: 

876 # Get the prd ligand code 

877 prd_code = get_prd_ligand_code(pdb_id) 

878 if prd_code != None: 

879 return [prd_code] 

880 

881 if nonpolymers == None and prd_code == None: return [] 

882 # Iterate nonpolymer entities to mine each PDB code 

883 ligand_codes = [] 

884 for nonpolymer in nonpolymers: 

885 ligand_code = nonpolymer['nonpolymer_comp']['chem_comp']['id'] 

886 ligand_codes.append(ligand_code) 

887 print(f' Ligand codes for PDB id {pdb_id}: ' + ', '.join(ligand_codes)) 

888 return ligand_codes 

889 

890# Given a PDB id, get its PRD ligand code 

891def get_prd_ligand_code (pdb_id : str) -> Optional[str]: 

892 query = '''query structure($id: String!) { 

893 entry(entry_id: $id) { 

894 pdbx_molecule_features { 

895 prd_id 

896 } 

897 } 

898 }''' 

899 parsed_response = request_pdb_data(pdb_id, query) 

900 pdbx_id = parsed_response.get('pdbx_molecule_features', None) 

901 if pdbx_id is None: 

902 print(f'WARNING: Cannot find PRD ligand code for PDB id {pdb_id}') 

903 return None 

904 prd_id = pdbx_id[0].get('prd_id', None) # AGUS: podría haber casos donde haya más de uno?  

905 if prd_id is None: 

906 print(f'WARNING: Cannot find PRD ligand code for PDB id {pdb_id}') 

907 return None 

908 # If the PRD id is not empty then return it 

909 return prd_id 

910 

911# Given a pdb id, get its pubchem ids 

912# DANI: De momento no usamos las SMILES que alguna vez me han dado problemas (e.g. 2I3I) 

913# e.g. 4YDF ->  

914def pdb_to_pubchem (pdb_id : str) -> list[str]: 

915 print(f'Searching PubChem ids for PDB {pdb_id}') 

916 pubchem_ids = [] 

917 # Iterate over pdb ligand codes 

918 ligand_codes = get_pdb_ligand_codes(pdb_id) 

919 for ligand_code in ligand_codes: 

920 # Ask the PDB API for the ligand 

921 pubchem_id = pdb_ligand_to_pubchem(ligand_code) 

922 # If this did not work then try mining the PDB client with a web crawler 

923 if not pubchem_id: 

924 pubchem_id = pdb_ligand_to_pubchem_RAW(ligand_code) 

925 # If this did not work then try it from PubChem 

926 if not pubchem_id: 

927 pubchem_id = pdb_ligand_to_pubchem_RAW_RAW(ligand_code) 

928 # Otherwise we surrender 

929 if not pubchem_id: 

930 print(f' {ligand_code} -> No PubChem id') 

931 continue 

932 print(f' {ligand_code} -> {pubchem_id}') 

933 pubchem_ids.append(pubchem_id) 

934 

935 return pubchem_ids 

936 

937@lru_cache(maxsize=None) 

938def search_cid_by_inchikey(inchikey : str) -> Optional[str]: 

939 url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/cids/JSON" 

940 r = requests.get(url) 

941 if r.ok: 

942 data = r.json() 

943 return data['IdentifierList']['CID'][0] 

944 else: 

945 return None