Coverage for model_workflow/tools/generate_ligands_desc.py: 62%

547 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-23 10:54 +0000

1import os 

2import json 

3from rdkit import Chem 

4from rdkit.Chem import AllChem 

5from rdkit.Chem import rdFingerprintGenerator 

6from mordred import Calculator, descriptors 

7from model_workflow.utils.constants import LIGANDS_MATCH_FLAG, PDB_TO_PUBCHEM, NOT_MATCHED_LIGANDS 

8from model_workflow.utils.auxiliar import InputError, load_json, save_json, request_pdb_data 

9from model_workflow.utils.type_hints import * 

10from model_workflow.utils.structures import Structure 

11from urllib.request import Request, urlopen 

12from urllib.parse import urlencode 

13from urllib.error import HTTPError, URLError 

14import re 

15 

16def get_drugbank_smiles (id_drugbank : str) -> Optional[str]: 

17 # Request Drugbank 

18 request_url = Request( 

19 url= f'https://go.drugbank.com/structures/small_molecule_drugs/{id_drugbank}.smiles', 

20 headers={'User-Agent': 'Mozilla/5.0'} 

21 ) 

22 try: 

23 with urlopen(request_url) as response: 

24 smiles = response.read() 

25 # If the accession is not found in the database then we stop here 

26 except HTTPError as error: 

27 # If the drugbank ID is not yet in the Drugbank references then return None 

28 if error.code == 404: 

29 return None 

30 else: 

31 print('Error when requesting ' + request_url) 

32 raise ValueError('Something went wrong with the Drugbank request (error ' + str(error.code) + ')') 

33 # This error may occur if there is no internet connection 

34 except URLError as error: 

35 print('Error when requesting ' + request_url) 

36 raise ValueError('Something went wrong with the MDposit request') 

37 

38 return smiles 

39 

40 

41 

42# Given a ChemBL ID, use the uniprot API to request its data and then mine what is needed for the database 

43def get_chembl_smiles (id_chembl : str) -> Optional[str]: 

44 # Request ChemBL 

45 parsed_response = None 

46 request_url = Request( 

47 url= f'https://www.ebi.ac.uk/chembl/interface_api/es_proxy/es_data/get_es_document/chembl_molecule/{id_chembl}', 

48 headers={'User-Agent': 'Mozilla/5.0'} 

49 ) 

50 try: 

51 with urlopen(request_url) as response: 

52 parsed_response = json.loads(response.read().decode("utf-8")) 

53 smiles = parsed_response['_source']['molecule_structures']['canonical_smiles'] 

54 pubchem_id = parsed_response['_source']['_metadata']['unichem'][8]['id'] 

55 # If the accession is not found in ChemBL then the id is not valid 

56 except HTTPError as error: 

57 if error.code == 404: 

58 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl) 

59 return None 

60 print('Error when requesting ' + request_url) 

61 raise ValueError('Something went wrong with the ChemBL request (error ' + str(error.code) + ')') 

62 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607) 

63 if parsed_response == None: 

64 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl) 

65 return None 

66 return smiles 

67 

68# Given a PubChem ID, use the uniprot API to request its data and then mine what is needed for the database 

69def get_pubchem_data (id_pubchem : str) -> Optional[dict]: 

70 # Request PubChem 

71 parsed_response = None 

72 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{id_pubchem}/JSON/' 

73 try: 

74 with urlopen(request_url) as response: 

75 #parsed_response = json.loads(response.read().decode("windows-1252")) 

76 parsed_response = json.loads(response.read().decode("utf-8", errors='ignore')) 

77 # If the accession is not found in PubChem then the id is not valid 

78 # This may happen with pubchem ids of non-discrete compounds (e.g. 483927498) 

79 except HTTPError as error: 

80 if error.code == 404: 

81 print('WARNING: Cannot find PubChem entry for accession ', id_pubchem) 

82 return None 

83 print('Error when requesting ', request_url) 

84 raise ValueError('Something went wrong with the PubChem request (error ', str(error.code), ')') 

85 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607) 

86 if parsed_response == None: 

87 print('WARNING: Cannot find PubChem entry for accession ' + id_pubchem) 

88 return None 

89 # Mine target data: SMILES 

90 record = parsed_response.get('Record', None) 

91 if record == None: 

92 raise RuntimeError('Wrong Pubchem data structure: no record: ' + request_url) 

93 sections = record.get('Section', None) 

94 if sections == None: 

95 raise RuntimeError('Wrong Pubchem data structure: no sections: ' + request_url) 

96 names_and_ids_section = next((section for section in sections if section.get('TOCHeading', None) == 'Names and Identifiers'), None) 

97 if names_and_ids_section == None: 

98 raise RuntimeError('Wrong Pubchem data structure: no name and ids section: ' + request_url) 

99 names_and_ids_subsections = names_and_ids_section.get('Section', None) 

100 if names_and_ids_subsections == None: 

101 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url) 

102 

103 # Mine the name 

104 synonims = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Synonyms'), None) 

105 if synonims == None: 

106 descriptors = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None) 

107 descriptors_subsections = descriptors.get('Section', None) 

108 if descriptors_subsections == None: 

109 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url) 

110 depositor_supplied_descriptors = next((s for s in descriptors_subsections if s.get('TOCHeading', None) == 'IUPAC Name'), None) 

111 name_substance = depositor_supplied_descriptors.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

112 else: 

113 synonims_subsections = synonims.get('Section', None) 

114 if synonims_subsections == None: 

115 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url) 

116 depositor_supplied_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Depositor-Supplied Synonyms'), None) 

117 if depositor_supplied_synonims == None: 

118 removed_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Removed Synonyms'), None) 

119 name_substance = removed_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

120 else: 

121 name_substance = depositor_supplied_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

122 

123 # Mine the SMILES 

124 computed_descriptors_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None) 

125 if computed_descriptors_subsection == None: 

126 raise RuntimeError('Wrong Pubchem data structure: no computeed descriptors: ' + request_url) 

127 canonical_smiles_section = computed_descriptors_subsection.get('Section', None) 

128 if canonical_smiles_section == None: 

129 raise RuntimeError('Wrong Pubchem data structure: no canonical SMILES section: ' + request_url) 

130 canonical_smiles = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'Canonical SMILES'), None) 

131 if canonical_smiles == None: 

132 # In some cases there is no canonical SMILES but a non-canonical one could exists 

133 non_canonical_smiles_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'SMILES'), None) 

134 if non_canonical_smiles_section == None: 

135 raise RuntimeError('Wrong Pubchem data structure: no canonical SMILES: ' + request_url) 

136 

137 if canonical_smiles: 

138 smiles = canonical_smiles.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

139 if non_canonical_smiles_section: 

140 smiles = non_canonical_smiles_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

141 

142 if smiles == None: 

143 raise RuntimeError('Wrong Pubchem data structure: no SMILES: ' + request_url) 

144 

145 # Mine target data: MOLECULAR FORMULA 

146 molecular_formula_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Molecular Formula'), None) 

147 if molecular_formula_subsection == None: 

148 raise RuntimeError('Wrong Pubchem data structure: no molecular formula section: ' + request_url) 

149 molecular_formula = molecular_formula_subsection.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

150 if molecular_formula == None: 

151 raise RuntimeError('Wrong Pubchem data structure: no molecular formula: ' + request_url) 

152 

153 # Mine target data: PDB ID 

154 pdb_id = None 

155 pdb_id_subsection = next((s for s in sections if s.get('TOCHeading', None) == 'Interactions and Pathways'), None) 

156 # If this section is missing then it means this PubChem compound has no PDB id 

157 if pdb_id_subsection: 

158 pdb_id_subsections = pdb_id_subsection.get('Section', None) 

159 if pdb_id_subsections == None: 

160 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url) 

161 bond_structures = next((s for s in pdb_id_subsections if s.get('TOCHeading', None) == 'Protein Bound 3D Structures'), None) 

162 if bond_structures: 

163 bond_structures_section = bond_structures.get('Section', None) 

164 # If this section is missing then it means this PubChem compound has no PDB id 

165 if bond_structures_section: 

166 ligands_structure = next((s for s in bond_structures_section if s.get('TOCHeading', None) == 'Ligands from Protein Bound 3D Structures'), None) 

167 if ligands_structure == None: 

168 raise RuntimeError('Wrong Pubchem data structure: no Ligands from Protein Bound 3D Structures section: ' + request_url) 

169 ligands_structure_subsections = ligands_structure.get('Section', None) 

170 if ligands_structure_subsections == None: 

171 raise RuntimeError('Wrong Pubchem data structure: no Ligands from Protein Bound 3D Structures subsections: ' + request_url) 

172 ligands_pdb = next((s for s in ligands_structure_subsections if s.get('TOCHeading', None) == 'PDBe Ligand Code'), None) 

173 if ligands_pdb == None: 

174 raise RuntimeError('Wrong Pubchem data structure: no PDBe Ligand Code: ' + request_url) 

175 pdb_id = ligands_pdb.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

176 

177 # Mine de INCHI and INCHIKEY 

178 inchi_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChI'), None) 

179 if inchi_section == None: 

180 raise RuntimeError('Wrong Pubchem data structure: no InChI: ' + request_url) 

181 if inchi_section: 

182 inchi = inchi_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

183 

184 inchikey_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChIKey'), None) 

185 if inchikey_section == None: 

186 raise RuntimeError('Wrong Pubchem data structure: no InChIKey: ' + request_url) 

187 if inchikey_section: 

188 inchikey = inchikey_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None) 

189 # Prepare the pubchem data to be returned 

190 return { 'name': name_substance, 'smiles': smiles, 'formula': molecular_formula , 'pdbid': pdb_id, 'inchi': inchi, 'inchikey': inchikey } 

191 

192 

193def find_drugbank_pubchem (drugbank_id): 

194 # Request Drugbank 

195 request_url = Request( 

196 url=f'https://go.drugbank.com/drugs/{drugbank_id}', 

197 headers={ 

198 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 

199 'Accept-Language': 'en-US,en;q=0.9', 

200 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' 

201 } 

202) 

203 pubchem_id = None 

204 try: 

205 with urlopen(request_url) as response: 

206 content = response.read().decode("utf-8") 

207 pattern = re.compile("http\:\/\/pubchem.ncbi.nlm.nih.gov\/summary\/summary.cgi\?cid\=([0-9]*)") 

208 match = re.search(pattern, str(content)) 

209 if not match: 

210 raise ValueError("No se encontró información sobre el Pubchem compound en esta página.") 

211 pubchem_id = match[1] 

212 if not pubchem_id: 

213 raise ValueError("No se encontró información sobre el Pubchem compound en esta página.") 

214 # If the accession is not found in the database then we stop here 

215 except HTTPError as error: 

216 # If the drugbank ID is not yet in the Drugbank references then return None 

217 raise ValueError(f'Wrong request. Code: {error.code}') 

218 # This error may occur if there is no internet connection 

219 except URLError as error: 

220 print('Error when requesting ' + request_url) 

221 raise ValueError('Something went wrong with the DrugBank request') 

222 

223 return pubchem_id 

224 

225 

226def find_chembl_pubchem (id_chembl): 

227 # Request ChemBL 

228 parsed_response = None 

229 request_url = Request( 

230 url= f'https://www.ebi.ac.uk/chembl/interface_api/es_proxy/es_data/get_es_document/chembl_molecule/{id_chembl}', 

231 headers={'User-Agent': 'Mozilla/5.0'} 

232 ) 

233 try: 

234 with urlopen(request_url) as response: 

235 parsed_response = json.loads(response.read().decode("utf-8")) 

236 unichem = parsed_response['_source']['_metadata']['unichem'] 

237 if unichem == None: 

238 raise RuntimeError('Wrong Pubchem data structure: no unichem section') 

239 pubchem_section = next((section for section in unichem if section.get('src_url', None) == "http://pubchem.ncbi.nlm.nih.gov"), None) 

240 if pubchem_section == None: 

241 raise RuntimeError('Wrong Pubchem data structure: no pubchem section') 

242 pubchem_id = pubchem_section.get('id', None) 

243 if pubchem_id == None: 

244 raise RuntimeError('Wrong Pubchem data structure: no pubchem ID') 

245 # If the accession is not found in ChemBL then the id is not valid 

246 except HTTPError as error: 

247 if error.code == 404: 

248 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl) 

249 return None 

250 print('Error when requesting ' + request_url) 

251 raise ValueError('Something went wrong with the ChemBL request (error ' + str(error.code) + ')') 

252 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607) 

253 if parsed_response == None: 

254 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl) 

255 return None 

256 

257 return pubchem_id 

258 

259# Calculate the Morgan fingerprint and the Mordred descriptors from a ligand SMILES 

260def obtain_mordred_morgan_descriptors (smiles : str) -> Tuple: 

261 mol = Chem.MolFromSmiles(smiles) 

262 if mol is None: 

263 print(f'WARNING: Cannot generate a molecule from SMILES {smiles}') 

264 return None 

265 

266 AllChem.Compute2DCoords(mol) 

267 mol_block = Chem.MolToMolBlock(mol) 

268 # We can select the different submodules of mordred descriptors, avaible in: 'https://mordred-descriptor.github.io/documentation/master/' 

269 calc = Calculator([ 

270 descriptors.ABCIndex, # Índice de ramificación 

271 descriptors.AcidBase.AcidicGroupCount, # Grupos ácidos 

272 descriptors.AcidBase.BasicGroupCount, # Grupos básicos 

273 descriptors.RingCount, # Conteo de anillos 

274 descriptors.Constitutional, # Propiedades generales como número de átomos, peso molecular 

275 descriptors.TopologicalCharge, # Índices topológicos, Cargas parciales, polaridad 

276 descriptors.HydrogenBond, # Donantes y aceptores de enlaces de hidrógeno 

277 descriptors.Lipinski, # Reglas de Lipinski (drug-likeness) 

278 descriptors.FragmentComplexity, # Identificación de subestructuras frecuentes 

279 descriptors.PathCount, # Conteo de caminos moleculares 

280 ], ignore_3D=True) 

281 

282 

283 # Calculate Mordred results 

284 mordred_results = calc(mol).drop_missing().asdict() 

285 

286 ######## MORGAN FINGERPRINT ########### 

287 morgan_fp_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048) 

288 ao = rdFingerprintGenerator.AdditionalOutput() 

289 ao.AllocateAtomCounts() 

290 ao.AllocateAtomToBits() 

291 ao.AllocateBitInfoMap() 

292 fp = morgan_fp_gen.GetFingerprint(mol,additionalOutput=ao) 

293 morgan_fp_bit_array = list(fp) 

294 morgan_highlight_atoms = {} 

295 for bit, atoms in ao.GetBitInfoMap().items(): 

296 morgan_highlight_atoms[bit] = list(set(atom for atom, radius in atoms)) 

297 

298 return mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block 

299 

300# Check the cache to see if the ligand has already been matched or not, if not the ligand is skipped 

301def check_matched_ligand (ligand: dict, ligand_data: dict, cache: 'Cache') -> bool: 

302 # The pubchem id could be in the ligands data or the ligand input 

303 pubchem_id_1 = ligand.get('pubchem', None) 

304 if ligand_data != None: 

305 pubchem_id_2 = ligand_data.get('pubchem', None) 

306 else: 

307 pubchem_id_2 = None 

308 # If we have no pubchem id then we cannot check if the ligand matched 

309 if not pubchem_id_1 and not pubchem_id_2: 

310 return False 

311 # If we have a pubchem id then check if it is already in the cache 

312 cache_not_matched = cache.retrieve(NOT_MATCHED_LIGANDS, []) 

313 for pubchem in cache_not_matched: 

314 if pubchem == pubchem_id_1 or pubchem == pubchem_id_2: 

315 print(f" Ligand with PubChem id {pubchem} didn't match before, skipping it") 

316 return True 

317 

318 return False 

319 

320def generate_ligand_mapping ( 

321 structure : 'Structure', 

322 cache : 'Cache', 

323 input_ligands : Optional[List[dict]], 

324 pdb_ids : List[str], 

325 output_filepath : str, 

326 mercy : List[str] = [], 

327 ) -> List[dict]: 

328 

329 """Generate a map of residues associated to ligands.""" 

330 # Merge input ligands and pdb ligands 

331 ligands = [] 

332 if input_ligands: 

333 ligands += input_ligands 

334 # Check we have cached pdb 2 pubchem values 

335 pdb_to_pubchem_cache = cache.retrieve(PDB_TO_PUBCHEM, {}) 

336 new_data_to_cache = False 

337 # Get input ligands from the pdb ids, if any 

338 for pdb_id in pdb_ids: 

339 # Check we have cached this specific pdb 

340 pubchem_ids_from_pdb = pdb_to_pubchem_cache.get(pdb_id, None) 

341 if pubchem_ids_from_pdb != None: 

342 print(f' Retriving from cache PubChem ids for PDB id {pdb_id}: ') 

343 if len(pubchem_ids_from_pdb) > 0: 

344 print(' PubChem ids: ' + ', '.join(pubchem_ids_from_pdb)) 

345 else: 

346 print(' This PDB id has no PubChem ids') 

347 

348 # If we had no cached pdb 2 pubchem then ask for them 

349 if pubchem_ids_from_pdb == None: 

350 pubchem_ids_from_pdb = pdb_to_pubchem(pdb_id) 

351 # Save the result in the cache object so it is saved to cache later 

352 pdb_to_pubchem_cache[pdb_id] = pubchem_ids_from_pdb 

353 new_data_to_cache = True 

354 for pubchem_id in pubchem_ids_from_pdb: 

355 # Ligands in the structure (PDB) and the 'inputs.json' could be the same so it's not necessary to do it twice 

356 if not any('pubchem' in ligand and ligand['pubchem'] == pubchem_id for ligand in ligands): 

357 ligands.append({ 'pubchem': pubchem_id, 'pdb': True }) 

358 # Save all pdb to pubchem results in cache, in case there is anything new 

359 if new_data_to_cache: cache.update(PDB_TO_PUBCHEM, pdb_to_pubchem_cache) 

360 

361 # If no input ligands are passed then stop here 

362 if len(ligands) == 0: 

363 return [] 

364 

365 # Save data from all ligands to be saved in a file 

366 json_ligands_data = [] 

367 ligands_data = [] 

368 

369 # Load the ligands file if exists already 

370 if os.path.exists(output_filepath): 

371 json_ligands_data += import_ligands(output_filepath) 

372 # If json ligands exists and it is empty means that ligands analysis has been already done but no ligands were matched 

373 # so the file will contain an empty list [] 

374 if len(json_ligands_data) == 0: 

375 print('No ligands have been matched yet.\nIf you want to force a ligand to be matched, please provide the field "residues" in the inputs.json file.') 

376 return [] 

377 

378 # Visited formulas 

379 visited_formulas = [] 

380 # Save the maps of every ligand 

381 ligand_maps = [] 

382 # Get cached ligand data 

383 # AGUS: esto lo creamos por alguna razón para que funcione sin internet (en el cluster) pero realmente 

384 # AGUS: llena el cache con datos que no son necesarios porque toda esa info está en el ligand_references.json 

385 #ligand_data_cache = cache.retrieve(LIGANDS_DATA, {}) 

386 # Iterate input ligands 

387 for ligand in ligands: 

388 # Set the pubchem id which may be assigned in different steps 

389 pubchem_id = None 

390 # If input ligand is not a dict but a single int/string then handle it 

391 if type(ligand) == int: 

392 print(f'A ligand number ID has been identified {ligand}, assuming that is a PubChem ID...') 

393 ligand = { 'pubchem': str(ligand) } 

394 elif type(ligand) == str: 

395 raise InputError(f'A name of ligand has been identified: {ligand}. Anyway, provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.') 

396 # Check if we already have this ligand data 

397 ligand_data = obtain_ligand_data_from_file(json_ligands_data, ligand) 

398 # Check if the ligand didn't match before 

399 if check_matched_ligand(ligand, ligand_data, cache): continue 

400 # If we do not have its data try to get from the cache 

401 # if not ligand_data: 

402 # # If this is a ligand not in ligans.json but in cache it means it comes from PDB, never form user inputs 

403 # # For this reason, this ligand will always have a PubChem id 

404 # pubchem_id = ligand.get('pubchem', None) 

405 # if pubchem_id: 

406 # ligand_data = ligand_data_cache.get(pubchem_id, None) 

407 # # If we still have no ligand data then request it to PubChem 

408 # if not ligand_data: 

409 # ligand_data = obtain_ligand_data_from_pubchem(ligand) 

410 # # Save data mined from PubChem in the cache 

411 # pubchem_id = ligand_data['pubchem'] 

412 # ligand_data_cache[pubchem_id] = { **ligand_data } 

413 # cache.update(LIGANDS_DATA, ligand_data_cache) 

414 # Add current ligand data to the general list 

415 if not ligand_data: 

416 ligand_data = obtain_ligand_data_from_pubchem(ligand) 

417 ligands_data.append(ligand_data) 

418 # Get pubchem id 

419 pubchem_id = ligand_data.get('pubchem', None) 

420 # If we already visited a different ligand but with identical formula then we skip this ligand  

421 # Note that the mapping will be identical thus overwritting the previous map 

422 # However, ligands forced by the user are processed before so we keep them as priority 

423 formula = ligand_data.get('formula', None) 

424 if formula in visited_formulas: 

425 print(f'WARNING: Ligand with PubChem Id {pubchem_id} has a formula which has been already matched') 

426 # AGUS: en este punto si el usuario ha definido el mismo ligando con diferente selección quiere decir que está repetido 

427 # AGUS: y que deberíamos mantener el ligando para diferentes residuos matcheados 

428 if forced_selection: 

429 visited_formulas.pop() 

430 ligands_data.pop() 

431 print(f'WARNING: Ligand with PubChem Id {pubchem_id} has been forced by the user and its repeated') 

432 else: 

433 ligands_data.pop() 

434 continue 

435 # Add it to the list of visited formulas 

436 visited_formulas.append(formula) 

437 # If the user defined a ligand name, it will be respectedand added to the metadata 

438 # if there isn't a defined name, it will be mined from PubChem 

439 user_forced_ligand_name = ligand.get('name', None) 

440 # Map structure residues with the current ligand 

441 # If the user forced the residues then use them 

442 forced_selection = ligand.get('selection', None) 

443 if forced_selection: 

444 # Could be a single residue or a list of residues 

445 selection_atoms = structure.select(forced_selection) 

446 residue_indices = structure.get_selection_residue_indices(selection_atoms) 

447 ligand_map = { 'name': pubchem_id, 'residue_indices': residue_indices, 'match': { 'ref': { 'pubchem': pubchem_id } } } 

448 if user_forced_ligand_name: ligand_map['forced_name'] = user_forced_ligand_name 

449 # If the user did not force the residues then map them 

450 else: 

451 ligand_map = map_ligand_residues(structure, ligand_data) 

452 # Add current ligand map to the general list 

453 ligand_maps.append(ligand_map) 

454 # If the ligand did not map then discard it 

455 did_not_match = len(ligand_map['residue_indices']) == 0 

456 if did_not_match: 

457 is_forced_by_user = not bool(ligand.get('pdb', False)) 

458 if is_forced_by_user: 

459 # At this point we should have macthed all sequences 

460 # If not, kill the process unless mercy was given 

461 must_be_killed = LIGANDS_MATCH_FLAG not in mercy 

462 if must_be_killed: 

463 raise InputError(f'Ligand with PubChem Id {pubchem_id} did not map with any residue') 

464 # Otherwise simply remove it from the final ligand references file and the forwarded maps  

465 ligands_data.pop() 

466 ligand_maps.pop() 

467 # Update the cache with the pubchem id of the ligands that didn't match 

468 not_matched_pubchems = cache.retrieve(NOT_MATCHED_LIGANDS, []) 

469 not_matched_pubchems.append(ligand_map['name']) 

470 cache.update(NOT_MATCHED_LIGANDS, not_matched_pubchems) 

471 # If the ligand matched then calculate some additional data 

472 # Get morgan and mordred descriptors, the SMILES is needed for this part 

473 smiles = ligand_data['smiles'] 

474 mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block = obtain_mordred_morgan_descriptors(smiles) 

475 ligand_data['mordred'] = mordred_results 

476 ligand_data['morgan_fp_bit_array'] = morgan_fp_bit_array 

477 ligand_data['morgan_highlight_atoms'] = morgan_highlight_atoms 

478 ligand_data['mol_block'] = mol_block 

479 # Export ligands to a file 

480 save_json(ligands_data, output_filepath) 

481 

482 # Log matched ligands 

483 if not ligand_maps: 

484 print('No ligands were matched') 

485 else: 

486 print('Matched ligands:') 

487 for ligand_map in ligand_maps: 

488 residue_indices = ligand_map["residue_indices"] 

489 residue_count = len(residue_indices) 

490 plural_suffix = '' if residue_count == 1 else 's' 

491 print(f' - {ligand_map["name"]}: {residue_count} residue{plural_suffix}') 

492 

493 return ligand_maps 

494 

495# Set the expected ligand data fields 

496LIGAND_DATA_FIELDS = set(['name', 'pubchem', 'drugbank', 'chembl', 'smiles', 'formula', 'morgan', 'mordred', 'pdbid']) 

497 

498# Import ligands json file so we do not have to rerun this process 

499def import_ligands (output_filepath : str) -> dict: 

500 # Read the file 

501 imported_ligands = load_json(output_filepath) 

502 # Format data as the process expects to find it 

503 for imported_ligand in imported_ligands: 

504 for expected_field in LIGAND_DATA_FIELDS: 

505 if expected_field not in imported_ligand: 

506 imported_ligand[expected_field] = None 

507 return imported_ligands 

508 

509# Check if the current input ligand is already among the ligands we already have data for 

510def obtain_ligand_data_from_file ( ligands_data : List[dict], ligand: dict ) -> Optional[dict]: 

511 for ligand_data in ligands_data: 

512 if (ligand.get('pubchem') and ligand['pubchem'] == ligand_data.get('pubchem')) or \ 

513 (ligand.get('drugbank') and ligand['drugbank'] == ligand_data.get('drugbank')) or \ 

514 (ligand.get('chembl') and ligand['chembl'] == ligand_data.get('chembl')): 

515 return ligand_data 

516 return None 

517 

518# Given an input ligand, obtain all necessary data 

519def obtain_ligand_data_from_pubchem (ligand : dict) -> dict: 

520 # Save in a dictionary all ligand data including its name and ids 

521 # The ID can be of the databases: 'drugbank' , 'pubchem' , 'chembl' 

522 # Define the needed variables to check if the ligand has a database ID or it is None 

523 ligand_data = { 

524 'name': None, 

525 'pubchem': None, 

526 'drugbank': None, 

527 'chembl': None, 

528 'smiles': None, 

529 'formula': None, 

530 'pdbid': None, 

531 } 

532 # Set ligand data pubchem id, even if the input id is not from pubhcme (e.g. drugbank, chembl) 

533 if 'pubchem' in ligand: 

534 ligand_data['pubchem'] = str(ligand.get('pubchem')) 

535 elif 'drugbank' in ligand: 

536 ligand_data['drugbank'] = ligand.get('drugbank') 

537 ligand_data['pubchem'] = str(find_drugbank_pubchem(ligand_data['drugbank'])) 

538 elif 'chembl' in ligand: 

539 ligand_data['chembl'] = ligand.get('chembl') 

540 ligand_data['pubchem'] = str(find_chembl_pubchem(ligand_data['chembl'])) 

541 else: 

542 raise InputError('None of the ligand IDs are defined. Please provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.') 

543 

544 # Request ligand data from pubchem 

545 pubchem_data = get_pubchem_data(ligand_data['pubchem']) 

546 if not pubchem_data: 

547 raise RuntimeError('No PubChem data avilable') 

548 

549 # Add pubchem data to ligand data 

550 ligand_data = { **ligand_data, **pubchem_data } 

551 return ligand_data 

552 

553 

554# For each residue, count the number of atom elements 

555def count_atom_elements_per_residue ( structure : 'Structure' ) -> dict: 

556 # Obtain a list of residues of ligand candidates 

557 residue_element_count = {} 

558 # Iterate over the residue(s) 

559 for residue in structure.residues: 

560 # Skip protein residues 

561 if residue.classification == 'protein': 

562 continue 

563 # Obtain the list of elements for each residue 

564 atom_elements = [ atom.element for atom in residue.atoms ] 

565 # Generate a dict with elements (keys) and their counting (values) 

566 atoms_count = {} 

567 for atom in atom_elements: 

568 if atom in atoms_count: 

569 atoms_count[atom] += 1 

570 else: 

571 atoms_count[atom] = 1 

572 # Define the key of the dictionary as the residue name and save it in a list with the all the residues 

573 residue_element_count[residue] = atoms_count 

574 

575 return residue_element_count 

576 

577 

578def nest_brackets(tokens, i = 0): 

579 l = [] 

580 while i < len(tokens): 

581 if tokens[i] == ')': 

582 return i,l 

583 elif tokens[i] == '(': 

584 i,subl = nest_brackets(tokens, i+1) 

585 l.append(subl) 

586 else: 

587 l.append(tokens[i]) 

588 i += 1 

589 return i,l 

590 

591def parse_compound(formula : str) -> List[str]: 

592 tokens = [''.join(t) for t in split_when(formula, lambda a,b: b.isupper() or b in '()' or (b.isdigit() and not a.isdigit()))] 

593 tokens = [(int(t) if t.isdigit() else t) for t in tokens] 

594 i, l = nest_brackets(tokens) 

595 assert(i == len(tokens)) # crash if unmatched ')' 

596 return l 

597 

598# Split a string using a function 

599def split_when(string : str, func : Callable) -> List[str]: 

600 splits = [] 

601 last_split = string[0] 

602 for s in string[1:]: 

603 if func(last_split, s): 

604 splits.append(last_split) 

605 last_split = s 

606 else: 

607 last_split += s 

608 splits.append(last_split) 

609 return splits 

610 

611# Given a chemical formula, get the count of atoms per element 

612def count_atom_elements (molecular_formula : str) -> dict: 

613 # Clean the formula from charges 

614 # These charges include numbers which are not atom counts (e.g. 49867169 -> C18H16NO5PS-2) 

615 parsed_molecular_formula = re.sub('[+-][0-9]*', '', molecular_formula) 

616 l = parse_compound(parsed_molecular_formula) 

617 c = parse_splits(l) 

618 return c 

619 

620# This function associates elements in a list 

621# If a string is followed by a number then they go together 

622# If a string has no number then the number is 1 for this specific string 

623def parse_splits (splits : List[str]) -> dict: 

624 parsed = {} 

625 previous_key = None 

626 for split in splits: 

627 if type(split) == str: 

628 if previous_key: 

629 parsed[previous_key] = 1 

630 previous_key = split 

631 elif type(split) == int: 

632 parsed[previous_key] = split 

633 previous_key = None 

634 else: 

635 raise ValueError('Not supported type ' + type(split)) 

636 if previous_key: 

637 parsed[previous_key] = 1 

638 return parsed 

639 

640 

641def match_ligandsID_to_res (ligand_atom_element_count : dict, residue_atom_element_count : dict) -> bool: 

642 # Remove hydrogens since we only pay attention to heavy atoms 

643 # This is to avoid having mismatched_residues due to different protonation states 

644 if 'H' in ligand_atom_element_count: 

645 del ligand_atom_element_count['H'] 

646 if 'H' in residue_atom_element_count: 

647 del residue_atom_element_count['H'] 

648 #shared_items = {k: ligand_atom_element_count[k] for k in ligand_atom_element_count if k in residue_atom_element_count and ligand_atom_element_count[k] == residue_atom_element_count[k]} 

649 ligand_elements = set(ligand_atom_element_count.keys()) 

650 residue_elements = set(residue_atom_element_count.keys()) 

651 

652 # Check if there are different elements 

653 # If so, we are done 

654 different_keys = ligand_elements.symmetric_difference(residue_elements) 

655 if len(different_keys) > 0: 

656 # print("Elements that are not matching:") 

657 # for atom in different_keys: 

658 # if atom in ligand_atom_element_count: 

659 # print(f"In Pubchem {list(ligand_atom_element_count.keys())[0]} : {atom}: {ligand_atom_element_count[atom]}") 

660 # else: 

661 # print(f"In PDB residue {list(residue_atom_element_count.keys())[0]}: {atom}: {residue_atom_element_count[atom]}") 

662 return False 

663 

664 # Different values 

665 different_values = [] 

666 for element in ligand_elements: 

667 if ligand_atom_element_count[element] != residue_atom_element_count[element]: 

668 different_values.append(element) 

669 

670 # If the count of elements does not match then stop here 

671 if len(different_values) > 0: 

672 # Print the differences found between the elements 

673 # print("Number of atoms that are different:") 

674 # for atom in different_values: 

675 # print(f"In Pubchem {list(ligand_atom_element_count.keys())[0]} {atom}: {ligand_atom_element_count[atom]}, In PDB residue {list(pdb_dict.keys())[0]}: {atom}: {residue_atom_element_count[atom]}\n") 

676 return False 

677 

678 return True 

679 

680def map_ligand_residues (structure : 'Structure', ligand_data : dict) -> dict: 

681 # Load the structure where the atoms will be minresidue.indexed and obtain a residues dict 

682 atom_elements_per_residue = count_atom_elements_per_residue(structure) 

683 # Get the ligand formula 

684 ligand_formula = ligand_data['formula'] 

685 if not ligand_formula: 

686 raise RuntimeError(f'Ligand with PubChem id {ligand_data["pubchem"]} is missing formula') 

687 # From pubchem mine the atoms of the ligands and save it in a dict 

688 ligand_atom_element_count = count_atom_elements(ligand_data['formula']) 

689 matched_residues = [] 

690 # Get ligand pubchem id 

691 pubchem_id = ligand_data['pubchem'] 

692 for residue, residue_atom_element_count in atom_elements_per_residue.items(): 

693 if match_ligandsID_to_res(ligand_atom_element_count, residue_atom_element_count): 

694 matched_residues.append(residue.index) 

695 # Format the output as we expect it 

696 ligand_map = { 'name': pubchem_id, 'residue_indices': matched_residues, 'match': { 'ref': { 'pubchem': pubchem_id } } } 

697 return ligand_map 

698 

699# Given a smiles, get the pubchem id 

700# e.g. CC1=C(C=NC=C1)NC(=O)CC2=CC(=CC(=C2)Cl)OC -> 154876006 

701# e.g. O=C4N3C(C(=O)Nc2cc(nn2c1ccccc1)C)C(SC3CC=CC4NC(=O)C(NC)C)(C)C -> None 

702def smiles_to_pubchem_id (smiles : str) -> Optional[str]: 

703 # Set the request URL 

704 request_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON' 

705 # Set the POST data 

706 data = urlencode({ 'smiles': smiles }).encode() 

707 try: 

708 with urlopen(request_url, data=data) as response: 

709 parsed_response = json.loads(response.read().decode("utf-8")) 

710 # If the smiles is not found in pubchem then we can stop here 

711 except HTTPError as error: 

712 if error.code == 404: 

713 print(f' Smiles {smiles} not found') 

714 return None 

715 else: 

716 raise ValueError('Something went wrong with the PubChem request: ' + request_url) 

717 # Get the PubChem id 

718 compounds = parsed_response.get('PC_Compounds', None) 

719 if not compounds: 

720 raise RuntimeError(f'Something went wrong when mining pubchem data for SMILES {smiles}') 

721 # Make sure there is only 1 compound 

722 # DANI: Esto algún día podría no ser así, ya lo gestionaremos 

723 if len(compounds) != 1: 

724 raise RuntimeError(f'There should be one and only one compound for SMILES {smiles}') 

725 # Keep mining 

726 compound = compounds[0] 

727 first_id = compound.get('id', None) 

728 # If there is no first id then it means there is no direct match with pubchem 

729 if not first_id: 

730 return None 

731 second_id = first_id.get('id', None) 

732 if not second_id: 

733 raise RuntimeError(f'Missing second id when mining pubchem data for SMILES {smiles}') 

734 pubchem_id = second_id.get('cid', None) 

735 if not pubchem_id: 

736 raise RuntimeError(f'Missing pubchem id when mining pubchem data for SMILES {smiles}') 

737 return str(pubchem_id) 

738 

739# Given a PDB ligand code, get its pubchem 

740def pdb_ligand_to_pubchem (pdb_ligand_id : str) -> Optional[str]: 

741 # Set the request query 

742 query = '''query molecule($id:String!){ 

743 chem_comp(comp_id:$id) { rcsb_chem_comp_related{ resource_name resource_accession_code } } 

744 }''' 

745 # Request PDB data 

746 parsed_response = request_pdb_data(pdb_ligand_id, query) 

747 related_resources = parsed_response['rcsb_chem_comp_related'] 

748 # It may happend that a ligand code has no related resources at all 

749 # e.g. ZN 

750 if not related_resources: return None 

751 pubchem_resource = next((resource for resource in related_resources if resource['resource_name'] == 'PubChem'), None) 

752 if not pubchem_resource: return None 

753 return pubchem_resource['resource_accession_code'] 

754 

755# Given a PDB ligand code, get its pubchem 

756# Use a web crawler to avoid having to use the PDB API 

757def pdb_ligand_to_pubchem_RAW (pdb_ligand_id : str) -> Optional[str]: 

758 # Set the request URL 

759 request_url = f'https://www.rcsb.org/ligand/{pdb_ligand_id}' 

760 # Run the query 

761 parsed_response = None 

762 try: 

763 with urlopen(request_url) as response: 

764 parsed_response = response.read().decode("utf-8") 

765 # If the accession is not found in the PDB then we can stop here 

766 except HTTPError as error: 

767 if error.code == 404: 

768 print(f' PDB ligand {pdb_ligand_id} not found') 

769 return None 

770 else: 

771 print(error.msg) 

772 raise ValueError('Something went wrong with the PDB ligand request: ' + request_url) 

773 # Mine the pubchem id out of the whole response 

774 pattern = re.compile('pubchem.ncbi.nlm.nih.gov\/compound\/([0-9]*)\"') 

775 match = re.search(pattern, parsed_response) 

776 # If there is no pubchem id then return none 

777 # This is normal for some ligands such as counter ions (e.g. LI) 

778 if not match: 

779 return None 

780 pubchem_id = match[1] 

781 return pubchem_id 

782 

783# Given a PDB ligand code, get its pubchem 

784# Ask to pubchem if the ligand exists and hope there is only one result 

785# DANI: No se ha provado a fondo 

786def pdb_ligand_to_pubchem_RAW_RAW (pdb_ligand_id : str) -> Optional[str]: 

787 # Set the request URL 

788 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{pdb_ligand_id}/json' 

789 # Run the query 

790 parsed_response = None 

791 try: 

792 with urlopen(request_url) as response: 

793 parsed_response = json.loads(response.read().decode("utf-8")) 

794 # If the accession is not found in the PDB then we can stop here 

795 except HTTPError as error: 

796 # This may happen for weird things such as UNX (unknown atom or ion) 

797 if error.code == 404: 

798 return None 

799 print(error.msg) 

800 raise RuntimeError('Something went wrong with the PDB ligand request in PubChem: ' + request_url) 

801 # Mine the pubchem id 

802 compounds = parsed_response['PC_Compounds'] 

803 if len(compounds) != 1: 

804 # There could be more than one result 

805 # For example, the case HEM: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/HEM/json  

806 # In this case we picked the first result 

807 compound = compounds[0] 

808 #raise RuntimeError('We are not having 1 and only 1 result from PubChem: ' + request_url) 

809 compound = compounds[0] 

810 id1 = compound['id'] 

811 id2 = id1['id'] 

812 pubchem_id = str(id2['cid']) 

813 return pubchem_id 

814 

815# Given a PDB id, get all its ligand codes 

816# e.g. 2I3I -> 618, BTB, ZN, EDO, LI 

817def get_pdb_ligand_codes (pdb_id : str) -> List[str]: 

818 # Set the request query 

819 query = '''query structure($id: String!) { 

820 entry(entry_id: $id) { 

821 nonpolymer_entities { nonpolymer_comp { chem_comp { id } } } 

822 } 

823 }''' 

824 # Request PDB data 

825 parsed_response = request_pdb_data(pdb_id, query) 

826 # Mine data for nonpolymer entities 

827 nonpolymers = parsed_response['nonpolymer_entities'] 

828 # If there are no nonpolymer entities, another type of entitie could be used 

829 # AGUS: esto es un caso muy concreto que me encontré con el PDB 1N3W 

830 # AGUS: el ligando en este caso es una 'Biologically Interesting Molecules' y se muestran como 'PRD_' 

831 prd_code = None 

832 if nonpolymers == None: 

833 # Get the prd ligand code 

834 prd_code = get_prd_ligand_code(pdb_id) 

835 if prd_code != None: 

836 return [prd_code] 

837 

838 if nonpolymers == None and prd_code == None: return [] 

839 # Iterate nonpolymer entities to mine each PDB code 

840 ligand_codes = [] 

841 for nonpolymer in nonpolymers: 

842 ligand_code = nonpolymer['nonpolymer_comp']['chem_comp']['id'] 

843 ligand_codes.append(ligand_code) 

844 print(f' Ligand codes for PDB id {pdb_id}: ' + ', '.join(ligand_codes)) 

845 return ligand_codes 

846 

847# Given a PDB id, get its PRD ligand code 

848def get_prd_ligand_code (pdb_id : str) -> Optional[str]: 

849 query = '''query structure($id: String!) { 

850 entry(entry_id: $id) { 

851 pdbx_molecule_features { 

852 prd_id 

853 } 

854 } 

855 }''' 

856 parsed_response = request_pdb_data(pdb_id, query) 

857 pdbx_id = parsed_response.get('pdbx_molecule_features', None) 

858 if pdbx_id is None: 

859 print(f'WARNING: Cannot find PRD ligand code for PDB id {pdb_id}') 

860 return None 

861 prd_id = pdbx_id[0].get('prd_id', None) # AGUS: podría haber casos donde haya más de uno?  

862 if prd_id is None: 

863 print(f'WARNING: Cannot find PRD ligand code for PDB id {pdb_id}') 

864 return None 

865 # If the PRD id is not empty then return it 

866 return prd_id 

867 

868# Given a pdb id, get its pubchem ids 

869# DANI: De momento no usamos las SMILES que alguna vez me han dado problemas (e.g. 2I3I) 

870# e.g. 4YDF ->  

871def pdb_to_pubchem (pdb_id : str) -> List[str]: 

872 print(f'Searching PubChem ids for PDB {pdb_id}') 

873 pubchem_ids = [] 

874 # Iterate over pdb ligand codes 

875 ligand_codes = get_pdb_ligand_codes(pdb_id) 

876 for ligand_code in ligand_codes: 

877 # Ask the PDB API for the ligand 

878 pubchem_id = pdb_ligand_to_pubchem(ligand_code) 

879 # If this did not work then try mining the PDB client with a web crawler 

880 if not pubchem_id: 

881 pubchem_id = pdb_ligand_to_pubchem_RAW(ligand_code) 

882 # If this did not work then try it from PubChem 

883 if not pubchem_id: 

884 pubchem_id = pdb_ligand_to_pubchem_RAW_RAW(ligand_code) 

885 # Otherwise we surrender 

886 if not pubchem_id: 

887 print(f' {ligand_code} -> No PubChem id') 

888 continue 

889 print(f' {ligand_code} -> {pubchem_id}') 

890 pubchem_ids.append(pubchem_id) 

891 

892 return pubchem_ids