Coverage for mddb_workflow/tools/generate_ligands

1import re

2import os

3import json

4import requests

5from rdkit import Chem

6from functools import lru_cache

7from mordred import Calculator, descriptors

8from mddb_workflow.utils.constants import LIGANDS_MATCH_FLAG, PDB_TO_PUBCHEM, NOT_MATCHED_LIGANDS

9from mddb_workflow.utils.auxiliar import InputError, load_json, save_json, request_pdb_data

10from mddb_workflow.utils.type_hints import *

11from mddb_workflow.utils.structures import Structure

12from mddb_workflow.tools.get_inchi_keys import obtain_inchikey_from_pdb

13from urllib.request import Request, urlopen

14from urllib.parse import urlencode

15from urllib.error import HTTPError, URLError

17def get_drugbank_smiles (id_drugbank : str) -> Optional[str]:

18 # Request Drugbank

19 request_url = Request(

20 url= f'https://go.drugbank.com/structures/small_molecule_drugs/{id_drugbank}.smiles',

21 headers={'User-Agent': 'Mozilla/5.0'}

22 )

23 try:

24 with urlopen(request_url) as response:

25 smiles = response.read()

26 # If the accession is not found in the database then we stop here

27 except HTTPError as error:

28 # If the drugbank ID is not yet in the Drugbank references then return None

29 if error.code == 404:

30 return None

31 else:

32 print('Error when requesting ' + request_url)

33 raise ValueError('Something went wrong with the Drugbank request (error ' + str(error.code) + ')')

34 # This error may occur if there is no internet connection

35 except URLError as error:

36 print('Error when requesting ' + request_url)

37 raise ValueError('Something went wrong with the MDposit request')

39 return smiles

41# Given a ChemBL ID, use the uniprot API to request its data and then mine what is needed for the database

42def get_chembl_smiles (id_chembl : str) -> Optional[str]:

43 # Request ChemBL

44 parsed_response = None

45 request_url = Request(

46 url= f'https://www.ebi.ac.uk/chembl/interface_api/es_proxy/es_data/get_es_document/chembl_molecule/{id_chembl}',

47 headers={'User-Agent': 'Mozilla/5.0'}

48 )

49 try:

50 with urlopen(request_url) as response:

51 parsed_response = json.loads(response.read().decode("utf-8"))

52 smiles = parsed_response['_source']['molecule_structures']['canonical_smiles']

53 pubchem_id = parsed_response['_source']['_metadata']['unichem'][8]['id']

54 # If the accession is not found in ChemBL then the id is not valid

55 except HTTPError as error:

56 if error.code == 404:

57 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)

58 return None

59 print('Error when requesting ' + request_url)

60 raise ValueError('Something went wrong with the ChemBL request (error ' + str(error.code) + ')')

61 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607)

62 if parsed_response == None:

63 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)

64 return None

65 return smiles

67# Given a PubChem ID, use the uniprot API to request its data and then mine what is needed for the database

68def get_pubchem_data (id_pubchem : str) -> Optional[dict]:

69 # Request PubChem

70 parsed_response = None

71 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{id_pubchem}/JSON/'

72 try:

73 with urlopen(request_url) as response:

74 #parsed_response = json.loads(response.read().decode("windows-1252"))

75 parsed_response = json.loads(response.read().decode("utf-8", errors='ignore'))

76 # If the accession is not found in PubChem then the id is not valid

77 # This may happen with pubchem ids of non-discrete compounds (e.g. 483927498)

78 except HTTPError as error:

79 if error.code == 404:

80 print('WARNING: Cannot find PubChem entry for accession ', id_pubchem)

81 return None

82 print('Error when requesting ', request_url)

83 raise ValueError('Something went wrong with the PubChem request (error ', str(error.code), ')')

84 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607)

85 if parsed_response == None:

86 print('WARNING: Cannot find PubChem entry for accession ' + id_pubchem)

87 return None

88 # Mine target data: SMILES

89 record = parsed_response.get('Record', None)

90 if record == None:

91 raise RuntimeError('Wrong Pubchem data structure: no record: ' + request_url)

92 sections = record.get('Section', None)

93 if sections == None:

94 raise RuntimeError('Wrong Pubchem data structure: no sections: ' + request_url)

95 names_and_ids_section = next((section for section in sections if section.get('TOCHeading', None) == 'Names and Identifiers'), None)

96 if names_and_ids_section == None:

97 raise RuntimeError('Wrong Pubchem data structure: no name and ids section: ' + request_url)

98 names_and_ids_subsections = names_and_ids_section.get('Section', None)

99 if names_and_ids_subsections == None:

100 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)

101

102 # Mine the name

103 synonims = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Synonyms'), None)

104 if synonims == None:

105 descriptors = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None)

106 descriptors_subsections = descriptors.get('Section', None)

107 if descriptors_subsections == None:

108 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)

109 depositor_supplied_descriptors = next((s for s in descriptors_subsections if s.get('TOCHeading', None) == 'IUPAC Name'), None)

110 name_substance = depositor_supplied_descriptors.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

111 else:

112 synonims_subsections = synonims.get('Section', None)

113 if synonims_subsections == None:

114 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)

115 depositor_supplied_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Depositor-Supplied Synonyms'), None)

116 if depositor_supplied_synonims == None:

117 removed_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Removed Synonyms'), None)

118 name_substance = removed_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

119 else:

120 name_substance = depositor_supplied_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

121

122 # Mine the SMILES

123 computed_descriptors_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None)

124 if computed_descriptors_subsection == None:

125 raise RuntimeError('Wrong Pubchem data structure: no computeed descriptors: ' + request_url)

126 canonical_smiles_section = computed_descriptors_subsection.get('Section', None)

127 if canonical_smiles_section == None:

128 raise RuntimeError('Wrong Pubchem data structure: no canonical SMILES section: ' + request_url)

129 canonical_smiles = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'Canonical SMILES'), None)

130 if canonical_smiles == None:

131 # In some cases there is no canonical SMILES but a non-canonical one could exists

132 non_canonical_smiles_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'SMILES'), None)

133 if non_canonical_smiles_section == None:

134 raise RuntimeError('Wrong Pubchem data structure: no canonical SMILES: ' + request_url)

135

136 if canonical_smiles:

137 smiles = canonical_smiles.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

138 if non_canonical_smiles_section:

139 smiles = non_canonical_smiles_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

140

141 if smiles == None:

142 raise RuntimeError('Wrong Pubchem data structure: no SMILES: ' + request_url)

143

144 # Mine target data: MOLECULAR FORMULA

145 molecular_formula_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Molecular Formula'), None)

146 if molecular_formula_subsection == None:

147 raise RuntimeError('Wrong Pubchem data structure: no molecular formula section: ' + request_url)

148 molecular_formula = molecular_formula_subsection.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

149 if molecular_formula == None:

150 raise RuntimeError('Wrong Pubchem data structure: no molecular formula: ' + request_url)

151

152 # Mine target data: PDB ID

153 pdb_id = None

154 pdb_id_subsection = next((s for s in sections if s.get('TOCHeading', None) == 'Interactions and Pathways'), None)

155 # If this section is missing then it means this PubChem compound has no PDB id

156 if pdb_id_subsection:

157 pdb_id_subsections = pdb_id_subsection.get('Section', None)

158 if pdb_id_subsections == None:

159 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)

160 bond_structures = next((s for s in pdb_id_subsections if s.get('TOCHeading', None) == 'Protein Bound 3D Structures'), None)

161 if bond_structures:

162 bond_structures_section = bond_structures.get('Section', None)

163 # If this section is missing then it means this PubChem compound has no PDB id

164 if bond_structures_section:

165 ligands_structure = next((s for s in bond_structures_section if s.get('TOCHeading', None) == 'Ligands from Protein Bound 3D Structures'), None)

166 if ligands_structure == None:

167 raise RuntimeError('Wrong Pubchem data structure: no Ligands from Protein Bound 3D Structures section: ' + request_url)

168 ligands_structure_subsections = ligands_structure.get('Section', None)

169 if ligands_structure_subsections == None:

170 raise RuntimeError('Wrong Pubchem data structure: no Ligands from Protein Bound 3D Structures subsections: ' + request_url)

171 ligands_pdb = next((s for s in ligands_structure_subsections if s.get('TOCHeading', None) == 'PDBe Ligand Code'), None)

172 if ligands_pdb == None:

173 raise RuntimeError('Wrong Pubchem data structure: no PDBe Ligand Code: ' + request_url)

174 pdb_id = ligands_pdb.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

175

176 # Mine de INCHI and INCHIKEY

177 inchi_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChI'), None)

178 if inchi_section == None:

179 raise RuntimeError('Wrong Pubchem data structure: no InChI: ' + request_url)

180 if inchi_section:

181 inchi = inchi_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

182

183 inchikey_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChIKey'), None)

184 if inchikey_section == None:

185 raise RuntimeError('Wrong Pubchem data structure: no InChIKey: ' + request_url)

186 if inchikey_section:

187 inchikey = inchikey_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

188 # Prepare the pubchem data to be returned

189 return { 'name': name_substance, 'smiles': smiles, 'formula': molecular_formula , 'pdbid': pdb_id, 'inchi': inchi, 'inchikey': inchikey }

190

191def find_drugbank_pubchem (drugbank_id):

192 # Request Drugbank

193 request_url = Request(

194 url=f'https://go.drugbank.com/drugs/{drugbank_id}',

195 headers={

196 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',

197 'Accept-Language': 'en-US,en;q=0.9',

198 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'

199 }

200)

201 pubchem_id = None

202 try:

203 with urlopen(request_url) as response:

204 content = response.read().decode("utf-8")

205 pattern = re.compile("http\:\/\/pubchem.ncbi.nlm.nih.gov\/summary\/summary.cgi\?cid\=([0-9]*)")

206 match = re.search(pattern, str(content))

207 if not match:

208 raise ValueError("No se encontró información sobre el Pubchem compound en esta página.")

209 pubchem_id = match[1]

210 if not pubchem_id:

211 raise ValueError("No se encontró información sobre el Pubchem compound en esta página.")

212 # If the accession is not found in the database then we stop here

213 except HTTPError as error:

214 # If the drugbank ID is not yet in the Drugbank references then return None

215 raise ValueError(f'Wrong request. Code: {error.code}')

216 # This error may occur if there is no internet connection

217 except URLError as error:

218 print('Error when requesting ' + request_url)

219 raise ValueError('Something went wrong with the DrugBank request')

220

221 return pubchem_id

222

223def find_chembl_pubchem (id_chembl):

224 # Request ChemBL

225 parsed_response = None

226 request_url = Request(

227 url= f'https://www.ebi.ac.uk/chembl/interface_api/es_proxy/es_data/get_es_document/chembl_molecule/{id_chembl}',

228 headers={'User-Agent': 'Mozilla/5.0'}

229 )

230 try:

231 with urlopen(request_url) as response:

232 parsed_response = json.loads(response.read().decode("utf-8"))

233 unichem = parsed_response['_source']['_metadata']['unichem']

234 if unichem == None:

235 raise RuntimeError('Wrong Pubchem data structure: no unichem section')

236 pubchem_section = next((section for section in unichem if section.get('src_url', None) == "http://pubchem.ncbi.nlm.nih.gov"), None)

237 if pubchem_section == None:

238 raise RuntimeError('Wrong Pubchem data structure: no pubchem section')

239 pubchem_id = pubchem_section.get('id', None)

240 if pubchem_id == None:

241 raise RuntimeError('Wrong Pubchem data structure: no pubchem ID')

242 # If the accession is not found in ChemBL then the id is not valid

243 except HTTPError as error:

244 if error.code == 404:

245 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)

246 return None

247 print('Error when requesting ' + request_url)

248 raise ValueError('Something went wrong with the ChemBL request (error ' + str(error.code) + ')')

249 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607)

250 if parsed_response == None:

251 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)

252 return None

253

254 return pubchem_id

255

256# Calculate the Morgan fingerprint and the Mordred descriptors from a ligand SMILES

257def obtain_mordred_morgan_descriptors (smiles : str) -> tuple:

258 mol = Chem.MolFromSmiles(smiles)

259 if mol is None:

260 print(f'WARNING: Cannot generate a molecule from SMILES {smiles}')

261 return None

262

263 Chem.AllChem.Compute2DCoords(mol)

264 mol_block = Chem.MolToMolBlock(mol)

265 # We can select the different submodules of mordred descriptors, avaible in: 'https://mordred-descriptor.github.io/documentation/master/'

266 calc = Calculator([

267 descriptors.ABCIndex, # Índice de ramificación

268 descriptors.AcidBase.AcidicGroupCount, # Grupos ácidos

269 descriptors.AcidBase.BasicGroupCount, # Grupos básicos

270 descriptors.RingCount, # Conteo de anillos

271 descriptors.Constitutional, # Propiedades generales como número de átomos, peso molecular

272 descriptors.TopologicalCharge, # Índices topológicos, Cargas parciales, polaridad

273 descriptors.HydrogenBond, # Donantes y aceptores de enlaces de hidrógeno

274 descriptors.Lipinski, # Reglas de Lipinski (drug-likeness)

275 descriptors.FragmentComplexity, # Identificación de subestructuras frecuentes

276 descriptors.PathCount, # Conteo de caminos moleculares

277 ], ignore_3D=True)

278

279

280 # Calculate Mordred results

281 mordred_results = calc(mol).drop_missing().asdict()

282

283 ######## MORGAN FINGERPRINT ###########

284 morgan_fp_gen = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

285 ao = Chem.rdFingerprintGenerator.AdditionalOutput()

286 ao.AllocateAtomCounts()

287 ao.AllocateAtomToBits()

288 ao.AllocateBitInfoMap()

289 fp = morgan_fp_gen.GetFingerprint(mol,additionalOutput=ao)

290 morgan_fp_bit_array = list(fp)

291 morgan_highlight_atoms = {}

292 for bit, atoms in ao.GetBitInfoMap().items():

293 morgan_highlight_atoms[bit] = list(set(atom for atom, radius in atoms))

294

295 return mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block

296

297# Check the cache to see if the ligand has already been matched or not, if not the ligand is skipped

298def check_matched_ligand (ligand: dict, ligand_data: dict, cache: 'Cache') -> bool:

299 # The pubchem id could be in the ligands data or the ligand input

300 pubchem_id_1 = ligand.get('pubchem', None)

301 if ligand_data != None:

302 pubchem_id_2 = ligand_data.get('pubchem', None)

303 else:

304 pubchem_id_2 = None

305 # If we have no pubchem id then we cannot check if the ligand matched

306 if not pubchem_id_1 and not pubchem_id_2:

307 return False

308 # If we have a pubchem id then check if it is already in the cache

309 cache_not_matched = cache.retrieve(NOT_MATCHED_LIGANDS, [])

310 for pubchem in cache_not_matched:

311 if pubchem == pubchem_id_1 or pubchem == pubchem_id_2:

312 print(f" Ligand with PubChem id {pubchem} didn't match before, skipping it")

313 return True

314

315 return False

316

317def generate_ligand_mapping (

318 structure : 'Structure',

319 cache : 'Cache',

320 input_ligands : Optional[list[dict]],

321 pdb_ids : list[str],

322 output_filepath : str,

323 inchikeys: dict,

324 mercy : list[str] = [],

325 ) -> list[dict]:

326 """Generate a map of residues associated to ligands."""

327 # Merge input ligands and pdb ligands

328 ligands = input_ligands or []

329 # Check we have cached pdb 2 pubchem values

330 pdb_to_pubchem_cache = cache.retrieve(PDB_TO_PUBCHEM, {})

331 new_data_to_cache = False

332 # Get input ligands from the pdb ids, if any

333 for pdb_id in pdb_ids:

334 # Check we have cached this specific pdb

335 pubchem_ids_from_pdb = pdb_to_pubchem_cache.get(pdb_id, None)

336 if pubchem_ids_from_pdb != None:

337 print(f' Retriving from cache PubChem ids for PDB id {pdb_id}: ')

338 if len(pubchem_ids_from_pdb) > 0:

339 print(' PubChem ids: ' + ', '.join(pubchem_ids_from_pdb))

340 else:

341 print(' This PDB id has no PubChem ids')

342

343 # If we had no cached pdb 2 pubchem then ask for them

344 if pubchem_ids_from_pdb == None:

345 pubchem_ids_from_pdb = pdb_to_pubchem(pdb_id)

346 # Save the result in the cache object so it is saved to cache later

347 pdb_to_pubchem_cache[pdb_id] = pubchem_ids_from_pdb

348 new_data_to_cache = True

349 for pubchem_id in pubchem_ids_from_pdb:

350 # Ligands in the structure (PDB) and the 'inputs.json' could be the same so it's not necessary to do it twice

351 if not any('pubchem' in ligand and ligand['pubchem'] == pubchem_id for ligand in ligands):

352 ligands.append({ 'pubchem': pubchem_id, 'pdb': True })

353 # Save all pdb to pubchem results in cache, in case there is anything new

354 if new_data_to_cache: cache.update(PDB_TO_PUBCHEM, pdb_to_pubchem_cache)

355

356 # If no input ligands are passed then stop here

357 # if len(ligands) == 0:

358 # return []

359

360 # Save data from all ligands to be saved in a file

361 json_ligands_data = []

362 ligands_data = []

363

364 # Load the ligands file if exists already

365 if os.path.exists(output_filepath):

366 json_ligands_data += import_ligands(output_filepath)

367 # If json ligands exists and it is empty means that ligands analysis has been already done but no ligands were matched

368 # so the file will contain an empty list []

369 if len(json_ligands_data) == 0:

370 print('No ligands have been matched yet.\nIf you want to force a ligand to be matched, please provide the field "residues" in the inputs.json file.')

371 return []

372

373 # Visited formulas

374 visited_formulas = []

375 # Save the maps of every ligand

376 ligand_maps = []

377 # Get cached ligand data

378 # AGUS: esto lo creamos por alguna razón para que funcione sin internet (en el cluster) pero realmente

379 # AGUS: llena el cache con datos que no son necesarios porque toda esa info está en el ligand_references.json

380 #ligand_data_cache = cache.retrieve(LIGANDS_DATA, {})

381 # Iterate input ligands

382 for ligand in ligands:

383 # Set the pubchem id which may be assigned in different steps

384 pubchem_id = None

385 # If input ligand is not a dict but a single int/string then handle it

386 if type(ligand) == int:

387 print(f'A ligand number ID has been identified {ligand}, assuming that is a PubChem ID...')

388 ligand = { 'pubchem': str(ligand) }

389 elif type(ligand) == str:

390 raise InputError(f'A name of ligand has been identified: {ligand}. Anyway, provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.')

391 # Check if we already have this ligand data

392 ligand_data = obtain_ligand_data_from_file(json_ligands_data, ligand)

393 # Check if the ligand didn't match before

394 if check_matched_ligand(ligand, ligand_data, cache): continue

395 # If we do not have its data try to get from the cache

396 # if not ligand_data:

397 # # If this is a ligand not in ligans.json but in cache it means it comes from PDB, never form user inputs

398 # # For this reason, this ligand will always have a PubChem id

399 # pubchem_id = ligand.get('pubchem', None)

400 # if pubchem_id:

401 # ligand_data = ligand_data_cache.get(pubchem_id, None)

402 # # If we still have no ligand data then request it to PubChem

403 # if not ligand_data:

404 # ligand_data = obtain_ligand_data_from_pubchem(ligand)

405 # # Save data mined from PubChem in the cache

406 # pubchem_id = ligand_data['pubchem']

407 # ligand_data_cache[pubchem_id] = { **ligand_data }

408 # cache.update(LIGANDS_DATA, ligand_data_cache)

409 # Add current ligand data to the general list

410 if not ligand_data:

411 ligand_data = obtain_ligand_data_from_pubchem(ligand)

412 ligands_data.append(ligand_data)

413 # Get pubchem id

414 pubchem_id = ligand_data.get('pubchem', None)

415 # If we already visited a different ligand but with identical formula then we skip this ligand

416 # Note that the mapping will be identical thus overwritting the previous map

417 # However, ligands forced by the user are processed before so we keep them as priority

418 formula = ligand_data.get('formula', None)

419 if formula in visited_formulas:

420 print(f'WARNING: Ligand with PubChem Id {pubchem_id} has a formula which has been already matched')

421 # AGUS: en este punto si el usuario ha definido el mismo ligando con diferente selección quiere decir que está repetido

422 # AGUS: y que deberíamos mantener el ligando para diferentes residuos matcheados

423 if forced_selection:

424 visited_formulas.pop()

425 ligands_data.pop()

426 print(f'WARNING: Ligand with PubChem Id {pubchem_id} has been forced by the user and its repeated')

427 else:

428 ligands_data.pop()

429 continue

430 # Add it to the list of visited formulas

431 visited_formulas.append(formula)

432 # If the user defined a ligand name, it will be respectedand added to the metadata

433 # if there isn't a defined name, it will be mined from PubChem

434 user_forced_ligand_name = ligand.get('name', None)

435 # Map structure residues with the current ligand

436 # If the user forced the residues then use them

437 forced_selection = ligand.get('selection', None)

438 if forced_selection:

439 # Could be a single residue or a list of residues

440 selection_atoms = structure.select(forced_selection)

441 residue_indices = structure.get_selection_residue_indices(selection_atoms)

442 ligand_map = { 'name': pubchem_id, 'residue_indices': residue_indices, 'match': { 'ref': { 'pubchem': pubchem_id } } }

443 if user_forced_ligand_name: ligand_map['forced_name'] = user_forced_ligand_name

444 # If the user did not force the residues then map them

445 else:

446 ligand_map = map_ligand_residues(structure, ligand_data)

447 # Add current ligand map to the general list

448 ligand_maps.append(ligand_map)

449 # If the ligand did not map then discard it

450 did_not_match = len(ligand_map['residue_indices']) == 0

451 if did_not_match:

452 is_forced_by_user = not bool(ligand.get('pdb', False))

453 if is_forced_by_user:

454 # At this point we should have macthed all sequences

455 # If not, kill the process unless mercy was given

456 must_be_killed = LIGANDS_MATCH_FLAG not in mercy

457 if must_be_killed:

458 raise InputError(f'Ligand with PubChem Id {pubchem_id} did not map with any residue')

459 # Otherwise simply remove it from the final ligand references file and the forwarded maps

460 ligands_data.pop()

461 ligand_maps.pop()

462 # Update the cache with the pubchem id of the ligands that didn't match

463 not_matched_pubchems = cache.retrieve(NOT_MATCHED_LIGANDS, [])

464 not_matched_pubchems.append(ligand_map['name'])

465 cache.update(NOT_MATCHED_LIGANDS, not_matched_pubchems)

466 # If the ligand matched then calculate some additional data

467 # Get morgan and mordred descriptors, the SMILES is needed for this part

468 smiles = ligand_data['smiles']

469 mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block = obtain_mordred_morgan_descriptors(smiles)

470 ligand_data['mordred'] = mordred_results

471 ligand_data['morgan_fp_bit_array'] = morgan_fp_bit_array

472 ligand_data['morgan_highlight_atoms'] = morgan_highlight_atoms

473 ligand_data['mol_block'] = mol_block

474

475 # At this point maybe some ligands in the structure may not match because they have not been included by either the author or the user. Alternatively, the authors may have added them

476 # So those "ligands" are residues that are not matched to any ligand in the structure and the analyses will not run

477 # LORE AGUS: en el dataset de MDBind (cineca) hay muchas simulaciones con ligandos modificados (incluso solo un átomo) y no matchean

478 # LORE AGUS: con los que están en el PDB, por lo que no se hace ningún análisis sobre estos (y se debería)

479 # LORE AGUS: ahora se intenta buscar su pubchem ID para extraer la información a partir del inchikey (da menos problemas que el smiles) y se hacen los análisis pertinentes.

480 # AGUS: si no se puede extraer información de estos porque el inchikey no funciona, se muestra la información más básica posible y se genera una referencia vacía (será problemático)

481

482 # Obtain a list of the possible residues in the structure that are not either protein/nucleic/water/lipid or matched ligands

483 for residue in structure.residues:

484 # Skip solvent, lipid or ions

485 if residue.classification in ['solvent', 'fatty', 'steroid', 'ion']: continue

486 # Make sure the residue is not bonded to other things

487 # If we are missing bonds then we can not rely in this rule

488 if residue.is_missing_any_bonds(): continue

489 if len(residue.get_bonded_residue_indices()) > 0: continue

490 # Check if the residue is already matched to a ligand

491 matched = False

492 for ligand_map in ligand_maps:

493 if residue.index in ligand_map['residue_indices']:

494 matched = True

495 break

496 # If the residue is not matched then add it to the ligands data

497 if matched: continue

498 # Create a new ligand data with the residue name and its atoms count

499 ligand_data = { field: None for field in LIGAND_DATA_FIELDS }

500 ligand_data['name'] = residue.name

501 # Add the residue as a ligand map

502 ligand_map = {

503 'name': residue.name,

504 'residue_indices': [residue.index],

505 'match': { 'ref': { 'pubchem': None } }

506 }

507

508 inchikey = inchikeys['residx_2_key'].get(residue.index, None)

509 # If the InChIKey is not None then try to obtain the PubChem CID

510 if inchikey:

511 ligand_data['inchikey'] = inchikey

512 if cid := search_cid_by_inchikey(inchikey):

513 ligand_data = obtain_ligand_data_from_pubchem( {'pubchem': cid} )

514 smiles = ligand_data['smiles']

515 mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block = obtain_mordred_morgan_descriptors(smiles)

516 ligand_data['mordred'] = mordred_results

517 ligand_data['morgan_fp_bit_array'] = morgan_fp_bit_array

518 ligand_data['morgan_highlight_atoms'] = morgan_highlight_atoms

519 ligand_data['mol_block'] = mol_block

520 ligand_data['pubchem'] = cid

521 ligand_map['match']['ref']['pubchem'] = str(cid)

522 # Add the ligand data to the list of ligands data

523 ligands_data.append(ligand_data)

524 # Add the ligand map to the list of ligand maps

525 ligand_maps.append(ligand_map)

526

527 # Export ligands to a file

528 save_json(ligands_data, output_filepath)

529

530 # Log matched ligands

531 if not ligand_maps:

532 print('No ligands were matched')

533 else:

534 print('Matched ligands:')

535 for ligand_map in ligand_maps:

536 residue_indices = ligand_map["residue_indices"]

537 residue_count = len(residue_indices)

538 plural_suffix = '' if residue_count == 1 else 's'

539 print(f' - {ligand_map["name"]}: {residue_count} residue{plural_suffix}')

540

541 return ligand_maps

542

543# Set the expected ligand data fields

544LIGAND_DATA_FIELDS = set(['name', 'pubchem', 'drugbank', 'chembl', 'smiles', 'formula', 'morgan', 'mordred', 'pdbid'])

545

546# Import ligands json file so we do not have to rerun this process

547def import_ligands (output_filepath : str) -> dict:

548 # Read the file

549 imported_ligands = load_json(output_filepath)

550 # Format data as the process expects to find it

551 for imported_ligand in imported_ligands:

552 for expected_field in LIGAND_DATA_FIELDS:

553 if expected_field not in imported_ligand:

554 imported_ligand[expected_field] = None

555 return imported_ligands

556

557# Check if the current input ligand is already among the ligands we already have data for

558def obtain_ligand_data_from_file ( ligands_data : list[dict], ligand: dict ) -> Optional[dict]:

559 for ligand_data in ligands_data:

560 if (ligand.get('pubchem') and ligand['pubchem'] == ligand_data.get('pubchem')) or \

561 (ligand.get('drugbank') and ligand['drugbank'] == ligand_data.get('drugbank')) or \

562 (ligand.get('chembl') and ligand['chembl'] == ligand_data.get('chembl')):

563 return ligand_data

564 return None

565

566# Given an input ligand, obtain all necessary data

567def obtain_ligand_data_from_pubchem (ligand : dict) -> dict:

568 # Save in a dictionary all ligand data including its name and ids

569 # The ID can be of the databases: 'drugbank' , 'pubchem' , 'chembl'

570 # Define the needed variables to check if the ligand has a database ID or it is None

571 ligand_data = {

572 'name': None,

573 'pubchem': None,

574 'drugbank': None,

575 'chembl': None,

576 'smiles': None,

577 'formula': None,

578 'pdbid': None,

579 }

580 # Set ligand data pubchem id, even if the input id is not from pubhcme (e.g. drugbank, chembl)

581 if 'pubchem' in ligand:

582 ligand_data['pubchem'] = str(ligand.get('pubchem'))

583 elif 'drugbank' in ligand:

584 ligand_data['drugbank'] = ligand.get('drugbank')

585 ligand_data['pubchem'] = str(find_drugbank_pubchem(ligand_data['drugbank']))

586 elif 'chembl' in ligand:

587 ligand_data['chembl'] = ligand.get('chembl')

588 ligand_data['pubchem'] = str(find_chembl_pubchem(ligand_data['chembl']))

589 else:

590 raise InputError('None of the ligand IDs are defined. Please provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.')

591

592 # Request ligand data from pubchem

593 pubchem_data = get_pubchem_data(ligand_data['pubchem'])

594 if not pubchem_data:

595 raise RuntimeError('No PubChem data avilable')

596

597 # Add pubchem data to ligand data

598 ligand_data = { **ligand_data, **pubchem_data }

599 return ligand_data

600

601# RUBEN: mover a Structure.count_atom_elements_per_residue

602def count_atom_elements_per_residue ( structure : 'Structure' ) -> dict:

603 """For each residue, count the number of atom elements."""

604 # Obtain a list of residues of ligand candidates

605 residue_element_count = {}

606 # Iterate over the residue(s)

607 for residue in structure.residues:

608 # Obtain the list of elements for each residue

609 atom_elements = [ atom.element for atom in residue.atoms ]

610 # Generate a dict with elements (keys) and their counting (values)

611 atoms_count = {}

612 for atom in atom_elements:

613 if atom in atoms_count:

614 atoms_count[atom] += 1

615 else:

616 atoms_count[atom] = 1

617 # Define the key of the dictionary as the residue name and save it in a list with the all the residues

618 residue_element_count[residue] = atoms_count

619

620 return residue_element_count

621

622def nest_brackets(tokens, i = 0):

623 l = []

624 while i < len(tokens):

625 if tokens[i] == ')':

626 return i,l

627 elif tokens[i] == '(':

628 i,subl = nest_brackets(tokens, i+1)

629 l.append(subl)

630 else:

631 l.append(tokens[i])

632 i += 1

633 return i,l

634

635def parse_compound(formula : str) -> list[str]:

636 tokens = [''.join(t) for t in split_when(formula, lambda a,b: b.isupper() or b in '()' or (b.isdigit() and not a.isdigit()))]

637 tokens = [(int(t) if t.isdigit() else t) for t in tokens]

638 i, l = nest_brackets(tokens)

639 assert(i == len(tokens)) # crash if unmatched ')'

640 return l

641

642# Split a string using a function

643def split_when(string : str, func : Callable) -> list[str]:

644 splits = []

645 last_split = string[0]

646 for s in string[1:]:

647 if func(last_split, s):

648 splits.append(last_split)

649 last_split = s

650 else:

651 last_split += s

652 splits.append(last_split)

653 return splits

654

655# Given a chemical formula, get the count of atoms per element

656def count_atom_elements (molecular_formula : str) -> dict:

657 # Clean the formula from charges

658 # These charges include numbers which are not atom counts (e.g. 49867169 -> C18H16NO5PS-2)

659 parsed_molecular_formula = re.sub('[+-][0-9]*', '', molecular_formula)

660 l = parse_compound(parsed_molecular_formula)

661 c = parse_splits(l)

662 return c

663

664# This function associates elements in a list

665# If a string is followed by a number then they go together

666# If a string has no number then the number is 1 for this specific string

667def parse_splits (splits : list[str]) -> dict:

668 parsed = {}

669 previous_key = None

670 for split in splits:

671 if type(split) == str:

672 if previous_key:

673 parsed[previous_key] = 1

674 previous_key = split

675 elif type(split) == int:

676 parsed[previous_key] = split

677 previous_key = None

678 else:

679 raise ValueError('Not supported type ' + type(split))

680 if previous_key:

681 parsed[previous_key] = 1

682 return parsed

683

684def match_ligandsID_to_res (ligand_atom_element_count : dict, residue_atom_element_count : dict) -> bool:

685 # Remove hydrogens since we only pay attention to heavy atoms

686 # This is to avoid having mismatched_residues due to different protonation states

687 if 'H' in ligand_atom_element_count:

688 del ligand_atom_element_count['H']

689 if 'H' in residue_atom_element_count:

690 del residue_atom_element_count['H']

691 #shared_items = {k: ligand_atom_element_count[k] for k in ligand_atom_element_count if k in residue_atom_element_count and ligand_atom_element_count[k] == residue_atom_element_count[k]}

692 ligand_elements = set(ligand_atom_element_count.keys())

693 residue_elements = set(residue_atom_element_count.keys())

694

695 # Check if there are different elements

696 # If so, we are done

697 different_keys = ligand_elements.symmetric_difference(residue_elements)

698 if len(different_keys) > 0:

699 # print("Elements that are not matching:")

700 # for atom in different_keys:

701 # if atom in ligand_atom_element_count:

702 # print(f"In Pubchem {list(ligand_atom_element_count.keys())[0]} : {atom}: {ligand_atom_element_count[atom]}")

703 # else:

704 # print(f"In PDB residue {list(residue_atom_element_count.keys())[0]}: {atom}: {residue_atom_element_count[atom]}")

705 return False

706

707 # Different values

708 different_values = []

709 for element in ligand_elements:

710 if ligand_atom_element_count[element] != residue_atom_element_count[element]:

711 different_values.append(element)

712

713 # If the count of elements does not match then stop here

714 if len(different_values) > 0:

715 # Print the differences found between the elements

716 # print("Number of atoms that are different:")

717 # for atom in different_values:

718 # print(f"In Pubchem {list(ligand_atom_element_count.keys())[0]} {atom}: {ligand_atom_element_count[atom]}, In PDB residue {list(pdb_dict.keys())[0]}: {atom}: {residue_atom_element_count[atom]}\n")

719 return False

720

721 return True

722

723def map_ligand_residues (structure : 'Structure', ligand_data : dict) -> dict:

724 # Load the structure where the atoms will be minresidue.indexed and obtain a residues dict

725 atom_elements_per_residue = count_atom_elements_per_residue(structure)

726 # Get the ligand formula

727 ligand_formula = ligand_data['formula']

728 if not ligand_formula:

729 raise RuntimeError(f'Ligand with PubChem id {ligand_data["pubchem"]} is missing formula')

730 # From pubchem mine the atoms of the ligands and save it in a dict

731 ligand_atom_element_count = count_atom_elements(ligand_data['formula'])

732 matched_residues = []

733 # Get ligand pubchem id

734 pubchem_id = ligand_data['pubchem']

735 for residue, residue_atom_element_count in atom_elements_per_residue.items():

736 if match_ligandsID_to_res(ligand_atom_element_count, residue_atom_element_count):

737 matched_residues.append(residue.index)

738 # Format the output as we expect it

739 ligand_map = { 'name': pubchem_id, 'residue_indices': matched_residues, 'match': { 'ref': { 'pubchem': pubchem_id } } }

740 return ligand_map

741

742# Given a smiles, get the pubchem id

743# e.g. CC1=C(C=NC=C1)NC(=O)CC2=CC(=CC(=C2)Cl)OC -> 154876006

744# e.g. O=C4N3C(C(=O)Nc2cc(nn2c1ccccc1)C)C(SC3CC=CC4NC(=O)C(NC)C)(C)C -> None

745def smiles_to_pubchem_id (smiles : str) -> Optional[str]:

746 # Set the request URL

747 request_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON'

748 # Set the POST data

749 data = urlencode({ 'smiles': smiles }).encode()

750 try:

751 with urlopen(request_url, data=data) as response:

752 parsed_response = json.loads(response.read().decode("utf-8"))

753 # If the smiles is not found in pubchem then we can stop here

754 except HTTPError as error:

755 if error.code == 404:

756 print(f' Smiles {smiles} not found')

757 return None

758 else:

759 raise ValueError('Something went wrong with the PubChem request: ' + request_url)

760 # Get the PubChem id

761 compounds = parsed_response.get('PC_Compounds', None)

762 if not compounds:

763 raise RuntimeError(f'Something went wrong when mining pubchem data for SMILES {smiles}')

764 # Make sure there is only 1 compound

765 # DANI: Esto algún día podría no ser así, ya lo gestionaremos

766 if len(compounds) != 1:

767 raise RuntimeError(f'There should be one and only one compound for SMILES {smiles}')

768 # Keep mining

769 compound = compounds[0]

770 first_id = compound.get('id', None)

771 # If there is no first id then it means there is no direct match with pubchem

772 if not first_id:

773 return None

774 second_id = first_id.get('id', None)

775 if not second_id:

776 raise RuntimeError(f'Missing second id when mining pubchem data for SMILES {smiles}')

777 pubchem_id = second_id.get('cid', None)

778 if not pubchem_id:

779 raise RuntimeError(f'Missing pubchem id when mining pubchem data for SMILES {smiles}')

780 return str(pubchem_id)

781

782# Given a PDB ligand code, get its pubchem

783def pdb_ligand_to_pubchem (pdb_ligand_id : str) -> Optional[str]:

784 # Set the request query

785 query = '''query molecule($id:String!){

786 chem_comp(comp_id:$id) { rcsb_chem_comp_related{ resource_name resource_accession_code } }

787 }'''

788 # Request PDB data

789 parsed_response = request_pdb_data(pdb_ligand_id, query)

790 related_resources = parsed_response['rcsb_chem_comp_related']

791 # It may happend that a ligand code has no related resources at all

792 # e.g. ZN

793 if not related_resources: return None

794 pubchem_resource = next((resource for resource in related_resources if resource['resource_name'] == 'PubChem'), None)

795 if not pubchem_resource: return None

796 return pubchem_resource['resource_accession_code']

797

798# Given a PDB ligand code, get its pubchem

799# Use a web crawler to avoid having to use the PDB API

800def pdb_ligand_to_pubchem_RAW (pdb_ligand_id : str) -> Optional[str]:

801 # Set the request URL

802 request_url = f'https://www.rcsb.org/ligand/{pdb_ligand_id}'

803 # Run the query

804 parsed_response = None

805 try:

806 with urlopen(request_url) as response:

807 parsed_response = response.read().decode("utf-8")

808 # If the accession is not found in the PDB then we can stop here

809 except HTTPError as error:

810 if error.code == 404:

811 print(f' PDB ligand {pdb_ligand_id} not found')

812 return None

813 else:

814 print(error.msg)

815 raise ValueError('Something went wrong with the PDB ligand request: ' + request_url)

816 # Mine the pubchem id out of the whole response

817 pattern = re.compile('pubchem.ncbi.nlm.nih.gov\/compound\/([0-9]*)\"')

818 match = re.search(pattern, parsed_response)

819 # If there is no pubchem id then return none

820 # This is normal for some ligands such as counter ions (e.g. LI)

821 if not match:

822 return None

823 pubchem_id = match[1]

824 return pubchem_id

825

826# Given a PDB ligand code, get its pubchem

827# Ask to pubchem if the ligand exists and hope there is only one result

828# DANI: No se ha provado a fondo

829def pdb_ligand_to_pubchem_RAW_RAW (pdb_ligand_id : str) -> Optional[str]:

830 # Set the request URL

831 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{pdb_ligand_id}/json'

832 # Run the query

833 parsed_response = None

834 try:

835 with urlopen(request_url) as response:

836 parsed_response = json.loads(response.read().decode("utf-8"))

837 # If the accession is not found in the PDB then we can stop here

838 except HTTPError as error:

839 # This may happen for weird things such as UNX (unknown atom or ion)

840 if error.code == 404:

841 return None

842 print(error.msg)

843 raise RuntimeError('Something went wrong with the PDB ligand request in PubChem: ' + request_url)

844 # Mine the pubchem id

845 compounds = parsed_response['PC_Compounds']

846 if len(compounds) != 1:

847 # There could be more than one result

848 # For example, the case HEM: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/HEM/json

849 # In this case we picked the first result

850 compound = compounds[0]

851 #raise RuntimeError('We are not having 1 and only 1 result from PubChem: ' + request_url)

852 compound = compounds[0]

853 id1 = compound['id']

854 id2 = id1['id']

855 pubchem_id = str(id2['cid'])

856 return pubchem_id

857

858# Given a PDB id, get all its ligand codes

859# e.g. 2I3I -> 618, BTB, ZN, EDO, LI

860def get_pdb_ligand_codes (pdb_id : str) -> list[str]:

861 # Set the request query

862 query = '''query structure($id: String!) {

863 entry(entry_id: $id) {

864 nonpolymer_entities { nonpolymer_comp { chem_comp { id } } }

865 }

866 }'''

867 # Request PDB data

868 parsed_response = request_pdb_data(pdb_id, query)

869 # Mine data for nonpolymer entities

870 nonpolymers = parsed_response['nonpolymer_entities']

871 # If there are no nonpolymer entities, another type of entitie could be used

872 # AGUS: esto es un caso muy concreto que me encontré con el PDB 1N3W

873 # AGUS: el ligando en este caso es una 'Biologically Interesting Molecules' y se muestran como 'PRD_'

874 prd_code = None

875 if nonpolymers == None:

876 # Get the prd ligand code

877 prd_code = get_prd_ligand_code(pdb_id)

878 if prd_code != None:

879 return [prd_code]

880

881 if nonpolymers == None and prd_code == None: return []

882 # Iterate nonpolymer entities to mine each PDB code

883 ligand_codes = []

884 for nonpolymer in nonpolymers:

885 ligand_code = nonpolymer['nonpolymer_comp']['chem_comp']['id']

886 ligand_codes.append(ligand_code)

887 print(f' Ligand codes for PDB id {pdb_id}: ' + ', '.join(ligand_codes))

888 return ligand_codes

889

890# Given a PDB id, get its PRD ligand code

891def get_prd_ligand_code (pdb_id : str) -> Optional[str]:

892 query = '''query structure($id: String!) {

893 entry(entry_id: $id) {

894 pdbx_molecule_features {

895 prd_id

896 }

897 }

898 }'''

899 parsed_response = request_pdb_data(pdb_id, query)

900 pdbx_id = parsed_response.get('pdbx_molecule_features', None)

901 if pdbx_id is None:

902 print(f'WARNING: Cannot find PRD ligand code for PDB id {pdb_id}')

903 return None

904 prd_id = pdbx_id[0].get('prd_id', None) # AGUS: podría haber casos donde haya más de uno?

905 if prd_id is None:

906 print(f'WARNING: Cannot find PRD ligand code for PDB id {pdb_id}')

907 return None

908 # If the PRD id is not empty then return it

909 return prd_id

910

911# Given a pdb id, get its pubchem ids

912# DANI: De momento no usamos las SMILES que alguna vez me han dado problemas (e.g. 2I3I)

913# e.g. 4YDF ->

914def pdb_to_pubchem (pdb_id : str) -> list[str]:

915 print(f'Searching PubChem ids for PDB {pdb_id}')

916 pubchem_ids = []

917 # Iterate over pdb ligand codes

918 ligand_codes = get_pdb_ligand_codes(pdb_id)

919 for ligand_code in ligand_codes:

920 # Ask the PDB API for the ligand

921 pubchem_id = pdb_ligand_to_pubchem(ligand_code)

922 # If this did not work then try mining the PDB client with a web crawler

923 if not pubchem_id:

924 pubchem_id = pdb_ligand_to_pubchem_RAW(ligand_code)

925 # If this did not work then try it from PubChem

926 if not pubchem_id:

927 pubchem_id = pdb_ligand_to_pubchem_RAW_RAW(ligand_code)

928 # Otherwise we surrender

929 if not pubchem_id:

930 print(f' {ligand_code} -> No PubChem id')

931 continue

932 print(f' {ligand_code} -> {pubchem_id}')

933 pubchem_ids.append(pubchem_id)

934

935 return pubchem_ids

936

937@lru_cache(maxsize=None)

938def search_cid_by_inchikey(inchikey : str) -> Optional[str]:

939 url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/cids/JSON"

940 r = requests.get(url)

941 if r.ok:

942 data = r.json()

943 return data['IdentifierList']['CID'][0]

944 else:

945 return None

Coverage for mddb_workflow/tools/generate_ligands_desc.py: 60%

578 statements