Coverage for model_workflow/tools/generate_ligands

1import os

2import json

3from rdkit import Chem

4from rdkit.Chem import AllChem

5from rdkit.Chem import rdFingerprintGenerator

6from mordred import Calculator, descriptors

7from model_workflow.utils.constants import LIGANDS_MATCH_FLAG, PDB_TO_PUBCHEM, NOT_MATCHED_LIGANDS

8from model_workflow.utils.auxiliar import InputError, load_json, save_json, request_pdb_data

9from model_workflow.utils.type_hints import *

10from model_workflow.utils.structures import Structure

11from urllib.request import Request, urlopen

12from urllib.parse import urlencode

13from urllib.error import HTTPError, URLError

14import re

16def get_drugbank_smiles (id_drugbank : str) -> Optional[str]:

17 # Request Drugbank

18 request_url = Request(

19 url= f'https://go.drugbank.com/structures/small_molecule_drugs/{id_drugbank}.smiles',

20 headers={'User-Agent': 'Mozilla/5.0'}

21 )

22 try:

23 with urlopen(request_url) as response:

24 smiles = response.read()

25 # If the accession is not found in the database then we stop here

26 except HTTPError as error:

27 # If the drugbank ID is not yet in the Drugbank references then return None

28 if error.code == 404:

29 return None

30 else:

31 print('Error when requesting ' + request_url)

32 raise ValueError('Something went wrong with the Drugbank request (error ' + str(error.code) + ')')

33 # This error may occur if there is no internet connection

34 except URLError as error:

35 print('Error when requesting ' + request_url)

36 raise ValueError('Something went wrong with the MDposit request')

38 return smiles

42# Given a ChemBL ID, use the uniprot API to request its data and then mine what is needed for the database

43def get_chembl_smiles (id_chembl : str) -> Optional[str]:

44 # Request ChemBL

45 parsed_response = None

46 request_url = Request(

47 url= f'https://www.ebi.ac.uk/chembl/interface_api/es_proxy/es_data/get_es_document/chembl_molecule/{id_chembl}',

48 headers={'User-Agent': 'Mozilla/5.0'}

49 )

50 try:

51 with urlopen(request_url) as response:

52 parsed_response = json.loads(response.read().decode("utf-8"))

53 smiles = parsed_response['_source']['molecule_structures']['canonical_smiles']

54 pubchem_id = parsed_response['_source']['_metadata']['unichem'][8]['id']

55 # If the accession is not found in ChemBL then the id is not valid

56 except HTTPError as error:

57 if error.code == 404:

58 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)

59 return None

60 print('Error when requesting ' + request_url)

61 raise ValueError('Something went wrong with the ChemBL request (error ' + str(error.code) + ')')

62 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607)

63 if parsed_response == None:

64 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)

65 return None

66 return smiles

68# Given a PubChem ID, use the uniprot API to request its data and then mine what is needed for the database

69def get_pubchem_data (id_pubchem : str) -> Optional[dict]:

70 # Request PubChem

71 parsed_response = None

72 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{id_pubchem}/JSON/'

73 try:

74 with urlopen(request_url) as response:

75 #parsed_response = json.loads(response.read().decode("windows-1252"))

76 parsed_response = json.loads(response.read().decode("utf-8", errors='ignore'))

77 # If the accession is not found in PubChem then the id is not valid

78 # This may happen with pubchem ids of non-discrete compounds (e.g. 483927498)

79 except HTTPError as error:

80 if error.code == 404:

81 print('WARNING: Cannot find PubChem entry for accession ', id_pubchem)

82 return None

83 print('Error when requesting ', request_url)

84 raise ValueError('Something went wrong with the PubChem request (error ', str(error.code), ')')

85 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607)

86 if parsed_response == None:

87 print('WARNING: Cannot find PubChem entry for accession ' + id_pubchem)

88 return None

89 # Mine target data: SMILES

90 record = parsed_response.get('Record', None)

91 if record == None:

92 raise RuntimeError('Wrong Pubchem data structure: no record: ' + request_url)

93 sections = record.get('Section', None)

94 if sections == None:

95 raise RuntimeError('Wrong Pubchem data structure: no sections: ' + request_url)

96 names_and_ids_section = next((section for section in sections if section.get('TOCHeading', None) == 'Names and Identifiers'), None)

97 if names_and_ids_section == None:

98 raise RuntimeError('Wrong Pubchem data structure: no name and ids section: ' + request_url)

99 names_and_ids_subsections = names_and_ids_section.get('Section', None)

100 if names_and_ids_subsections == None:

101 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)

102

103 # Mine the name

104 synonims = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Synonyms'), None)

105 if synonims == None:

106 descriptors = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None)

107 descriptors_subsections = descriptors.get('Section', None)

108 if descriptors_subsections == None:

109 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)

110 depositor_supplied_descriptors = next((s for s in descriptors_subsections if s.get('TOCHeading', None) == 'IUPAC Name'), None)

111 name_substance = depositor_supplied_descriptors.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

112 else:

113 synonims_subsections = synonims.get('Section', None)

114 if synonims_subsections == None:

115 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)

116 depositor_supplied_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Depositor-Supplied Synonyms'), None)

117 if depositor_supplied_synonims == None:

118 removed_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Removed Synonyms'), None)

119 name_substance = removed_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

120 else:

121 name_substance = depositor_supplied_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

122

123 # Mine the SMILES

124 computed_descriptors_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None)

125 if computed_descriptors_subsection == None:

126 raise RuntimeError('Wrong Pubchem data structure: no computeed descriptors: ' + request_url)

127 canonical_smiles_section = computed_descriptors_subsection.get('Section', None)

128 if canonical_smiles_section == None:

129 raise RuntimeError('Wrong Pubchem data structure: no canonical SMILES section: ' + request_url)

130 canonical_smiles = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'Canonical SMILES'), None)

131 if canonical_smiles == None:

132 # In some cases there is no canonical SMILES but a non-canonical one could exists

133 non_canonical_smiles_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'SMILES'), None)

134 if non_canonical_smiles_section == None:

135 raise RuntimeError('Wrong Pubchem data structure: no canonical SMILES: ' + request_url)

136

137 if canonical_smiles:

138 smiles = canonical_smiles.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

139 if non_canonical_smiles_section:

140 smiles = non_canonical_smiles_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

141

142 if smiles == None:

143 raise RuntimeError('Wrong Pubchem data structure: no SMILES: ' + request_url)

144

145 # Mine target data: MOLECULAR FORMULA

146 molecular_formula_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Molecular Formula'), None)

147 if molecular_formula_subsection == None:

148 raise RuntimeError('Wrong Pubchem data structure: no molecular formula section: ' + request_url)

149 molecular_formula = molecular_formula_subsection.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

150 if molecular_formula == None:

151 raise RuntimeError('Wrong Pubchem data structure: no molecular formula: ' + request_url)

152

153 # Mine target data: PDB ID

154 pdb_id = None

155 pdb_id_subsection = next((s for s in sections if s.get('TOCHeading', None) == 'Interactions and Pathways'), None)

156 # If this section is missing then it means this PubChem compound has no PDB id

157 if pdb_id_subsection:

158 pdb_id_subsections = pdb_id_subsection.get('Section', None)

159 if pdb_id_subsections == None:

160 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)

161 bond_structures = next((s for s in pdb_id_subsections if s.get('TOCHeading', None) == 'Protein Bound 3D Structures'), None)

162 if bond_structures:

163 bond_structures_section = bond_structures.get('Section', None)

164 # If this section is missing then it means this PubChem compound has no PDB id

165 if bond_structures_section:

166 ligands_structure = next((s for s in bond_structures_section if s.get('TOCHeading', None) == 'Ligands from Protein Bound 3D Structures'), None)

167 if ligands_structure == None:

168 raise RuntimeError('Wrong Pubchem data structure: no Ligands from Protein Bound 3D Structures section: ' + request_url)

169 ligands_structure_subsections = ligands_structure.get('Section', None)

170 if ligands_structure_subsections == None:

171 raise RuntimeError('Wrong Pubchem data structure: no Ligands from Protein Bound 3D Structures subsections: ' + request_url)

172 ligands_pdb = next((s for s in ligands_structure_subsections if s.get('TOCHeading', None) == 'PDBe Ligand Code'), None)

173 if ligands_pdb == None:

174 raise RuntimeError('Wrong Pubchem data structure: no PDBe Ligand Code: ' + request_url)

175 pdb_id = ligands_pdb.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

176

177 # Mine de INCHI and INCHIKEY

178 inchi_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChI'), None)

179 if inchi_section == None:

180 raise RuntimeError('Wrong Pubchem data structure: no InChI: ' + request_url)

181 if inchi_section:

182 inchi = inchi_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

183

184 inchikey_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChIKey'), None)

185 if inchikey_section == None:

186 raise RuntimeError('Wrong Pubchem data structure: no InChIKey: ' + request_url)

187 if inchikey_section:

188 inchikey = inchikey_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

189 # Prepare the pubchem data to be returned

190 return { 'name': name_substance, 'smiles': smiles, 'formula': molecular_formula , 'pdbid': pdb_id, 'inchi': inchi, 'inchikey': inchikey }

191

192

193def find_drugbank_pubchem (drugbank_id):

194 # Request Drugbank

195 request_url = Request(

196 url=f'https://go.drugbank.com/drugs/{drugbank_id}',

197 headers={

198 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',

199 'Accept-Language': 'en-US,en;q=0.9',

200 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'

201 }

202)

203 pubchem_id = None

204 try:

205 with urlopen(request_url) as response:

206 content = response.read().decode("utf-8")

207 pattern = re.compile("http\:\/\/pubchem.ncbi.nlm.nih.gov\/summary\/summary.cgi\?cid\=([0-9]*)")

208 match = re.search(pattern, str(content))

209 if not match:

210 raise ValueError("No se encontró información sobre el Pubchem compound en esta página.")

211 pubchem_id = match[1]

212 if not pubchem_id:

213 raise ValueError("No se encontró información sobre el Pubchem compound en esta página.")

214 # If the accession is not found in the database then we stop here

215 except HTTPError as error:

216 # If the drugbank ID is not yet in the Drugbank references then return None

217 raise ValueError(f'Wrong request. Code: {error.code}')

218 # This error may occur if there is no internet connection

219 except URLError as error:

220 print('Error when requesting ' + request_url)

221 raise ValueError('Something went wrong with the DrugBank request')

222

223 return pubchem_id

224

225

226def find_chembl_pubchem (id_chembl):

227 # Request ChemBL

228 parsed_response = None

229 request_url = Request(

230 url= f'https://www.ebi.ac.uk/chembl/interface_api/es_proxy/es_data/get_es_document/chembl_molecule/{id_chembl}',

231 headers={'User-Agent': 'Mozilla/5.0'}

232 )

233 try:

234 with urlopen(request_url) as response:

235 parsed_response = json.loads(response.read().decode("utf-8"))

236 unichem = parsed_response['_source']['_metadata']['unichem']

237 if unichem == None:

238 raise RuntimeError('Wrong Pubchem data structure: no unichem section')

239 pubchem_section = next((section for section in unichem if section.get('src_url', None) == "http://pubchem.ncbi.nlm.nih.gov"), None)

240 if pubchem_section == None:

241 raise RuntimeError('Wrong Pubchem data structure: no pubchem section')

242 pubchem_id = pubchem_section.get('id', None)

243 if pubchem_id == None:

244 raise RuntimeError('Wrong Pubchem data structure: no pubchem ID')

245 # If the accession is not found in ChemBL then the id is not valid

246 except HTTPError as error:

247 if error.code == 404:

248 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)

249 return None

250 print('Error when requesting ' + request_url)

251 raise ValueError('Something went wrong with the ChemBL request (error ' + str(error.code) + ')')

252 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607)

253 if parsed_response == None:

254 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)

255 return None

256

257 return pubchem_id

258

259# Calculate the Morgan fingerprint and the Mordred descriptors from a ligand SMILES

260def obtain_mordred_morgan_descriptors (smiles : str) -> Tuple:

261 mol = Chem.MolFromSmiles(smiles)

262 if mol is None:

263 print(f'WARNING: Cannot generate a molecule from SMILES {smiles}')

264 return None

265

266 AllChem.Compute2DCoords(mol)

267 mol_block = Chem.MolToMolBlock(mol)

268 # We can select the different submodules of mordred descriptors, avaible in: 'https://mordred-descriptor.github.io/documentation/master/'

269 calc = Calculator([

270 descriptors.ABCIndex, # Índice de ramificación

271 descriptors.AcidBase.AcidicGroupCount, # Grupos ácidos

272 descriptors.AcidBase.BasicGroupCount, # Grupos básicos

273 descriptors.RingCount, # Conteo de anillos

274 descriptors.Constitutional, # Propiedades generales como número de átomos, peso molecular

275 descriptors.TopologicalCharge, # Índices topológicos, Cargas parciales, polaridad

276 descriptors.HydrogenBond, # Donantes y aceptores de enlaces de hidrógeno

277 descriptors.Lipinski, # Reglas de Lipinski (drug-likeness)

278 descriptors.FragmentComplexity, # Identificación de subestructuras frecuentes

279 descriptors.PathCount, # Conteo de caminos moleculares

280 ], ignore_3D=True)

281

282

283 # Calculate Mordred results

284 mordred_results = calc(mol).drop_missing().asdict()

285

286 ######## MORGAN FINGERPRINT ###########

287 morgan_fp_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

288 ao = rdFingerprintGenerator.AdditionalOutput()

289 ao.AllocateAtomCounts()

290 ao.AllocateAtomToBits()

291 ao.AllocateBitInfoMap()

292 fp = morgan_fp_gen.GetFingerprint(mol,additionalOutput=ao)

293 morgan_fp_bit_array = list(fp)

294 morgan_highlight_atoms = {}

295 for bit, atoms in ao.GetBitInfoMap().items():

296 morgan_highlight_atoms[bit] = list(set(atom for atom, radius in atoms))

297

298 return mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block

299

300# Check the cache to see if the ligand has already been matched or not, if not the ligand is skipped

301def check_matched_ligand (ligand: dict, ligand_data: dict, cache: 'Cache') -> bool:

302 # The pubchem id could be in the ligands data or the ligand input

303 pubchem_id_1 = ligand.get('pubchem', None)

304 if ligand_data != None:

305 pubchem_id_2 = ligand_data.get('pubchem', None)

306 else:

307 pubchem_id_2 = None

308 # If we have no pubchem id then we cannot check if the ligand matched

309 if not pubchem_id_1 and not pubchem_id_2:

310 return False

311 # If we have a pubchem id then check if it is already in the cache

312 cache_not_matched = cache.retrieve(NOT_MATCHED_LIGANDS, [])

313 for pubchem in cache_not_matched:

314 if pubchem == pubchem_id_1 or pubchem == pubchem_id_2:

315 print(f" Ligand with PubChem id {pubchem} didn't match before, skipping it")

316 return True

317

318 return False

319

320def generate_ligand_mapping (

321 structure : 'Structure',

322 cache : 'Cache',

323 input_ligands : Optional[List[dict]],

324 pdb_ids : List[str],

325 output_filepath : str,

326 mercy : List[str] = [],

327 ) -> List[dict]:

328

329 """Generate a map of residues associated to ligands."""

330 # Merge input ligands and pdb ligands

331 ligands = []

332 if input_ligands:

333 ligands += input_ligands

334 # Check we have cached pdb 2 pubchem values

335 pdb_to_pubchem_cache = cache.retrieve(PDB_TO_PUBCHEM, {})

336 new_data_to_cache = False

337 # Get input ligands from the pdb ids, if any

338 for pdb_id in pdb_ids:

339 # Check we have cached this specific pdb

340 pubchem_ids_from_pdb = pdb_to_pubchem_cache.get(pdb_id, None)

341 if pubchem_ids_from_pdb != None:

342 print(f' Retriving from cache PubChem ids for PDB id {pdb_id}: ')

343 if len(pubchem_ids_from_pdb) > 0:

344 print(' PubChem ids: ' + ', '.join(pubchem_ids_from_pdb))

345 else:

346 print(' This PDB id has no PubChem ids')

347

348 # If we had no cached pdb 2 pubchem then ask for them

349 if pubchem_ids_from_pdb == None:

350 pubchem_ids_from_pdb = pdb_to_pubchem(pdb_id)

351 # Save the result in the cache object so it is saved to cache later

352 pdb_to_pubchem_cache[pdb_id] = pubchem_ids_from_pdb

353 new_data_to_cache = True

354 for pubchem_id in pubchem_ids_from_pdb:

355 # Ligands in the structure (PDB) and the 'inputs.json' could be the same so it's not necessary to do it twice

356 if not any('pubchem' in ligand and ligand['pubchem'] == pubchem_id for ligand in ligands):

357 ligands.append({ 'pubchem': pubchem_id, 'pdb': True })

358 # Save all pdb to pubchem results in cache, in case there is anything new

359 if new_data_to_cache: cache.update(PDB_TO_PUBCHEM, pdb_to_pubchem_cache)

360

361 # If no input ligands are passed then stop here

362 if len(ligands) == 0:

363 return []

364

365 # Save data from all ligands to be saved in a file

366 json_ligands_data = []

367 ligands_data = []

368

369 # Load the ligands file if exists already

370 if os.path.exists(output_filepath):

371 json_ligands_data += import_ligands(output_filepath)

372 # If json ligands exists and it is empty means that ligands analysis has been already done but no ligands were matched

373 # so the file will contain an empty list []

374 if len(json_ligands_data) == 0:

375 print('No ligands have been matched yet.\nIf you want to force a ligand to be matched, please provide the field "residues" in the inputs.json file.')

376 return []

377

378 # Visited formulas

379 visited_formulas = []

380 # Save the maps of every ligand

381 ligand_maps = []

382 # Get cached ligand data

383 # AGUS: esto lo creamos por alguna razón para que funcione sin internet (en el cluster) pero realmente

384 # AGUS: llena el cache con datos que no son necesarios porque toda esa info está en el ligand_references.json

385 #ligand_data_cache = cache.retrieve(LIGANDS_DATA, {})

386 # Iterate input ligands

387 for ligand in ligands:

388 # Set the pubchem id which may be assigned in different steps

389 pubchem_id = None

390 # If input ligand is not a dict but a single int/string then handle it

391 if type(ligand) == int:

392 print(f'A ligand number ID has been identified {ligand}, assuming that is a PubChem ID...')

393 ligand = { 'pubchem': str(ligand) }

394 elif type(ligand) == str:

395 raise InputError(f'A name of ligand has been identified: {ligand}. Anyway, provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.')

396 # Check if we already have this ligand data

397 ligand_data = obtain_ligand_data_from_file(json_ligands_data, ligand)

398 # Check if the ligand didn't match before

399 if check_matched_ligand(ligand, ligand_data, cache): continue

400 # If we do not have its data try to get from the cache

401 # if not ligand_data:

402 # # If this is a ligand not in ligans.json but in cache it means it comes from PDB, never form user inputs

403 # # For this reason, this ligand will always have a PubChem id

404 # pubchem_id = ligand.get('pubchem', None)

405 # if pubchem_id:

406 # ligand_data = ligand_data_cache.get(pubchem_id, None)

407 # # If we still have no ligand data then request it to PubChem

408 # if not ligand_data:

409 # ligand_data = obtain_ligand_data_from_pubchem(ligand)

410 # # Save data mined from PubChem in the cache

411 # pubchem_id = ligand_data['pubchem']

412 # ligand_data_cache[pubchem_id] = { **ligand_data }

413 # cache.update(LIGANDS_DATA, ligand_data_cache)

414 # Add current ligand data to the general list

415 if not ligand_data:

416 ligand_data = obtain_ligand_data_from_pubchem(ligand)

417 ligands_data.append(ligand_data)

418 # Get pubchem id

419 pubchem_id = ligand_data.get('pubchem', None)

420 # If we already visited a different ligand but with identical formula then we skip this ligand

421 # Note that the mapping will be identical thus overwritting the previous map

422 # However, ligands forced by the user are processed before so we keep them as priority

423 formula = ligand_data.get('formula', None)

424 if formula in visited_formulas:

425 print(f'WARNING: Ligand with PubChem Id {pubchem_id} has a formula which has been already matched')

426 # AGUS: en este punto si el usuario ha definido el mismo ligando con diferente selección quiere decir que está repetido

427 # AGUS: y que deberíamos mantener el ligando para diferentes residuos matcheados

428 if forced_selection:

429 visited_formulas.pop()

430 ligands_data.pop()

431 print(f'WARNING: Ligand with PubChem Id {pubchem_id} has been forced by the user and its repeated')

432 else:

433 ligands_data.pop()

434 continue

435 # Add it to the list of visited formulas

436 visited_formulas.append(formula)

437 # If the user defined a ligand name, it will be respectedand added to the metadata

438 # if there isn't a defined name, it will be mined from PubChem

439 user_forced_ligand_name = ligand.get('name', None)

440 # Map structure residues with the current ligand

441 # If the user forced the residues then use them

442 forced_selection = ligand.get('selection', None)

443 if forced_selection:

444 # Could be a single residue or a list of residues

445 selection_atoms = structure.select(forced_selection)

446 residue_indices = structure.get_selection_residue_indices(selection_atoms)

447 ligand_map = { 'name': pubchem_id, 'residue_indices': residue_indices, 'match': { 'ref': { 'pubchem': pubchem_id } } }

448 if user_forced_ligand_name: ligand_map['forced_name'] = user_forced_ligand_name

449 # If the user did not force the residues then map them

450 else:

451 ligand_map = map_ligand_residues(structure, ligand_data)

452 # Add current ligand map to the general list

453 ligand_maps.append(ligand_map)

454 # If the ligand did not map then discard it

455 did_not_match = len(ligand_map['residue_indices']) == 0

456 if did_not_match:

457 is_forced_by_user = not bool(ligand.get('pdb', False))

458 if is_forced_by_user:

459 # At this point we should have macthed all sequences

460 # If not, kill the process unless mercy was given

461 must_be_killed = LIGANDS_MATCH_FLAG not in mercy

462 if must_be_killed:

463 raise InputError(f'Ligand with PubChem Id {pubchem_id} did not map with any residue')

464 # Otherwise simply remove it from the final ligand references file and the forwarded maps

465 ligands_data.pop()

466 ligand_maps.pop()

467 # Update the cache with the pubchem id of the ligands that didn't match

468 not_matched_pubchems = cache.retrieve(NOT_MATCHED_LIGANDS, [])

469 not_matched_pubchems.append(ligand_map['name'])

470 cache.update(NOT_MATCHED_LIGANDS, not_matched_pubchems)

471 # If the ligand matched then calculate some additional data

472 # Get morgan and mordred descriptors, the SMILES is needed for this part

473 smiles = ligand_data['smiles']

474 mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block = obtain_mordred_morgan_descriptors(smiles)

475 ligand_data['mordred'] = mordred_results

476 ligand_data['morgan_fp_bit_array'] = morgan_fp_bit_array

477 ligand_data['morgan_highlight_atoms'] = morgan_highlight_atoms

478 ligand_data['mol_block'] = mol_block

479 # Export ligands to a file

480 save_json(ligands_data, output_filepath)

481

482 # Log matched ligands

483 if not ligand_maps:

484 print('No ligands were matched')

485 else:

486 print('Matched ligands:')

487 for ligand_map in ligand_maps:

488 residue_indices = ligand_map["residue_indices"]

489 residue_count = len(residue_indices)

490 plural_suffix = '' if residue_count == 1 else 's'

491 print(f' - {ligand_map["name"]}: {residue_count} residue{plural_suffix}')

492

493 return ligand_maps

494

495# Set the expected ligand data fields

496LIGAND_DATA_FIELDS = set(['name', 'pubchem', 'drugbank', 'chembl', 'smiles', 'formula', 'morgan', 'mordred', 'pdbid'])

497

498# Import ligands json file so we do not have to rerun this process

499def import_ligands (output_filepath : str) -> dict:

500 # Read the file

501 imported_ligands = load_json(output_filepath)

502 # Format data as the process expects to find it

503 for imported_ligand in imported_ligands:

504 for expected_field in LIGAND_DATA_FIELDS:

505 if expected_field not in imported_ligand:

506 imported_ligand[expected_field] = None

507 return imported_ligands

508

509# Check if the current input ligand is already among the ligands we already have data for

510def obtain_ligand_data_from_file ( ligands_data : List[dict], ligand: dict ) -> Optional[dict]:

511 for ligand_data in ligands_data:

512 if (ligand.get('pubchem') and ligand['pubchem'] == ligand_data.get('pubchem')) or \

513 (ligand.get('drugbank') and ligand['drugbank'] == ligand_data.get('drugbank')) or \

514 (ligand.get('chembl') and ligand['chembl'] == ligand_data.get('chembl')):

515 return ligand_data

516 return None

517

518# Given an input ligand, obtain all necessary data

519def obtain_ligand_data_from_pubchem (ligand : dict) -> dict:

520 # Save in a dictionary all ligand data including its name and ids

521 # The ID can be of the databases: 'drugbank' , 'pubchem' , 'chembl'

522 # Define the needed variables to check if the ligand has a database ID or it is None

523 ligand_data = {

524 'name': None,

525 'pubchem': None,

526 'drugbank': None,

527 'chembl': None,

528 'smiles': None,

529 'formula': None,

530 'pdbid': None,

531 }

532 # Set ligand data pubchem id, even if the input id is not from pubhcme (e.g. drugbank, chembl)

533 if 'pubchem' in ligand:

534 ligand_data['pubchem'] = str(ligand.get('pubchem'))

535 elif 'drugbank' in ligand:

536 ligand_data['drugbank'] = ligand.get('drugbank')

537 ligand_data['pubchem'] = str(find_drugbank_pubchem(ligand_data['drugbank']))

538 elif 'chembl' in ligand:

539 ligand_data['chembl'] = ligand.get('chembl')

540 ligand_data['pubchem'] = str(find_chembl_pubchem(ligand_data['chembl']))

541 else:

542 raise InputError('None of the ligand IDs are defined. Please provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.')

543

544 # Request ligand data from pubchem

545 pubchem_data = get_pubchem_data(ligand_data['pubchem'])

546 if not pubchem_data:

547 raise RuntimeError('No PubChem data avilable')

548

549 # Add pubchem data to ligand data

550 ligand_data = { **ligand_data, **pubchem_data }

551 return ligand_data

552

553

554# For each residue, count the number of atom elements

555def count_atom_elements_per_residue ( structure : 'Structure' ) -> dict:

556 # Obtain a list of residues of ligand candidates

557 residue_element_count = {}

558 # Iterate over the residue(s)

559 for residue in structure.residues:

560 # Skip protein residues

561 if residue.classification == 'protein':

562 continue

563 # Obtain the list of elements for each residue

564 atom_elements = [ atom.element for atom in residue.atoms ]

565 # Generate a dict with elements (keys) and their counting (values)

566 atoms_count = {}

567 for atom in atom_elements:

568 if atom in atoms_count:

569 atoms_count[atom] += 1

570 else:

571 atoms_count[atom] = 1

572 # Define the key of the dictionary as the residue name and save it in a list with the all the residues

573 residue_element_count[residue] = atoms_count

574

575 return residue_element_count

576

577

578def nest_brackets(tokens, i = 0):

579 l = []

580 while i < len(tokens):

581 if tokens[i] == ')':

582 return i,l

583 elif tokens[i] == '(':

584 i,subl = nest_brackets(tokens, i+1)

585 l.append(subl)

586 else:

587 l.append(tokens[i])

588 i += 1

589 return i,l

590

591def parse_compound(formula : str) -> List[str]:

592 tokens = [''.join(t) for t in split_when(formula, lambda a,b: b.isupper() or b in '()' or (b.isdigit() and not a.isdigit()))]

593 tokens = [(int(t) if t.isdigit() else t) for t in tokens]

594 i, l = nest_brackets(tokens)

595 assert(i == len(tokens)) # crash if unmatched ')'

596 return l

597

598# Split a string using a function

599def split_when(string : str, func : Callable) -> List[str]:

600 splits = []

601 last_split = string[0]

602 for s in string[1:]:

603 if func(last_split, s):

604 splits.append(last_split)

605 last_split = s

606 else:

607 last_split += s

608 splits.append(last_split)

609 return splits

610

611# Given a chemical formula, get the count of atoms per element

612def count_atom_elements (molecular_formula : str) -> dict:

613 # Clean the formula from charges

614 # These charges include numbers which are not atom counts (e.g. 49867169 -> C18H16NO5PS-2)

615 parsed_molecular_formula = re.sub('[+-][0-9]*', '', molecular_formula)

616 l = parse_compound(parsed_molecular_formula)

617 c = parse_splits(l)

618 return c

619

620# This function associates elements in a list

621# If a string is followed by a number then they go together

622# If a string has no number then the number is 1 for this specific string

623def parse_splits (splits : List[str]) -> dict:

624 parsed = {}

625 previous_key = None

626 for split in splits:

627 if type(split) == str:

628 if previous_key:

629 parsed[previous_key] = 1

630 previous_key = split

631 elif type(split) == int:

632 parsed[previous_key] = split

633 previous_key = None

634 else:

635 raise ValueError('Not supported type ' + type(split))

636 if previous_key:

637 parsed[previous_key] = 1

638 return parsed

639

640

641def match_ligandsID_to_res (ligand_atom_element_count : dict, residue_atom_element_count : dict) -> bool:

642 # Remove hydrogens since we only pay attention to heavy atoms

643 # This is to avoid having mismatched_residues due to different protonation states

644 if 'H' in ligand_atom_element_count:

645 del ligand_atom_element_count['H']

646 if 'H' in residue_atom_element_count:

647 del residue_atom_element_count['H']

648 #shared_items = {k: ligand_atom_element_count[k] for k in ligand_atom_element_count if k in residue_atom_element_count and ligand_atom_element_count[k] == residue_atom_element_count[k]}

649 ligand_elements = set(ligand_atom_element_count.keys())

650 residue_elements = set(residue_atom_element_count.keys())

651

652 # Check if there are different elements

653 # If so, we are done

654 different_keys = ligand_elements.symmetric_difference(residue_elements)

655 if len(different_keys) > 0:

656 # print("Elements that are not matching:")

657 # for atom in different_keys:

658 # if atom in ligand_atom_element_count:

659 # print(f"In Pubchem {list(ligand_atom_element_count.keys())[0]} : {atom}: {ligand_atom_element_count[atom]}")

660 # else:

661 # print(f"In PDB residue {list(residue_atom_element_count.keys())[0]}: {atom}: {residue_atom_element_count[atom]}")

662 return False

663

664 # Different values

665 different_values = []

666 for element in ligand_elements:

667 if ligand_atom_element_count[element] != residue_atom_element_count[element]:

668 different_values.append(element)

669

670 # If the count of elements does not match then stop here

671 if len(different_values) > 0:

672 # Print the differences found between the elements

673 # print("Number of atoms that are different:")

674 # for atom in different_values:

675 # print(f"In Pubchem {list(ligand_atom_element_count.keys())[0]} {atom}: {ligand_atom_element_count[atom]}, In PDB residue {list(pdb_dict.keys())[0]}: {atom}: {residue_atom_element_count[atom]}\n")

676 return False

677

678 return True

679

680def map_ligand_residues (structure : 'Structure', ligand_data : dict) -> dict:

681 # Load the structure where the atoms will be minresidue.indexed and obtain a residues dict

682 atom_elements_per_residue = count_atom_elements_per_residue(structure)

683 # Get the ligand formula

684 ligand_formula = ligand_data['formula']

685 if not ligand_formula:

686 raise RuntimeError(f'Ligand with PubChem id {ligand_data["pubchem"]} is missing formula')

687 # From pubchem mine the atoms of the ligands and save it in a dict

688 ligand_atom_element_count = count_atom_elements(ligand_data['formula'])

689 matched_residues = []

690 # Get ligand pubchem id

691 pubchem_id = ligand_data['pubchem']

692 for residue, residue_atom_element_count in atom_elements_per_residue.items():

693 if match_ligandsID_to_res(ligand_atom_element_count, residue_atom_element_count):

694 matched_residues.append(residue.index)

695 # Format the output as we expect it

696 ligand_map = { 'name': pubchem_id, 'residue_indices': matched_residues, 'match': { 'ref': { 'pubchem': pubchem_id } } }

697 return ligand_map

698

699# Given a smiles, get the pubchem id

700# e.g. CC1=C(C=NC=C1)NC(=O)CC2=CC(=CC(=C2)Cl)OC -> 154876006

701# e.g. O=C4N3C(C(=O)Nc2cc(nn2c1ccccc1)C)C(SC3CC=CC4NC(=O)C(NC)C)(C)C -> None

702def smiles_to_pubchem_id (smiles : str) -> Optional[str]:

703 # Set the request URL

704 request_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON'

705 # Set the POST data

706 data = urlencode({ 'smiles': smiles }).encode()

707 try:

708 with urlopen(request_url, data=data) as response:

709 parsed_response = json.loads(response.read().decode("utf-8"))

710 # If the smiles is not found in pubchem then we can stop here

711 except HTTPError as error:

712 if error.code == 404:

713 print(f' Smiles {smiles} not found')

714 return None

715 else:

716 raise ValueError('Something went wrong with the PubChem request: ' + request_url)

717 # Get the PubChem id

718 compounds = parsed_response.get('PC_Compounds', None)

719 if not compounds:

720 raise RuntimeError(f'Something went wrong when mining pubchem data for SMILES {smiles}')

721 # Make sure there is only 1 compound

722 # DANI: Esto algún día podría no ser así, ya lo gestionaremos

723 if len(compounds) != 1:

724 raise RuntimeError(f'There should be one and only one compound for SMILES {smiles}')

725 # Keep mining

726 compound = compounds[0]

727 first_id = compound.get('id', None)

728 # If there is no first id then it means there is no direct match with pubchem

729 if not first_id:

730 return None

731 second_id = first_id.get('id', None)

732 if not second_id:

733 raise RuntimeError(f'Missing second id when mining pubchem data for SMILES {smiles}')

734 pubchem_id = second_id.get('cid', None)

735 if not pubchem_id:

736 raise RuntimeError(f'Missing pubchem id when mining pubchem data for SMILES {smiles}')

737 return str(pubchem_id)

738

739# Given a PDB ligand code, get its pubchem

740def pdb_ligand_to_pubchem (pdb_ligand_id : str) -> Optional[str]:

741 # Set the request query

742 query = '''query molecule($id:String!){

743 chem_comp(comp_id:$id) { rcsb_chem_comp_related{ resource_name resource_accession_code } }

744 }'''

745 # Request PDB data

746 parsed_response = request_pdb_data(pdb_ligand_id, query)

747 related_resources = parsed_response['rcsb_chem_comp_related']

748 # It may happend that a ligand code has no related resources at all

749 # e.g. ZN

750 if not related_resources: return None

751 pubchem_resource = next((resource for resource in related_resources if resource['resource_name'] == 'PubChem'), None)

752 if not pubchem_resource: return None

753 return pubchem_resource['resource_accession_code']

754

755# Given a PDB ligand code, get its pubchem

756# Use a web crawler to avoid having to use the PDB API

757def pdb_ligand_to_pubchem_RAW (pdb_ligand_id : str) -> Optional[str]:

758 # Set the request URL

759 request_url = f'https://www.rcsb.org/ligand/{pdb_ligand_id}'

760 # Run the query

761 parsed_response = None

762 try:

763 with urlopen(request_url) as response:

764 parsed_response = response.read().decode("utf-8")

765 # If the accession is not found in the PDB then we can stop here

766 except HTTPError as error:

767 if error.code == 404:

768 print(f' PDB ligand {pdb_ligand_id} not found')

769 return None

770 else:

771 print(error.msg)

772 raise ValueError('Something went wrong with the PDB ligand request: ' + request_url)

773 # Mine the pubchem id out of the whole response

774 pattern = re.compile('pubchem.ncbi.nlm.nih.gov\/compound\/([0-9]*)\"')

775 match = re.search(pattern, parsed_response)

776 # If there is no pubchem id then return none

777 # This is normal for some ligands such as counter ions (e.g. LI)

778 if not match:

779 return None

780 pubchem_id = match[1]

781 return pubchem_id

782

783# Given a PDB ligand code, get its pubchem

784# Ask to pubchem if the ligand exists and hope there is only one result

785# DANI: No se ha provado a fondo

786def pdb_ligand_to_pubchem_RAW_RAW (pdb_ligand_id : str) -> Optional[str]:

787 # Set the request URL

788 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{pdb_ligand_id}/json'

789 # Run the query

790 parsed_response = None

791 try:

792 with urlopen(request_url) as response:

793 parsed_response = json.loads(response.read().decode("utf-8"))

794 # If the accession is not found in the PDB then we can stop here

795 except HTTPError as error:

796 # This may happen for weird things such as UNX (unknown atom or ion)

797 if error.code == 404:

798 return None

799 print(error.msg)

800 raise RuntimeError('Something went wrong with the PDB ligand request in PubChem: ' + request_url)

801 # Mine the pubchem id

802 compounds = parsed_response['PC_Compounds']

803 if len(compounds) != 1:

804 # There could be more than one result

805 # For example, the case HEM: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/HEM/json

806 # In this case we picked the first result

807 compound = compounds[0]

808 #raise RuntimeError('We are not having 1 and only 1 result from PubChem: ' + request_url)

809 compound = compounds[0]

810 id1 = compound['id']

811 id2 = id1['id']

812 pubchem_id = str(id2['cid'])

813 return pubchem_id

814

815# Given a PDB id, get all its ligand codes

816# e.g. 2I3I -> 618, BTB, ZN, EDO, LI

817def get_pdb_ligand_codes (pdb_id : str) -> List[str]:

818 # Set the request query

819 query = '''query structure($id: String!) {

820 entry(entry_id: $id) {

821 nonpolymer_entities { nonpolymer_comp { chem_comp { id } } }

822 }

823 }'''

824 # Request PDB data

825 parsed_response = request_pdb_data(pdb_id, query)

826 # Mine data for nonpolymer entities

827 nonpolymers = parsed_response['nonpolymer_entities']

828 # If there are no nonpolymer entities, another type of entitie could be used

829 # AGUS: esto es un caso muy concreto que me encontré con el PDB 1N3W

830 # AGUS: el ligando en este caso es una 'Biologically Interesting Molecules' y se muestran como 'PRD_'

831 prd_code = None

832 if nonpolymers == None:

833 # Get the prd ligand code

834 prd_code = get_prd_ligand_code(pdb_id)

835 if prd_code != None:

836 return [prd_code]

837

838 if nonpolymers == None and prd_code == None: return []

839 # Iterate nonpolymer entities to mine each PDB code

840 ligand_codes = []

841 for nonpolymer in nonpolymers:

842 ligand_code = nonpolymer['nonpolymer_comp']['chem_comp']['id']

843 ligand_codes.append(ligand_code)

844 print(f' Ligand codes for PDB id {pdb_id}: ' + ', '.join(ligand_codes))

845 return ligand_codes

846

847# Given a PDB id, get its PRD ligand code

848def get_prd_ligand_code (pdb_id : str) -> Optional[str]:

849 query = '''query structure($id: String!) {

850 entry(entry_id: $id) {

851 pdbx_molecule_features {

852 prd_id

853 }

854 }

855 }'''

856 parsed_response = request_pdb_data(pdb_id, query)

857 pdbx_id = parsed_response.get('pdbx_molecule_features', None)

858 if pdbx_id is None:

859 print(f'WARNING: Cannot find PRD ligand code for PDB id {pdb_id}')

860 return None

861 prd_id = pdbx_id[0].get('prd_id', None) # AGUS: podría haber casos donde haya más de uno?

862 if prd_id is None:

863 print(f'WARNING: Cannot find PRD ligand code for PDB id {pdb_id}')

864 return None

865 # If the PRD id is not empty then return it

866 return prd_id

867

868# Given a pdb id, get its pubchem ids

869# DANI: De momento no usamos las SMILES que alguna vez me han dado problemas (e.g. 2I3I)

870# e.g. 4YDF ->

871def pdb_to_pubchem (pdb_id : str) -> List[str]:

872 print(f'Searching PubChem ids for PDB {pdb_id}')

873 pubchem_ids = []

874 # Iterate over pdb ligand codes

875 ligand_codes = get_pdb_ligand_codes(pdb_id)

876 for ligand_code in ligand_codes:

877 # Ask the PDB API for the ligand

878 pubchem_id = pdb_ligand_to_pubchem(ligand_code)

879 # If this did not work then try mining the PDB client with a web crawler

880 if not pubchem_id:

881 pubchem_id = pdb_ligand_to_pubchem_RAW(ligand_code)

882 # If this did not work then try it from PubChem

883 if not pubchem_id:

884 pubchem_id = pdb_ligand_to_pubchem_RAW_RAW(ligand_code)

885 # Otherwise we surrender

886 if not pubchem_id:

887 print(f' {ligand_code} -> No PubChem id')

888 continue

889 print(f' {ligand_code} -> {pubchem_id}')

890 pubchem_ids.append(pubchem_id)

891

892 return pubchem_ids

Coverage for model_workflow/tools/generate_ligands_desc.py: 62%

547 statements