Coverage for mddb_workflow/tools/get

1import re

2import json

3from rdkit import Chem

4from rdkit.Chem.MolStandardize import rdMolStandardize

5from mordred import Calculator, descriptors

6from mddb_workflow.utils.constants import LIGANDS_MATCH_FLAG, PDB_TO_PUBCHEM

7from mddb_workflow.utils.auxiliar import InputError, request_pdb_data, warn

8from mddb_workflow.utils.type_hints import *

9from urllib.request import urlopen

10from urllib.error import HTTPError, URLError

11from functools import lru_cache

12import requests

13from scipy.optimize import linear_sum_assignment

14import numpy as np

16# Set the expected ligand data fields

17LIGAND_DATA_FIELDS = set(['name', 'pubchem', 'drugbank', 'chembl', 'smiles', 'formula', 'morgan', 'mordred', 'pdbid', 'inchikey'])

18MINIMUM_TANIMOTO_THRESHOLD = 0.3

21def record_pubchem_match(

22 inchikey: str,

23 pubchem_id: str,

24 match_mol: Chem.Mol,

25 reference_fg: dict,

26 ligand_references: dict,

27 reason: str,

28 threshold: float = MINIMUM_TANIMOTO_THRESHOLD,

29) -> None:

30 """Set the pubchem id for the inchikey and compute/print Tanimoto vs descriptor_data if match_mol provided."""

31 matched_fg = obtain_mordred_morgan_descriptors(match_mol)

32 tc = tanimoto_similarity(reference_fg['morgan_fp_bit_array'], matched_fg['morgan_fp_bit_array'])

33 if tc >= threshold:

34 ligand_references[inchikey]['pubchem'] = str(pubchem_id)

35 ligand_references[inchikey]['tc'] = tc

36 print(f'Found CID {pubchem_id} after {reason}. Tanimoto coef. (morgan fingerprint): {tc:.3f}')

37 return tc

40def generate_ligand_references(

41 structure: 'Structure',

42 cache: 'Cache',

43 input_ligands: Optional[list[dict]],

44 pdb_ids: list[str],

45 inchikeys: dict[str, 'InChIKeyData'],

46 lipid_references: dict[str, dict],

47 membrane_map: dict,

48 mercy: list[str] = [],

49 ) -> dict[str, dict]:

50 """Generate a map of residues associated to ligands.

52 This function identifies and maps ligands in the molecular structure through a

53 multi-step matching process:

55 1. **Direct InChIKey matching**: Extracts InChIKeys from structure fragments

56 (excluding lipids and membrane components) and attempts direct matching with

57 PubChem database.

59 2. **Chemical similarity matching**: If direct matching fails, progressively

60 modifies the molecular structure and calculates Tanimoto coefficient (TC)

61 for similarity assessment:

62 1. Neutralize charges

63 2. Remove stereochemistry information

64 3. Apply PubChem standardization (tautomer, protonation, etc.)

65 4. Match against PDB-derived ligands (TC threshold ≥ 0.9)

66 5. Perform similarity search in PubChem/ChEMBL (TC threshold ≥ 0.9)

68 3. **Fallback handling**: Unmatched ligands are saved as-is with warnings.

70 4. **User-forced selections**: Respects user-specified ligand selections from

71 inputs.yaml, with warnings if TC compared to original fragment is insufficient.

72 """

73 ligand_references = {}

74 # Check input ligands format validity

75 input_ligands = input_ligands or []

76 for i, ligand in enumerate(input_ligands):

77 if type(ligand) is int or (type(ligand) is str and ligand.isnumeric()):

78 print(f'A ligand number ID has been identified {ligand}, assuming that is a PubChem ID...')

79 input_ligands[i] = {'pubchem': str(ligand)}

80 elif type(ligand) is str:

81 raise InputError(f'A name of ligand has been identified: {ligand}. Anyway, provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.')

82 # Get non-lipid inchikeys

83 ligand_keys = {key: {} for key in inchikeys if key not in lipid_references.keys()}

84 residx_2_inchikey = {resid: inchikey

85 for inchikey, data in inchikeys.items()

86 for resid in data.resindices}

87 # Add non-membrane lipid inchikeys

88 for resid in membrane_map['no_mem_lipid']:

89 # Some residues may not have InChIKeys (e.g. CG)

90 inchikey = residx_2_inchikey.get(resid, None)

91 if inchikey and inchikey not in ligand_keys:

92 ligand_keys[inchikey][resid] = {}

93 automatic_references = {key: {} for key in ligand_keys}

94 pubchem_ids_from_pdb = []

95 pdb_ids = pdb_ids or []

96 pubchem_ids_from_pdb = pdbs_2_pubchems(pdb_ids, cache)

97 for inchikey in ligand_keys:

98 # 1. Direct InChIKey match

99 pubchem_id = inchikey_2_pubchem(inchikey)

100 inchi = inchikeys[inchikey].inchi

101 mol = Chem.MolFromInchi(inchi)

102 # Original descriptors to calculate similarity later

103 descriptor_data = obtain_mordred_morgan_descriptors(mol)

104 automatic_references[inchikey].update(descriptor_data)

105 if pubchem_id:

106 automatic_references[inchikey]['pubchem'] = pubchem_id

107 print(f'Found CID {pubchem_id} by direct InChIKey match.')

108 continue

109 # 2.1. Neutralize charges

110 neutral_mol = rdMolStandardize.ChargeParent(mol)

111 neutral_inchikey = Chem.MolToInchiKey(neutral_mol)

112 if pubchem_id := inchikey_2_pubchem(neutral_inchikey):

113 record_pubchem_match(inchikey, pubchem_id, neutral_mol, descriptor_data,

114 automatic_references, 'neutralization')

115 continue

116 # 2.2. Remove stereochemistry

117 snon_inchikey = Chem.MolToInchiKey(mol, options='-SNon')

118 if pubchem_id := inchikey_2_pubchem(snon_inchikey):

119 Chem.RemoveStereochemistry(mol)

120 record_pubchem_match(inchikey, pubchem_id, mol, descriptor_data,

121 automatic_references, 'stereochemistry removal')

122 continue

123 # Neutralize charges and remove stereochemistry

124 snon_inchikey = Chem.MolToInchiKey(neutral_mol, options='-SNon')

125 if pubchem_id := inchikey_2_pubchem(snon_inchikey):

126 Chem.RemoveStereochemistry(neutral_mol)

127 record_pubchem_match(inchikey, pubchem_id, mol, descriptor_data,

128 automatic_references, 'neutralization + stereochemistry removal')

129 continue

130 # 2.3. Standardize molecule

131 standar_id = pubchem_standardization(inchi)

132 if standar_id and len(standar_id) == 1:

133 standar_pubchem = standar_id[0]['pubchem']

134 standar_mol = Chem.MolFromInchi(standar_id[0]['inchi'])

135 record_pubchem_match(inchikey, standar_pubchem, standar_mol, descriptor_data,

136 automatic_references, 'standardization')

137 continue

138 # 2.4. Match against PDB-derived ligands

139 if pubchem_ids_from_pdb:

140 for pdb_ligand in pubchem_ids_from_pdb:

141 if not pdb_ligand.get('inchi', None):

142 ligand_data = obtain_pubchem_data_from_input(pdb_ligand)

143 pdb_ligand.update(ligand_data)

144

145 pdb_mol = Chem.MolFromInchi(pdb_ligand['inchi'])

146 tc = record_pubchem_match(inchikey, pdb_ligand['pubchem'], pdb_mol, descriptor_data,

147 automatic_references, 'matching against PDB-derived ligands')

148 print(f'Tanimoto coefficient against PDB ligand {pdb_ligand["pubchem"]}: {tc:.3f}')

149 if tc >= MINIMUM_TANIMOTO_THRESHOLD:

150 break

151 # RUBEN: disabled as it does not work very well and it is slow

152 if False:

153 # 2.5. Similarity search in PubChem using neutral molecule

154 neutral_inchi = Chem.MolToInchi(neutral_mol)

155 for threshold in [95, 90]:

156 similarity_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastsimilarity_2d/inchi/JSON?Threshold={threshold}&MaxRecords=5"

157 payload = {'inchi': neutral_inchi}

158 r = requests.post(similarity_url, data=payload)

159 if r.status_code != 200:

160 continue

161 data = r.json()

162 tc = 0

163 for compound in data['PC_Compounds']:

164 if compound['id']['id']['cid']:

165 cid = str(compound['id']['id']['cid'])

166 for prop in compound['props']:

167 if prop['urn']['label'] == 'InChI':

168 standard_inchi = prop['value']['sval']

169 if not compound or not standard_inchi:

170 assert False, "PubChem similarity search did not return valid compound data"

171 simil_mol = Chem.MolFromInchi(standard_inchi)

172 tc = record_pubchem_match(inchikey, cid, simil_mol, descriptor_data,

173 ligand_references, 'similarity search in PubChem')

174 if tc >= MINIMUM_TANIMOTO_THRESHOLD: break

175 if tc >= MINIMUM_TANIMOTO_THRESHOLD: break

176

177 for inchikey, ligand_data in automatic_references.items():

178 if ligand_data.get('pubchem', None):

179 automatic_references[inchikey].update(obtain_pubchem_data_from_input(ligand_data))

180

181 # 4. User-forced selections

182 input_ligands_data = []

183 for input_ligand in input_ligands:

184 ligand_data = obtain_pubchem_data_from_input(input_ligand)

185 # If the user defined a ligand name, it will be respected and added to the metadata

186 if forced_name := input_ligand.get('name', None):

187 ligand_data['forced_name'] = forced_name

188 forced_selection = input_ligand.get('selection', None)

189 if forced_selection:

190 # Could be a single residue or a list of residues

191 selection_atoms = structure.select(forced_selection)

192 residue_indices = structure.get_selection_residue_indices(selection_atoms)

193 print(f'User forced selection of {len(residue_indices)} residue/s for ligand {input_ligand}.')

194 if len(residue_indices) == 0 and LIGANDS_MATCH_FLAG not in mercy:

195 raise InputError(f'Ligand with PubChem ID {pubchem_id} did not map with any residue')

196 ligand_data['resindices'] = residue_indices

197 # Search for this pubchem ID in automatic_references

198 matched_inchikey = None

199 for auto_inchikey, auto_data in automatic_references.items():

200 if auto_data.get('pubchem') == ligand_data['pubchem']:

201 matched_inchikey = auto_inchikey

202 break

203 # If found, move from automatic_references to ligand_references

204 if matched_inchikey:

205 ligand_references[matched_inchikey] = automatic_references.pop(matched_inchikey)

206 if forced_selection:

207 ligand_references[matched_inchikey]['resindices'] = residue_indices

208 elif forced_selection:

209 # If not found, create a new entry in ligand_references

210 ligand_references[ligand_data['inchikey']] = ligand_data

211 else:

212 # Add to input ligands for similarity search

213 input_fg = obtain_mordred_morgan_descriptors(Chem.MolFromInchi(ligand_data['inchi']))

214 input_ligands_data.append({**input_ligand, **ligand_data, **input_fg})

215 # Create similarity matrix (N x M)

216 auto_inchikeys = list(automatic_references.keys())

217 similarity_matrix = np.zeros((len(input_ligands_data), len(auto_inchikeys)))

218 for i, input_fg in enumerate(input_ligands_data):

219 for j, auto_inchikey in enumerate(auto_inchikeys):

220 tc = tanimoto_similarity(

221 input_fg['morgan_fp_bit_array'],

222 automatic_references[auto_inchikey]['morgan_fp_bit_array'])

223 similarity_matrix[i, j] = tc

224 # Use Hungarian algorithm to find optimal assignment (maximize similarity)

225 # Convert to minimization problem by negating similarities

226 row_ind, col_ind = linear_sum_assignment(-similarity_matrix)

227 # Process matched pairs

228 for input_idx, ref_idx in zip(row_ind, col_ind):

229 input_data = input_ligands_data[input_idx]

230 matched_inchikey = auto_inchikeys[ref_idx]

231 tc = similarity_matrix[input_idx, ref_idx]

232

233 if tc < 0.6:

234 warn(f'Ligand "{input_data.get("name", input_data["inchikey"])}" input selection has Tanimoto coefficient {tc:.3f} '

235 f'which is below the threshold of 0.6 when compared to the matched fragment (InChIKey: {matched_inchikey}).')

236

237 # Update ligand reference with input data

238 ligand_references[matched_inchikey] = input_data

239 automatic_references.pop(matched_inchikey)

240 print(f'Matched input ligand {input_data["pubchem"]} to reference {matched_inchikey} with TC={tc:.3f}')

241

242 # Merge remaining automatic references into ligand_references

243 ligand_references.update(automatic_references)

244 not_matched_ligands = [(inchikeys[k].molname, inchikeys[k].resindices)

245 for k, v in ligand_references.items()

246 if 'pubchem' not in v]

247 if not_matched_ligands:

248 for k, v in not_matched_ligands:

249 warn(f'Ligand {inchikeys[k].molname} could not be matched to any PubChem ID. Residues: {v}')

250 if LIGANDS_MATCH_FLAG not in mercy:

251 raise InputError('Provide PubChem IDs for all ligands, a PDB code where it is '

252 f'present or use the flag -m {LIGANDS_MATCH_FLAG} to bypass this check.')

253

254 if not ligand_references:

255 print('No ligands were matched')

256

257 return ligand_references

258

259

260def pdb_ligand_2_prd(pdb_id: str) -> Optional[str]:

261 """Given a PDB ID, get its PRD ligand code."""

262 query = '''query structure($id: String!) {

263 entry(entry_id: $id) {

264 pdbx_molecule_features {

265 prd_id

266 }

267 }

268 }'''

269 parsed_response = request_pdb_data(pdb_id, query)

270 pdbx_id = parsed_response.get('pdbx_molecule_features', None)

271 if pdbx_id is None:

272 print(f'WARNING: Cannot find PRD ligand code for PDB ID {pdb_id}')

273 return None

274 prd_id = pdbx_id[0].get('prd_id', None) # AGUS: podría haber casos donde haya más de uno?

275 if prd_id is None:

276 print(f'WARNING: Cannot find PRD ligand code for PDB ID {pdb_id}')

277 return None

278 # If the PRD ID is not empty then return it

279 return prd_id

280

281

282def get_pdb_ligand_codes(pdb_id: str) -> list[str]:

283 """Given a PDB ID, get all its ligand codes.

284 e.g. 2I3I -> 618, BTB, ZN, EDO, LI.

285 """

286 # Set the request query

287 query = '''query structure($id: String!) {

288 entry(entry_id: $id) {

289 nonpolymer_entities { nonpolymer_comp { chem_comp { id } } }

290 }

291 }'''

292 # Request PDB data

293 parsed_response = request_pdb_data(pdb_id, query)

294 # Mine data for nonpolymer entities

295 nonpolymers = parsed_response['nonpolymer_entities']

296 # If there are no nonpolymer entities, another type of entitie could be used

297 # AGUS: esto es un caso muy concreto que me encontré con el PDB 1N3W

298 # AGUS: el ligando en este caso es una 'Biologically Interesting Molecules' y se muestran como 'PRD_'

299 prd_code = None

300 if nonpolymers is None:

301 # Get the prd ligand code

302 prd_code = pdb_ligand_2_prd(pdb_id)

303 if prd_code is not None:

304 return [prd_code]

305

306 if nonpolymers is None and prd_code is None: return []

307 # Iterate nonpolymer entities to mine each PDB code

308 ligand_codes = []

309 for nonpolymer in nonpolymers:

310 ligand_code = nonpolymer['nonpolymer_comp']['chem_comp']['id']

311 ligand_codes.append(ligand_code)

312 print(f' Ligand codes for PDB ID {pdb_id}: ' + ', '.join(ligand_codes))

313 return ligand_codes

314

315

316def pdb_ligand_2_pubchem(pdb_ligand_id: str) -> Optional[str]:

317 """Given a PDB ligand code, get its PubChem ID."""

318 # Set the request query

319 query = '''query molecule($id:String!){

320 chem_comp(comp_id:$id) { rcsb_chem_comp_related{ resource_name resource_accession_code } }

321 }'''

322 # Request PDB data

323 parsed_response = request_pdb_data(pdb_ligand_id, query)

324 related_resources = parsed_response['rcsb_chem_comp_related']

325 # It may happend that a ligand code has no related resources at all

326 # e.g. ZN

327 if not related_resources: return None

328 pubchem_resource = next((resource for resource in related_resources if resource['resource_name'] == 'PubChem'), None)

329 if not pubchem_resource: return None

330 return pubchem_resource['resource_accession_code']

331

332

333def pdb_ligand_2_pubchem_RAW(pdb_ligand_id: str) -> Optional[str]:

334 """Given a PDB ligand code, get its PubChem ID.

335 Use a web crawler to avoid having to use the PDB API.

336 """

337 # Set the request URL

338 request_url = f'https://www.rcsb.org/ligand/{pdb_ligand_id}'

339 # Run the query

340 parsed_response = None

341 try:

342 with urlopen(request_url) as response:

343 parsed_response = response.read().decode("utf-8")

344 # If the accession is not found in the PDB then we can stop here

345 except HTTPError as error:

346 if error.code == 404:

347 print(f' PDB ligand {pdb_ligand_id} not found')

348 return None

349 else:

350 print(error.msg)

351 raise ValueError('Something went wrong with the PDB ligand request: ' + request_url)

352 # Mine the PubChem ID out of the whole response

353 pattern = re.compile('pubchem.ncbi.nlm.nih.gov\/compound\/([0-9]*)\"')

354 match = re.search(pattern, parsed_response)

355 # If there is no PubChem ID then return none

356 # This is normal for some ligands such as counter ions (e.g. LI)

357 if not match:

358 return None

359 pubchem_id = match[1]

360 return pubchem_id

361

362

363# DANI: No se ha provado a fondo

364def pdb_ligand_2_pubchem_RAW_RAW(pdb_ligand_id: str) -> Optional[str]:

365 """Given a PDB ligand code, get its PubChem ID.

366 Ask to PubChem if the ligand exists and hope there is only one result.

367 """

368 # Set the request URL

369 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{pdb_ligand_id}/json'

370 # Run the query

371 parsed_response = None

372 try:

373 with urlopen(request_url) as response:

374 parsed_response = json.loads(response.read().decode("utf-8"))

375 # If the accession is not found in the PDB then we can stop here

376 except HTTPError as error:

377 # This may happen for weird things such as UNX (unknown atom or ion)

378 if error.code == 404:

379 return None

380 print(error.msg)

381 raise RuntimeError('Something went wrong with the PDB ligand request in PubChem: ' + request_url)

382 # Mine the PubChem ID

383 compounds = parsed_response['PC_Compounds']

384 if len(compounds) != 1:

385 # There could be more than one result

386 # For example, the case HEM: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/HEM/json

387 # In this case we picked the first result

388 compound = compounds[0]

389 # raise RuntimeError('We are not having 1 and only 1 result from PubChem: ' + request_url)

390 compound = compounds[0]

391 id1 = compound['id']

392 id2 = id1['id']

393 pubchem_id = str(id2['cid'])

394 return pubchem_id

395

396

397def pdb_2_pubchem(pdb_id: str) -> list[str]:

398 """Given a PDB ID, get its PubChem IDs."""

399 print(f'Searching PubChem IDs for PDB ID {pdb_id}')

400 pubchem_ids = []

401 # Iterate over pdb ligand codes

402 ligand_codes = get_pdb_ligand_codes(pdb_id)

403 for ligand_code in ligand_codes:

404 # Ask the PDB API for the ligand

405 pubchem_id = pdb_ligand_2_pubchem(ligand_code)

406 # If this did not work then try mining the PDB client with a web crawler

407 if not pubchem_id:

408 pubchem_id = pdb_ligand_2_pubchem_RAW(ligand_code)

409 # If this did not work then try it from PubChem

410 if not pubchem_id:

411 pubchem_id = pdb_ligand_2_pubchem_RAW_RAW(ligand_code)

412 # Otherwise we surrender

413 if not pubchem_id:

414 print(f' {ligand_code} -> No PubChem ID')

415 continue

416 print(f' {ligand_code} -> {pubchem_id}')

417 pubchem_ids.append(pubchem_id)

418

419 return pubchem_ids

420

421

422def pdbs_2_pubchems(pdb_ids: list[str], cache: 'Cache') -> list[dict]:

423 """Given a list of PDB IDs, get their PubChem IDs and add them to the ligands list."""

424 pdb_ligands = []

425 # Check we have cached pdb 2 PubChem values

426 pdb_2_pubchem_cache = cache.retrieve(PDB_TO_PUBCHEM, {})

427 new_data_to_cache = False

428 # Get input ligands from the pdb IDs, if any

429 for pdb_id in pdb_ids:

430 # Check we have cached this specific pdb

431 pubchem_ids_from_pdb = pdb_2_pubchem_cache.get(pdb_id, None)

432 if pubchem_ids_from_pdb is not None:

433 print(f' Retriving from cache PubChem IDs for PDB ID {pdb_id}: ')

434 if len(pubchem_ids_from_pdb) > 0:

435 print(' PubChem IDs: ' + ', '.join(pubchem_ids_from_pdb))

436 else:

437 print(' This PDB ID has no PubChem IDs')

438

439 # If we had no cached pdb 2 PubChem then ask for them

440 if pubchem_ids_from_pdb is None:

441 pubchem_ids_from_pdb = pdb_2_pubchem(pdb_id)

442 # Save the result in the cache object so it is saved to cache later

443 pdb_2_pubchem_cache[pdb_id] = pubchem_ids_from_pdb

444 new_data_to_cache = True

445 for pubchem_id in pubchem_ids_from_pdb:

446 # Ligands in the structure (PDB) and the 'inputs.json' could be the same so it's not necessary to do it twice

447 if not any('pubchem' in ligand and ligand['pubchem'] == pubchem_id for ligand in pdb_ligands):

448 pdb_ligands.append({'pubchem': pubchem_id, 'pdb': True})

449 # Save all pdb to PubChem results in cache, in case there is anything new

450 if new_data_to_cache: cache.update(PDB_TO_PUBCHEM, pdb_2_pubchem_cache)

451

452 return pdb_ligands

453

454

455def handle_http_request(request_url, error_context="request"):

456 """Make an HTTP request handler with consistent error handling."""

457 try:

458 with urlopen(request_url) as response:

459 return response.read().decode("utf-8", errors='ignore')

460 except HTTPError as error:

461 if error.code == 404:

462 return None

463 raise ValueError(f'Something went wrong with the {error_context} (error {error.code})')

464 except URLError as error:

465 raise ValueError(f'Something went wrong with the {error_context}: {error.reason}')

466

467

468def get_pubchem_data(pubchem_id: str) -> Optional[dict]:

469 """Given a PubChem ID, use the uniprot API to request its data and then mine what is needed for the database."""

470 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{pubchem_id}/JSON/'

471 parsed_response = handle_http_request(request_url, "PubChem request")

472 if parsed_response is None:

473 print('WARNING: Cannot find PubChem entry for accession ' + pubchem_id)

474 return None

475 parsed_response = json.loads(parsed_response)

476 # Mine target data: SMILES

477 record = parsed_response.get('Record', None)

478 if record is None:

479 raise RuntimeError('Wrong PubChem data structure: no record: ' + request_url)

480 sections = record.get('Section', None)

481 if sections is None:

482 raise RuntimeError('Wrong PubChem data structure: no sections: ' + request_url)

483 names_and_ids_section = next((section for section in sections if section.get('TOCHeading', None) == 'Names and Identifiers'), None)

484 if names_and_ids_section is None:

485 raise RuntimeError('Wrong PubChem data structure: no name and IDs section: ' + request_url)

486 names_and_ids_subsections = names_and_ids_section.get('Section', None)

487 if names_and_ids_subsections is None:

488 raise RuntimeError('Wrong PubChem data structure: no name and IDs subsections: ' + request_url)

489

490 # Mine the name

491 synonims = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Synonyms'), None)

492 if synonims is None:

493 descriptors = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None)

494 descriptors_subsections = descriptors.get('Section', None)

495 if descriptors_subsections is None:

496 raise RuntimeError('Wrong PubChem data structure: no name and IDs subsections: ' + request_url)

497 depositor_supplied_descriptors = next((s for s in descriptors_subsections if s.get('TOCHeading', None) == 'IUPAC Name'), None)

498 name_substance = depositor_supplied_descriptors.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

499 else:

500 synonims_subsections = synonims.get('Section', None)

501 if synonims_subsections is None:

502 raise RuntimeError('Wrong PubChem data structure: no name and IDs subsections: ' + request_url)

503 depositor_supplied_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Depositor-Supplied Synonyms'), None)

504 if depositor_supplied_synonims is None:

505 removed_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Removed Synonyms'), None)

506 name_substance = removed_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

507 else:

508 name_substance = depositor_supplied_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

509

510 # Mine the SMILES

511 computed_descriptors_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None)

512 if computed_descriptors_subsection is None:

513 raise RuntimeError('Wrong PubChem data structure: no computeed descriptors: ' + request_url)

514 canonical_smiles_section = computed_descriptors_subsection.get('Section', None)

515 if canonical_smiles_section is None:

516 raise RuntimeError('Wrong PubChem data structure: no canonical SMILES section: ' + request_url)

517 canonical_smiles = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'Canonical SMILES'), None)

518 if canonical_smiles is None:

519 # In some cases there is no canonical SMILES but a non-canonical one could exists

520 non_canonical_smiles_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'SMILES'), None)

521 if non_canonical_smiles_section is None:

522 raise RuntimeError('Wrong PubChem data structure: no canonical SMILES: ' + request_url)

523

524 if canonical_smiles:

525 smiles = canonical_smiles.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

526 if non_canonical_smiles_section:

527 smiles = non_canonical_smiles_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

528

529 if smiles is None:

530 raise RuntimeError('Wrong PubChem data structure: no SMILES: ' + request_url)

531

532 # Mine target data: MOLECULAR FORMULA

533 molecular_formula_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Molecular Formula'), None)

534 if molecular_formula_subsection is None:

535 raise RuntimeError('Wrong PubChem data structure: no molecular formula section: ' + request_url)

536 molecular_formula = molecular_formula_subsection.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

537 if molecular_formula is None:

538 raise RuntimeError('Wrong PubChem data structure: no molecular formula: ' + request_url)

539

540 # Mine target data: PDB ID

541 pdb_id = None

542 pdb_id_subsection = next((s for s in sections if s.get('TOCHeading', None) == 'Interactions and Pathways'), None)

543 # If this section is missing then it means this PubChem compound has no PDB ID

544 if pdb_id_subsection:

545 pdb_id_subsections = pdb_id_subsection.get('Section', None)

546 if pdb_id_subsections is None:

547 raise RuntimeError('Wrong PubChem data structure: no name and IDs subsections: ' + request_url)

548 bond_structures = next((s for s in pdb_id_subsections if s.get('TOCHeading', None) == 'Protein Bound 3D Structures'), None)

549 if bond_structures:

550 bond_structures_section = bond_structures.get('Section', None)

551 # If this section is missing then it means this PubChem compound has no PDB ID

552 if bond_structures_section:

553 ligands_structure = next((s for s in bond_structures_section if s.get('TOCHeading', None) == 'Ligands from Protein Bound 3D Structures'), None)

554 if ligands_structure is None:

555 raise RuntimeError('Wrong PubChem data structure: no Ligands from Protein Bound 3D Structures section: ' + request_url)

556 ligands_structure_subsections = ligands_structure.get('Section', None)

557 if ligands_structure_subsections is None:

558 raise RuntimeError('Wrong PubChem data structure: no Ligands from Protein Bound 3D Structures subsections: ' + request_url)

559 ligands_pdb = next((s for s in ligands_structure_subsections if s.get('TOCHeading', None) == 'PDBe Ligand Code'), None)

560 if ligands_pdb is None:

561 raise RuntimeError('Wrong PubChem data structure: no PDBe Ligand Code: ' + request_url)

562 pdb_id = ligands_pdb.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

563

564 # Mine de INCHI and INCHIKEY

565 inchi_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChI'), None)

566 if inchi_section is None:

567 raise RuntimeError('Wrong PubChem data structure: no InChI: ' + request_url)

568 if inchi_section:

569 inchi = inchi_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

570

571 inchikey_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChIKey'), None)

572 if inchikey_section is None:

573 raise RuntimeError('Wrong PubChem data structure: no InChIKey: ' + request_url)

574 if inchikey_section:

575 inchikey = inchikey_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)

576 # Prepare the PubChem data to be returned

577 return {'name': name_substance, 'smiles': smiles, 'formula': molecular_formula, 'pdbid': pdb_id, 'inchi': inchi, 'inchikey': inchikey}

578

579

580def drugbank_2_pubchem(drugbank_id) -> Optional[str]:

581 """Given a DrugBank ID, request its PubChem compound ID."""

582 url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drugbank_id}/JSON'

583 r = requests.get(url)

584 if r.ok:

585 data = r.json()

586 return str(data['PC_Compounds'][0]['id']['id']['cid'])

587 else:

588 raise ValueError("Something went wrong with the DrugBank to PubChem request (error " + str(r.status_code) + ")")

589

590

591def chembl_2_pubchem(chembl_id) -> Optional[str]:

592 """Given a ChemBL ID, use the uniprot API to request its PubChem compound ID."""

593 url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/xref/RegistryID/{chembl_id}/cids/JSON'

594 r = requests.get(url)

595 if r.ok:

596 data = r.json()

597 return str(data['IdentifierList']['CID'][0])

598 else:

599 raise ValueError("Something went wrong with the ChEMBL to PubChem request (error " + str(r.status_code) + ")")

600

601

602def obtain_mordred_morgan_descriptors(mol: Chem.Mol) -> dict:

603 """Calculate the Morgan fingerprint and the Mordred descriptors from a molecule."""

604 Chem.AllChem.Compute2DCoords(mol)

605 mol_block = Chem.MolToMolBlock(mol)

606 # We can select the different submodules of mordred descriptors, avaible in: 'https://mordred-descriptor.github.io/documentation/master/'

607 calc = Calculator([

608 descriptors.ABCIndex, # Índice de ramificación

609 descriptors.AcidBase.AcidicGroupCount, # Grupos ácidos

610 descriptors.AcidBase.BasicGroupCount, # Grupos básicos

611 descriptors.RingCount, # Conteo de anillos

612 descriptors.Constitutional, # Propiedades generales como número de átomos, peso molecular

613 descriptors.TopologicalCharge, # Índices topológicos, Cargas parciales, polaridad

614 descriptors.HydrogenBond, # Donantes y aceptores de enlaces de hidrógeno

615 descriptors.Lipinski, # Reglas de Lipinski (drug-likeness)

616 descriptors.FragmentComplexity, # Identificación de subestructuras frecuentes

617 descriptors.PathCount, # Conteo de caminos moleculares

618 ], ignore_3D=True)

619

620 # Calculate Mordred results

621 mordred_results = calc(mol).drop_missing().asdict()

622

623 # MORGAN FINGERPRINT

624 morgan_fp_gen = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

625 ao = Chem.rdFingerprintGenerator.AdditionalOutput()

626 ao.AllocateAtomCounts()

627 ao.AllocateAtomToBits()

628 ao.AllocateBitInfoMap()

629 fp = morgan_fp_gen.GetFingerprint(mol, additionalOutput=ao)

630 morgan_fp_bit_array = list(fp)

631 morgan_highlight_atoms = {}

632 for bit, atoms in ao.GetBitInfoMap().items():

633 morgan_highlight_atoms[bit] = list(set(atom for atom, radius in atoms))

634 data = {'mordred': mordred_results,

635 'morgan_fp_bit_array': morgan_fp_bit_array,

636 'morgan_highlight_atoms': morgan_highlight_atoms,

637 'mol_block': mol_block}

638 return data

639

640

641def tanimoto_similarity(fp1: list[int], fp2: list[int]) -> float:

642 """Calculate the Tanimoto similarity between two fingerprints."""

643 if len(fp1) != len(fp2):

644 raise ValueError('Fingerprints must be of the same length')

645 a = sum(1 for i in range(len(fp1)) if fp1[i] == 1 and fp2[i] == 1)

646 b = sum(1 for i in range(len(fp1)) if fp1[i] == 1 and fp2[i] == 0)

647 c = sum(1 for i in range(len(fp1)) if fp1[i] == 0 and fp2[i] == 1)

648 if (a + b + c) == 0:

649 return 0.0

650 return a / (a + b + c)

651

652

653def obtain_pubchem_data_from_input(ligand: dict) -> dict[str, Optional[str]]:

654 """Given an input ligand, obtain all necessary data."""

655 # Save in a dictionary all ligand data including its name and IDs

656 # The ID can be of the databases: 'drugbank' , 'pubchem' , 'chembl'

657 # Define the needed variables to check if the ligand has a database ID or it is None

658 ligand_data = {field: None for field in LIGAND_DATA_FIELDS}

659 # Set ligand data PubChem ID, even if the input ID is not from pubhcme (e.g. drugbank, chembl)

660 if 'pubchem' in ligand:

661 ligand_data['pubchem'] = str(ligand.get('pubchem'))

662 elif 'drugbank' in ligand:

663 ligand_data['drugbank'] = ligand.get('drugbank')

664 ligand_data['pubchem'] = str(drugbank_2_pubchem(ligand_data['drugbank']))

665 elif 'chembl' in ligand:

666 ligand_data['chembl'] = ligand.get('chembl')

667 ligand_data['pubchem'] = str(chembl_2_pubchem(ligand_data['chembl']))

668 else:

669 raise InputError('None of the ligand IDs are defined. Please provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.')

670

671 # Request ligand data from pubchem

672 pubchem_data = get_pubchem_data(ligand_data['pubchem'])

673 if not pubchem_data:

674 raise RuntimeError('No PubChem data avilable')

675

676 # Add PubChem data to ligand data

677 ligand_data = {**ligand_data, **pubchem_data}

678 return ligand_data

679

680

681@lru_cache(maxsize=None)

682def inchikey_2_pubchem(inchikey: str, conectivity_only=False) -> Optional[str]:

683 """Given an InChIKey, get the PubChem CID."""

684 inchikey = inchikey.split('-')[0] if conectivity_only else inchikey

685 url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/cids/JSON"

686 r = requests.get(url)

687 if r.ok:

688 data = r.json()

689 if conectivity_only:

690 return [str(cid) for cid in data['IdentifierList']['CID']]

691 else:

692 return str(data['IdentifierList']['CID'][0])

693 return None

694

695

696@lru_cache(maxsize=None)

697def pubchem_standardization(inchi: str) -> Optional[list[dict]]:

698 """Try PubChem standardization service to write things like the bonds from aromatic rings in the same order.

699

700 You can see examples on the paper https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0293-8

701 """

702 url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/standardize/inchi/JSON"

703 payload = {'inchi': inchi}

704 r = requests.post(url, data=payload)

705 if r.ok:

706 data = r.json()

707 results = []

708 for compound in data['PC_Compounds']:

709 if compound['id']['id']['cid']:

710 cid = str(compound['id']['id']['cid'])

711 else:

712 continue

713 standard_inchi = None

714 for prop in compound['props']:

715 if prop['urn']['label'] == 'InChI':

716 standard_inchi = prop['value']['sval']

717 results.append({'pubchem': cid, 'inchi': standard_inchi})

718 return results

719 return None

Coverage for mddb_workflow / tools / get_ligands.py: 68%

434 statements