Coverage for mddb_workflow / tools / get_ligands.py: 68%
434 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
1import re
2import json
3from rdkit import Chem
4from rdkit.Chem.MolStandardize import rdMolStandardize
5from mordred import Calculator, descriptors
6from mddb_workflow.utils.constants import LIGANDS_MATCH_FLAG, PDB_TO_PUBCHEM
7from mddb_workflow.utils.auxiliar import InputError, request_pdb_data, warn
8from mddb_workflow.utils.type_hints import *
9from urllib.request import urlopen
10from urllib.error import HTTPError, URLError
11from functools import lru_cache
12import requests
13from scipy.optimize import linear_sum_assignment
14import numpy as np
16# Set the expected ligand data fields
17LIGAND_DATA_FIELDS = set(['name', 'pubchem', 'drugbank', 'chembl', 'smiles', 'formula', 'morgan', 'mordred', 'pdbid', 'inchikey'])
18MINIMUM_TANIMOTO_THRESHOLD = 0.3
21def record_pubchem_match(
22 inchikey: str,
23 pubchem_id: str,
24 match_mol: Chem.Mol,
25 reference_fg: dict,
26 ligand_references: dict,
27 reason: str,
28 threshold: float = MINIMUM_TANIMOTO_THRESHOLD,
29) -> None:
30 """Set the pubchem id for the inchikey and compute/print Tanimoto vs descriptor_data if match_mol provided."""
31 matched_fg = obtain_mordred_morgan_descriptors(match_mol)
32 tc = tanimoto_similarity(reference_fg['morgan_fp_bit_array'], matched_fg['morgan_fp_bit_array'])
33 if tc >= threshold:
34 ligand_references[inchikey]['pubchem'] = str(pubchem_id)
35 ligand_references[inchikey]['tc'] = tc
36 print(f'Found CID {pubchem_id} after {reason}. Tanimoto coef. (morgan fingerprint): {tc:.3f}')
37 return tc
40def generate_ligand_references(
41 structure: 'Structure',
42 cache: 'Cache',
43 input_ligands: Optional[list[dict]],
44 pdb_ids: list[str],
45 inchikeys: dict[str, 'InChIKeyData'],
46 lipid_references: dict[str, dict],
47 membrane_map: dict,
48 mercy: list[str] = [],
49 ) -> dict[str, dict]:
50 """Generate a map of residues associated to ligands.
52 This function identifies and maps ligands in the molecular structure through a
53 multi-step matching process:
55 1. **Direct InChIKey matching**: Extracts InChIKeys from structure fragments
56 (excluding lipids and membrane components) and attempts direct matching with
57 PubChem database.
59 2. **Chemical similarity matching**: If direct matching fails, progressively
60 modifies the molecular structure and calculates Tanimoto coefficient (TC)
61 for similarity assessment:
62 1. Neutralize charges
63 2. Remove stereochemistry information
64 3. Apply PubChem standardization (tautomer, protonation, etc.)
65 4. Match against PDB-derived ligands (TC threshold ≥ 0.9)
66 5. Perform similarity search in PubChem/ChEMBL (TC threshold ≥ 0.9)
68 3. **Fallback handling**: Unmatched ligands are saved as-is with warnings.
70 4. **User-forced selections**: Respects user-specified ligand selections from
71 inputs.yaml, with warnings if TC compared to original fragment is insufficient.
72 """
73 ligand_references = {}
74 # Check input ligands format validity
75 input_ligands = input_ligands or []
76 for i, ligand in enumerate(input_ligands):
77 if type(ligand) is int or (type(ligand) is str and ligand.isnumeric()):
78 print(f'A ligand number ID has been identified {ligand}, assuming that is a PubChem ID...')
79 input_ligands[i] = {'pubchem': str(ligand)}
80 elif type(ligand) is str:
81 raise InputError(f'A name of ligand has been identified: {ligand}. Anyway, provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.')
82 # Get non-lipid inchikeys
83 ligand_keys = {key: {} for key in inchikeys if key not in lipid_references.keys()}
84 residx_2_inchikey = {resid: inchikey
85 for inchikey, data in inchikeys.items()
86 for resid in data.resindices}
87 # Add non-membrane lipid inchikeys
88 for resid in membrane_map['no_mem_lipid']:
89 # Some residues may not have InChIKeys (e.g. CG)
90 inchikey = residx_2_inchikey.get(resid, None)
91 if inchikey and inchikey not in ligand_keys:
92 ligand_keys[inchikey][resid] = {}
93 automatic_references = {key: {} for key in ligand_keys}
94 pubchem_ids_from_pdb = []
95 pdb_ids = pdb_ids or []
96 pubchem_ids_from_pdb = pdbs_2_pubchems(pdb_ids, cache)
97 for inchikey in ligand_keys:
98 # 1. Direct InChIKey match
99 pubchem_id = inchikey_2_pubchem(inchikey)
100 inchi = inchikeys[inchikey].inchi
101 mol = Chem.MolFromInchi(inchi)
102 # Original descriptors to calculate similarity later
103 descriptor_data = obtain_mordred_morgan_descriptors(mol)
104 automatic_references[inchikey].update(descriptor_data)
105 if pubchem_id:
106 automatic_references[inchikey]['pubchem'] = pubchem_id
107 print(f'Found CID {pubchem_id} by direct InChIKey match.')
108 continue
109 # 2.1. Neutralize charges
110 neutral_mol = rdMolStandardize.ChargeParent(mol)
111 neutral_inchikey = Chem.MolToInchiKey(neutral_mol)
112 if pubchem_id := inchikey_2_pubchem(neutral_inchikey):
113 record_pubchem_match(inchikey, pubchem_id, neutral_mol, descriptor_data,
114 automatic_references, 'neutralization')
115 continue
116 # 2.2. Remove stereochemistry
117 snon_inchikey = Chem.MolToInchiKey(mol, options='-SNon')
118 if pubchem_id := inchikey_2_pubchem(snon_inchikey):
119 Chem.RemoveStereochemistry(mol)
120 record_pubchem_match(inchikey, pubchem_id, mol, descriptor_data,
121 automatic_references, 'stereochemistry removal')
122 continue
123 # Neutralize charges and remove stereochemistry
124 snon_inchikey = Chem.MolToInchiKey(neutral_mol, options='-SNon')
125 if pubchem_id := inchikey_2_pubchem(snon_inchikey):
126 Chem.RemoveStereochemistry(neutral_mol)
127 record_pubchem_match(inchikey, pubchem_id, mol, descriptor_data,
128 automatic_references, 'neutralization + stereochemistry removal')
129 continue
130 # 2.3. Standardize molecule
131 standar_id = pubchem_standardization(inchi)
132 if standar_id and len(standar_id) == 1:
133 standar_pubchem = standar_id[0]['pubchem']
134 standar_mol = Chem.MolFromInchi(standar_id[0]['inchi'])
135 record_pubchem_match(inchikey, standar_pubchem, standar_mol, descriptor_data,
136 automatic_references, 'standardization')
137 continue
138 # 2.4. Match against PDB-derived ligands
139 if pubchem_ids_from_pdb:
140 for pdb_ligand in pubchem_ids_from_pdb:
141 if not pdb_ligand.get('inchi', None):
142 ligand_data = obtain_pubchem_data_from_input(pdb_ligand)
143 pdb_ligand.update(ligand_data)
145 pdb_mol = Chem.MolFromInchi(pdb_ligand['inchi'])
146 tc = record_pubchem_match(inchikey, pdb_ligand['pubchem'], pdb_mol, descriptor_data,
147 automatic_references, 'matching against PDB-derived ligands')
148 print(f'Tanimoto coefficient against PDB ligand {pdb_ligand["pubchem"]}: {tc:.3f}')
149 if tc >= MINIMUM_TANIMOTO_THRESHOLD:
150 break
151 # RUBEN: disabled as it does not work very well and it is slow
152 if False:
153 # 2.5. Similarity search in PubChem using neutral molecule
154 neutral_inchi = Chem.MolToInchi(neutral_mol)
155 for threshold in [95, 90]:
156 similarity_url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/fastsimilarity_2d/inchi/JSON?Threshold={threshold}&MaxRecords=5"
157 payload = {'inchi': neutral_inchi}
158 r = requests.post(similarity_url, data=payload)
159 if r.status_code != 200:
160 continue
161 data = r.json()
162 tc = 0
163 for compound in data['PC_Compounds']:
164 if compound['id']['id']['cid']:
165 cid = str(compound['id']['id']['cid'])
166 for prop in compound['props']:
167 if prop['urn']['label'] == 'InChI':
168 standard_inchi = prop['value']['sval']
169 if not compound or not standard_inchi:
170 assert False, "PubChem similarity search did not return valid compound data"
171 simil_mol = Chem.MolFromInchi(standard_inchi)
172 tc = record_pubchem_match(inchikey, cid, simil_mol, descriptor_data,
173 ligand_references, 'similarity search in PubChem')
174 if tc >= MINIMUM_TANIMOTO_THRESHOLD: break
175 if tc >= MINIMUM_TANIMOTO_THRESHOLD: break
177 for inchikey, ligand_data in automatic_references.items():
178 if ligand_data.get('pubchem', None):
179 automatic_references[inchikey].update(obtain_pubchem_data_from_input(ligand_data))
181 # 4. User-forced selections
182 input_ligands_data = []
183 for input_ligand in input_ligands:
184 ligand_data = obtain_pubchem_data_from_input(input_ligand)
185 # If the user defined a ligand name, it will be respected and added to the metadata
186 if forced_name := input_ligand.get('name', None):
187 ligand_data['forced_name'] = forced_name
188 forced_selection = input_ligand.get('selection', None)
189 if forced_selection:
190 # Could be a single residue or a list of residues
191 selection_atoms = structure.select(forced_selection)
192 residue_indices = structure.get_selection_residue_indices(selection_atoms)
193 print(f'User forced selection of {len(residue_indices)} residue/s for ligand {input_ligand}.')
194 if len(residue_indices) == 0 and LIGANDS_MATCH_FLAG not in mercy:
195 raise InputError(f'Ligand with PubChem ID {pubchem_id} did not map with any residue')
196 ligand_data['resindices'] = residue_indices
197 # Search for this pubchem ID in automatic_references
198 matched_inchikey = None
199 for auto_inchikey, auto_data in automatic_references.items():
200 if auto_data.get('pubchem') == ligand_data['pubchem']:
201 matched_inchikey = auto_inchikey
202 break
203 # If found, move from automatic_references to ligand_references
204 if matched_inchikey:
205 ligand_references[matched_inchikey] = automatic_references.pop(matched_inchikey)
206 if forced_selection:
207 ligand_references[matched_inchikey]['resindices'] = residue_indices
208 elif forced_selection:
209 # If not found, create a new entry in ligand_references
210 ligand_references[ligand_data['inchikey']] = ligand_data
211 else:
212 # Add to input ligands for similarity search
213 input_fg = obtain_mordred_morgan_descriptors(Chem.MolFromInchi(ligand_data['inchi']))
214 input_ligands_data.append({**input_ligand, **ligand_data, **input_fg})
215 # Create similarity matrix (N x M)
216 auto_inchikeys = list(automatic_references.keys())
217 similarity_matrix = np.zeros((len(input_ligands_data), len(auto_inchikeys)))
218 for i, input_fg in enumerate(input_ligands_data):
219 for j, auto_inchikey in enumerate(auto_inchikeys):
220 tc = tanimoto_similarity(
221 input_fg['morgan_fp_bit_array'],
222 automatic_references[auto_inchikey]['morgan_fp_bit_array'])
223 similarity_matrix[i, j] = tc
224 # Use Hungarian algorithm to find optimal assignment (maximize similarity)
225 # Convert to minimization problem by negating similarities
226 row_ind, col_ind = linear_sum_assignment(-similarity_matrix)
227 # Process matched pairs
228 for input_idx, ref_idx in zip(row_ind, col_ind):
229 input_data = input_ligands_data[input_idx]
230 matched_inchikey = auto_inchikeys[ref_idx]
231 tc = similarity_matrix[input_idx, ref_idx]
233 if tc < 0.6:
234 warn(f'Ligand "{input_data.get("name", input_data["inchikey"])}" input selection has Tanimoto coefficient {tc:.3f} '
235 f'which is below the threshold of 0.6 when compared to the matched fragment (InChIKey: {matched_inchikey}).')
237 # Update ligand reference with input data
238 ligand_references[matched_inchikey] = input_data
239 automatic_references.pop(matched_inchikey)
240 print(f'Matched input ligand {input_data["pubchem"]} to reference {matched_inchikey} with TC={tc:.3f}')
242 # Merge remaining automatic references into ligand_references
243 ligand_references.update(automatic_references)
244 not_matched_ligands = [(inchikeys[k].molname, inchikeys[k].resindices)
245 for k, v in ligand_references.items()
246 if 'pubchem' not in v]
247 if not_matched_ligands:
248 for k, v in not_matched_ligands:
249 warn(f'Ligand {inchikeys[k].molname} could not be matched to any PubChem ID. Residues: {v}')
250 if LIGANDS_MATCH_FLAG not in mercy:
251 raise InputError('Provide PubChem IDs for all ligands, a PDB code where it is '
252 f'present or use the flag -m {LIGANDS_MATCH_FLAG} to bypass this check.')
254 if not ligand_references:
255 print('No ligands were matched')
257 return ligand_references
260def pdb_ligand_2_prd(pdb_id: str) -> Optional[str]:
261 """Given a PDB ID, get its PRD ligand code."""
262 query = '''query structure($id: String!) {
263 entry(entry_id: $id) {
264 pdbx_molecule_features {
265 prd_id
266 }
267 }
268 }'''
269 parsed_response = request_pdb_data(pdb_id, query)
270 pdbx_id = parsed_response.get('pdbx_molecule_features', None)
271 if pdbx_id is None:
272 print(f'WARNING: Cannot find PRD ligand code for PDB ID {pdb_id}')
273 return None
274 prd_id = pdbx_id[0].get('prd_id', None) # AGUS: podría haber casos donde haya más de uno?
275 if prd_id is None:
276 print(f'WARNING: Cannot find PRD ligand code for PDB ID {pdb_id}')
277 return None
278 # If the PRD ID is not empty then return it
279 return prd_id
282def get_pdb_ligand_codes(pdb_id: str) -> list[str]:
283 """Given a PDB ID, get all its ligand codes.
284 e.g. 2I3I -> 618, BTB, ZN, EDO, LI.
285 """
286 # Set the request query
287 query = '''query structure($id: String!) {
288 entry(entry_id: $id) {
289 nonpolymer_entities { nonpolymer_comp { chem_comp { id } } }
290 }
291 }'''
292 # Request PDB data
293 parsed_response = request_pdb_data(pdb_id, query)
294 # Mine data for nonpolymer entities
295 nonpolymers = parsed_response['nonpolymer_entities']
296 # If there are no nonpolymer entities, another type of entitie could be used
297 # AGUS: esto es un caso muy concreto que me encontré con el PDB 1N3W
298 # AGUS: el ligando en este caso es una 'Biologically Interesting Molecules' y se muestran como 'PRD_'
299 prd_code = None
300 if nonpolymers is None:
301 # Get the prd ligand code
302 prd_code = pdb_ligand_2_prd(pdb_id)
303 if prd_code is not None:
304 return [prd_code]
306 if nonpolymers is None and prd_code is None: return []
307 # Iterate nonpolymer entities to mine each PDB code
308 ligand_codes = []
309 for nonpolymer in nonpolymers:
310 ligand_code = nonpolymer['nonpolymer_comp']['chem_comp']['id']
311 ligand_codes.append(ligand_code)
312 print(f' Ligand codes for PDB ID {pdb_id}: ' + ', '.join(ligand_codes))
313 return ligand_codes
316def pdb_ligand_2_pubchem(pdb_ligand_id: str) -> Optional[str]:
317 """Given a PDB ligand code, get its PubChem ID."""
318 # Set the request query
319 query = '''query molecule($id:String!){
320 chem_comp(comp_id:$id) { rcsb_chem_comp_related{ resource_name resource_accession_code } }
321 }'''
322 # Request PDB data
323 parsed_response = request_pdb_data(pdb_ligand_id, query)
324 related_resources = parsed_response['rcsb_chem_comp_related']
325 # It may happend that a ligand code has no related resources at all
326 # e.g. ZN
327 if not related_resources: return None
328 pubchem_resource = next((resource for resource in related_resources if resource['resource_name'] == 'PubChem'), None)
329 if not pubchem_resource: return None
330 return pubchem_resource['resource_accession_code']
333def pdb_ligand_2_pubchem_RAW(pdb_ligand_id: str) -> Optional[str]:
334 """Given a PDB ligand code, get its PubChem ID.
335 Use a web crawler to avoid having to use the PDB API.
336 """
337 # Set the request URL
338 request_url = f'https://www.rcsb.org/ligand/{pdb_ligand_id}'
339 # Run the query
340 parsed_response = None
341 try:
342 with urlopen(request_url) as response:
343 parsed_response = response.read().decode("utf-8")
344 # If the accession is not found in the PDB then we can stop here
345 except HTTPError as error:
346 if error.code == 404:
347 print(f' PDB ligand {pdb_ligand_id} not found')
348 return None
349 else:
350 print(error.msg)
351 raise ValueError('Something went wrong with the PDB ligand request: ' + request_url)
352 # Mine the PubChem ID out of the whole response
353 pattern = re.compile('pubchem.ncbi.nlm.nih.gov\/compound\/([0-9]*)\"')
354 match = re.search(pattern, parsed_response)
355 # If there is no PubChem ID then return none
356 # This is normal for some ligands such as counter ions (e.g. LI)
357 if not match:
358 return None
359 pubchem_id = match[1]
360 return pubchem_id
363# DANI: No se ha provado a fondo
364def pdb_ligand_2_pubchem_RAW_RAW(pdb_ligand_id: str) -> Optional[str]:
365 """Given a PDB ligand code, get its PubChem ID.
366 Ask to PubChem if the ligand exists and hope there is only one result.
367 """
368 # Set the request URL
369 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{pdb_ligand_id}/json'
370 # Run the query
371 parsed_response = None
372 try:
373 with urlopen(request_url) as response:
374 parsed_response = json.loads(response.read().decode("utf-8"))
375 # If the accession is not found in the PDB then we can stop here
376 except HTTPError as error:
377 # This may happen for weird things such as UNX (unknown atom or ion)
378 if error.code == 404:
379 return None
380 print(error.msg)
381 raise RuntimeError('Something went wrong with the PDB ligand request in PubChem: ' + request_url)
382 # Mine the PubChem ID
383 compounds = parsed_response['PC_Compounds']
384 if len(compounds) != 1:
385 # There could be more than one result
386 # For example, the case HEM: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/HEM/json
387 # In this case we picked the first result
388 compound = compounds[0]
389 # raise RuntimeError('We are not having 1 and only 1 result from PubChem: ' + request_url)
390 compound = compounds[0]
391 id1 = compound['id']
392 id2 = id1['id']
393 pubchem_id = str(id2['cid'])
394 return pubchem_id
397def pdb_2_pubchem(pdb_id: str) -> list[str]:
398 """Given a PDB ID, get its PubChem IDs."""
399 print(f'Searching PubChem IDs for PDB ID {pdb_id}')
400 pubchem_ids = []
401 # Iterate over pdb ligand codes
402 ligand_codes = get_pdb_ligand_codes(pdb_id)
403 for ligand_code in ligand_codes:
404 # Ask the PDB API for the ligand
405 pubchem_id = pdb_ligand_2_pubchem(ligand_code)
406 # If this did not work then try mining the PDB client with a web crawler
407 if not pubchem_id:
408 pubchem_id = pdb_ligand_2_pubchem_RAW(ligand_code)
409 # If this did not work then try it from PubChem
410 if not pubchem_id:
411 pubchem_id = pdb_ligand_2_pubchem_RAW_RAW(ligand_code)
412 # Otherwise we surrender
413 if not pubchem_id:
414 print(f' {ligand_code} -> No PubChem ID')
415 continue
416 print(f' {ligand_code} -> {pubchem_id}')
417 pubchem_ids.append(pubchem_id)
419 return pubchem_ids
422def pdbs_2_pubchems(pdb_ids: list[str], cache: 'Cache') -> list[dict]:
423 """Given a list of PDB IDs, get their PubChem IDs and add them to the ligands list."""
424 pdb_ligands = []
425 # Check we have cached pdb 2 PubChem values
426 pdb_2_pubchem_cache = cache.retrieve(PDB_TO_PUBCHEM, {})
427 new_data_to_cache = False
428 # Get input ligands from the pdb IDs, if any
429 for pdb_id in pdb_ids:
430 # Check we have cached this specific pdb
431 pubchem_ids_from_pdb = pdb_2_pubchem_cache.get(pdb_id, None)
432 if pubchem_ids_from_pdb is not None:
433 print(f' Retriving from cache PubChem IDs for PDB ID {pdb_id}: ')
434 if len(pubchem_ids_from_pdb) > 0:
435 print(' PubChem IDs: ' + ', '.join(pubchem_ids_from_pdb))
436 else:
437 print(' This PDB ID has no PubChem IDs')
439 # If we had no cached pdb 2 PubChem then ask for them
440 if pubchem_ids_from_pdb is None:
441 pubchem_ids_from_pdb = pdb_2_pubchem(pdb_id)
442 # Save the result in the cache object so it is saved to cache later
443 pdb_2_pubchem_cache[pdb_id] = pubchem_ids_from_pdb
444 new_data_to_cache = True
445 for pubchem_id in pubchem_ids_from_pdb:
446 # Ligands in the structure (PDB) and the 'inputs.json' could be the same so it's not necessary to do it twice
447 if not any('pubchem' in ligand and ligand['pubchem'] == pubchem_id for ligand in pdb_ligands):
448 pdb_ligands.append({'pubchem': pubchem_id, 'pdb': True})
449 # Save all pdb to PubChem results in cache, in case there is anything new
450 if new_data_to_cache: cache.update(PDB_TO_PUBCHEM, pdb_2_pubchem_cache)
452 return pdb_ligands
455def handle_http_request(request_url, error_context="request"):
456 """Make an HTTP request handler with consistent error handling."""
457 try:
458 with urlopen(request_url) as response:
459 return response.read().decode("utf-8", errors='ignore')
460 except HTTPError as error:
461 if error.code == 404:
462 return None
463 raise ValueError(f'Something went wrong with the {error_context} (error {error.code})')
464 except URLError as error:
465 raise ValueError(f'Something went wrong with the {error_context}: {error.reason}')
468def get_pubchem_data(pubchem_id: str) -> Optional[dict]:
469 """Given a PubChem ID, use the uniprot API to request its data and then mine what is needed for the database."""
470 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{pubchem_id}/JSON/'
471 parsed_response = handle_http_request(request_url, "PubChem request")
472 if parsed_response is None:
473 print('WARNING: Cannot find PubChem entry for accession ' + pubchem_id)
474 return None
475 parsed_response = json.loads(parsed_response)
476 # Mine target data: SMILES
477 record = parsed_response.get('Record', None)
478 if record is None:
479 raise RuntimeError('Wrong PubChem data structure: no record: ' + request_url)
480 sections = record.get('Section', None)
481 if sections is None:
482 raise RuntimeError('Wrong PubChem data structure: no sections: ' + request_url)
483 names_and_ids_section = next((section for section in sections if section.get('TOCHeading', None) == 'Names and Identifiers'), None)
484 if names_and_ids_section is None:
485 raise RuntimeError('Wrong PubChem data structure: no name and IDs section: ' + request_url)
486 names_and_ids_subsections = names_and_ids_section.get('Section', None)
487 if names_and_ids_subsections is None:
488 raise RuntimeError('Wrong PubChem data structure: no name and IDs subsections: ' + request_url)
490 # Mine the name
491 synonims = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Synonyms'), None)
492 if synonims is None:
493 descriptors = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None)
494 descriptors_subsections = descriptors.get('Section', None)
495 if descriptors_subsections is None:
496 raise RuntimeError('Wrong PubChem data structure: no name and IDs subsections: ' + request_url)
497 depositor_supplied_descriptors = next((s for s in descriptors_subsections if s.get('TOCHeading', None) == 'IUPAC Name'), None)
498 name_substance = depositor_supplied_descriptors.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
499 else:
500 synonims_subsections = synonims.get('Section', None)
501 if synonims_subsections is None:
502 raise RuntimeError('Wrong PubChem data structure: no name and IDs subsections: ' + request_url)
503 depositor_supplied_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Depositor-Supplied Synonyms'), None)
504 if depositor_supplied_synonims is None:
505 removed_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Removed Synonyms'), None)
506 name_substance = removed_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
507 else:
508 name_substance = depositor_supplied_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
510 # Mine the SMILES
511 computed_descriptors_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None)
512 if computed_descriptors_subsection is None:
513 raise RuntimeError('Wrong PubChem data structure: no computeed descriptors: ' + request_url)
514 canonical_smiles_section = computed_descriptors_subsection.get('Section', None)
515 if canonical_smiles_section is None:
516 raise RuntimeError('Wrong PubChem data structure: no canonical SMILES section: ' + request_url)
517 canonical_smiles = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'Canonical SMILES'), None)
518 if canonical_smiles is None:
519 # In some cases there is no canonical SMILES but a non-canonical one could exists
520 non_canonical_smiles_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'SMILES'), None)
521 if non_canonical_smiles_section is None:
522 raise RuntimeError('Wrong PubChem data structure: no canonical SMILES: ' + request_url)
524 if canonical_smiles:
525 smiles = canonical_smiles.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
526 if non_canonical_smiles_section:
527 smiles = non_canonical_smiles_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
529 if smiles is None:
530 raise RuntimeError('Wrong PubChem data structure: no SMILES: ' + request_url)
532 # Mine target data: MOLECULAR FORMULA
533 molecular_formula_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Molecular Formula'), None)
534 if molecular_formula_subsection is None:
535 raise RuntimeError('Wrong PubChem data structure: no molecular formula section: ' + request_url)
536 molecular_formula = molecular_formula_subsection.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
537 if molecular_formula is None:
538 raise RuntimeError('Wrong PubChem data structure: no molecular formula: ' + request_url)
540 # Mine target data: PDB ID
541 pdb_id = None
542 pdb_id_subsection = next((s for s in sections if s.get('TOCHeading', None) == 'Interactions and Pathways'), None)
543 # If this section is missing then it means this PubChem compound has no PDB ID
544 if pdb_id_subsection:
545 pdb_id_subsections = pdb_id_subsection.get('Section', None)
546 if pdb_id_subsections is None:
547 raise RuntimeError('Wrong PubChem data structure: no name and IDs subsections: ' + request_url)
548 bond_structures = next((s for s in pdb_id_subsections if s.get('TOCHeading', None) == 'Protein Bound 3D Structures'), None)
549 if bond_structures:
550 bond_structures_section = bond_structures.get('Section', None)
551 # If this section is missing then it means this PubChem compound has no PDB ID
552 if bond_structures_section:
553 ligands_structure = next((s for s in bond_structures_section if s.get('TOCHeading', None) == 'Ligands from Protein Bound 3D Structures'), None)
554 if ligands_structure is None:
555 raise RuntimeError('Wrong PubChem data structure: no Ligands from Protein Bound 3D Structures section: ' + request_url)
556 ligands_structure_subsections = ligands_structure.get('Section', None)
557 if ligands_structure_subsections is None:
558 raise RuntimeError('Wrong PubChem data structure: no Ligands from Protein Bound 3D Structures subsections: ' + request_url)
559 ligands_pdb = next((s for s in ligands_structure_subsections if s.get('TOCHeading', None) == 'PDBe Ligand Code'), None)
560 if ligands_pdb is None:
561 raise RuntimeError('Wrong PubChem data structure: no PDBe Ligand Code: ' + request_url)
562 pdb_id = ligands_pdb.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
564 # Mine de INCHI and INCHIKEY
565 inchi_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChI'), None)
566 if inchi_section is None:
567 raise RuntimeError('Wrong PubChem data structure: no InChI: ' + request_url)
568 if inchi_section:
569 inchi = inchi_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
571 inchikey_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChIKey'), None)
572 if inchikey_section is None:
573 raise RuntimeError('Wrong PubChem data structure: no InChIKey: ' + request_url)
574 if inchikey_section:
575 inchikey = inchikey_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
576 # Prepare the PubChem data to be returned
577 return {'name': name_substance, 'smiles': smiles, 'formula': molecular_formula, 'pdbid': pdb_id, 'inchi': inchi, 'inchikey': inchikey}
580def drugbank_2_pubchem(drugbank_id) -> Optional[str]:
581 """Given a DrugBank ID, request its PubChem compound ID."""
582 url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drugbank_id}/JSON'
583 r = requests.get(url)
584 if r.ok:
585 data = r.json()
586 return str(data['PC_Compounds'][0]['id']['id']['cid'])
587 else:
588 raise ValueError("Something went wrong with the DrugBank to PubChem request (error " + str(r.status_code) + ")")
591def chembl_2_pubchem(chembl_id) -> Optional[str]:
592 """Given a ChemBL ID, use the uniprot API to request its PubChem compound ID."""
593 url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/xref/RegistryID/{chembl_id}/cids/JSON'
594 r = requests.get(url)
595 if r.ok:
596 data = r.json()
597 return str(data['IdentifierList']['CID'][0])
598 else:
599 raise ValueError("Something went wrong with the ChEMBL to PubChem request (error " + str(r.status_code) + ")")
602def obtain_mordred_morgan_descriptors(mol: Chem.Mol) -> dict:
603 """Calculate the Morgan fingerprint and the Mordred descriptors from a molecule."""
604 Chem.AllChem.Compute2DCoords(mol)
605 mol_block = Chem.MolToMolBlock(mol)
606 # We can select the different submodules of mordred descriptors, avaible in: 'https://mordred-descriptor.github.io/documentation/master/'
607 calc = Calculator([
608 descriptors.ABCIndex, # Índice de ramificación
609 descriptors.AcidBase.AcidicGroupCount, # Grupos ácidos
610 descriptors.AcidBase.BasicGroupCount, # Grupos básicos
611 descriptors.RingCount, # Conteo de anillos
612 descriptors.Constitutional, # Propiedades generales como número de átomos, peso molecular
613 descriptors.TopologicalCharge, # Índices topológicos, Cargas parciales, polaridad
614 descriptors.HydrogenBond, # Donantes y aceptores de enlaces de hidrógeno
615 descriptors.Lipinski, # Reglas de Lipinski (drug-likeness)
616 descriptors.FragmentComplexity, # Identificación de subestructuras frecuentes
617 descriptors.PathCount, # Conteo de caminos moleculares
618 ], ignore_3D=True)
620 # Calculate Mordred results
621 mordred_results = calc(mol).drop_missing().asdict()
623 # MORGAN FINGERPRINT
624 morgan_fp_gen = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
625 ao = Chem.rdFingerprintGenerator.AdditionalOutput()
626 ao.AllocateAtomCounts()
627 ao.AllocateAtomToBits()
628 ao.AllocateBitInfoMap()
629 fp = morgan_fp_gen.GetFingerprint(mol, additionalOutput=ao)
630 morgan_fp_bit_array = list(fp)
631 morgan_highlight_atoms = {}
632 for bit, atoms in ao.GetBitInfoMap().items():
633 morgan_highlight_atoms[bit] = list(set(atom for atom, radius in atoms))
634 data = {'mordred': mordred_results,
635 'morgan_fp_bit_array': morgan_fp_bit_array,
636 'morgan_highlight_atoms': morgan_highlight_atoms,
637 'mol_block': mol_block}
638 return data
641def tanimoto_similarity(fp1: list[int], fp2: list[int]) -> float:
642 """Calculate the Tanimoto similarity between two fingerprints."""
643 if len(fp1) != len(fp2):
644 raise ValueError('Fingerprints must be of the same length')
645 a = sum(1 for i in range(len(fp1)) if fp1[i] == 1 and fp2[i] == 1)
646 b = sum(1 for i in range(len(fp1)) if fp1[i] == 1 and fp2[i] == 0)
647 c = sum(1 for i in range(len(fp1)) if fp1[i] == 0 and fp2[i] == 1)
648 if (a + b + c) == 0:
649 return 0.0
650 return a / (a + b + c)
653def obtain_pubchem_data_from_input(ligand: dict) -> dict[str, Optional[str]]:
654 """Given an input ligand, obtain all necessary data."""
655 # Save in a dictionary all ligand data including its name and IDs
656 # The ID can be of the databases: 'drugbank' , 'pubchem' , 'chembl'
657 # Define the needed variables to check if the ligand has a database ID or it is None
658 ligand_data = {field: None for field in LIGAND_DATA_FIELDS}
659 # Set ligand data PubChem ID, even if the input ID is not from pubhcme (e.g. drugbank, chembl)
660 if 'pubchem' in ligand:
661 ligand_data['pubchem'] = str(ligand.get('pubchem'))
662 elif 'drugbank' in ligand:
663 ligand_data['drugbank'] = ligand.get('drugbank')
664 ligand_data['pubchem'] = str(drugbank_2_pubchem(ligand_data['drugbank']))
665 elif 'chembl' in ligand:
666 ligand_data['chembl'] = ligand.get('chembl')
667 ligand_data['pubchem'] = str(chembl_2_pubchem(ligand_data['chembl']))
668 else:
669 raise InputError('None of the ligand IDs are defined. Please provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.')
671 # Request ligand data from pubchem
672 pubchem_data = get_pubchem_data(ligand_data['pubchem'])
673 if not pubchem_data:
674 raise RuntimeError('No PubChem data avilable')
676 # Add PubChem data to ligand data
677 ligand_data = {**ligand_data, **pubchem_data}
678 return ligand_data
681@lru_cache(maxsize=None)
682def inchikey_2_pubchem(inchikey: str, conectivity_only=False) -> Optional[str]:
683 """Given an InChIKey, get the PubChem CID."""
684 inchikey = inchikey.split('-')[0] if conectivity_only else inchikey
685 url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/cids/JSON"
686 r = requests.get(url)
687 if r.ok:
688 data = r.json()
689 if conectivity_only:
690 return [str(cid) for cid in data['IdentifierList']['CID']]
691 else:
692 return str(data['IdentifierList']['CID'][0])
693 return None
696@lru_cache(maxsize=None)
697def pubchem_standardization(inchi: str) -> Optional[list[dict]]:
698 """Try PubChem standardization service to write things like the bonds from aromatic rings in the same order.
700 You can see examples on the paper https://jcheminf.biomedcentral.com/articles/10.1186/s13321-018-0293-8
701 """
702 url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/standardize/inchi/JSON"
703 payload = {'inchi': inchi}
704 r = requests.post(url, data=payload)
705 if r.ok:
706 data = r.json()
707 results = []
708 for compound in data['PC_Compounds']:
709 if compound['id']['id']['cid']:
710 cid = str(compound['id']['id']['cid'])
711 else:
712 continue
713 standard_inchi = None
714 for prop in compound['props']:
715 if prop['urn']['label'] == 'InChI':
716 standard_inchi = prop['value']['sval']
717 results.append({'pubchem': cid, 'inchi': standard_inchi})
718 return results
719 return None