Coverage for mddb_workflow/tools/generate_ligands_desc.py: 60%
578 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 15:48 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 15:48 +0000
1import re
2import os
3import json
4import requests
5from rdkit import Chem
6from functools import lru_cache
7from mordred import Calculator, descriptors
8from mddb_workflow.utils.constants import LIGANDS_MATCH_FLAG, PDB_TO_PUBCHEM, NOT_MATCHED_LIGANDS
9from mddb_workflow.utils.auxiliar import InputError, load_json, save_json, request_pdb_data
10from mddb_workflow.utils.type_hints import *
11from mddb_workflow.utils.structures import Structure
12from mddb_workflow.tools.get_inchi_keys import obtain_inchikey_from_pdb
13from urllib.request import Request, urlopen
14from urllib.parse import urlencode
15from urllib.error import HTTPError, URLError
17def get_drugbank_smiles (id_drugbank : str) -> Optional[str]:
18 # Request Drugbank
19 request_url = Request(
20 url= f'https://go.drugbank.com/structures/small_molecule_drugs/{id_drugbank}.smiles',
21 headers={'User-Agent': 'Mozilla/5.0'}
22 )
23 try:
24 with urlopen(request_url) as response:
25 smiles = response.read()
26 # If the accession is not found in the database then we stop here
27 except HTTPError as error:
28 # If the drugbank ID is not yet in the Drugbank references then return None
29 if error.code == 404:
30 return None
31 else:
32 print('Error when requesting ' + request_url)
33 raise ValueError('Something went wrong with the Drugbank request (error ' + str(error.code) + ')')
34 # This error may occur if there is no internet connection
35 except URLError as error:
36 print('Error when requesting ' + request_url)
37 raise ValueError('Something went wrong with the MDposit request')
39 return smiles
41# Given a ChemBL ID, use the uniprot API to request its data and then mine what is needed for the database
42def get_chembl_smiles (id_chembl : str) -> Optional[str]:
43 # Request ChemBL
44 parsed_response = None
45 request_url = Request(
46 url= f'https://www.ebi.ac.uk/chembl/interface_api/es_proxy/es_data/get_es_document/chembl_molecule/{id_chembl}',
47 headers={'User-Agent': 'Mozilla/5.0'}
48 )
49 try:
50 with urlopen(request_url) as response:
51 parsed_response = json.loads(response.read().decode("utf-8"))
52 smiles = parsed_response['_source']['molecule_structures']['canonical_smiles']
53 pubchem_id = parsed_response['_source']['_metadata']['unichem'][8]['id']
54 # If the accession is not found in ChemBL then the id is not valid
55 except HTTPError as error:
56 if error.code == 404:
57 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)
58 return None
59 print('Error when requesting ' + request_url)
60 raise ValueError('Something went wrong with the ChemBL request (error ' + str(error.code) + ')')
61 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607)
62 if parsed_response == None:
63 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)
64 return None
65 return smiles
67# Given a PubChem ID, use the uniprot API to request its data and then mine what is needed for the database
68def get_pubchem_data (id_pubchem : str) -> Optional[dict]:
69 # Request PubChem
70 parsed_response = None
71 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{id_pubchem}/JSON/'
72 try:
73 with urlopen(request_url) as response:
74 #parsed_response = json.loads(response.read().decode("windows-1252"))
75 parsed_response = json.loads(response.read().decode("utf-8", errors='ignore'))
76 # If the accession is not found in PubChem then the id is not valid
77 # This may happen with pubchem ids of non-discrete compounds (e.g. 483927498)
78 except HTTPError as error:
79 if error.code == 404:
80 print('WARNING: Cannot find PubChem entry for accession ', id_pubchem)
81 return None
82 print('Error when requesting ', request_url)
83 raise ValueError('Something went wrong with the PubChem request (error ', str(error.code), ')')
84 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607)
85 if parsed_response == None:
86 print('WARNING: Cannot find PubChem entry for accession ' + id_pubchem)
87 return None
88 # Mine target data: SMILES
89 record = parsed_response.get('Record', None)
90 if record == None:
91 raise RuntimeError('Wrong Pubchem data structure: no record: ' + request_url)
92 sections = record.get('Section', None)
93 if sections == None:
94 raise RuntimeError('Wrong Pubchem data structure: no sections: ' + request_url)
95 names_and_ids_section = next((section for section in sections if section.get('TOCHeading', None) == 'Names and Identifiers'), None)
96 if names_and_ids_section == None:
97 raise RuntimeError('Wrong Pubchem data structure: no name and ids section: ' + request_url)
98 names_and_ids_subsections = names_and_ids_section.get('Section', None)
99 if names_and_ids_subsections == None:
100 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)
102 # Mine the name
103 synonims = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Synonyms'), None)
104 if synonims == None:
105 descriptors = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None)
106 descriptors_subsections = descriptors.get('Section', None)
107 if descriptors_subsections == None:
108 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)
109 depositor_supplied_descriptors = next((s for s in descriptors_subsections if s.get('TOCHeading', None) == 'IUPAC Name'), None)
110 name_substance = depositor_supplied_descriptors.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
111 else:
112 synonims_subsections = synonims.get('Section', None)
113 if synonims_subsections == None:
114 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)
115 depositor_supplied_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Depositor-Supplied Synonyms'), None)
116 if depositor_supplied_synonims == None:
117 removed_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Removed Synonyms'), None)
118 name_substance = removed_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
119 else:
120 name_substance = depositor_supplied_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
122 # Mine the SMILES
123 computed_descriptors_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None)
124 if computed_descriptors_subsection == None:
125 raise RuntimeError('Wrong Pubchem data structure: no computeed descriptors: ' + request_url)
126 canonical_smiles_section = computed_descriptors_subsection.get('Section', None)
127 if canonical_smiles_section == None:
128 raise RuntimeError('Wrong Pubchem data structure: no canonical SMILES section: ' + request_url)
129 canonical_smiles = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'Canonical SMILES'), None)
130 if canonical_smiles == None:
131 # In some cases there is no canonical SMILES but a non-canonical one could exists
132 non_canonical_smiles_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'SMILES'), None)
133 if non_canonical_smiles_section == None:
134 raise RuntimeError('Wrong Pubchem data structure: no canonical SMILES: ' + request_url)
136 if canonical_smiles:
137 smiles = canonical_smiles.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
138 if non_canonical_smiles_section:
139 smiles = non_canonical_smiles_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
141 if smiles == None:
142 raise RuntimeError('Wrong Pubchem data structure: no SMILES: ' + request_url)
144 # Mine target data: MOLECULAR FORMULA
145 molecular_formula_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Molecular Formula'), None)
146 if molecular_formula_subsection == None:
147 raise RuntimeError('Wrong Pubchem data structure: no molecular formula section: ' + request_url)
148 molecular_formula = molecular_formula_subsection.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
149 if molecular_formula == None:
150 raise RuntimeError('Wrong Pubchem data structure: no molecular formula: ' + request_url)
152 # Mine target data: PDB ID
153 pdb_id = None
154 pdb_id_subsection = next((s for s in sections if s.get('TOCHeading', None) == 'Interactions and Pathways'), None)
155 # If this section is missing then it means this PubChem compound has no PDB id
156 if pdb_id_subsection:
157 pdb_id_subsections = pdb_id_subsection.get('Section', None)
158 if pdb_id_subsections == None:
159 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)
160 bond_structures = next((s for s in pdb_id_subsections if s.get('TOCHeading', None) == 'Protein Bound 3D Structures'), None)
161 if bond_structures:
162 bond_structures_section = bond_structures.get('Section', None)
163 # If this section is missing then it means this PubChem compound has no PDB id
164 if bond_structures_section:
165 ligands_structure = next((s for s in bond_structures_section if s.get('TOCHeading', None) == 'Ligands from Protein Bound 3D Structures'), None)
166 if ligands_structure == None:
167 raise RuntimeError('Wrong Pubchem data structure: no Ligands from Protein Bound 3D Structures section: ' + request_url)
168 ligands_structure_subsections = ligands_structure.get('Section', None)
169 if ligands_structure_subsections == None:
170 raise RuntimeError('Wrong Pubchem data structure: no Ligands from Protein Bound 3D Structures subsections: ' + request_url)
171 ligands_pdb = next((s for s in ligands_structure_subsections if s.get('TOCHeading', None) == 'PDBe Ligand Code'), None)
172 if ligands_pdb == None:
173 raise RuntimeError('Wrong Pubchem data structure: no PDBe Ligand Code: ' + request_url)
174 pdb_id = ligands_pdb.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
176 # Mine de INCHI and INCHIKEY
177 inchi_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChI'), None)
178 if inchi_section == None:
179 raise RuntimeError('Wrong Pubchem data structure: no InChI: ' + request_url)
180 if inchi_section:
181 inchi = inchi_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
183 inchikey_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChIKey'), None)
184 if inchikey_section == None:
185 raise RuntimeError('Wrong Pubchem data structure: no InChIKey: ' + request_url)
186 if inchikey_section:
187 inchikey = inchikey_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
188 # Prepare the pubchem data to be returned
189 return { 'name': name_substance, 'smiles': smiles, 'formula': molecular_formula , 'pdbid': pdb_id, 'inchi': inchi, 'inchikey': inchikey }
191def find_drugbank_pubchem (drugbank_id):
192 # Request Drugbank
193 request_url = Request(
194 url=f'https://go.drugbank.com/drugs/{drugbank_id}',
195 headers={
196 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
197 'Accept-Language': 'en-US,en;q=0.9',
198 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
199 }
200)
201 pubchem_id = None
202 try:
203 with urlopen(request_url) as response:
204 content = response.read().decode("utf-8")
205 pattern = re.compile("http\:\/\/pubchem.ncbi.nlm.nih.gov\/summary\/summary.cgi\?cid\=([0-9]*)")
206 match = re.search(pattern, str(content))
207 if not match:
208 raise ValueError("No se encontró información sobre el Pubchem compound en esta página.")
209 pubchem_id = match[1]
210 if not pubchem_id:
211 raise ValueError("No se encontró información sobre el Pubchem compound en esta página.")
212 # If the accession is not found in the database then we stop here
213 except HTTPError as error:
214 # If the drugbank ID is not yet in the Drugbank references then return None
215 raise ValueError(f'Wrong request. Code: {error.code}')
216 # This error may occur if there is no internet connection
217 except URLError as error:
218 print('Error when requesting ' + request_url)
219 raise ValueError('Something went wrong with the DrugBank request')
221 return pubchem_id
223def find_chembl_pubchem (id_chembl):
224 # Request ChemBL
225 parsed_response = None
226 request_url = Request(
227 url= f'https://www.ebi.ac.uk/chembl/interface_api/es_proxy/es_data/get_es_document/chembl_molecule/{id_chembl}',
228 headers={'User-Agent': 'Mozilla/5.0'}
229 )
230 try:
231 with urlopen(request_url) as response:
232 parsed_response = json.loads(response.read().decode("utf-8"))
233 unichem = parsed_response['_source']['_metadata']['unichem']
234 if unichem == None:
235 raise RuntimeError('Wrong Pubchem data structure: no unichem section')
236 pubchem_section = next((section for section in unichem if section.get('src_url', None) == "http://pubchem.ncbi.nlm.nih.gov"), None)
237 if pubchem_section == None:
238 raise RuntimeError('Wrong Pubchem data structure: no pubchem section')
239 pubchem_id = pubchem_section.get('id', None)
240 if pubchem_id == None:
241 raise RuntimeError('Wrong Pubchem data structure: no pubchem ID')
242 # If the accession is not found in ChemBL then the id is not valid
243 except HTTPError as error:
244 if error.code == 404:
245 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)
246 return None
247 print('Error when requesting ' + request_url)
248 raise ValueError('Something went wrong with the ChemBL request (error ' + str(error.code) + ')')
249 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607)
250 if parsed_response == None:
251 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)
252 return None
254 return pubchem_id
256# Calculate the Morgan fingerprint and the Mordred descriptors from a ligand SMILES
257def obtain_mordred_morgan_descriptors (smiles : str) -> tuple:
258 mol = Chem.MolFromSmiles(smiles)
259 if mol is None:
260 print(f'WARNING: Cannot generate a molecule from SMILES {smiles}')
261 return None
263 Chem.AllChem.Compute2DCoords(mol)
264 mol_block = Chem.MolToMolBlock(mol)
265 # We can select the different submodules of mordred descriptors, avaible in: 'https://mordred-descriptor.github.io/documentation/master/'
266 calc = Calculator([
267 descriptors.ABCIndex, # Índice de ramificación
268 descriptors.AcidBase.AcidicGroupCount, # Grupos ácidos
269 descriptors.AcidBase.BasicGroupCount, # Grupos básicos
270 descriptors.RingCount, # Conteo de anillos
271 descriptors.Constitutional, # Propiedades generales como número de átomos, peso molecular
272 descriptors.TopologicalCharge, # Índices topológicos, Cargas parciales, polaridad
273 descriptors.HydrogenBond, # Donantes y aceptores de enlaces de hidrógeno
274 descriptors.Lipinski, # Reglas de Lipinski (drug-likeness)
275 descriptors.FragmentComplexity, # Identificación de subestructuras frecuentes
276 descriptors.PathCount, # Conteo de caminos moleculares
277 ], ignore_3D=True)
280 # Calculate Mordred results
281 mordred_results = calc(mol).drop_missing().asdict()
283 ######## MORGAN FINGERPRINT ###########
284 morgan_fp_gen = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
285 ao = Chem.rdFingerprintGenerator.AdditionalOutput()
286 ao.AllocateAtomCounts()
287 ao.AllocateAtomToBits()
288 ao.AllocateBitInfoMap()
289 fp = morgan_fp_gen.GetFingerprint(mol,additionalOutput=ao)
290 morgan_fp_bit_array = list(fp)
291 morgan_highlight_atoms = {}
292 for bit, atoms in ao.GetBitInfoMap().items():
293 morgan_highlight_atoms[bit] = list(set(atom for atom, radius in atoms))
295 return mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block
297# Check the cache to see if the ligand has already been matched or not, if not the ligand is skipped
298def check_matched_ligand (ligand: dict, ligand_data: dict, cache: 'Cache') -> bool:
299 # The pubchem id could be in the ligands data or the ligand input
300 pubchem_id_1 = ligand.get('pubchem', None)
301 if ligand_data != None:
302 pubchem_id_2 = ligand_data.get('pubchem', None)
303 else:
304 pubchem_id_2 = None
305 # If we have no pubchem id then we cannot check if the ligand matched
306 if not pubchem_id_1 and not pubchem_id_2:
307 return False
308 # If we have a pubchem id then check if it is already in the cache
309 cache_not_matched = cache.retrieve(NOT_MATCHED_LIGANDS, [])
310 for pubchem in cache_not_matched:
311 if pubchem == pubchem_id_1 or pubchem == pubchem_id_2:
312 print(f" Ligand with PubChem id {pubchem} didn't match before, skipping it")
313 return True
315 return False
317def generate_ligand_mapping (
318 structure : 'Structure',
319 cache : 'Cache',
320 input_ligands : Optional[list[dict]],
321 pdb_ids : list[str],
322 output_filepath : str,
323 inchikeys: dict,
324 mercy : list[str] = [],
325 ) -> list[dict]:
326 """Generate a map of residues associated to ligands."""
327 # Merge input ligands and pdb ligands
328 ligands = input_ligands or []
329 # Check we have cached pdb 2 pubchem values
330 pdb_to_pubchem_cache = cache.retrieve(PDB_TO_PUBCHEM, {})
331 new_data_to_cache = False
332 # Get input ligands from the pdb ids, if any
333 for pdb_id in pdb_ids:
334 # Check we have cached this specific pdb
335 pubchem_ids_from_pdb = pdb_to_pubchem_cache.get(pdb_id, None)
336 if pubchem_ids_from_pdb != None:
337 print(f' Retriving from cache PubChem ids for PDB id {pdb_id}: ')
338 if len(pubchem_ids_from_pdb) > 0:
339 print(' PubChem ids: ' + ', '.join(pubchem_ids_from_pdb))
340 else:
341 print(' This PDB id has no PubChem ids')
343 # If we had no cached pdb 2 pubchem then ask for them
344 if pubchem_ids_from_pdb == None:
345 pubchem_ids_from_pdb = pdb_to_pubchem(pdb_id)
346 # Save the result in the cache object so it is saved to cache later
347 pdb_to_pubchem_cache[pdb_id] = pubchem_ids_from_pdb
348 new_data_to_cache = True
349 for pubchem_id in pubchem_ids_from_pdb:
350 # Ligands in the structure (PDB) and the 'inputs.json' could be the same so it's not necessary to do it twice
351 if not any('pubchem' in ligand and ligand['pubchem'] == pubchem_id for ligand in ligands):
352 ligands.append({ 'pubchem': pubchem_id, 'pdb': True })
353 # Save all pdb to pubchem results in cache, in case there is anything new
354 if new_data_to_cache: cache.update(PDB_TO_PUBCHEM, pdb_to_pubchem_cache)
356 # If no input ligands are passed then stop here
357 # if len(ligands) == 0:
358 # return []
360 # Save data from all ligands to be saved in a file
361 json_ligands_data = []
362 ligands_data = []
364 # Load the ligands file if exists already
365 if os.path.exists(output_filepath):
366 json_ligands_data += import_ligands(output_filepath)
367 # If json ligands exists and it is empty means that ligands analysis has been already done but no ligands were matched
368 # so the file will contain an empty list []
369 if len(json_ligands_data) == 0:
370 print('No ligands have been matched yet.\nIf you want to force a ligand to be matched, please provide the field "residues" in the inputs.json file.')
371 return []
373 # Visited formulas
374 visited_formulas = []
375 # Save the maps of every ligand
376 ligand_maps = []
377 # Get cached ligand data
378 # AGUS: esto lo creamos por alguna razón para que funcione sin internet (en el cluster) pero realmente
379 # AGUS: llena el cache con datos que no son necesarios porque toda esa info está en el ligand_references.json
380 #ligand_data_cache = cache.retrieve(LIGANDS_DATA, {})
381 # Iterate input ligands
382 for ligand in ligands:
383 # Set the pubchem id which may be assigned in different steps
384 pubchem_id = None
385 # If input ligand is not a dict but a single int/string then handle it
386 if type(ligand) == int:
387 print(f'A ligand number ID has been identified {ligand}, assuming that is a PubChem ID...')
388 ligand = { 'pubchem': str(ligand) }
389 elif type(ligand) == str:
390 raise InputError(f'A name of ligand has been identified: {ligand}. Anyway, provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.')
391 # Check if we already have this ligand data
392 ligand_data = obtain_ligand_data_from_file(json_ligands_data, ligand)
393 # Check if the ligand didn't match before
394 if check_matched_ligand(ligand, ligand_data, cache): continue
395 # If we do not have its data try to get from the cache
396 # if not ligand_data:
397 # # If this is a ligand not in ligans.json but in cache it means it comes from PDB, never form user inputs
398 # # For this reason, this ligand will always have a PubChem id
399 # pubchem_id = ligand.get('pubchem', None)
400 # if pubchem_id:
401 # ligand_data = ligand_data_cache.get(pubchem_id, None)
402 # # If we still have no ligand data then request it to PubChem
403 # if not ligand_data:
404 # ligand_data = obtain_ligand_data_from_pubchem(ligand)
405 # # Save data mined from PubChem in the cache
406 # pubchem_id = ligand_data['pubchem']
407 # ligand_data_cache[pubchem_id] = { **ligand_data }
408 # cache.update(LIGANDS_DATA, ligand_data_cache)
409 # Add current ligand data to the general list
410 if not ligand_data:
411 ligand_data = obtain_ligand_data_from_pubchem(ligand)
412 ligands_data.append(ligand_data)
413 # Get pubchem id
414 pubchem_id = ligand_data.get('pubchem', None)
415 # If we already visited a different ligand but with identical formula then we skip this ligand
416 # Note that the mapping will be identical thus overwritting the previous map
417 # However, ligands forced by the user are processed before so we keep them as priority
418 formula = ligand_data.get('formula', None)
419 if formula in visited_formulas:
420 print(f'WARNING: Ligand with PubChem Id {pubchem_id} has a formula which has been already matched')
421 # AGUS: en este punto si el usuario ha definido el mismo ligando con diferente selección quiere decir que está repetido
422 # AGUS: y que deberíamos mantener el ligando para diferentes residuos matcheados
423 if forced_selection:
424 visited_formulas.pop()
425 ligands_data.pop()
426 print(f'WARNING: Ligand with PubChem Id {pubchem_id} has been forced by the user and its repeated')
427 else:
428 ligands_data.pop()
429 continue
430 # Add it to the list of visited formulas
431 visited_formulas.append(formula)
432 # If the user defined a ligand name, it will be respectedand added to the metadata
433 # if there isn't a defined name, it will be mined from PubChem
434 user_forced_ligand_name = ligand.get('name', None)
435 # Map structure residues with the current ligand
436 # If the user forced the residues then use them
437 forced_selection = ligand.get('selection', None)
438 if forced_selection:
439 # Could be a single residue or a list of residues
440 selection_atoms = structure.select(forced_selection)
441 residue_indices = structure.get_selection_residue_indices(selection_atoms)
442 ligand_map = { 'name': pubchem_id, 'residue_indices': residue_indices, 'match': { 'ref': { 'pubchem': pubchem_id } } }
443 if user_forced_ligand_name: ligand_map['forced_name'] = user_forced_ligand_name
444 # If the user did not force the residues then map them
445 else:
446 ligand_map = map_ligand_residues(structure, ligand_data)
447 # Add current ligand map to the general list
448 ligand_maps.append(ligand_map)
449 # If the ligand did not map then discard it
450 did_not_match = len(ligand_map['residue_indices']) == 0
451 if did_not_match:
452 is_forced_by_user = not bool(ligand.get('pdb', False))
453 if is_forced_by_user:
454 # At this point we should have macthed all sequences
455 # If not, kill the process unless mercy was given
456 must_be_killed = LIGANDS_MATCH_FLAG not in mercy
457 if must_be_killed:
458 raise InputError(f'Ligand with PubChem Id {pubchem_id} did not map with any residue')
459 # Otherwise simply remove it from the final ligand references file and the forwarded maps
460 ligands_data.pop()
461 ligand_maps.pop()
462 # Update the cache with the pubchem id of the ligands that didn't match
463 not_matched_pubchems = cache.retrieve(NOT_MATCHED_LIGANDS, [])
464 not_matched_pubchems.append(ligand_map['name'])
465 cache.update(NOT_MATCHED_LIGANDS, not_matched_pubchems)
466 # If the ligand matched then calculate some additional data
467 # Get morgan and mordred descriptors, the SMILES is needed for this part
468 smiles = ligand_data['smiles']
469 mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block = obtain_mordred_morgan_descriptors(smiles)
470 ligand_data['mordred'] = mordred_results
471 ligand_data['morgan_fp_bit_array'] = morgan_fp_bit_array
472 ligand_data['morgan_highlight_atoms'] = morgan_highlight_atoms
473 ligand_data['mol_block'] = mol_block
475 # At this point maybe some ligands in the structure may not match because they have not been included by either the author or the user. Alternatively, the authors may have added them
476 # So those "ligands" are residues that are not matched to any ligand in the structure and the analyses will not run
477 # LORE AGUS: en el dataset de MDBind (cineca) hay muchas simulaciones con ligandos modificados (incluso solo un átomo) y no matchean
478 # LORE AGUS: con los que están en el PDB, por lo que no se hace ningún análisis sobre estos (y se debería)
479 # LORE AGUS: ahora se intenta buscar su pubchem ID para extraer la información a partir del inchikey (da menos problemas que el smiles) y se hacen los análisis pertinentes.
480 # AGUS: si no se puede extraer información de estos porque el inchikey no funciona, se muestra la información más básica posible y se genera una referencia vacía (será problemático)
482 # Obtain a list of the possible residues in the structure that are not either protein/nucleic/water/lipid or matched ligands
483 for residue in structure.residues:
484 # Skip solvent, lipid or ions
485 if residue.classification in ['solvent', 'fatty', 'steroid', 'ion']: continue
486 # Make sure the residue is not bonded to other things
487 # If we are missing bonds then we can not rely in this rule
488 if residue.is_missing_any_bonds(): continue
489 if len(residue.get_bonded_residue_indices()) > 0: continue
490 # Check if the residue is already matched to a ligand
491 matched = False
492 for ligand_map in ligand_maps:
493 if residue.index in ligand_map['residue_indices']:
494 matched = True
495 break
496 # If the residue is not matched then add it to the ligands data
497 if matched: continue
498 # Create a new ligand data with the residue name and its atoms count
499 ligand_data = { field: None for field in LIGAND_DATA_FIELDS }
500 ligand_data['name'] = residue.name
501 # Add the residue as a ligand map
502 ligand_map = {
503 'name': residue.name,
504 'residue_indices': [residue.index],
505 'match': { 'ref': { 'pubchem': None } }
506 }
508 inchikey = inchikeys['residx_2_key'].get(residue.index, None)
509 # If the InChIKey is not None then try to obtain the PubChem CID
510 if inchikey:
511 ligand_data['inchikey'] = inchikey
512 if cid := search_cid_by_inchikey(inchikey):
513 ligand_data = obtain_ligand_data_from_pubchem( {'pubchem': cid} )
514 smiles = ligand_data['smiles']
515 mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block = obtain_mordred_morgan_descriptors(smiles)
516 ligand_data['mordred'] = mordred_results
517 ligand_data['morgan_fp_bit_array'] = morgan_fp_bit_array
518 ligand_data['morgan_highlight_atoms'] = morgan_highlight_atoms
519 ligand_data['mol_block'] = mol_block
520 ligand_data['pubchem'] = cid
521 ligand_map['match']['ref']['pubchem'] = str(cid)
522 # Add the ligand data to the list of ligands data
523 ligands_data.append(ligand_data)
524 # Add the ligand map to the list of ligand maps
525 ligand_maps.append(ligand_map)
527 # Export ligands to a file
528 save_json(ligands_data, output_filepath)
530 # Log matched ligands
531 if not ligand_maps:
532 print('No ligands were matched')
533 else:
534 print('Matched ligands:')
535 for ligand_map in ligand_maps:
536 residue_indices = ligand_map["residue_indices"]
537 residue_count = len(residue_indices)
538 plural_suffix = '' if residue_count == 1 else 's'
539 print(f' - {ligand_map["name"]}: {residue_count} residue{plural_suffix}')
541 return ligand_maps
543# Set the expected ligand data fields
544LIGAND_DATA_FIELDS = set(['name', 'pubchem', 'drugbank', 'chembl', 'smiles', 'formula', 'morgan', 'mordred', 'pdbid'])
546# Import ligands json file so we do not have to rerun this process
547def import_ligands (output_filepath : str) -> dict:
548 # Read the file
549 imported_ligands = load_json(output_filepath)
550 # Format data as the process expects to find it
551 for imported_ligand in imported_ligands:
552 for expected_field in LIGAND_DATA_FIELDS:
553 if expected_field not in imported_ligand:
554 imported_ligand[expected_field] = None
555 return imported_ligands
557# Check if the current input ligand is already among the ligands we already have data for
558def obtain_ligand_data_from_file ( ligands_data : list[dict], ligand: dict ) -> Optional[dict]:
559 for ligand_data in ligands_data:
560 if (ligand.get('pubchem') and ligand['pubchem'] == ligand_data.get('pubchem')) or \
561 (ligand.get('drugbank') and ligand['drugbank'] == ligand_data.get('drugbank')) or \
562 (ligand.get('chembl') and ligand['chembl'] == ligand_data.get('chembl')):
563 return ligand_data
564 return None
566# Given an input ligand, obtain all necessary data
567def obtain_ligand_data_from_pubchem (ligand : dict) -> dict:
568 # Save in a dictionary all ligand data including its name and ids
569 # The ID can be of the databases: 'drugbank' , 'pubchem' , 'chembl'
570 # Define the needed variables to check if the ligand has a database ID or it is None
571 ligand_data = {
572 'name': None,
573 'pubchem': None,
574 'drugbank': None,
575 'chembl': None,
576 'smiles': None,
577 'formula': None,
578 'pdbid': None,
579 }
580 # Set ligand data pubchem id, even if the input id is not from pubhcme (e.g. drugbank, chembl)
581 if 'pubchem' in ligand:
582 ligand_data['pubchem'] = str(ligand.get('pubchem'))
583 elif 'drugbank' in ligand:
584 ligand_data['drugbank'] = ligand.get('drugbank')
585 ligand_data['pubchem'] = str(find_drugbank_pubchem(ligand_data['drugbank']))
586 elif 'chembl' in ligand:
587 ligand_data['chembl'] = ligand.get('chembl')
588 ligand_data['pubchem'] = str(find_chembl_pubchem(ligand_data['chembl']))
589 else:
590 raise InputError('None of the ligand IDs are defined. Please provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.')
592 # Request ligand data from pubchem
593 pubchem_data = get_pubchem_data(ligand_data['pubchem'])
594 if not pubchem_data:
595 raise RuntimeError('No PubChem data avilable')
597 # Add pubchem data to ligand data
598 ligand_data = { **ligand_data, **pubchem_data }
599 return ligand_data
601# RUBEN: mover a Structure.count_atom_elements_per_residue
602def count_atom_elements_per_residue ( structure : 'Structure' ) -> dict:
603 """For each residue, count the number of atom elements."""
604 # Obtain a list of residues of ligand candidates
605 residue_element_count = {}
606 # Iterate over the residue(s)
607 for residue in structure.residues:
608 # Obtain the list of elements for each residue
609 atom_elements = [ atom.element for atom in residue.atoms ]
610 # Generate a dict with elements (keys) and their counting (values)
611 atoms_count = {}
612 for atom in atom_elements:
613 if atom in atoms_count:
614 atoms_count[atom] += 1
615 else:
616 atoms_count[atom] = 1
617 # Define the key of the dictionary as the residue name and save it in a list with the all the residues
618 residue_element_count[residue] = atoms_count
620 return residue_element_count
622def nest_brackets(tokens, i = 0):
623 l = []
624 while i < len(tokens):
625 if tokens[i] == ')':
626 return i,l
627 elif tokens[i] == '(':
628 i,subl = nest_brackets(tokens, i+1)
629 l.append(subl)
630 else:
631 l.append(tokens[i])
632 i += 1
633 return i,l
635def parse_compound(formula : str) -> list[str]:
636 tokens = [''.join(t) for t in split_when(formula, lambda a,b: b.isupper() or b in '()' or (b.isdigit() and not a.isdigit()))]
637 tokens = [(int(t) if t.isdigit() else t) for t in tokens]
638 i, l = nest_brackets(tokens)
639 assert(i == len(tokens)) # crash if unmatched ')'
640 return l
642# Split a string using a function
643def split_when(string : str, func : Callable) -> list[str]:
644 splits = []
645 last_split = string[0]
646 for s in string[1:]:
647 if func(last_split, s):
648 splits.append(last_split)
649 last_split = s
650 else:
651 last_split += s
652 splits.append(last_split)
653 return splits
655# Given a chemical formula, get the count of atoms per element
656def count_atom_elements (molecular_formula : str) -> dict:
657 # Clean the formula from charges
658 # These charges include numbers which are not atom counts (e.g. 49867169 -> C18H16NO5PS-2)
659 parsed_molecular_formula = re.sub('[+-][0-9]*', '', molecular_formula)
660 l = parse_compound(parsed_molecular_formula)
661 c = parse_splits(l)
662 return c
664# This function associates elements in a list
665# If a string is followed by a number then they go together
666# If a string has no number then the number is 1 for this specific string
667def parse_splits (splits : list[str]) -> dict:
668 parsed = {}
669 previous_key = None
670 for split in splits:
671 if type(split) == str:
672 if previous_key:
673 parsed[previous_key] = 1
674 previous_key = split
675 elif type(split) == int:
676 parsed[previous_key] = split
677 previous_key = None
678 else:
679 raise ValueError('Not supported type ' + type(split))
680 if previous_key:
681 parsed[previous_key] = 1
682 return parsed
684def match_ligandsID_to_res (ligand_atom_element_count : dict, residue_atom_element_count : dict) -> bool:
685 # Remove hydrogens since we only pay attention to heavy atoms
686 # This is to avoid having mismatched_residues due to different protonation states
687 if 'H' in ligand_atom_element_count:
688 del ligand_atom_element_count['H']
689 if 'H' in residue_atom_element_count:
690 del residue_atom_element_count['H']
691 #shared_items = {k: ligand_atom_element_count[k] for k in ligand_atom_element_count if k in residue_atom_element_count and ligand_atom_element_count[k] == residue_atom_element_count[k]}
692 ligand_elements = set(ligand_atom_element_count.keys())
693 residue_elements = set(residue_atom_element_count.keys())
695 # Check if there are different elements
696 # If so, we are done
697 different_keys = ligand_elements.symmetric_difference(residue_elements)
698 if len(different_keys) > 0:
699 # print("Elements that are not matching:")
700 # for atom in different_keys:
701 # if atom in ligand_atom_element_count:
702 # print(f"In Pubchem {list(ligand_atom_element_count.keys())[0]} : {atom}: {ligand_atom_element_count[atom]}")
703 # else:
704 # print(f"In PDB residue {list(residue_atom_element_count.keys())[0]}: {atom}: {residue_atom_element_count[atom]}")
705 return False
707 # Different values
708 different_values = []
709 for element in ligand_elements:
710 if ligand_atom_element_count[element] != residue_atom_element_count[element]:
711 different_values.append(element)
713 # If the count of elements does not match then stop here
714 if len(different_values) > 0:
715 # Print the differences found between the elements
716 # print("Number of atoms that are different:")
717 # for atom in different_values:
718 # print(f"In Pubchem {list(ligand_atom_element_count.keys())[0]} {atom}: {ligand_atom_element_count[atom]}, In PDB residue {list(pdb_dict.keys())[0]}: {atom}: {residue_atom_element_count[atom]}\n")
719 return False
721 return True
723def map_ligand_residues (structure : 'Structure', ligand_data : dict) -> dict:
724 # Load the structure where the atoms will be minresidue.indexed and obtain a residues dict
725 atom_elements_per_residue = count_atom_elements_per_residue(structure)
726 # Get the ligand formula
727 ligand_formula = ligand_data['formula']
728 if not ligand_formula:
729 raise RuntimeError(f'Ligand with PubChem id {ligand_data["pubchem"]} is missing formula')
730 # From pubchem mine the atoms of the ligands and save it in a dict
731 ligand_atom_element_count = count_atom_elements(ligand_data['formula'])
732 matched_residues = []
733 # Get ligand pubchem id
734 pubchem_id = ligand_data['pubchem']
735 for residue, residue_atom_element_count in atom_elements_per_residue.items():
736 if match_ligandsID_to_res(ligand_atom_element_count, residue_atom_element_count):
737 matched_residues.append(residue.index)
738 # Format the output as we expect it
739 ligand_map = { 'name': pubchem_id, 'residue_indices': matched_residues, 'match': { 'ref': { 'pubchem': pubchem_id } } }
740 return ligand_map
742# Given a smiles, get the pubchem id
743# e.g. CC1=C(C=NC=C1)NC(=O)CC2=CC(=CC(=C2)Cl)OC -> 154876006
744# e.g. O=C4N3C(C(=O)Nc2cc(nn2c1ccccc1)C)C(SC3CC=CC4NC(=O)C(NC)C)(C)C -> None
745def smiles_to_pubchem_id (smiles : str) -> Optional[str]:
746 # Set the request URL
747 request_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON'
748 # Set the POST data
749 data = urlencode({ 'smiles': smiles }).encode()
750 try:
751 with urlopen(request_url, data=data) as response:
752 parsed_response = json.loads(response.read().decode("utf-8"))
753 # If the smiles is not found in pubchem then we can stop here
754 except HTTPError as error:
755 if error.code == 404:
756 print(f' Smiles {smiles} not found')
757 return None
758 else:
759 raise ValueError('Something went wrong with the PubChem request: ' + request_url)
760 # Get the PubChem id
761 compounds = parsed_response.get('PC_Compounds', None)
762 if not compounds:
763 raise RuntimeError(f'Something went wrong when mining pubchem data for SMILES {smiles}')
764 # Make sure there is only 1 compound
765 # DANI: Esto algún día podría no ser así, ya lo gestionaremos
766 if len(compounds) != 1:
767 raise RuntimeError(f'There should be one and only one compound for SMILES {smiles}')
768 # Keep mining
769 compound = compounds[0]
770 first_id = compound.get('id', None)
771 # If there is no first id then it means there is no direct match with pubchem
772 if not first_id:
773 return None
774 second_id = first_id.get('id', None)
775 if not second_id:
776 raise RuntimeError(f'Missing second id when mining pubchem data for SMILES {smiles}')
777 pubchem_id = second_id.get('cid', None)
778 if not pubchem_id:
779 raise RuntimeError(f'Missing pubchem id when mining pubchem data for SMILES {smiles}')
780 return str(pubchem_id)
782# Given a PDB ligand code, get its pubchem
783def pdb_ligand_to_pubchem (pdb_ligand_id : str) -> Optional[str]:
784 # Set the request query
785 query = '''query molecule($id:String!){
786 chem_comp(comp_id:$id) { rcsb_chem_comp_related{ resource_name resource_accession_code } }
787 }'''
788 # Request PDB data
789 parsed_response = request_pdb_data(pdb_ligand_id, query)
790 related_resources = parsed_response['rcsb_chem_comp_related']
791 # It may happend that a ligand code has no related resources at all
792 # e.g. ZN
793 if not related_resources: return None
794 pubchem_resource = next((resource for resource in related_resources if resource['resource_name'] == 'PubChem'), None)
795 if not pubchem_resource: return None
796 return pubchem_resource['resource_accession_code']
798# Given a PDB ligand code, get its pubchem
799# Use a web crawler to avoid having to use the PDB API
800def pdb_ligand_to_pubchem_RAW (pdb_ligand_id : str) -> Optional[str]:
801 # Set the request URL
802 request_url = f'https://www.rcsb.org/ligand/{pdb_ligand_id}'
803 # Run the query
804 parsed_response = None
805 try:
806 with urlopen(request_url) as response:
807 parsed_response = response.read().decode("utf-8")
808 # If the accession is not found in the PDB then we can stop here
809 except HTTPError as error:
810 if error.code == 404:
811 print(f' PDB ligand {pdb_ligand_id} not found')
812 return None
813 else:
814 print(error.msg)
815 raise ValueError('Something went wrong with the PDB ligand request: ' + request_url)
816 # Mine the pubchem id out of the whole response
817 pattern = re.compile('pubchem.ncbi.nlm.nih.gov\/compound\/([0-9]*)\"')
818 match = re.search(pattern, parsed_response)
819 # If there is no pubchem id then return none
820 # This is normal for some ligands such as counter ions (e.g. LI)
821 if not match:
822 return None
823 pubchem_id = match[1]
824 return pubchem_id
826# Given a PDB ligand code, get its pubchem
827# Ask to pubchem if the ligand exists and hope there is only one result
828# DANI: No se ha provado a fondo
829def pdb_ligand_to_pubchem_RAW_RAW (pdb_ligand_id : str) -> Optional[str]:
830 # Set the request URL
831 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{pdb_ligand_id}/json'
832 # Run the query
833 parsed_response = None
834 try:
835 with urlopen(request_url) as response:
836 parsed_response = json.loads(response.read().decode("utf-8"))
837 # If the accession is not found in the PDB then we can stop here
838 except HTTPError as error:
839 # This may happen for weird things such as UNX (unknown atom or ion)
840 if error.code == 404:
841 return None
842 print(error.msg)
843 raise RuntimeError('Something went wrong with the PDB ligand request in PubChem: ' + request_url)
844 # Mine the pubchem id
845 compounds = parsed_response['PC_Compounds']
846 if len(compounds) != 1:
847 # There could be more than one result
848 # For example, the case HEM: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/HEM/json
849 # In this case we picked the first result
850 compound = compounds[0]
851 #raise RuntimeError('We are not having 1 and only 1 result from PubChem: ' + request_url)
852 compound = compounds[0]
853 id1 = compound['id']
854 id2 = id1['id']
855 pubchem_id = str(id2['cid'])
856 return pubchem_id
858# Given a PDB id, get all its ligand codes
859# e.g. 2I3I -> 618, BTB, ZN, EDO, LI
860def get_pdb_ligand_codes (pdb_id : str) -> list[str]:
861 # Set the request query
862 query = '''query structure($id: String!) {
863 entry(entry_id: $id) {
864 nonpolymer_entities { nonpolymer_comp { chem_comp { id } } }
865 }
866 }'''
867 # Request PDB data
868 parsed_response = request_pdb_data(pdb_id, query)
869 # Mine data for nonpolymer entities
870 nonpolymers = parsed_response['nonpolymer_entities']
871 # If there are no nonpolymer entities, another type of entitie could be used
872 # AGUS: esto es un caso muy concreto que me encontré con el PDB 1N3W
873 # AGUS: el ligando en este caso es una 'Biologically Interesting Molecules' y se muestran como 'PRD_'
874 prd_code = None
875 if nonpolymers == None:
876 # Get the prd ligand code
877 prd_code = get_prd_ligand_code(pdb_id)
878 if prd_code != None:
879 return [prd_code]
881 if nonpolymers == None and prd_code == None: return []
882 # Iterate nonpolymer entities to mine each PDB code
883 ligand_codes = []
884 for nonpolymer in nonpolymers:
885 ligand_code = nonpolymer['nonpolymer_comp']['chem_comp']['id']
886 ligand_codes.append(ligand_code)
887 print(f' Ligand codes for PDB id {pdb_id}: ' + ', '.join(ligand_codes))
888 return ligand_codes
890# Given a PDB id, get its PRD ligand code
891def get_prd_ligand_code (pdb_id : str) -> Optional[str]:
892 query = '''query structure($id: String!) {
893 entry(entry_id: $id) {
894 pdbx_molecule_features {
895 prd_id
896 }
897 }
898 }'''
899 parsed_response = request_pdb_data(pdb_id, query)
900 pdbx_id = parsed_response.get('pdbx_molecule_features', None)
901 if pdbx_id is None:
902 print(f'WARNING: Cannot find PRD ligand code for PDB id {pdb_id}')
903 return None
904 prd_id = pdbx_id[0].get('prd_id', None) # AGUS: podría haber casos donde haya más de uno?
905 if prd_id is None:
906 print(f'WARNING: Cannot find PRD ligand code for PDB id {pdb_id}')
907 return None
908 # If the PRD id is not empty then return it
909 return prd_id
911# Given a pdb id, get its pubchem ids
912# DANI: De momento no usamos las SMILES que alguna vez me han dado problemas (e.g. 2I3I)
913# e.g. 4YDF ->
914def pdb_to_pubchem (pdb_id : str) -> list[str]:
915 print(f'Searching PubChem ids for PDB {pdb_id}')
916 pubchem_ids = []
917 # Iterate over pdb ligand codes
918 ligand_codes = get_pdb_ligand_codes(pdb_id)
919 for ligand_code in ligand_codes:
920 # Ask the PDB API for the ligand
921 pubchem_id = pdb_ligand_to_pubchem(ligand_code)
922 # If this did not work then try mining the PDB client with a web crawler
923 if not pubchem_id:
924 pubchem_id = pdb_ligand_to_pubchem_RAW(ligand_code)
925 # If this did not work then try it from PubChem
926 if not pubchem_id:
927 pubchem_id = pdb_ligand_to_pubchem_RAW_RAW(ligand_code)
928 # Otherwise we surrender
929 if not pubchem_id:
930 print(f' {ligand_code} -> No PubChem id')
931 continue
932 print(f' {ligand_code} -> {pubchem_id}')
933 pubchem_ids.append(pubchem_id)
935 return pubchem_ids
937@lru_cache(maxsize=None)
938def search_cid_by_inchikey(inchikey : str) -> Optional[str]:
939 url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/{inchikey}/cids/JSON"
940 r = requests.get(url)
941 if r.ok:
942 data = r.json()
943 return data['IdentifierList']['CID'][0]
944 else:
945 return None