Coverage for model_workflow/tools/generate_ligands_desc.py: 62%
547 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
1import os
2import json
3from rdkit import Chem
4from rdkit.Chem import AllChem
5from rdkit.Chem import rdFingerprintGenerator
6from mordred import Calculator, descriptors
7from model_workflow.utils.constants import LIGANDS_MATCH_FLAG, PDB_TO_PUBCHEM, NOT_MATCHED_LIGANDS
8from model_workflow.utils.auxiliar import InputError, load_json, save_json, request_pdb_data
9from model_workflow.utils.type_hints import *
10from model_workflow.utils.structures import Structure
11from urllib.request import Request, urlopen
12from urllib.parse import urlencode
13from urllib.error import HTTPError, URLError
14import re
16def get_drugbank_smiles (id_drugbank : str) -> Optional[str]:
17 # Request Drugbank
18 request_url = Request(
19 url= f'https://go.drugbank.com/structures/small_molecule_drugs/{id_drugbank}.smiles',
20 headers={'User-Agent': 'Mozilla/5.0'}
21 )
22 try:
23 with urlopen(request_url) as response:
24 smiles = response.read()
25 # If the accession is not found in the database then we stop here
26 except HTTPError as error:
27 # If the drugbank ID is not yet in the Drugbank references then return None
28 if error.code == 404:
29 return None
30 else:
31 print('Error when requesting ' + request_url)
32 raise ValueError('Something went wrong with the Drugbank request (error ' + str(error.code) + ')')
33 # This error may occur if there is no internet connection
34 except URLError as error:
35 print('Error when requesting ' + request_url)
36 raise ValueError('Something went wrong with the MDposit request')
38 return smiles
42# Given a ChemBL ID, use the uniprot API to request its data and then mine what is needed for the database
43def get_chembl_smiles (id_chembl : str) -> Optional[str]:
44 # Request ChemBL
45 parsed_response = None
46 request_url = Request(
47 url= f'https://www.ebi.ac.uk/chembl/interface_api/es_proxy/es_data/get_es_document/chembl_molecule/{id_chembl}',
48 headers={'User-Agent': 'Mozilla/5.0'}
49 )
50 try:
51 with urlopen(request_url) as response:
52 parsed_response = json.loads(response.read().decode("utf-8"))
53 smiles = parsed_response['_source']['molecule_structures']['canonical_smiles']
54 pubchem_id = parsed_response['_source']['_metadata']['unichem'][8]['id']
55 # If the accession is not found in ChemBL then the id is not valid
56 except HTTPError as error:
57 if error.code == 404:
58 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)
59 return None
60 print('Error when requesting ' + request_url)
61 raise ValueError('Something went wrong with the ChemBL request (error ' + str(error.code) + ')')
62 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607)
63 if parsed_response == None:
64 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)
65 return None
66 return smiles
68# Given a PubChem ID, use the uniprot API to request its data and then mine what is needed for the database
69def get_pubchem_data (id_pubchem : str) -> Optional[dict]:
70 # Request PubChem
71 parsed_response = None
72 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{id_pubchem}/JSON/'
73 try:
74 with urlopen(request_url) as response:
75 #parsed_response = json.loads(response.read().decode("windows-1252"))
76 parsed_response = json.loads(response.read().decode("utf-8", errors='ignore'))
77 # If the accession is not found in PubChem then the id is not valid
78 # This may happen with pubchem ids of non-discrete compounds (e.g. 483927498)
79 except HTTPError as error:
80 if error.code == 404:
81 print('WARNING: Cannot find PubChem entry for accession ', id_pubchem)
82 return None
83 print('Error when requesting ', request_url)
84 raise ValueError('Something went wrong with the PubChem request (error ', str(error.code), ')')
85 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607)
86 if parsed_response == None:
87 print('WARNING: Cannot find PubChem entry for accession ' + id_pubchem)
88 return None
89 # Mine target data: SMILES
90 record = parsed_response.get('Record', None)
91 if record == None:
92 raise RuntimeError('Wrong Pubchem data structure: no record: ' + request_url)
93 sections = record.get('Section', None)
94 if sections == None:
95 raise RuntimeError('Wrong Pubchem data structure: no sections: ' + request_url)
96 names_and_ids_section = next((section for section in sections if section.get('TOCHeading', None) == 'Names and Identifiers'), None)
97 if names_and_ids_section == None:
98 raise RuntimeError('Wrong Pubchem data structure: no name and ids section: ' + request_url)
99 names_and_ids_subsections = names_and_ids_section.get('Section', None)
100 if names_and_ids_subsections == None:
101 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)
103 # Mine the name
104 synonims = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Synonyms'), None)
105 if synonims == None:
106 descriptors = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None)
107 descriptors_subsections = descriptors.get('Section', None)
108 if descriptors_subsections == None:
109 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)
110 depositor_supplied_descriptors = next((s for s in descriptors_subsections if s.get('TOCHeading', None) == 'IUPAC Name'), None)
111 name_substance = depositor_supplied_descriptors.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
112 else:
113 synonims_subsections = synonims.get('Section', None)
114 if synonims_subsections == None:
115 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)
116 depositor_supplied_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Depositor-Supplied Synonyms'), None)
117 if depositor_supplied_synonims == None:
118 removed_synonims = next((s for s in synonims_subsections if s.get('TOCHeading', None) == 'Removed Synonyms'), None)
119 name_substance = removed_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
120 else:
121 name_substance = depositor_supplied_synonims.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
123 # Mine the SMILES
124 computed_descriptors_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Computed Descriptors'), None)
125 if computed_descriptors_subsection == None:
126 raise RuntimeError('Wrong Pubchem data structure: no computeed descriptors: ' + request_url)
127 canonical_smiles_section = computed_descriptors_subsection.get('Section', None)
128 if canonical_smiles_section == None:
129 raise RuntimeError('Wrong Pubchem data structure: no canonical SMILES section: ' + request_url)
130 canonical_smiles = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'Canonical SMILES'), None)
131 if canonical_smiles == None:
132 # In some cases there is no canonical SMILES but a non-canonical one could exists
133 non_canonical_smiles_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'SMILES'), None)
134 if non_canonical_smiles_section == None:
135 raise RuntimeError('Wrong Pubchem data structure: no canonical SMILES: ' + request_url)
137 if canonical_smiles:
138 smiles = canonical_smiles.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
139 if non_canonical_smiles_section:
140 smiles = non_canonical_smiles_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
142 if smiles == None:
143 raise RuntimeError('Wrong Pubchem data structure: no SMILES: ' + request_url)
145 # Mine target data: MOLECULAR FORMULA
146 molecular_formula_subsection = next((s for s in names_and_ids_subsections if s.get('TOCHeading', None) == 'Molecular Formula'), None)
147 if molecular_formula_subsection == None:
148 raise RuntimeError('Wrong Pubchem data structure: no molecular formula section: ' + request_url)
149 molecular_formula = molecular_formula_subsection.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
150 if molecular_formula == None:
151 raise RuntimeError('Wrong Pubchem data structure: no molecular formula: ' + request_url)
153 # Mine target data: PDB ID
154 pdb_id = None
155 pdb_id_subsection = next((s for s in sections if s.get('TOCHeading', None) == 'Interactions and Pathways'), None)
156 # If this section is missing then it means this PubChem compound has no PDB id
157 if pdb_id_subsection:
158 pdb_id_subsections = pdb_id_subsection.get('Section', None)
159 if pdb_id_subsections == None:
160 raise RuntimeError('Wrong Pubchem data structure: no name and ids subsections: ' + request_url)
161 bond_structures = next((s for s in pdb_id_subsections if s.get('TOCHeading', None) == 'Protein Bound 3D Structures'), None)
162 if bond_structures:
163 bond_structures_section = bond_structures.get('Section', None)
164 # If this section is missing then it means this PubChem compound has no PDB id
165 if bond_structures_section:
166 ligands_structure = next((s for s in bond_structures_section if s.get('TOCHeading', None) == 'Ligands from Protein Bound 3D Structures'), None)
167 if ligands_structure == None:
168 raise RuntimeError('Wrong Pubchem data structure: no Ligands from Protein Bound 3D Structures section: ' + request_url)
169 ligands_structure_subsections = ligands_structure.get('Section', None)
170 if ligands_structure_subsections == None:
171 raise RuntimeError('Wrong Pubchem data structure: no Ligands from Protein Bound 3D Structures subsections: ' + request_url)
172 ligands_pdb = next((s for s in ligands_structure_subsections if s.get('TOCHeading', None) == 'PDBe Ligand Code'), None)
173 if ligands_pdb == None:
174 raise RuntimeError('Wrong Pubchem data structure: no PDBe Ligand Code: ' + request_url)
175 pdb_id = ligands_pdb.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
177 # Mine de INCHI and INCHIKEY
178 inchi_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChI'), None)
179 if inchi_section == None:
180 raise RuntimeError('Wrong Pubchem data structure: no InChI: ' + request_url)
181 if inchi_section:
182 inchi = inchi_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
184 inchikey_section = next((s for s in canonical_smiles_section if s.get('TOCHeading', None) == 'InChIKey'), None)
185 if inchikey_section == None:
186 raise RuntimeError('Wrong Pubchem data structure: no InChIKey: ' + request_url)
187 if inchikey_section:
188 inchikey = inchikey_section.get('Information', None)[0].get('Value', {}).get('StringWithMarkup', None)[0].get('String', None)
189 # Prepare the pubchem data to be returned
190 return { 'name': name_substance, 'smiles': smiles, 'formula': molecular_formula , 'pdbid': pdb_id, 'inchi': inchi, 'inchikey': inchikey }
193def find_drugbank_pubchem (drugbank_id):
194 # Request Drugbank
195 request_url = Request(
196 url=f'https://go.drugbank.com/drugs/{drugbank_id}',
197 headers={
198 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
199 'Accept-Language': 'en-US,en;q=0.9',
200 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
201 }
202)
203 pubchem_id = None
204 try:
205 with urlopen(request_url) as response:
206 content = response.read().decode("utf-8")
207 pattern = re.compile("http\:\/\/pubchem.ncbi.nlm.nih.gov\/summary\/summary.cgi\?cid\=([0-9]*)")
208 match = re.search(pattern, str(content))
209 if not match:
210 raise ValueError("No se encontró información sobre el Pubchem compound en esta página.")
211 pubchem_id = match[1]
212 if not pubchem_id:
213 raise ValueError("No se encontró información sobre el Pubchem compound en esta página.")
214 # If the accession is not found in the database then we stop here
215 except HTTPError as error:
216 # If the drugbank ID is not yet in the Drugbank references then return None
217 raise ValueError(f'Wrong request. Code: {error.code}')
218 # This error may occur if there is no internet connection
219 except URLError as error:
220 print('Error when requesting ' + request_url)
221 raise ValueError('Something went wrong with the DrugBank request')
223 return pubchem_id
226def find_chembl_pubchem (id_chembl):
227 # Request ChemBL
228 parsed_response = None
229 request_url = Request(
230 url= f'https://www.ebi.ac.uk/chembl/interface_api/es_proxy/es_data/get_es_document/chembl_molecule/{id_chembl}',
231 headers={'User-Agent': 'Mozilla/5.0'}
232 )
233 try:
234 with urlopen(request_url) as response:
235 parsed_response = json.loads(response.read().decode("utf-8"))
236 unichem = parsed_response['_source']['_metadata']['unichem']
237 if unichem == None:
238 raise RuntimeError('Wrong Pubchem data structure: no unichem section')
239 pubchem_section = next((section for section in unichem if section.get('src_url', None) == "http://pubchem.ncbi.nlm.nih.gov"), None)
240 if pubchem_section == None:
241 raise RuntimeError('Wrong Pubchem data structure: no pubchem section')
242 pubchem_id = pubchem_section.get('id', None)
243 if pubchem_id == None:
244 raise RuntimeError('Wrong Pubchem data structure: no pubchem ID')
245 # If the accession is not found in ChemBL then the id is not valid
246 except HTTPError as error:
247 if error.code == 404:
248 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)
249 return None
250 print('Error when requesting ' + request_url)
251 raise ValueError('Something went wrong with the ChemBL request (error ' + str(error.code) + ')')
252 # If we have not a response at this point then it may mean we are trying to access an obsolete entry (e.g. P01607)
253 if parsed_response == None:
254 print('WARNING: Cannot find ChemBL entry for accession ' + id_chembl)
255 return None
257 return pubchem_id
259# Calculate the Morgan fingerprint and the Mordred descriptors from a ligand SMILES
260def obtain_mordred_morgan_descriptors (smiles : str) -> Tuple:
261 mol = Chem.MolFromSmiles(smiles)
262 if mol is None:
263 print(f'WARNING: Cannot generate a molecule from SMILES {smiles}')
264 return None
266 AllChem.Compute2DCoords(mol)
267 mol_block = Chem.MolToMolBlock(mol)
268 # We can select the different submodules of mordred descriptors, avaible in: 'https://mordred-descriptor.github.io/documentation/master/'
269 calc = Calculator([
270 descriptors.ABCIndex, # Índice de ramificación
271 descriptors.AcidBase.AcidicGroupCount, # Grupos ácidos
272 descriptors.AcidBase.BasicGroupCount, # Grupos básicos
273 descriptors.RingCount, # Conteo de anillos
274 descriptors.Constitutional, # Propiedades generales como número de átomos, peso molecular
275 descriptors.TopologicalCharge, # Índices topológicos, Cargas parciales, polaridad
276 descriptors.HydrogenBond, # Donantes y aceptores de enlaces de hidrógeno
277 descriptors.Lipinski, # Reglas de Lipinski (drug-likeness)
278 descriptors.FragmentComplexity, # Identificación de subestructuras frecuentes
279 descriptors.PathCount, # Conteo de caminos moleculares
280 ], ignore_3D=True)
283 # Calculate Mordred results
284 mordred_results = calc(mol).drop_missing().asdict()
286 ######## MORGAN FINGERPRINT ###########
287 morgan_fp_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
288 ao = rdFingerprintGenerator.AdditionalOutput()
289 ao.AllocateAtomCounts()
290 ao.AllocateAtomToBits()
291 ao.AllocateBitInfoMap()
292 fp = morgan_fp_gen.GetFingerprint(mol,additionalOutput=ao)
293 morgan_fp_bit_array = list(fp)
294 morgan_highlight_atoms = {}
295 for bit, atoms in ao.GetBitInfoMap().items():
296 morgan_highlight_atoms[bit] = list(set(atom for atom, radius in atoms))
298 return mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block
300# Check the cache to see if the ligand has already been matched or not, if not the ligand is skipped
301def check_matched_ligand (ligand: dict, ligand_data: dict, cache: 'Cache') -> bool:
302 # The pubchem id could be in the ligands data or the ligand input
303 pubchem_id_1 = ligand.get('pubchem', None)
304 if ligand_data != None:
305 pubchem_id_2 = ligand_data.get('pubchem', None)
306 else:
307 pubchem_id_2 = None
308 # If we have no pubchem id then we cannot check if the ligand matched
309 if not pubchem_id_1 and not pubchem_id_2:
310 return False
311 # If we have a pubchem id then check if it is already in the cache
312 cache_not_matched = cache.retrieve(NOT_MATCHED_LIGANDS, [])
313 for pubchem in cache_not_matched:
314 if pubchem == pubchem_id_1 or pubchem == pubchem_id_2:
315 print(f" Ligand with PubChem id {pubchem} didn't match before, skipping it")
316 return True
318 return False
320def generate_ligand_mapping (
321 structure : 'Structure',
322 cache : 'Cache',
323 input_ligands : Optional[List[dict]],
324 pdb_ids : List[str],
325 output_filepath : str,
326 mercy : List[str] = [],
327 ) -> List[dict]:
329 """Generate a map of residues associated to ligands."""
330 # Merge input ligands and pdb ligands
331 ligands = []
332 if input_ligands:
333 ligands += input_ligands
334 # Check we have cached pdb 2 pubchem values
335 pdb_to_pubchem_cache = cache.retrieve(PDB_TO_PUBCHEM, {})
336 new_data_to_cache = False
337 # Get input ligands from the pdb ids, if any
338 for pdb_id in pdb_ids:
339 # Check we have cached this specific pdb
340 pubchem_ids_from_pdb = pdb_to_pubchem_cache.get(pdb_id, None)
341 if pubchem_ids_from_pdb != None:
342 print(f' Retriving from cache PubChem ids for PDB id {pdb_id}: ')
343 if len(pubchem_ids_from_pdb) > 0:
344 print(' PubChem ids: ' + ', '.join(pubchem_ids_from_pdb))
345 else:
346 print(' This PDB id has no PubChem ids')
348 # If we had no cached pdb 2 pubchem then ask for them
349 if pubchem_ids_from_pdb == None:
350 pubchem_ids_from_pdb = pdb_to_pubchem(pdb_id)
351 # Save the result in the cache object so it is saved to cache later
352 pdb_to_pubchem_cache[pdb_id] = pubchem_ids_from_pdb
353 new_data_to_cache = True
354 for pubchem_id in pubchem_ids_from_pdb:
355 # Ligands in the structure (PDB) and the 'inputs.json' could be the same so it's not necessary to do it twice
356 if not any('pubchem' in ligand and ligand['pubchem'] == pubchem_id for ligand in ligands):
357 ligands.append({ 'pubchem': pubchem_id, 'pdb': True })
358 # Save all pdb to pubchem results in cache, in case there is anything new
359 if new_data_to_cache: cache.update(PDB_TO_PUBCHEM, pdb_to_pubchem_cache)
361 # If no input ligands are passed then stop here
362 if len(ligands) == 0:
363 return []
365 # Save data from all ligands to be saved in a file
366 json_ligands_data = []
367 ligands_data = []
369 # Load the ligands file if exists already
370 if os.path.exists(output_filepath):
371 json_ligands_data += import_ligands(output_filepath)
372 # If json ligands exists and it is empty means that ligands analysis has been already done but no ligands were matched
373 # so the file will contain an empty list []
374 if len(json_ligands_data) == 0:
375 print('No ligands have been matched yet.\nIf you want to force a ligand to be matched, please provide the field "residues" in the inputs.json file.')
376 return []
378 # Visited formulas
379 visited_formulas = []
380 # Save the maps of every ligand
381 ligand_maps = []
382 # Get cached ligand data
383 # AGUS: esto lo creamos por alguna razón para que funcione sin internet (en el cluster) pero realmente
384 # AGUS: llena el cache con datos que no son necesarios porque toda esa info está en el ligand_references.json
385 #ligand_data_cache = cache.retrieve(LIGANDS_DATA, {})
386 # Iterate input ligands
387 for ligand in ligands:
388 # Set the pubchem id which may be assigned in different steps
389 pubchem_id = None
390 # If input ligand is not a dict but a single int/string then handle it
391 if type(ligand) == int:
392 print(f'A ligand number ID has been identified {ligand}, assuming that is a PubChem ID...')
393 ligand = { 'pubchem': str(ligand) }
394 elif type(ligand) == str:
395 raise InputError(f'A name of ligand has been identified: {ligand}. Anyway, provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.')
396 # Check if we already have this ligand data
397 ligand_data = obtain_ligand_data_from_file(json_ligands_data, ligand)
398 # Check if the ligand didn't match before
399 if check_matched_ligand(ligand, ligand_data, cache): continue
400 # If we do not have its data try to get from the cache
401 # if not ligand_data:
402 # # If this is a ligand not in ligans.json but in cache it means it comes from PDB, never form user inputs
403 # # For this reason, this ligand will always have a PubChem id
404 # pubchem_id = ligand.get('pubchem', None)
405 # if pubchem_id:
406 # ligand_data = ligand_data_cache.get(pubchem_id, None)
407 # # If we still have no ligand data then request it to PubChem
408 # if not ligand_data:
409 # ligand_data = obtain_ligand_data_from_pubchem(ligand)
410 # # Save data mined from PubChem in the cache
411 # pubchem_id = ligand_data['pubchem']
412 # ligand_data_cache[pubchem_id] = { **ligand_data }
413 # cache.update(LIGANDS_DATA, ligand_data_cache)
414 # Add current ligand data to the general list
415 if not ligand_data:
416 ligand_data = obtain_ligand_data_from_pubchem(ligand)
417 ligands_data.append(ligand_data)
418 # Get pubchem id
419 pubchem_id = ligand_data.get('pubchem', None)
420 # If we already visited a different ligand but with identical formula then we skip this ligand
421 # Note that the mapping will be identical thus overwritting the previous map
422 # However, ligands forced by the user are processed before so we keep them as priority
423 formula = ligand_data.get('formula', None)
424 if formula in visited_formulas:
425 print(f'WARNING: Ligand with PubChem Id {pubchem_id} has a formula which has been already matched')
426 # AGUS: en este punto si el usuario ha definido el mismo ligando con diferente selección quiere decir que está repetido
427 # AGUS: y que deberíamos mantener el ligando para diferentes residuos matcheados
428 if forced_selection:
429 visited_formulas.pop()
430 ligands_data.pop()
431 print(f'WARNING: Ligand with PubChem Id {pubchem_id} has been forced by the user and its repeated')
432 else:
433 ligands_data.pop()
434 continue
435 # Add it to the list of visited formulas
436 visited_formulas.append(formula)
437 # If the user defined a ligand name, it will be respectedand added to the metadata
438 # if there isn't a defined name, it will be mined from PubChem
439 user_forced_ligand_name = ligand.get('name', None)
440 # Map structure residues with the current ligand
441 # If the user forced the residues then use them
442 forced_selection = ligand.get('selection', None)
443 if forced_selection:
444 # Could be a single residue or a list of residues
445 selection_atoms = structure.select(forced_selection)
446 residue_indices = structure.get_selection_residue_indices(selection_atoms)
447 ligand_map = { 'name': pubchem_id, 'residue_indices': residue_indices, 'match': { 'ref': { 'pubchem': pubchem_id } } }
448 if user_forced_ligand_name: ligand_map['forced_name'] = user_forced_ligand_name
449 # If the user did not force the residues then map them
450 else:
451 ligand_map = map_ligand_residues(structure, ligand_data)
452 # Add current ligand map to the general list
453 ligand_maps.append(ligand_map)
454 # If the ligand did not map then discard it
455 did_not_match = len(ligand_map['residue_indices']) == 0
456 if did_not_match:
457 is_forced_by_user = not bool(ligand.get('pdb', False))
458 if is_forced_by_user:
459 # At this point we should have macthed all sequences
460 # If not, kill the process unless mercy was given
461 must_be_killed = LIGANDS_MATCH_FLAG not in mercy
462 if must_be_killed:
463 raise InputError(f'Ligand with PubChem Id {pubchem_id} did not map with any residue')
464 # Otherwise simply remove it from the final ligand references file and the forwarded maps
465 ligands_data.pop()
466 ligand_maps.pop()
467 # Update the cache with the pubchem id of the ligands that didn't match
468 not_matched_pubchems = cache.retrieve(NOT_MATCHED_LIGANDS, [])
469 not_matched_pubchems.append(ligand_map['name'])
470 cache.update(NOT_MATCHED_LIGANDS, not_matched_pubchems)
471 # If the ligand matched then calculate some additional data
472 # Get morgan and mordred descriptors, the SMILES is needed for this part
473 smiles = ligand_data['smiles']
474 mordred_results, morgan_fp_bit_array, morgan_highlight_atoms, mol_block = obtain_mordred_morgan_descriptors(smiles)
475 ligand_data['mordred'] = mordred_results
476 ligand_data['morgan_fp_bit_array'] = morgan_fp_bit_array
477 ligand_data['morgan_highlight_atoms'] = morgan_highlight_atoms
478 ligand_data['mol_block'] = mol_block
479 # Export ligands to a file
480 save_json(ligands_data, output_filepath)
482 # Log matched ligands
483 if not ligand_maps:
484 print('No ligands were matched')
485 else:
486 print('Matched ligands:')
487 for ligand_map in ligand_maps:
488 residue_indices = ligand_map["residue_indices"]
489 residue_count = len(residue_indices)
490 plural_suffix = '' if residue_count == 1 else 's'
491 print(f' - {ligand_map["name"]}: {residue_count} residue{plural_suffix}')
493 return ligand_maps
495# Set the expected ligand data fields
496LIGAND_DATA_FIELDS = set(['name', 'pubchem', 'drugbank', 'chembl', 'smiles', 'formula', 'morgan', 'mordred', 'pdbid'])
498# Import ligands json file so we do not have to rerun this process
499def import_ligands (output_filepath : str) -> dict:
500 # Read the file
501 imported_ligands = load_json(output_filepath)
502 # Format data as the process expects to find it
503 for imported_ligand in imported_ligands:
504 for expected_field in LIGAND_DATA_FIELDS:
505 if expected_field not in imported_ligand:
506 imported_ligand[expected_field] = None
507 return imported_ligands
509# Check if the current input ligand is already among the ligands we already have data for
510def obtain_ligand_data_from_file ( ligands_data : List[dict], ligand: dict ) -> Optional[dict]:
511 for ligand_data in ligands_data:
512 if (ligand.get('pubchem') and ligand['pubchem'] == ligand_data.get('pubchem')) or \
513 (ligand.get('drugbank') and ligand['drugbank'] == ligand_data.get('drugbank')) or \
514 (ligand.get('chembl') and ligand['chembl'] == ligand_data.get('chembl')):
515 return ligand_data
516 return None
518# Given an input ligand, obtain all necessary data
519def obtain_ligand_data_from_pubchem (ligand : dict) -> dict:
520 # Save in a dictionary all ligand data including its name and ids
521 # The ID can be of the databases: 'drugbank' , 'pubchem' , 'chembl'
522 # Define the needed variables to check if the ligand has a database ID or it is None
523 ligand_data = {
524 'name': None,
525 'pubchem': None,
526 'drugbank': None,
527 'chembl': None,
528 'smiles': None,
529 'formula': None,
530 'pdbid': None,
531 }
532 # Set ligand data pubchem id, even if the input id is not from pubhcme (e.g. drugbank, chembl)
533 if 'pubchem' in ligand:
534 ligand_data['pubchem'] = str(ligand.get('pubchem'))
535 elif 'drugbank' in ligand:
536 ligand_data['drugbank'] = ligand.get('drugbank')
537 ligand_data['pubchem'] = str(find_drugbank_pubchem(ligand_data['drugbank']))
538 elif 'chembl' in ligand:
539 ligand_data['chembl'] = ligand.get('chembl')
540 ligand_data['pubchem'] = str(find_chembl_pubchem(ligand_data['chembl']))
541 else:
542 raise InputError('None of the ligand IDs are defined. Please provide at least one of the following IDs: DrugBank, PubChem, ChEMBL.')
544 # Request ligand data from pubchem
545 pubchem_data = get_pubchem_data(ligand_data['pubchem'])
546 if not pubchem_data:
547 raise RuntimeError('No PubChem data avilable')
549 # Add pubchem data to ligand data
550 ligand_data = { **ligand_data, **pubchem_data }
551 return ligand_data
554# For each residue, count the number of atom elements
555def count_atom_elements_per_residue ( structure : 'Structure' ) -> dict:
556 # Obtain a list of residues of ligand candidates
557 residue_element_count = {}
558 # Iterate over the residue(s)
559 for residue in structure.residues:
560 # Skip protein residues
561 if residue.classification == 'protein':
562 continue
563 # Obtain the list of elements for each residue
564 atom_elements = [ atom.element for atom in residue.atoms ]
565 # Generate a dict with elements (keys) and their counting (values)
566 atoms_count = {}
567 for atom in atom_elements:
568 if atom in atoms_count:
569 atoms_count[atom] += 1
570 else:
571 atoms_count[atom] = 1
572 # Define the key of the dictionary as the residue name and save it in a list with the all the residues
573 residue_element_count[residue] = atoms_count
575 return residue_element_count
578def nest_brackets(tokens, i = 0):
579 l = []
580 while i < len(tokens):
581 if tokens[i] == ')':
582 return i,l
583 elif tokens[i] == '(':
584 i,subl = nest_brackets(tokens, i+1)
585 l.append(subl)
586 else:
587 l.append(tokens[i])
588 i += 1
589 return i,l
591def parse_compound(formula : str) -> List[str]:
592 tokens = [''.join(t) for t in split_when(formula, lambda a,b: b.isupper() or b in '()' or (b.isdigit() and not a.isdigit()))]
593 tokens = [(int(t) if t.isdigit() else t) for t in tokens]
594 i, l = nest_brackets(tokens)
595 assert(i == len(tokens)) # crash if unmatched ')'
596 return l
598# Split a string using a function
599def split_when(string : str, func : Callable) -> List[str]:
600 splits = []
601 last_split = string[0]
602 for s in string[1:]:
603 if func(last_split, s):
604 splits.append(last_split)
605 last_split = s
606 else:
607 last_split += s
608 splits.append(last_split)
609 return splits
611# Given a chemical formula, get the count of atoms per element
612def count_atom_elements (molecular_formula : str) -> dict:
613 # Clean the formula from charges
614 # These charges include numbers which are not atom counts (e.g. 49867169 -> C18H16NO5PS-2)
615 parsed_molecular_formula = re.sub('[+-][0-9]*', '', molecular_formula)
616 l = parse_compound(parsed_molecular_formula)
617 c = parse_splits(l)
618 return c
620# This function associates elements in a list
621# If a string is followed by a number then they go together
622# If a string has no number then the number is 1 for this specific string
623def parse_splits (splits : List[str]) -> dict:
624 parsed = {}
625 previous_key = None
626 for split in splits:
627 if type(split) == str:
628 if previous_key:
629 parsed[previous_key] = 1
630 previous_key = split
631 elif type(split) == int:
632 parsed[previous_key] = split
633 previous_key = None
634 else:
635 raise ValueError('Not supported type ' + type(split))
636 if previous_key:
637 parsed[previous_key] = 1
638 return parsed
641def match_ligandsID_to_res (ligand_atom_element_count : dict, residue_atom_element_count : dict) -> bool:
642 # Remove hydrogens since we only pay attention to heavy atoms
643 # This is to avoid having mismatched_residues due to different protonation states
644 if 'H' in ligand_atom_element_count:
645 del ligand_atom_element_count['H']
646 if 'H' in residue_atom_element_count:
647 del residue_atom_element_count['H']
648 #shared_items = {k: ligand_atom_element_count[k] for k in ligand_atom_element_count if k in residue_atom_element_count and ligand_atom_element_count[k] == residue_atom_element_count[k]}
649 ligand_elements = set(ligand_atom_element_count.keys())
650 residue_elements = set(residue_atom_element_count.keys())
652 # Check if there are different elements
653 # If so, we are done
654 different_keys = ligand_elements.symmetric_difference(residue_elements)
655 if len(different_keys) > 0:
656 # print("Elements that are not matching:")
657 # for atom in different_keys:
658 # if atom in ligand_atom_element_count:
659 # print(f"In Pubchem {list(ligand_atom_element_count.keys())[0]} : {atom}: {ligand_atom_element_count[atom]}")
660 # else:
661 # print(f"In PDB residue {list(residue_atom_element_count.keys())[0]}: {atom}: {residue_atom_element_count[atom]}")
662 return False
664 # Different values
665 different_values = []
666 for element in ligand_elements:
667 if ligand_atom_element_count[element] != residue_atom_element_count[element]:
668 different_values.append(element)
670 # If the count of elements does not match then stop here
671 if len(different_values) > 0:
672 # Print the differences found between the elements
673 # print("Number of atoms that are different:")
674 # for atom in different_values:
675 # print(f"In Pubchem {list(ligand_atom_element_count.keys())[0]} {atom}: {ligand_atom_element_count[atom]}, In PDB residue {list(pdb_dict.keys())[0]}: {atom}: {residue_atom_element_count[atom]}\n")
676 return False
678 return True
680def map_ligand_residues (structure : 'Structure', ligand_data : dict) -> dict:
681 # Load the structure where the atoms will be minresidue.indexed and obtain a residues dict
682 atom_elements_per_residue = count_atom_elements_per_residue(structure)
683 # Get the ligand formula
684 ligand_formula = ligand_data['formula']
685 if not ligand_formula:
686 raise RuntimeError(f'Ligand with PubChem id {ligand_data["pubchem"]} is missing formula')
687 # From pubchem mine the atoms of the ligands and save it in a dict
688 ligand_atom_element_count = count_atom_elements(ligand_data['formula'])
689 matched_residues = []
690 # Get ligand pubchem id
691 pubchem_id = ligand_data['pubchem']
692 for residue, residue_atom_element_count in atom_elements_per_residue.items():
693 if match_ligandsID_to_res(ligand_atom_element_count, residue_atom_element_count):
694 matched_residues.append(residue.index)
695 # Format the output as we expect it
696 ligand_map = { 'name': pubchem_id, 'residue_indices': matched_residues, 'match': { 'ref': { 'pubchem': pubchem_id } } }
697 return ligand_map
699# Given a smiles, get the pubchem id
700# e.g. CC1=C(C=NC=C1)NC(=O)CC2=CC(=CC(=C2)Cl)OC -> 154876006
701# e.g. O=C4N3C(C(=O)Nc2cc(nn2c1ccccc1)C)C(SC3CC=CC4NC(=O)C(NC)C)(C)C -> None
702def smiles_to_pubchem_id (smiles : str) -> Optional[str]:
703 # Set the request URL
704 request_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/JSON'
705 # Set the POST data
706 data = urlencode({ 'smiles': smiles }).encode()
707 try:
708 with urlopen(request_url, data=data) as response:
709 parsed_response = json.loads(response.read().decode("utf-8"))
710 # If the smiles is not found in pubchem then we can stop here
711 except HTTPError as error:
712 if error.code == 404:
713 print(f' Smiles {smiles} not found')
714 return None
715 else:
716 raise ValueError('Something went wrong with the PubChem request: ' + request_url)
717 # Get the PubChem id
718 compounds = parsed_response.get('PC_Compounds', None)
719 if not compounds:
720 raise RuntimeError(f'Something went wrong when mining pubchem data for SMILES {smiles}')
721 # Make sure there is only 1 compound
722 # DANI: Esto algún día podría no ser así, ya lo gestionaremos
723 if len(compounds) != 1:
724 raise RuntimeError(f'There should be one and only one compound for SMILES {smiles}')
725 # Keep mining
726 compound = compounds[0]
727 first_id = compound.get('id', None)
728 # If there is no first id then it means there is no direct match with pubchem
729 if not first_id:
730 return None
731 second_id = first_id.get('id', None)
732 if not second_id:
733 raise RuntimeError(f'Missing second id when mining pubchem data for SMILES {smiles}')
734 pubchem_id = second_id.get('cid', None)
735 if not pubchem_id:
736 raise RuntimeError(f'Missing pubchem id when mining pubchem data for SMILES {smiles}')
737 return str(pubchem_id)
739# Given a PDB ligand code, get its pubchem
740def pdb_ligand_to_pubchem (pdb_ligand_id : str) -> Optional[str]:
741 # Set the request query
742 query = '''query molecule($id:String!){
743 chem_comp(comp_id:$id) { rcsb_chem_comp_related{ resource_name resource_accession_code } }
744 }'''
745 # Request PDB data
746 parsed_response = request_pdb_data(pdb_ligand_id, query)
747 related_resources = parsed_response['rcsb_chem_comp_related']
748 # It may happend that a ligand code has no related resources at all
749 # e.g. ZN
750 if not related_resources: return None
751 pubchem_resource = next((resource for resource in related_resources if resource['resource_name'] == 'PubChem'), None)
752 if not pubchem_resource: return None
753 return pubchem_resource['resource_accession_code']
755# Given a PDB ligand code, get its pubchem
756# Use a web crawler to avoid having to use the PDB API
757def pdb_ligand_to_pubchem_RAW (pdb_ligand_id : str) -> Optional[str]:
758 # Set the request URL
759 request_url = f'https://www.rcsb.org/ligand/{pdb_ligand_id}'
760 # Run the query
761 parsed_response = None
762 try:
763 with urlopen(request_url) as response:
764 parsed_response = response.read().decode("utf-8")
765 # If the accession is not found in the PDB then we can stop here
766 except HTTPError as error:
767 if error.code == 404:
768 print(f' PDB ligand {pdb_ligand_id} not found')
769 return None
770 else:
771 print(error.msg)
772 raise ValueError('Something went wrong with the PDB ligand request: ' + request_url)
773 # Mine the pubchem id out of the whole response
774 pattern = re.compile('pubchem.ncbi.nlm.nih.gov\/compound\/([0-9]*)\"')
775 match = re.search(pattern, parsed_response)
776 # If there is no pubchem id then return none
777 # This is normal for some ligands such as counter ions (e.g. LI)
778 if not match:
779 return None
780 pubchem_id = match[1]
781 return pubchem_id
783# Given a PDB ligand code, get its pubchem
784# Ask to pubchem if the ligand exists and hope there is only one result
785# DANI: No se ha provado a fondo
786def pdb_ligand_to_pubchem_RAW_RAW (pdb_ligand_id : str) -> Optional[str]:
787 # Set the request URL
788 request_url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{pdb_ligand_id}/json'
789 # Run the query
790 parsed_response = None
791 try:
792 with urlopen(request_url) as response:
793 parsed_response = json.loads(response.read().decode("utf-8"))
794 # If the accession is not found in the PDB then we can stop here
795 except HTTPError as error:
796 # This may happen for weird things such as UNX (unknown atom or ion)
797 if error.code == 404:
798 return None
799 print(error.msg)
800 raise RuntimeError('Something went wrong with the PDB ligand request in PubChem: ' + request_url)
801 # Mine the pubchem id
802 compounds = parsed_response['PC_Compounds']
803 if len(compounds) != 1:
804 # There could be more than one result
805 # For example, the case HEM: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/HEM/json
806 # In this case we picked the first result
807 compound = compounds[0]
808 #raise RuntimeError('We are not having 1 and only 1 result from PubChem: ' + request_url)
809 compound = compounds[0]
810 id1 = compound['id']
811 id2 = id1['id']
812 pubchem_id = str(id2['cid'])
813 return pubchem_id
815# Given a PDB id, get all its ligand codes
816# e.g. 2I3I -> 618, BTB, ZN, EDO, LI
817def get_pdb_ligand_codes (pdb_id : str) -> List[str]:
818 # Set the request query
819 query = '''query structure($id: String!) {
820 entry(entry_id: $id) {
821 nonpolymer_entities { nonpolymer_comp { chem_comp { id } } }
822 }
823 }'''
824 # Request PDB data
825 parsed_response = request_pdb_data(pdb_id, query)
826 # Mine data for nonpolymer entities
827 nonpolymers = parsed_response['nonpolymer_entities']
828 # If there are no nonpolymer entities, another type of entitie could be used
829 # AGUS: esto es un caso muy concreto que me encontré con el PDB 1N3W
830 # AGUS: el ligando en este caso es una 'Biologically Interesting Molecules' y se muestran como 'PRD_'
831 prd_code = None
832 if nonpolymers == None:
833 # Get the prd ligand code
834 prd_code = get_prd_ligand_code(pdb_id)
835 if prd_code != None:
836 return [prd_code]
838 if nonpolymers == None and prd_code == None: return []
839 # Iterate nonpolymer entities to mine each PDB code
840 ligand_codes = []
841 for nonpolymer in nonpolymers:
842 ligand_code = nonpolymer['nonpolymer_comp']['chem_comp']['id']
843 ligand_codes.append(ligand_code)
844 print(f' Ligand codes for PDB id {pdb_id}: ' + ', '.join(ligand_codes))
845 return ligand_codes
847# Given a PDB id, get its PRD ligand code
848def get_prd_ligand_code (pdb_id : str) -> Optional[str]:
849 query = '''query structure($id: String!) {
850 entry(entry_id: $id) {
851 pdbx_molecule_features {
852 prd_id
853 }
854 }
855 }'''
856 parsed_response = request_pdb_data(pdb_id, query)
857 pdbx_id = parsed_response.get('pdbx_molecule_features', None)
858 if pdbx_id is None:
859 print(f'WARNING: Cannot find PRD ligand code for PDB id {pdb_id}')
860 return None
861 prd_id = pdbx_id[0].get('prd_id', None) # AGUS: podría haber casos donde haya más de uno?
862 if prd_id is None:
863 print(f'WARNING: Cannot find PRD ligand code for PDB id {pdb_id}')
864 return None
865 # If the PRD id is not empty then return it
866 return prd_id
868# Given a pdb id, get its pubchem ids
869# DANI: De momento no usamos las SMILES que alguna vez me han dado problemas (e.g. 2I3I)
870# e.g. 4YDF ->
871def pdb_to_pubchem (pdb_id : str) -> List[str]:
872 print(f'Searching PubChem ids for PDB {pdb_id}')
873 pubchem_ids = []
874 # Iterate over pdb ligand codes
875 ligand_codes = get_pdb_ligand_codes(pdb_id)
876 for ligand_code in ligand_codes:
877 # Ask the PDB API for the ligand
878 pubchem_id = pdb_ligand_to_pubchem(ligand_code)
879 # If this did not work then try mining the PDB client with a web crawler
880 if not pubchem_id:
881 pubchem_id = pdb_ligand_to_pubchem_RAW(ligand_code)
882 # If this did not work then try it from PubChem
883 if not pubchem_id:
884 pubchem_id = pdb_ligand_to_pubchem_RAW_RAW(ligand_code)
885 # Otherwise we surrender
886 if not pubchem_id:
887 print(f' {ligand_code} -> No PubChem id')
888 continue
889 print(f' {ligand_code} -> {pubchem_id}')
890 pubchem_ids.append(pubchem_id)
892 return pubchem_ids