Coverage for model_workflow/tools/generate_pdb_references.py: 53%
87 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
1import json
2import urllib.request
4from model_workflow.utils.auxiliar import RemoteServiceError, load_json, save_json, request_pdb_data
5from model_workflow.utils.file import File
6from model_workflow.utils.type_hints import *
9def prepare_pdb_references (pdb_ids : List[str], output_filepath : str):
10 """Prepare the PDB references json file to be uploaded to the database."""
11 # Set the output file
12 pdb_references_file = File(output_filepath)
13 # If we already have PDB references then load them
14 previous_pdb_references = {}
15 if pdb_references_file.exists:
16 previous_content = load_json(pdb_references_file.path)
17 for pdb_reference in previous_content:
18 pdb_id = pdb_reference['id']
19 previous_pdb_references[pdb_id] = pdb_reference
20 # Mine PDB data for every PDB id
21 pdb_references = []
22 for pdb_id in pdb_ids:
23 # Find if we already have this PDB id among hte previous references
24 pdb_reference = previous_pdb_references.get(pdb_id, None)
25 if pdb_reference:
26 pdb_references.append(pdb_reference)
27 continue
28 # Otherwise download and mine the PDB data
29 pdb_reference = get_pdb_reference(pdb_id)
30 pdb_references.append(pdb_reference)
31 # Write references to a json file
32 save_json(pdb_references, pdb_references_file.path, indent = 4)
33 # DANI: There is no need to return PDB references since it is not used in the workflow
34 #return pdb_references
36# Download PDB data from the PDB API
37def get_pdb_reference (pdb_id : str) -> dict:
38 # Set the request query
39 query = '''query ($id: String!) {
40 entry(entry_id: $id) {
41 rcsb_id
42 struct { title }
43 struct_keywords { pdbx_keywords }
44 refine { pdbx_refine_id ls_d_res_high }
45 rcsb_accession_info { initial_release_date }
46 audit_author { name }
47 polymer_entities {
48 rcsb_polymer_entity_container_identifiers { asym_ids uniprot_ids }
49 rcsb_entity_source_organism { scientific_name }
50 }
51 exptl { method }
52 }
53 }'''
54 # Request PDB data
55 parsed_response = request_pdb_data(pdb_id, query)
56 try:
57 # Mine data
58 pdb_data = {}
59 pdb_data['id'] = parsed_response['rcsb_id']
60 pdb_data['title'] = parsed_response['struct']['title']
61 pdb_data['class'] = parsed_response['struct_keywords']['pdbx_keywords']
62 pdb_data['authors'] = [ author['name'] for author in parsed_response['audit_author'] ]
63 pdb_data['date'] = parsed_response['rcsb_accession_info']['initial_release_date']
64 # pdbx_refine_id not on every PDB like in 1FI3
65 #pdb_data['method'] = parsed_response['refine'][0]['pdbx_refine_id']
66 pdb_data['method'] = parsed_response['exptl'][0]['method']
67 # ls_d_res_high not on every PDB like in 1FI3
68 # pdb_data['resolution'] = parsed_response['refine'][0]['ls_d_res_high']
69 chain_uniprots = {}
70 organisms = []
71 for polymer in parsed_response['polymer_entities']:
72 identifier = polymer['rcsb_polymer_entity_container_identifiers']
73 # Get the organisms
74 organism_entries = polymer['rcsb_entity_source_organism']
75 if organism_entries != None:
76 organisms += [ organism['scientific_name'] for organism in organism_entries ]
77 # Get the uniprot
78 uniprots = identifier.get('uniprot_ids', None)
79 if not uniprots: continue
80 if len(uniprots) > 1:
81 print(f'PDB {pdb_id} has multiple uniprots: {uniprots}. Saving only the first one')
82 uniprot_id = uniprots[0]
83 chains = identifier['asym_ids']
84 for chain in chains:
85 chain_uniprots[chain] = uniprot_id
86 pdb_data['chain_uniprots'] = chain_uniprots
87 pdb_data['organisms'] = list(set(organisms))
88 except Exception as e:
89 print(f'Error when mining PDB data for {pdb_id}')
90 print('Got the response:', parsed_response, '.Setting noref')
91 pdb_data = {'id': 'noref'}
92 return pdb_data
94# Set service URLs to be requested
95pdb_data_services = {
96 'IRB': f'https://mmb.irbbarcelona.org',
97 'BSC': f'https://mdb-login.bsc.es'
98}
99# Download PDB data from a remote service
100# DEPRECATED: our custom PDB API may return UniProt ids not matching the ones in the PDB
101# e.g. 1AK4 and 4Z80
102def DEPRECATED_download_pdb_data (pdb_id : str, service = 'IRB') -> dict:
103 # Set the request URL
104 service_url = pdb_data_services[service]
105 request_url = f'{service_url}/api/pdb/{pdb_id}/entry'
106 # Send the request and save the response
107 print(f'Requesting {request_url} (...)', end='\r')
108 parsed_response = None
109 try:
110 with urllib.request.urlopen(request_url) as response:
111 parsed_response = json.loads(response.read().decode("utf-8"))
112 # Handle HTTP errors
113 except urllib.error.HTTPError as error:
114 print(f'There was a problem when requesting {request_url}')
115 print(f' Error code: {error.code}')
116 # If the PDB id is not found then we can stop here
117 if error.code == 404:
118 print(f' PDB id {pdb_id} not found')
119 return None
120 # If the API is not responding try another service
121 elif error.code == 500 or error.code == 502 or error.code == 503:
122 print(f' {service} API to retrieve PDB data may be out of service')
123 # Before we surrender we try with the other available service
124 if service == 'IRB':
125 print(' Retrying with a different service')
126 return DEPRECATED_download_pdb_data(pdb_id, service='BSC')
127 # If we already tried with the other service then surrender
128 raise RemoteServiceError('All APIs to retrieve PDB data may be out of service')
129 # If the error is not known then stop here
130 else:
131 raise RemoteServiceError(f'Something went wrong with the PDB data request')
132 # Handle URL errors
133 except urllib.error.URLError as error:
134 print(f'There was a problem when requesting {request_url}')
135 print(f' Error reason: {error.reason}')
136 # These errors are not our fault, but the service is unsatable
137 print(f' {service} API to retrieve PDB data may be out of service')
138 # Try with a different service
139 if service == 'IRB':
140 print(' Retrying with a different service')
141 return DEPRECATED_download_pdb_data(pdb_id, service='BSC')
142 raise RemoteServiceError(f'Something went wrong with the PDB data request')
143 # Return the response
144 print(f'Successfully requested {request_url}')
145 return parsed_response