Coverage for mddb_workflow / tools / generate_pdb_references.py: 52%
85 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
1import json
2import urllib.request
4from mddb_workflow.utils.auxiliar import RemoteServiceError, load_json, save_json, request_pdb_data
5from mddb_workflow.utils.type_hints import *
8def prepare_pdb_references (pdb_ids : list[str], pdb_references_file : 'File'):
9 """Prepare the PDB references json file to be uploaded to the database."""
10 # If we already have PDB references then load them
11 previous_pdb_references = {}
12 if pdb_references_file.exists:
13 previous_content = load_json(pdb_references_file.path)
14 for pdb_reference in previous_content:
15 pdb_id = pdb_reference['id']
16 previous_pdb_references[pdb_id] = pdb_reference
17 # Mine PDB data for every PDB id
18 pdb_references = []
19 for pdb_id in pdb_ids:
20 # Find if we already have this PDB id among hte previous references
21 pdb_reference = previous_pdb_references.get(pdb_id, None)
22 if pdb_reference:
23 pdb_references.append(pdb_reference)
24 continue
25 # Otherwise download and mine the PDB data
26 pdb_reference = get_pdb_reference(pdb_id)
27 pdb_references.append(pdb_reference)
28 # Write references to a json file
29 save_json(pdb_references, pdb_references_file.path, indent = 4)
30 # DANI: There is no need to return PDB references since it is not used in the workflow
31 #return pdb_references
33# Download PDB data from the PDB API
34def get_pdb_reference (pdb_id : str) -> dict:
35 # Set the request query
36 query = '''query ($id: String!) {
37 entry(entry_id: $id) {
38 rcsb_id
39 struct { title }
40 struct_keywords { pdbx_keywords }
41 refine { pdbx_refine_id ls_d_res_high }
42 rcsb_accession_info { initial_release_date }
43 audit_author { name }
44 polymer_entities {
45 rcsb_polymer_entity_container_identifiers { asym_ids uniprot_ids }
46 rcsb_entity_source_organism { scientific_name }
47 }
48 exptl { method }
49 }
50 }'''
51 # Request PDB data
52 parsed_response = request_pdb_data(pdb_id, query)
53 try:
54 # Mine data
55 pdb_data = {}
56 pdb_data['id'] = parsed_response['rcsb_id']
57 pdb_data['title'] = parsed_response['struct']['title']
58 pdb_data['class'] = parsed_response['struct_keywords']['pdbx_keywords']
59 pdb_data['authors'] = [ author['name'] for author in parsed_response['audit_author'] ]
60 pdb_data['date'] = parsed_response['rcsb_accession_info']['initial_release_date']
61 # pdbx_refine_id not on every PDB like in 1FI3
62 #pdb_data['method'] = parsed_response['refine'][0]['pdbx_refine_id']
63 pdb_data['method'] = parsed_response['exptl'][0]['method']
64 # ls_d_res_high not on every PDB like in 1FI3
65 # pdb_data['resolution'] = parsed_response['refine'][0]['ls_d_res_high']
66 chain_uniprots = {}
67 organisms = []
68 for polymer in parsed_response['polymer_entities']:
69 identifier = polymer['rcsb_polymer_entity_container_identifiers']
70 # Get the organisms
71 organism_entries = polymer['rcsb_entity_source_organism']
72 if organism_entries != None:
73 organisms += [ organism['scientific_name'] for organism in organism_entries ]
74 # Get the uniprot
75 uniprots = identifier.get('uniprot_ids', None)
76 if not uniprots: continue
77 if len(uniprots) > 1:
78 print(f'PDB {pdb_id} has multiple uniprots: {uniprots}. Saving only the first one')
79 uniprot_id = uniprots[0]
80 chains = identifier['asym_ids']
81 for chain in chains:
82 chain_uniprots[chain] = uniprot_id
83 pdb_data['chain_uniprots'] = chain_uniprots
84 pdb_data['organisms'] = list(set(organisms))
85 except Exception as e:
86 print(f'Error when mining PDB data for {pdb_id}')
87 print('Got the response:', parsed_response, '.Setting noref')
88 pdb_data = {'id': 'noref'}
89 return pdb_data
91# Set service URLs to be requested
92pdb_data_services = {
93 'IRB': f'https://mmb.irbbarcelona.org',
94 'BSC': f'https://mdb-login.bsc.es'
95}
96# Download PDB data from a remote service
97# DEPRECATED: our custom PDB API may return UniProt ids not matching the ones in the PDB
98# e.g. 1AK4 and 4Z80
99def DEPRECATED_download_pdb_data (pdb_id : str, service = 'IRB') -> dict:
100 # Set the request URL
101 service_url = pdb_data_services[service]
102 request_url = f'{service_url}/api/pdb/{pdb_id}/entry'
103 # Send the request and save the response
104 print(f'Requesting {request_url} (...)', end='\r')
105 parsed_response = None
106 try:
107 with urllib.request.urlopen(request_url) as response:
108 parsed_response = json.loads(response.read().decode("utf-8"))
109 # Handle HTTP errors
110 except urllib.error.HTTPError as error:
111 print(f'There was a problem when requesting {request_url}')
112 print(f' Error code: {error.code}')
113 # If the PDB id is not found then we can stop here
114 if error.code == 404:
115 print(f' PDB id {pdb_id} not found')
116 return None
117 # If the API is not responding try another service
118 elif error.code == 500 or error.code == 502 or error.code == 503:
119 print(f' {service} API to retrieve PDB data may be out of service')
120 # Before we surrender we try with the other available service
121 if service == 'IRB':
122 print(' Retrying with a different service')
123 return DEPRECATED_download_pdb_data(pdb_id, service='BSC')
124 # If we already tried with the other service then surrender
125 raise RemoteServiceError('All APIs to retrieve PDB data may be out of service')
126 # If the error is not known then stop here
127 else:
128 raise RemoteServiceError(f'Something went wrong with the PDB data request')
129 # Handle URL errors
130 except urllib.error.URLError as error:
131 print(f'There was a problem when requesting {request_url}')
132 print(f' Error reason: {error.reason}')
133 # These errors are not our fault, but the service is unsatable
134 print(f' {service} API to retrieve PDB data may be out of service')
135 # Try with a different service
136 if service == 'IRB':
137 print(' Retrying with a different service')
138 return DEPRECATED_download_pdb_data(pdb_id, service='BSC')
139 raise RemoteServiceError(f'Something went wrong with the PDB data request')
140 # Return the response
141 print(f'Successfully requested {request_url}')
142 return parsed_response