Coverage for mddb_workflow / tools / generate_pdb_references.py: 52%

85 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 18:45 +0000

1import json 

2import urllib.request 

3 

4from mddb_workflow.utils.auxiliar import RemoteServiceError, load_json, save_json, request_pdb_data 

5from mddb_workflow.utils.type_hints import * 

6 

7 

8def prepare_pdb_references (pdb_ids : list[str], pdb_references_file : 'File'): 

9 """Prepare the PDB references json file to be uploaded to the database.""" 

10 # If we already have PDB references then load them 

11 previous_pdb_references = {} 

12 if pdb_references_file.exists: 

13 previous_content = load_json(pdb_references_file.path) 

14 for pdb_reference in previous_content: 

15 pdb_id = pdb_reference['id'] 

16 previous_pdb_references[pdb_id] = pdb_reference 

17 # Mine PDB data for every PDB id 

18 pdb_references = [] 

19 for pdb_id in pdb_ids: 

20 # Find if we already have this PDB id among hte previous references 

21 pdb_reference = previous_pdb_references.get(pdb_id, None) 

22 if pdb_reference: 

23 pdb_references.append(pdb_reference) 

24 continue 

25 # Otherwise download and mine the PDB data 

26 pdb_reference = get_pdb_reference(pdb_id) 

27 pdb_references.append(pdb_reference) 

28 # Write references to a json file 

29 save_json(pdb_references, pdb_references_file.path, indent = 4) 

30 # DANI: There is no need to return PDB references since it is not used in the workflow 

31 #return pdb_references 

32 

33# Download PDB data from the PDB API 

34def get_pdb_reference (pdb_id : str) -> dict: 

35 # Set the request query 

36 query = '''query ($id: String!) { 

37 entry(entry_id: $id) { 

38 rcsb_id 

39 struct { title } 

40 struct_keywords { pdbx_keywords } 

41 refine { pdbx_refine_id ls_d_res_high } 

42 rcsb_accession_info { initial_release_date } 

43 audit_author { name } 

44 polymer_entities { 

45 rcsb_polymer_entity_container_identifiers { asym_ids uniprot_ids } 

46 rcsb_entity_source_organism { scientific_name } 

47 } 

48 exptl { method } 

49 } 

50 }''' 

51 # Request PDB data 

52 parsed_response = request_pdb_data(pdb_id, query) 

53 try: 

54 # Mine data 

55 pdb_data = {} 

56 pdb_data['id'] = parsed_response['rcsb_id'] 

57 pdb_data['title'] = parsed_response['struct']['title'] 

58 pdb_data['class'] = parsed_response['struct_keywords']['pdbx_keywords'] 

59 pdb_data['authors'] = [ author['name'] for author in parsed_response['audit_author'] ] 

60 pdb_data['date'] = parsed_response['rcsb_accession_info']['initial_release_date'] 

61 # pdbx_refine_id not on every PDB like in 1FI3 

62 #pdb_data['method'] = parsed_response['refine'][0]['pdbx_refine_id'] 

63 pdb_data['method'] = parsed_response['exptl'][0]['method'] 

64 # ls_d_res_high not on every PDB like in 1FI3 

65 # pdb_data['resolution'] = parsed_response['refine'][0]['ls_d_res_high'] 

66 chain_uniprots = {} 

67 organisms = [] 

68 for polymer in parsed_response['polymer_entities']: 

69 identifier = polymer['rcsb_polymer_entity_container_identifiers'] 

70 # Get the organisms 

71 organism_entries = polymer['rcsb_entity_source_organism'] 

72 if organism_entries != None: 

73 organisms += [ organism['scientific_name'] for organism in organism_entries ] 

74 # Get the uniprot 

75 uniprots = identifier.get('uniprot_ids', None) 

76 if not uniprots: continue 

77 if len(uniprots) > 1: 

78 print(f'PDB {pdb_id} has multiple uniprots: {uniprots}. Saving only the first one') 

79 uniprot_id = uniprots[0] 

80 chains = identifier['asym_ids'] 

81 for chain in chains: 

82 chain_uniprots[chain] = uniprot_id 

83 pdb_data['chain_uniprots'] = chain_uniprots 

84 pdb_data['organisms'] = list(set(organisms)) 

85 except Exception as e: 

86 print(f'Error when mining PDB data for {pdb_id}') 

87 print('Got the response:', parsed_response, '.Setting noref') 

88 pdb_data = {'id': 'noref'} 

89 return pdb_data 

90 

91# Set service URLs to be requested 

92pdb_data_services = { 

93 'IRB': f'https://mmb.irbbarcelona.org', 

94 'BSC': f'https://mdb-login.bsc.es' 

95} 

96# Download PDB data from a remote service 

97# DEPRECATED: our custom PDB API may return UniProt ids not matching the ones in the PDB 

98# e.g. 1AK4 and 4Z80 

99def DEPRECATED_download_pdb_data (pdb_id : str, service = 'IRB') -> dict: 

100 # Set the request URL 

101 service_url = pdb_data_services[service] 

102 request_url = f'{service_url}/api/pdb/{pdb_id}/entry' 

103 # Send the request and save the response 

104 print(f'Requesting {request_url} (...)', end='\r') 

105 parsed_response = None 

106 try: 

107 with urllib.request.urlopen(request_url) as response: 

108 parsed_response = json.loads(response.read().decode("utf-8")) 

109 # Handle HTTP errors 

110 except urllib.error.HTTPError as error: 

111 print(f'There was a problem when requesting {request_url}') 

112 print(f' Error code: {error.code}') 

113 # If the PDB id is not found then we can stop here 

114 if error.code == 404: 

115 print(f' PDB id {pdb_id} not found') 

116 return None 

117 # If the API is not responding try another service 

118 elif error.code == 500 or error.code == 502 or error.code == 503: 

119 print(f' {service} API to retrieve PDB data may be out of service') 

120 # Before we surrender we try with the other available service 

121 if service == 'IRB': 

122 print(' Retrying with a different service') 

123 return DEPRECATED_download_pdb_data(pdb_id, service='BSC') 

124 # If we already tried with the other service then surrender 

125 raise RemoteServiceError('All APIs to retrieve PDB data may be out of service') 

126 # If the error is not known then stop here 

127 else: 

128 raise RemoteServiceError(f'Something went wrong with the PDB data request') 

129 # Handle URL errors 

130 except urllib.error.URLError as error: 

131 print(f'There was a problem when requesting {request_url}') 

132 print(f' Error reason: {error.reason}') 

133 # These errors are not our fault, but the service is unsatable 

134 print(f' {service} API to retrieve PDB data may be out of service') 

135 # Try with a different service 

136 if service == 'IRB': 

137 print(' Retrying with a different service') 

138 return DEPRECATED_download_pdb_data(pdb_id, service='BSC') 

139 raise RemoteServiceError(f'Something went wrong with the PDB data request') 

140 # Return the response 

141 print(f'Successfully requested {request_url}') 

142 return parsed_response