Coverage for model_workflow/tools/generate_pdb_references.py: 53%

87 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-23 10:54 +0000

1import json 

2import urllib.request 

3 

4from model_workflow.utils.auxiliar import RemoteServiceError, load_json, save_json, request_pdb_data 

5from model_workflow.utils.file import File 

6from model_workflow.utils.type_hints import * 

7 

8 

9def prepare_pdb_references (pdb_ids : List[str], output_filepath : str): 

10 """Prepare the PDB references json file to be uploaded to the database.""" 

11 # Set the output file 

12 pdb_references_file = File(output_filepath) 

13 # If we already have PDB references then load them 

14 previous_pdb_references = {} 

15 if pdb_references_file.exists: 

16 previous_content = load_json(pdb_references_file.path) 

17 for pdb_reference in previous_content: 

18 pdb_id = pdb_reference['id'] 

19 previous_pdb_references[pdb_id] = pdb_reference 

20 # Mine PDB data for every PDB id 

21 pdb_references = [] 

22 for pdb_id in pdb_ids: 

23 # Find if we already have this PDB id among hte previous references 

24 pdb_reference = previous_pdb_references.get(pdb_id, None) 

25 if pdb_reference: 

26 pdb_references.append(pdb_reference) 

27 continue 

28 # Otherwise download and mine the PDB data 

29 pdb_reference = get_pdb_reference(pdb_id) 

30 pdb_references.append(pdb_reference) 

31 # Write references to a json file 

32 save_json(pdb_references, pdb_references_file.path, indent = 4) 

33 # DANI: There is no need to return PDB references since it is not used in the workflow 

34 #return pdb_references 

35 

36# Download PDB data from the PDB API 

37def get_pdb_reference (pdb_id : str) -> dict: 

38 # Set the request query 

39 query = '''query ($id: String!) { 

40 entry(entry_id: $id) { 

41 rcsb_id 

42 struct { title } 

43 struct_keywords { pdbx_keywords } 

44 refine { pdbx_refine_id ls_d_res_high } 

45 rcsb_accession_info { initial_release_date } 

46 audit_author { name } 

47 polymer_entities { 

48 rcsb_polymer_entity_container_identifiers { asym_ids uniprot_ids } 

49 rcsb_entity_source_organism { scientific_name } 

50 } 

51 exptl { method } 

52 } 

53 }''' 

54 # Request PDB data 

55 parsed_response = request_pdb_data(pdb_id, query) 

56 try: 

57 # Mine data 

58 pdb_data = {} 

59 pdb_data['id'] = parsed_response['rcsb_id'] 

60 pdb_data['title'] = parsed_response['struct']['title'] 

61 pdb_data['class'] = parsed_response['struct_keywords']['pdbx_keywords'] 

62 pdb_data['authors'] = [ author['name'] for author in parsed_response['audit_author'] ] 

63 pdb_data['date'] = parsed_response['rcsb_accession_info']['initial_release_date'] 

64 # pdbx_refine_id not on every PDB like in 1FI3 

65 #pdb_data['method'] = parsed_response['refine'][0]['pdbx_refine_id'] 

66 pdb_data['method'] = parsed_response['exptl'][0]['method'] 

67 # ls_d_res_high not on every PDB like in 1FI3 

68 # pdb_data['resolution'] = parsed_response['refine'][0]['ls_d_res_high'] 

69 chain_uniprots = {} 

70 organisms = [] 

71 for polymer in parsed_response['polymer_entities']: 

72 identifier = polymer['rcsb_polymer_entity_container_identifiers'] 

73 # Get the organisms 

74 organism_entries = polymer['rcsb_entity_source_organism'] 

75 if organism_entries != None: 

76 organisms += [ organism['scientific_name'] for organism in organism_entries ] 

77 # Get the uniprot 

78 uniprots = identifier.get('uniprot_ids', None) 

79 if not uniprots: continue 

80 if len(uniprots) > 1: 

81 print(f'PDB {pdb_id} has multiple uniprots: {uniprots}. Saving only the first one') 

82 uniprot_id = uniprots[0] 

83 chains = identifier['asym_ids'] 

84 for chain in chains: 

85 chain_uniprots[chain] = uniprot_id 

86 pdb_data['chain_uniprots'] = chain_uniprots 

87 pdb_data['organisms'] = list(set(organisms)) 

88 except Exception as e: 

89 print(f'Error when mining PDB data for {pdb_id}') 

90 print('Got the response:', parsed_response, '.Setting noref') 

91 pdb_data = {'id': 'noref'} 

92 return pdb_data 

93 

94# Set service URLs to be requested 

95pdb_data_services = { 

96 'IRB': f'https://mmb.irbbarcelona.org', 

97 'BSC': f'https://mdb-login.bsc.es' 

98} 

99# Download PDB data from a remote service 

100# DEPRECATED: our custom PDB API may return UniProt ids not matching the ones in the PDB 

101# e.g. 1AK4 and 4Z80 

102def DEPRECATED_download_pdb_data (pdb_id : str, service = 'IRB') -> dict: 

103 # Set the request URL 

104 service_url = pdb_data_services[service] 

105 request_url = f'{service_url}/api/pdb/{pdb_id}/entry' 

106 # Send the request and save the response 

107 print(f'Requesting {request_url} (...)', end='\r') 

108 parsed_response = None 

109 try: 

110 with urllib.request.urlopen(request_url) as response: 

111 parsed_response = json.loads(response.read().decode("utf-8")) 

112 # Handle HTTP errors 

113 except urllib.error.HTTPError as error: 

114 print(f'There was a problem when requesting {request_url}') 

115 print(f' Error code: {error.code}') 

116 # If the PDB id is not found then we can stop here 

117 if error.code == 404: 

118 print(f' PDB id {pdb_id} not found') 

119 return None 

120 # If the API is not responding try another service 

121 elif error.code == 500 or error.code == 502 or error.code == 503: 

122 print(f' {service} API to retrieve PDB data may be out of service') 

123 # Before we surrender we try with the other available service 

124 if service == 'IRB': 

125 print(' Retrying with a different service') 

126 return DEPRECATED_download_pdb_data(pdb_id, service='BSC') 

127 # If we already tried with the other service then surrender 

128 raise RemoteServiceError('All APIs to retrieve PDB data may be out of service') 

129 # If the error is not known then stop here 

130 else: 

131 raise RemoteServiceError(f'Something went wrong with the PDB data request') 

132 # Handle URL errors 

133 except urllib.error.URLError as error: 

134 print(f'There was a problem when requesting {request_url}') 

135 print(f' Error reason: {error.reason}') 

136 # These errors are not our fault, but the service is unsatable 

137 print(f' {service} API to retrieve PDB data may be out of service') 

138 # Try with a different service 

139 if service == 'IRB': 

140 print(' Retrying with a different service') 

141 return DEPRECATED_download_pdb_data(pdb_id, service='BSC') 

142 raise RemoteServiceError(f'Something went wrong with the PDB data request') 

143 # Return the response 

144 print(f'Successfully requested {request_url}') 

145 return parsed_response