Coverage for mddb_workflow/tools/generate_pdb

1import json

2import urllib.request

4from mddb_workflow.utils.auxiliar import RemoteServiceError, load_json, save_json, request_pdb_data

5from mddb_workflow.utils.type_hints import *

8def prepare_pdb_references (pdb_ids : list[str], pdb_references_file : 'File'):

9 """Prepare the PDB references json file to be uploaded to the database."""

10 # If we already have PDB references then load them

11 previous_pdb_references = {}

12 if pdb_references_file.exists:

13 previous_content = load_json(pdb_references_file.path)

14 for pdb_reference in previous_content:

15 pdb_id = pdb_reference['id']

16 previous_pdb_references[pdb_id] = pdb_reference

17 # Mine PDB data for every PDB id

18 pdb_references = []

19 for pdb_id in pdb_ids:

20 # Find if we already have this PDB id among hte previous references

21 pdb_reference = previous_pdb_references.get(pdb_id, None)

22 if pdb_reference:

23 pdb_references.append(pdb_reference)

24 continue

25 # Otherwise download and mine the PDB data

26 pdb_reference = get_pdb_reference(pdb_id)

27 pdb_references.append(pdb_reference)

28 # Write references to a json file

29 save_json(pdb_references, pdb_references_file.path, indent = 4)

30 # DANI: There is no need to return PDB references since it is not used in the workflow

31 #return pdb_references

33# Download PDB data from the PDB API

34def get_pdb_reference (pdb_id : str) -> dict:

35 # Set the request query

36 query = '''query ($id: String!) {

37 entry(entry_id: $id) {

38 rcsb_id

39 struct { title }

40 struct_keywords { pdbx_keywords }

41 refine { pdbx_refine_id ls_d_res_high }

42 rcsb_accession_info { initial_release_date }

43 audit_author { name }

44 polymer_entities {

45 rcsb_polymer_entity_container_identifiers { asym_ids uniprot_ids }

46 rcsb_entity_source_organism { scientific_name }

47 }

48 exptl { method }

49 }

50 }'''

51 # Request PDB data

52 parsed_response = request_pdb_data(pdb_id, query)

53 try:

54 # Mine data

55 pdb_data = {}

56 pdb_data['id'] = parsed_response['rcsb_id']

57 pdb_data['title'] = parsed_response['struct']['title']

58 pdb_data['class'] = parsed_response['struct_keywords']['pdbx_keywords']

59 pdb_data['authors'] = [ author['name'] for author in parsed_response['audit_author'] ]

60 pdb_data['date'] = parsed_response['rcsb_accession_info']['initial_release_date']

61 # pdbx_refine_id not on every PDB like in 1FI3

62 #pdb_data['method'] = parsed_response['refine'][0]['pdbx_refine_id']

63 pdb_data['method'] = parsed_response['exptl'][0]['method']

64 # ls_d_res_high not on every PDB like in 1FI3

65 # pdb_data['resolution'] = parsed_response['refine'][0]['ls_d_res_high']

66 chain_uniprots = {}

67 organisms = []

68 for polymer in parsed_response['polymer_entities']:

69 identifier = polymer['rcsb_polymer_entity_container_identifiers']

70 # Get the organisms

71 organism_entries = polymer['rcsb_entity_source_organism']

72 if organism_entries != None:

73 organisms += [ organism['scientific_name'] for organism in organism_entries ]

74 # Get the uniprot

75 uniprots = identifier.get('uniprot_ids', None)

76 if not uniprots: continue

77 if len(uniprots) > 1:

78 print(f'PDB {pdb_id} has multiple uniprots: {uniprots}. Saving only the first one')

79 uniprot_id = uniprots[0]

80 chains = identifier['asym_ids']

81 for chain in chains:

82 chain_uniprots[chain] = uniprot_id

83 pdb_data['chain_uniprots'] = chain_uniprots

84 pdb_data['organisms'] = list(set(organisms))

85 except Exception as e:

86 print(f'Error when mining PDB data for {pdb_id}')

87 print('Got the response:', parsed_response, '.Setting noref')

88 pdb_data = {'id': 'noref'}

89 return pdb_data

91# Set service URLs to be requested

92pdb_data_services = {

93 'IRB': f'https://mmb.irbbarcelona.org',

94 'BSC': f'https://mdb-login.bsc.es'

95}

96# Download PDB data from a remote service

97# DEPRECATED: our custom PDB API may return UniProt ids not matching the ones in the PDB

98# e.g. 1AK4 and 4Z80

99def DEPRECATED_download_pdb_data (pdb_id : str, service = 'IRB') -> dict:

100 # Set the request URL

101 service_url = pdb_data_services[service]

102 request_url = f'{service_url}/api/pdb/{pdb_id}/entry'

103 # Send the request and save the response

104 print(f'Requesting {request_url} (...)', end='\r')

105 parsed_response = None

106 try:

107 with urllib.request.urlopen(request_url) as response:

108 parsed_response = json.loads(response.read().decode("utf-8"))

109 # Handle HTTP errors

110 except urllib.error.HTTPError as error:

111 print(f'There was a problem when requesting {request_url}')

112 print(f' Error code: {error.code}')

113 # If the PDB id is not found then we can stop here

114 if error.code == 404:

115 print(f' PDB id {pdb_id} not found')

116 return None

117 # If the API is not responding try another service

118 elif error.code == 500 or error.code == 502 or error.code == 503:

119 print(f' {service} API to retrieve PDB data may be out of service')

120 # Before we surrender we try with the other available service

121 if service == 'IRB':

122 print(' Retrying with a different service')

123 return DEPRECATED_download_pdb_data(pdb_id, service='BSC')

124 # If we already tried with the other service then surrender

125 raise RemoteServiceError('All APIs to retrieve PDB data may be out of service')

126 # If the error is not known then stop here

127 else:

128 raise RemoteServiceError(f'Something went wrong with the PDB data request')

129 # Handle URL errors

130 except urllib.error.URLError as error:

131 print(f'There was a problem when requesting {request_url}')

132 print(f' Error reason: {error.reason}')

133 # These errors are not our fault, but the service is unsatable

134 print(f' {service} API to retrieve PDB data may be out of service')

135 # Try with a different service

136 if service == 'IRB':

137 print(' Retrying with a different service')

138 return DEPRECATED_download_pdb_data(pdb_id, service='BSC')

139 raise RemoteServiceError(f'Something went wrong with the PDB data request')

140 # Return the response

141 print(f'Successfully requested {request_url}')

142 return parsed_response

Coverage for mddb_workflow / tools / generate_pdb_references.py: 52%

85 statements