Coverage for mddb_workflow / tools / chains.py: 42%

127 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 18:45 +0000

1 

2import json 

3import time 

4 

5from urllib.request import urlopen 

6from urllib.parse import urlencode 

7from urllib.error import HTTPError 

8 

9from mddb_workflow.utils.auxiliar import warn, load_json, save_json, protein_residue_name_to_letter 

10from mddb_workflow.utils.auxiliar import RemoteServiceError 

11from mddb_workflow.utils.type_hints import * 

12 

13# Set analysis version 

14CHAINS_VERSION = '0.1' 

15 

16# Get the sequence and name of the chain in the structure and request the InterProScan 

17def request_interpsocan (sequence : str) -> str: 

18 # Set the request URL 

19 request_url = 'https://www.ebi.ac.uk/Tools/services/rest/iprscan5/run' 

20 # Set the POST data 

21 data = urlencode({ 

22 'email': 'daniel.beltran@irbbarcelona.org', 

23 'title': f'Chain X', 

24 'sequence': f'>chain X\n{sequence}' 

25 }).encode() 

26 parsed_response = None 

27 try: 

28 with urlopen(request_url, data=data) as response: 

29 parsed_response = response.read().decode("utf-8") 

30 except HTTPError as error: 

31 print(error.read().decode()) 

32 if error.code == 404: 

33 print(f' Not found') 

34 return None 

35 else: 

36 raise ValueError('Something went wrong with the InterProScan request: ' + request_url) 

37 return parsed_response 

38 

39# Check the status of the InterProScan job 

40def check_interproscan_status (jobid : str) -> str: 

41 # Set the request URL 

42 request_url = f'https://www.ebi.ac.uk/Tools/services/rest/iprscan5/status/{jobid}' 

43 parsed_response = None 

44 try: 

45 with urlopen(request_url) as response: 

46 parsed_response = response.read().decode("utf-8") 

47 except HTTPError as error: 

48 print(error.read().decode()) 

49 if error.code == 404: 

50 print(f' Not found') 

51 return None 

52 else: 

53 raise ValueError('Something went wrong with the InterProScan status request: ' + request_url) 

54 return parsed_response 

55 

56# Obtain the result of the InterProScan job in json format 

57def check_interproscan_result (jobid : str) -> dict: 

58 # Set the request URL 

59 request_url = f'https://www.ebi.ac.uk/Tools/services/rest/iprscan5/result/{jobid}/json' 

60 parsed_response = None 

61 try: 

62 with urlopen(request_url) as response: 

63 parsed_response = json.loads(response.read().decode("utf-8")) 

64 except HTTPError as error: 

65 print(error.read().decode()) 

66 if error.code == 404: 

67 print(f' Not found') 

68 return None 

69 elif error.code == 503: 

70 raise RemoteServiceError('InterProScan Service unavailable. Please try again later.') 

71 else: 

72 raise ValueError('Something went wrong with the InterProScan results request: ' + request_url) 

73 return parsed_response 

74 

75# Get the parsed chains from the structure 

76def get_protein_parsed_chains (structure : 'Structure') -> list: 

77 parsed_chains = [] 

78 chains = structure.chains 

79 # Iterate over the chains in the structure 

80 for chain in chains: 

81 # Get the name of the chain 

82 name = chain.name 

83 sequence = '' 

84 # Iterate over the residues in the chain 

85 for residue in chain.residues: 

86 # Translate the residue letter to his equivalent in the aminoacids library 

87 letter = protein_residue_name_to_letter(residue.name) 

88 sequence += letter 

89 # If all residues are 'X' then it means this is not a protein 

90 if all(letter == 'X' for letter in sequence): 

91 continue 

92 # Create a dictionary with the chain name, sequence and residue indices that be returned 

93 sequence_object = { 'name': name, 'sequence': sequence } 

94 parsed_chains.append(sequence_object) 

95 return parsed_chains 

96 

97# Set the expected ligand data fields 

98CHAIN_DATA_FIELDS = set(['sequence', 'interproscan']) 

99 

100# Import the chains data from a file if exists 

101def import_chains (chains_references_file : 'File') -> dict: 

102 # Read the file 

103 imported_chains = load_json(chains_references_file.path) 

104 # Format data as the process expects to find it 

105 for imported_chain in imported_chains: 

106 for expected_field in CHAIN_DATA_FIELDS: 

107 if expected_field not in imported_chain: 

108 imported_chain[expected_field] = None 

109 return imported_chains 

110 

111def prepare_chain_references ( 

112 structure : 'Structure', 

113 chains_references_file : 'File', 

114 database : 'Database', 

115): 

116 """Define the main function that will be called from the main script. 

117 This function will get the parsed chains from the structure and request the InterProScan service 

118 to obtain the data for each chain.""" 

119 

120 # Obtain the protein parsed chains from the structure 

121 protein_parsed_chains = get_protein_parsed_chains(structure) 

122 

123 # Get unique sequences 

124 protein_sequences = set([ chain['sequence'] for chain in protein_parsed_chains ]) 

125 

126 print(f' Found {len(protein_parsed_chains)} protein chains with {len(protein_sequences)} unique sequences') 

127 

128 # Save data from all chains to be saved in a file 

129 chains_data = [] 

130 # Load the chains file if exists already 

131 if chains_references_file.exists: 

132 chains_data += import_chains(chains_references_file) 

133 

134 # Save the jobids of every call to InterProScan 

135 interproscan_jobids = {} 

136 

137 # Iterate protein sequences 

138 for sequence in protein_sequences: 

139 # Check if the chain data already exists in the chains file 

140 chain_data = next((data for data in chains_data if data['sequence'] == sequence), None) 

141 # If we have no previous chain data then check if the sequence is already in the MDDB database 

142 if chain_data == None: 

143 chain_data = database.get_reference_data('chains', sequence) 

144 if chain_data is not None: 

145 chains_data.append(chain_data) 

146 # Save the chains data at this point 

147 # This may seem redundant since data will be not loaded further in the database 

148 # However, saving the chain in the local backup file is useufl to further run without internet connection 

149 save_json(chains_data, chains_references_file.path) 

150 # If we still have no chain data then create a new chain data dict 

151 # Set an object with the results of every call to InterProScan 

152 if chain_data == None: 

153 chain_data = { 

154 'sequence': sequence, 

155 'interproscan': None 

156 } 

157 chains_data.append(chain_data) 

158 # If chain data is missing any analysis then send a job 

159 # Request the InterProScan service 

160 # Keep the returned job ids to check the status and get the results later 

161 if chain_data['interproscan'] == None: 

162 interproscan_jobid = request_interpsocan(sequence) 

163 interproscan_jobids[sequence] = interproscan_jobid 

164 

165 # Get the pending interpsocan jobids 

166 pending_jobids = list(interproscan_jobids.values()) 

167 

168 # If we already have the results of all the chains then we can skip the next steps 

169 if len(pending_jobids) == 0: 

170 print(' All reference chains are already in the backup file') 

171 # DANI: No es necesario devolver los datos, no se usa en el workflow 

172 # return chains_data 

173 # RUBEN: Separated functions so can be used in the references updater 

174 get_interproscan_results(pending_jobids, interproscan_jobids, chains_data, chains_references_file) 

175 

176 

177def get_interproscan_results ( 

178 pending_jobids : list, 

179 interproscan_jobids : dict, 

180 chains_data : list, 

181 chains_references_file : 'File', 

182) -> None: 

183 # Set the timeout for the InterProScan jobs 

184 # AGUS: a veces ha llegado a tardar ~6 minutos que es excesivo, creo que minutos es suficiente tiempo de espera 

185 TIMEOUT = 300 # 5 min (seg) 

186 start_time = time.time() 

187 # Iterate over the jobids to check the status and get the results 

188 # If the status is 'FINISHED' then we can get the results and eliminate the jobid from the list 

189 # until there are no more jobids in either list 

190 while len(pending_jobids) >= 1: 

191 if time.time() - start_time > TIMEOUT: 

192 warn("Waiting time exceeded the limit. Chains data could not be obtained. Exiting analysis.") 

193 return 

194 time.sleep(3) # Wait for 3 seconds 

195 print(f' We are still waiting for {len(pending_jobids)} jobs to finish', end='\r') 

196 for sequence, interproscan_jobid in interproscan_jobids.items(): 

197 # If the jobid is already processed then skip it 

198 if interproscan_jobid not in pending_jobids: 

199 continue 

200 status = check_interproscan_status(interproscan_jobid) 

201 # We only know four possible status for interproscan jobs, but it could be more 

202 if status == 'RUNNING' or status == 'PENDING' or status == 'QUEUED': 

203 continue 

204 # If the status is something that we don´t know then we raise an error in order to solucionate this problem 

205 if status != 'FINISHED': 

206 raise ValueError('Something went wrong with the InterProScan job: ' + interproscan_jobid) 

207 # Retrive the results from InterProScan 

208 interproscan_result = check_interproscan_result(interproscan_jobid) 

209 # Get corresponding chain data and add the InterProScan results 

210 chain_data = next(data for data in chains_data if data['sequence'] == sequence) 

211 chain_data['version'] = CHAINS_VERSION 

212 chain_data['interproscan'] = interproscan_result 

213 # Remove version and pathways so Mongo don't get confused when they change 

214 del chain_data['interproscan']['interproscan-version'] 

215 # RUBEN: creo que results siempre tiene un solo elemento, pero por si acaso iteramos 

216 for result in chain_data['interproscan']['results']: 

217 for match in result['matches']: 

218 if match['signature']['entry'] is not None: 

219 del match['signature']['entry']['pathwayXRefs'] 

220 # Remove the jobid from the queue list 

221 pending_jobids.remove(interproscan_jobid) 

222 # Save the result 

223 save_json(chains_data, chains_references_file.path) 

224 

225 print(' Protein chains data obtained ')