Coverage for model_workflow/tools/chains.py: 43%

143 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-23 10:54 +0000

1 

2import json 

3import time 

4 

5from urllib.request import urlopen 

6from urllib.parse import urlencode 

7from urllib.error import HTTPError 

8 

9from model_workflow.utils.auxiliar import warn, load_json, save_json, protein_residue_name_to_letter 

10from model_workflow.utils.file import File 

11from model_workflow.utils.type_hints import * 

12 

13# Set analysis version 

14CHAINS_VERSION = '0.1' 

15 

16# Get the sequence and name of the chain in the structure and request the InterProScan  

17def request_interpsocan (sequence : str) -> str: 

18 # Set the request URL 

19 request_url = 'https://www.ebi.ac.uk/Tools/services/rest/iprscan5/run' 

20 # Set the POST data 

21 data = urlencode({ 

22 'email': 'daniel.beltran@irbbarcelona.org', 

23 'title': f'Chain X', 

24 'sequence': f'>chain X\n{sequence}' 

25 }).encode() 

26 parsed_response = None 

27 try: 

28 with urlopen(request_url, data=data) as response: 

29 parsed_response = response.read().decode("utf-8") 

30 except HTTPError as error: 

31 print(error.read().decode()) 

32 if error.code == 404: 

33 print(f' Not found') 

34 return None 

35 else: 

36 raise ValueError('Something went wrong with the InterProScan request: ' + request_url) 

37 return parsed_response 

38 

39# Check the status of the InterProScan job 

40def check_interproscan_status (jobid : str) -> str: 

41 # Set the request URL 

42 request_url = f'https://www.ebi.ac.uk/Tools/services/rest/iprscan5/status/{jobid}' 

43 parsed_response = None 

44 try: 

45 with urlopen(request_url) as response: 

46 parsed_response = response.read().decode("utf-8") 

47 except HTTPError as error: 

48 print(error.read().decode()) 

49 if error.code == 404: 

50 print(f' Not found') 

51 return None 

52 else: 

53 raise ValueError('Something went wrong with the InterProScan status request: ' + request_url) 

54 return parsed_response 

55 

56# Obtain the result of the InterProScan job in json format 

57def check_interproscan_result (jobid : str) -> dict: 

58 # Set the request URL 

59 request_url = f'https://www.ebi.ac.uk/Tools/services/rest/iprscan5/result/{jobid}/json' 

60 parsed_response = None 

61 try: 

62 with urlopen(request_url) as response: 

63 parsed_response = json.loads(response.read().decode("utf-8")) 

64 except HTTPError as error: 

65 print(error.read().decode()) 

66 if error.code == 404: 

67 print(f' Not found') 

68 return None 

69 elif error.code == 503: 

70 raise SystemExit('InterProScan Service unavailable. Please try again later.') 

71 else: 

72 raise ValueError('Something went wrong with the InterProScan results request: ' + request_url) 

73 return parsed_response 

74 

75# Check if the required sequence is already in the MDDB database 

76def check_sequence_in_mddb (sequence : str, database_url : str) -> dict: 

77 # Request PubChem 

78 request_url = f'{database_url}rest/v1/references/chains/{sequence}' 

79 parsed_response = None 

80 try: 

81 with urlopen(request_url) as response: 

82 #parsed_response = json.loads(response.read().decode("windows-1252")) 

83 parsed_response = json.loads(response.read().decode("utf-8", errors='ignore')) 

84 # If the accession is not found in PubChem then the id is not valid 

85 # This may happen with pubchem ids of non-discrete compounds (e.g. 483927498) 

86 except HTTPError as error: 

87 if error.code == 404: 

88 return None 

89 elif error.code == 503: 

90 print('MDDB Service unavailable. Please try again later.') 

91 return None 

92 print('Error when requesting ', request_url) 

93 return None 

94 return parsed_response 

95 

96 

97# Get the parsed chains from the structure 

98def get_protein_parsed_chains (structure : 'Structure') -> list: 

99 parsed_chains = [] 

100 chains = structure.chains 

101 # Iterate over the chains in the structure 

102 for chain in chains: 

103 # Get the name of the chain 

104 name = chain.name 

105 sequence = '' 

106 # Iterate over the residues in the chain 

107 for residue in chain.residues: 

108 # Translate the residue letter to his equivalent in the aminoacids library 

109 letter = protein_residue_name_to_letter(residue.name) 

110 sequence += letter 

111 # If all residues are 'X' then it means this is not a protein 

112 if all(letter == 'X' for letter in sequence): 

113 continue 

114 # Create a dictionary with the chain name, sequence and residue indices that be returned 

115 sequence_object = { 'name': name, 'sequence': sequence } 

116 parsed_chains.append(sequence_object) 

117 return parsed_chains 

118 

119# Set the expected ligand data fields 

120CHAIN_DATA_FIELDS = set(['sequence', 'interproscan']) 

121 

122# Import the chains data from a file if exists 

123def import_chains (chains_references_file : 'File') -> dict: 

124 # Read the file 

125 imported_chains = load_json(chains_references_file.path) 

126 # Format data as the process expects to find it 

127 for imported_chain in imported_chains: 

128 for expected_field in CHAIN_DATA_FIELDS: 

129 if expected_field not in imported_chain: 

130 imported_chain[expected_field] = None 

131 return imported_chains 

132 

133def prepare_chain_references ( 

134 structure : 'Structure', 

135 output_filepath : 'File', 

136 database_url : str, 

137): 

138 """Define the main function that will be called from the main script. 

139 This function will get the parsed chains from the structure and request the InterProScan service 

140 to obtain the data for each chain.""" 

141 # Set the chain references output file 

142 chains_references_file = File(output_filepath) 

143 

144 # Obtain the protein parsed chains from the structure 

145 protein_parsed_chains = get_protein_parsed_chains(structure) 

146 

147 # Get unique sequences 

148 protein_sequences = set([ chain['sequence'] for chain in protein_parsed_chains ]) 

149 

150 print(f' Found {len(protein_parsed_chains)} protein chains with {len(protein_sequences)} unique sequences') 

151 

152 # Save data from all chains to be saved in a file 

153 chains_data = [] 

154 # Load the chains file if exists already 

155 if chains_references_file.exists: 

156 chains_data += import_chains(chains_references_file) 

157 

158 # Save the jobids of every call to InterProScan 

159 interproscan_jobids = {} 

160 

161 # Iterate protein sequences 

162 for sequence in protein_sequences: 

163 # Check if the chain data already exists in the chains file 

164 chain_data = next((data for data in chains_data if data['sequence'] == sequence), None) 

165 # If we have no previous chain data then check if the sequence is already in the MDDB database 

166 if chain_data == None: 

167 chain_data = check_sequence_in_mddb(sequence, database_url) 

168 if chain_data is not None: 

169 chains_data.append(chain_data) 

170 # Save the chains data at this point 

171 # This may seem redundant since data will be not loaded further in the database 

172 # However, saving the chain in the local backup file is useufl to further run without internet connection 

173 save_json(chains_data, chains_references_file.path) 

174 # If we still have no chain data then create a new chain data dict 

175 # Set an object with the results of every call to InterProScan 

176 if chain_data == None: 

177 chain_data = { 

178 'sequence': sequence, 

179 'interproscan': None 

180 } 

181 chains_data.append(chain_data) 

182 # If chain data is missing any analysis then send a job 

183 # Request the InterProScan service 

184 # Keep the returned job ids to check the status and get the results later 

185 if chain_data['interproscan'] == None: 

186 interproscan_jobid = request_interpsocan(sequence) 

187 interproscan_jobids[sequence] = interproscan_jobid 

188 

189 # Get the pending interpsocan jobids  

190 pending_jobids = list(interproscan_jobids.values()) 

191 

192 # If we already have the results of all the chains then we can skip the next steps 

193 if len(pending_jobids) == 0: 

194 print(' All reference chains are already in the backup file') 

195 # DANI: No es necesario devolver los datos, no se usa en el workflow 

196 # return chains_data 

197 # RUBEN: Separated functions so can be used in the references updater 

198 get_interproscan_results(pending_jobids, interproscan_jobids, chains_data, chains_references_file) 

199 

200 

201def get_interproscan_results ( 

202 pending_jobids : list, 

203 interproscan_jobids : dict, 

204 chains_data : list, 

205 chains_references_file : 'File', 

206) -> None: 

207 # Set the timeout for the InterProScan jobs 

208 # AGUS: a veces ha llegado a tardar ~6 minutos que es excesivo, creo que minutos es suficiente tiempo de espera 

209 TIMEOUT = 300 # 5 min (seg) 

210 start_time = time.time() 

211 # Iterate over the jobids to check the status and get the results 

212 # If the status is 'FINISHED' then we can get the results and eliminate the jobid from the list  

213 # until there are no more jobids in either list 

214 while len(pending_jobids) >= 1: 

215 if time.time() - start_time > TIMEOUT: 

216 warn("Waiting time exceeded the limit. Chains data could not be obtained. Exiting analysis.") 

217 return 

218 time.sleep(3) # Wait for 3 seconds 

219 print(f' We are still waiting for {len(pending_jobids)} jobs to finish', end='\r') 

220 for sequence, interproscan_jobid in interproscan_jobids.items(): 

221 # If the jobid is already processed then skip it 

222 if interproscan_jobid not in pending_jobids: 

223 continue 

224 status = check_interproscan_status(interproscan_jobid) 

225 # We only know four possible status for interproscan jobs, but it could be more 

226 if status == 'RUNNING' or status == 'PENDING' or status == 'QUEUED': 

227 continue 

228 # If the status is something that we don´t know then we raise an error in order to solucionate this problem 

229 if status != 'FINISHED': 

230 raise ValueError('Something went wrong with the InterProScan job: ' + interproscan_jobid) 

231 # Retrive the results from InterProScan 

232 interproscan_result = check_interproscan_result(interproscan_jobid) 

233 # Get corresponding chain data and add the InterProScan results 

234 chain_data = next(data for data in chains_data if data['sequence'] == sequence) 

235 chain_data['version'] = CHAINS_VERSION 

236 chain_data['interproscan'] = interproscan_result 

237 # Remove version and pathways so Mongo don't get confused when they change 

238 del chain_data['interproscan']['interproscan-version'] 

239 # RUBEN: creo que results siempre tiene un solo elemento, pero por si acaso iteramos 

240 for result in chain_data['interproscan']['results']: 

241 for match in result['matches']: 

242 if match['signature']['entry'] is not None: 

243 del match['signature']['entry']['pathwayXRefs'] 

244 # Remove the jobid from the queue list 

245 pending_jobids.remove(interproscan_jobid) 

246 # Save the result 

247 save_json(chains_data, chains_references_file.path) 

248 

249 print(' Protein chains data obtained ')