Coverage for model_workflow/analyses/pockets.py: 90%

247 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-23 10:54 +0000

1# Pockets analysis 

2#  

3# The pockets analysis is carried by the 'Fpocket' software. Fpocket is a fast open source protein  

4# pocket (cavity) detection algorithm based on Voronoi tessellation. 

5# 

6# The analysis is performed over 100 frames selected along the trajectory. First of all, an 'MDpocket' 

7# analysis is run over these frames thus generating an output grid file. This grid contains a measure  

8# of frequency of how many times the pocket was open during the trajectory. The most stable and wide  

9# pockets are selected from this grid and then analyzed again with MDpocket individually. This second  

10# run returns dynamic data of the pocket volume and drugability score, among others. 

11# 

12# Vincent Le Guilloux, Peter Schmidtke and Pierre Tuffery, “Fpocket: An open source platform for ligand  

13# pocket detection”, BMC Bioinformatics 2009, 10:168 

14# 

15# Peter Schmidtke & Xavier Barril “Understanding and predicting druggability. A high-throughput method  

16# for detection of drug binding sites.”, J Med Chem, 2010, 53(15):5858-67 

17# 

18# Peter Schmldtke, Axel Bidon-Chanal, Javier Luque, Xavier Barril, “MDpocket: open-source cavity detection  

19# and characterization on molecular dynamics trajectories.”, Bioinformatics. 2011 Dec 1;27(23):3276-85 

20 

21from os.path import exists, getsize, split 

22from os import remove, chdir, getcwd 

23import re 

24import collections 

25 

26from subprocess import run, PIPE 

27 

28from model_workflow.tools.get_reduced_trajectory import get_reduced_trajectory 

29from model_workflow.utils.auxiliar import warn, ToolError, save_json 

30from model_workflow.utils.file import File 

31from model_workflow.utils.constants import GREY_HEADER, COLOR_END 

32from model_workflow.utils.constants import OUTPUT_POCKETS_FILENAME, OUTPUT_POCKET_STRUCTURES_PREFIX 

33from model_workflow.utils.type_hints import * 

34 

35CURSOR_UP_ONE = '\x1b[1A' 

36ERASE_LINE = '\x1b[2K' 

37ERASE_PREVIOUS_LINE = CURSOR_UP_ONE + ERASE_LINE 

38ERASE_4_PREVIOUS_LINES = ERASE_PREVIOUS_LINE + ERASE_PREVIOUS_LINE + ERASE_PREVIOUS_LINE + ERASE_PREVIOUS_LINE + CURSOR_UP_ONE 

39 

40# Set some known error logs to check if we are having them 

41KNOWN_MDPOCKET_ERRORS = set([ 

42 'Error in creating clustering tree, return NULL pointer...breaking up' 

43]) 

44 

45def pockets ( 

46 structure_file : 'File', 

47 trajectory_file : 'File', 

48 output_directory : str, 

49 pbc_selection : 'Selection', 

50 snapshots : int, 

51 frames_limit : int = 100, 

52 # Get only the 10 first pockets since the analysis is quite slow by now 

53 # DANI: Cuando hagamos threading y no haya limite de tamaño para cargar en mongo podremos hacer más pockets 

54 maximum_pockets_number : int = 10): 

55 """Perform the pockets analysis.""" 

56 

57 # DANI: De momento, no se hacen pockets para simulaciones con residuos en PBC (e.g. membrana) 

58 # DANI: Esto es debido a que los átomos donde NO queremos que encuentre pockets no se pueden descartar 

59 # DANI: Descartarlos significa quitarlos, pero si los quitamos entonces podemos encontrar pockets donde están estos átomos 

60 # DANI: Estamos a la espera que los de mdpocket incluyan un flag para estos casos 

61 # https://github.com/Discngine/fpocket/issues/77#issuecomment-974193129 

62 if pbc_selection: 

63 print(' Pockets analysis will be skipped since we have PBC atoms') 

64 return 

65 

66 # Set output filepaths 

67 output_analysis_filepath = f'{output_directory}/{OUTPUT_POCKETS_FILENAME}' 

68 

69 # Set a reduced trajectory with only 100 frames 

70 # Get the step between frames of the new reduced trajectory, since it will be append to the output 

71 pockets_trajectory, step, frames = get_reduced_trajectory( 

72 structure_file, 

73 trajectory_file, 

74 snapshots, 

75 frames_limit, 

76 ) 

77 # Save the pockets trajectory as a file 

78 pockets_trajectory_file = File(pockets_trajectory) 

79 

80 # WARNING: There is a silent sharp limit of characters here 

81 # https://github.com/Discngine/fpocket/issues/130 

82 # To avoid the problem we must change the directory where we run pockets so all the paths are as short as possible 

83 

84 # Save the MD path 

85 md_path, output_directory_name = split(output_directory) 

86 # Save the current working directory so we can recover it later 

87 recovery_path = getcwd() 

88 

89 # Move to the MD path so all relative paths become shorter 

90 chdir(md_path) 

91 

92 # Now set the structure and trajectory paths relative to the current new location 

93 # HARDCODE: We asume the trajectory and structure files are in the current location 

94 # This is now true, but if this changes in the future this will fail, although the error will be obvious 

95 auxiliar_trajectory_filepath = f'./{pockets_trajectory_file.filename}' 

96 auxiliar_structure_filepath = f'./{structure_file.filename}' 

97 

98 # Run the mdpocket analysis to find new pockets 

99 mdpocket_output = output_directory_name + '/mdpout' 

100 # Set the filename of the fpocket output we are intereseted in 

101 grid_filename = mdpocket_output + '_freq.dx' 

102 # Skip this step if the output file already exists and is not empty 

103 # WARNING: The file is created as soon as mdpocket starts to run 

104 # WARNING: However the file remains empty until the end of mdpocket 

105 if not exists(grid_filename) or getsize(grid_filename) == 0: 

106 

107 print('Searching new pockets') 

108 print(GREY_HEADER) 

109 process = run([ 

110 "mdpocket", 

111 "--trajectory_file", 

112 # WARNING: There is a silent sharp limit of characters here 

113 # To avoid the problem we must use the relative path instead of the absolute path 

114 auxiliar_trajectory_filepath, 

115 "--trajectory_format", 

116 "xtc", 

117 "-f", 

118 # WARNING: There is a silent sharp limit of characters here 

119 # To avoid the problem we must use the relative path instead of the absolute path 

120 auxiliar_structure_filepath, 

121 "-o", 

122 mdpocket_output, 

123 ], stderr=PIPE) 

124 error_logs = process.stderr.decode() 

125 print(COLOR_END) 

126 

127 # If file does not exist or is still empty at this point then somethin went wrong 

128 if not exists(grid_filename) or getsize(grid_filename) == 0: 

129 print(error_logs) 

130 raise ToolError('Something went wrong with mdpocket while searching pockets') 

131 

132 # Check if we are having concerning error logs 

133 errored = False 

134 for error in KNOWN_MDPOCKET_ERRORS: 

135 matches = re.findall(error, error_logs) 

136 count = len(matches) 

137 if count > 0: 

138 warn(f'We are having "{error}" ({count})') 

139 errored = True 

140 

141 # If so we can stop here since we will not be able to mine the grid filename 

142 if errored: 

143 if exists(grid_filename): 

144 remove(grid_filename) 

145 raise ToolError('We had errors with mdpocket while searching pockets') 

146 

147 # Read and harvest the gird file 

148 with open(grid_filename, 'r') as file: 

149 

150 # First, mine the grid dimensions and origin 

151 dimensions = None 

152 dimensions_pattern = "counts ([0-9]+) ([0-9]+) ([0-9]+)" 

153 origin = None 

154 origin_pattern = "origin ([.0-9-]+) ([.0-9-]+) ([.0-9-]+)" 

155 for line in file: 

156 search = re.search(dimensions_pattern, line) 

157 if search != None: 

158 dimensions = (int(search.group(1)),int(search.group(2)),int(search.group(3))) 

159 search = re.search(origin_pattern, line) 

160 if search != None: 

161 origin = (float(search.group(1)),float(search.group(2)),float(search.group(3))) 

162 if origin and dimensions: 

163 break 

164 if not dimensions: 

165 # This may happend when one of the dimensions is negative, which an mdpocket error 

166 # DANI: El origen de esto no está claro pero cuando me pasó vino acompañado de muchos errores: 

167 # DANI: '! No Pockets Found while refining' y '! No pocket to reindex.' 

168 # DANI: Además de algún 'Error in creating clustering tree, return NULL pointer...breaking up!' 

169 raise Exception('Failed to mine dimensions') 

170 

171 # Next, mine the grid values 

172 grid_values = [] 

173 grid_values_pattern = "^([.0-9]+) ([.0-9]+) ([.0-9]+) $" 

174 # The last line of the grid may have less than 3 values 

175 last_line_grid_values_pattern_1 = "^([.0-9]+) ([.0-9]+) $" 

176 last_line_grid_values_pattern_2 = "^([.0-9]+) $" 

177 for line in file: 

178 search = re.search(grid_values_pattern, line) 

179 if (search != None): 

180 grid_values.append(float(search.group(1))) 

181 grid_values.append(float(search.group(2))) 

182 grid_values.append(float(search.group(3))) 

183 continue 

184 # This should only happend at the end 

185 search = re.search(last_line_grid_values_pattern_1, line) 

186 if (search != None): 

187 grid_values.append(float(search.group(1))) 

188 grid_values.append(float(search.group(2))) 

189 continue 

190 search = re.search(last_line_grid_values_pattern_2, line) 

191 if (search != None): 

192 grid_values.append(float(search.group(1))) 

193 continue 

194 

195 # Set a function to get the value of a given 'x, y, z' grid point 

196 # Grid points are disposed first in 'z' order, then 'y', and finally 'x' 

197 xl, yl, zl = dimensions 

198 def get_index (x : int, y : int, z : int) -> int: 

199 index = x*yl*zl + y*zl + z 

200 return index 

201 

202 # Get the coordinates of the colliding points 

203 # Set the x, y and z index limit (i.e. the limit - 1) 

204 xil, yil, zil = (xl-1, yl-1, zl-1) 

205 def get_colliding_points (point : tuple) -> list: 

206 x, y, z = point[0], point[1], point[2] 

207 colliding_points = [ 

208 (x-1, y-1, z-1), 

209 (x-1, y-1, z ), 

210 (x-1, y-1, z+1), 

211 (x-1, y , z-1), 

212 (x-1, y , z ), 

213 (x-1, y , z+1), 

214 (x-1, y+1, z-1), 

215 (x-1, y+1, z ), 

216 (x-1, y+1, z+1), 

217 (x , y-1, z-1), 

218 (x , y-1, z ), 

219 (x , y-1, z+1), 

220 (x , y , z-1), 

221 (x , y , z+1), 

222 (x , y+1, z-1), 

223 (x , y+1, z ), 

224 (x , y+1, z+1), 

225 (x+1, y-1, z-1), 

226 (x+1, y-1, z ), 

227 (x+1, y-1, z+1), 

228 (x+1, y , z-1), 

229 (x+1, y , z ), 

230 (x+1, y , z+1), 

231 (x+1, y+1, z-1), 

232 (x+1, y+1, z ), 

233 (x+1, y+1, z+1), 

234 ] 

235 if x <= 0: 

236 colliding_points = [ point for point in colliding_points if point[0] != x-1 ] 

237 if x >= xil: 

238 colliding_points = [ point for point in colliding_points if point[0] != x+1 ] 

239 if y <= 0: 

240 colliding_points = [ point for point in colliding_points if point[1] != y-1 ] 

241 if y >= yil: 

242 colliding_points = [ point for point in colliding_points if point[1] != y+1 ] 

243 if z <= 0: 

244 colliding_points = [ point for point in colliding_points if point[2] != z-1 ] 

245 if z >= zil: 

246 colliding_points = [ point for point in colliding_points if point[2] != z+1 ] 

247 return colliding_points 

248 

249 # Set a cuttoff value to consider a point valid 

250 cuttoff = 0.5 

251 

252 # Classify all non-zero values by 'pocket' groups according to if they are connected 

253 pockets = [0] * xl * yl * zl 

254 pockets_count = 0 

255 

256 # Given a point which belongs to a pocket, find all pocket points connected to it 

257 # This connection must be wide enought for a water molecule to fit in 

258 # Grid points are 1 Ångstrom wide and a water molecule has a diameter of 2.75 Ångstroms aprox 

259 # So, pocket points must be connected by a net of points at least 3 points wide in all dimensions (x,y,z) 

260 # Tag all points with the same pocket number 

261 def set_pocket (start_point : tuple, pocket_number : int): 

262 points = [start_point] 

263 for point in points: 

264 # Get current point colliders 

265 colliding_points = get_colliding_points(point) 

266 # Filter only the pocket points 

267 pocket_points = [] 

268 for point in colliding_points: 

269 index = get_index(*point) 

270 value = grid_values[index] 

271 # If it is a pocket then set it as part of the current pocket 

272 if value >= cuttoff: 

273 # Tag it as the current pocket 

274 pockets[index] = pocket_number 

275 pocket_points.append(point) 

276 # In case all its surrounding points are pockets add the points to the list so they can keep expanding the pocket 

277 # This is done because if all surrounding points are pockets it means we have a wide enought region of the pocket 

278 # It is a 3 x 3 x 3 region. 

279 # Otherwise stop here since the surrounding points may not we wide enough to expand the pocket 

280 if len(pocket_points) < 26: 

281 continue 

282 # Filter points which are not in the pockets list already 

283 new_points = [ point for point in pocket_points if point not in points ] 

284 # Update the points list 

285 points += new_points 

286 

287 # Find out if a point is surrounded by all pocket points so it could be considered a pocket alone 

288 def is_pocket_base (start_point : tuple) -> bool: 

289 # Get current point colliders 

290 colliding_points = get_colliding_points(start_point) 

291 # Search if these points match the cutoff 

292 # If only 1 point does not match then this is not a pocket base 

293 for point in colliding_points: 

294 index = get_index(*point) 

295 value = grid_values[index] 

296 if value < cuttoff: 

297 return False 

298 return True 

299 

300 # Save also each point coordinates 

301 for x in range(xl): 

302 for y in range(yl): 

303 for z in range(zl): 

304 index = get_index(x,y,z) 

305 # If it is not a pocket then pass 

306 value = grid_values[index] 

307 if value < cuttoff: 

308 continue 

309 # If it is a pocket but it has been already tagged then pass 

310 pocket = pockets[index] 

311 if pocket: 

312 continue 

313 # If it is not wide enought to start a pocket then pass 

314 point = (x,y,z) 

315 if not is_pocket_base(point): 

316 continue 

317 # If none of the previous values was marked as a pocket then we set a new pocket number 

318 pockets_count += 1 

319 set_pocket(start_point=point, pocket_number=pockets_count) 

320 

321 # Exclude the first result which will always be 0 and it stands for no-pocket points 

322 biggest_pockets = collections.Counter(pockets).most_common() 

323 if len(biggest_pockets) == 1: 

324 print(' No pockets were found') 

325 # Recover the original directory 

326 chdir(recovery_path) 

327 return 

328 biggest_pockets = biggest_pockets[1:] 

329 pockets_number = len(biggest_pockets) 

330 # If we have more than the maximum number of pockets then get the first pockets and discard the rest 

331 if pockets_number > maximum_pockets_number: 

332 biggest_pockets = biggest_pockets[0:maximum_pockets_number] 

333 pockets_number = maximum_pockets_number 

334 

335 # First of all, get all header lines from the original grid file 

336 # We need them to write grid files further 

337 with open(grid_filename,'r') as file: 

338 header_lines = [] 

339 header_pattern = "^[a-z]" 

340 for line in file: 

341 if re.match(header_pattern, line): 

342 header_lines.append(line) 

343 elif re.match(grid_values_pattern, line): 

344 break 

345 

346 # Set the dict where all output data will be stored 

347 output_analysis = [] 

348 

349 # Set the output pocket structure files prefix 

350 output_pocket_structure_prefix = f'{output_directory_name}/{OUTPUT_POCKET_STRUCTURES_PREFIX}_' 

351 

352 # Next, we analyze each selected pocket independently. The steps for each pocket are: 

353 # 1 - Create a new grid file 

354 # 2 - Conver the grid to pdb 

355 # 3 - Analyze this pdb with mdpocket 

356 # 4 - Harvest the volumes over time and write them in the pockets analysis file 

357 for p, pock in enumerate(biggest_pockets, 1): 

358 # WARNING: This name must match the final name of the pocket file once loaded in the database 

359 pocket_name = 'pocket_' + str(p).zfill(2) 

360 pocket_output = output_directory_name + '/' + pocket_name 

361 # Check if current pocket files already exist and are complete. If so, skip this pocket 

362 # Output files: 

363 # - pX.dx: it is created and completed at the begining by this workflow 

364 # - pX_descriptors.txt: it is created at the begining and completed along the mdpocket progress 

365 # - pX_mdpocket_atoms.pdb: it is completed at the begining but remains size 0 until the end of mdpocket 

366 # - pX_mdpocket.pdb: it is completed at the begining but remains size 0 until the end of mdpocket 

367 # Note that checking pX_mdpocket_atoms.pdb or pX_mdpocket.pdb is enought to know if mdpocket was completed 

368 checking_filename = pocket_output + '_mdpocket.pdb' 

369 if not (exists(checking_filename) and getsize(checking_filename) > 0): 

370 

371 # Update the logs 

372 print(f' Analyzing pocket {p}/{pockets_number}', end='\r') 

373 

374 # Create the new grid for this pocket, where all values from other pockets are set to 0 

375 pocket_value = pock[0] 

376 new_grid_values = [str(value).ljust(5,'0') if pockets[v] == pocket_value else '0.000' for v, value in enumerate(grid_values)] 

377 new_grid_filename = output_directory_name + '/' + pocket_name + '.dx' 

378 with open(new_grid_filename,'w') as file: 

379 # Write the header lines 

380 for line in header_lines: 

381 file.write(line) 

382 # Write values in lines of 3 values 

383 count = 0 

384 line = '' 

385 for value in new_grid_values: 

386 if count == 0: 

387 line = value 

388 count += 1 

389 elif count == 1: 

390 line = line + ' ' + value 

391 count += 1 

392 elif count == 2: 

393 line = line + ' ' + value + ' \n' 

394 file.write(line) 

395 count = 0 

396 

397 # Set a function to get the 'x, y, z' coordinates of a given grid point index 

398 def get_coordinates (index : int) -> tuple: 

399 x, rem = divmod(index, yl*zl) 

400 y, z = divmod(rem, zl) 

401 return (x,y,z) 

402 

403 # Convert the grid coordinates to pdb 

404 new_pdb_lines = [] 

405 lines_count = 0 

406 new_pdb_filename = f'{output_pocket_structure_prefix}{str(p).zfill(2)}.pdb' 

407 for j, pocket in enumerate(pockets): 

408 if pocket != pocket_value: 

409 continue 

410 x,y,z = get_coordinates(j) 

411 lines_count += 1 

412 atom_num = str(lines_count).rjust(6,' ') 

413 x_coordinates = str(round((origin[0] + x) * 1000) / 1000).rjust(8, ' ') 

414 y_coordinates = str(round((origin[1] + y) * 1000) / 1000).rjust(8, ' ') 

415 z_coordinates = str(round((origin[2] + z) * 1000) / 1000).rjust(8, ' ') 

416 line = "ATOM "+ atom_num +" C PTH 1 "+ x_coordinates + y_coordinates + z_coordinates +" 0.00 0.00\n" 

417 new_pdb_lines.append(line) 

418 

419 # Write the pdb file 

420 with open(new_pdb_filename,'w') as file: 

421 for line in new_pdb_lines: 

422 file.write(line) 

423 

424 # Run the mdpocket analysis focusing in this specific pocket 

425 print(GREY_HEADER) 

426 error_logs = run([ 

427 "mdpocket", 

428 "--trajectory_file", 

429 auxiliar_trajectory_filepath, 

430 "--trajectory_format", 

431 "xtc", 

432 "-f", 

433 # WARNING: There is a silent sharp limit of characters here 

434 # To avoid the problem we must use the relative path instead of the absolute path 

435 auxiliar_structure_filepath, 

436 "-o", 

437 pocket_output, 

438 "--selected_pocket", 

439 new_pdb_filename, 

440 ], stderr=PIPE).stderr.decode() 

441 print(COLOR_END) 

442 

443 # If file does not exist or is still empty at this point then somethin went wrong 

444 if not exists(checking_filename) or getsize(checking_filename) == 0: 

445 print(error_logs) 

446 raise Exception(f'Something went wrong with mdpocket while analysing pocket {p}') 

447 

448 # Remove previous lines 

449 print(ERASE_4_PREVIOUS_LINES) 

450 

451 # Mine data from the mdpocket 'descriptors' output file 

452 descriptors_data = {} 

453 mdpocket_descriptors_filename = pocket_output + '_descriptors.txt' 

454 with open(mdpocket_descriptors_filename,'r') as file: 

455 # The '[:-1]' is to remove the break line at the end of each line 

456 entries = re.split("[ ]+", next(file)[:-1]) 

457 for entry in entries: 

458 descriptors_data[entry] = [] 

459 for line in file: 

460 line_data = re.split("[ ]+", line[:-1]) 

461 for v, value in enumerate(line_data): 

462 descriptors_data[entries[v]].append(value) 

463 

464 # Mine the Voronoi vertices which represent the pocket each frame 

465 # Mine their positions and radius for each frame 

466 # Example of lines in hte file to be mined: 

467 # ATOM 0 O STP 105 79.352 75.486 84.079 0.00 4.25 

468 # ATOM 1 C STP 105 79.735 75.228 84.482 0.00 4.13 

469 # ATOM 2 C STP 105 79.624 74.251 84.263 0.00 4.52 

470 # vertices = [] 

471 # vertices_filename = pocket_output + '_mdpocket.pdb' 

472 # with open(vertices_filename,'r') as file: 

473 # frame_vertices = [] 

474 # for line in file: 

475 # line_data = re.split("[ ]+", line[:-1]) 

476 # if line_data[0] == 'ATOM': 

477 # frame_vertices.append({ 

478 # 'e': line_data[2], 

479 # 'n': line_data[4], 

480 # 'x': line_data[5], 

481 # 'y': line_data[6], 

482 # 'z': line_data[7], 

483 # 'r': line_data[9], 

484 # }) 

485 # if line_data[0] == 'ENDMDL': 

486 # vertices.append(frame_vertices) 

487 # frame_vertices = [] 

488 

489 # Mine the atoms implicated in each pocket 

490 # In this file atoms are listed for each frame, but they are always the same 

491 # For this reason, we mine only atoms in the first frame 

492 atoms = [] 

493 atoms_filename = pocket_output + '_mdpocket_atoms.pdb' 

494 with open(atoms_filename,'r') as file: 

495 for line in file: 

496 line_data = re.split("[ ]+", line[:-1]) 

497 if line_data[0] == 'MODEL': 

498 continue 

499 if line_data[0] == 'ATOM': 

500 atoms.append(int(line_data[1])) 

501 if line_data[0] == 'ENDMDL': 

502 break 

503 

504 # Format the mined data and append it to the output data 

505 # NEVER FORGET: The 'descriptors_data' object contains a lot of data, not only pocket volumes 

506 # (e.g. drugability score) 

507 # DANI: Habría que sentarnos un día a ver que otros valores queremos quedarnos 

508 output = { 

509 'name': pocket_name, 

510 'volumes': list(map(float, descriptors_data['pock_volume'])), 

511 #'vertices': vertices, 

512 'atoms': atoms, 

513 } 

514 

515 output_analysis.append(output) 

516 

517 # Recover the original directory 

518 chdir(recovery_path) 

519 

520 # By default, the starting frame is always 0 

521 start = 0 

522 

523 # Export the analysis in json format 

524 save_json({ 'data': output_analysis, 'start': start, 'step': step }, output_analysis_filepath)