Coverage for model_workflow/analyses/pockets.py: 90%
247 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
1# Pockets analysis
2#
3# The pockets analysis is carried by the 'Fpocket' software. Fpocket is a fast open source protein
4# pocket (cavity) detection algorithm based on Voronoi tessellation.
5#
6# The analysis is performed over 100 frames selected along the trajectory. First of all, an 'MDpocket'
7# analysis is run over these frames thus generating an output grid file. This grid contains a measure
8# of frequency of how many times the pocket was open during the trajectory. The most stable and wide
9# pockets are selected from this grid and then analyzed again with MDpocket individually. This second
10# run returns dynamic data of the pocket volume and drugability score, among others.
11#
12# Vincent Le Guilloux, Peter Schmidtke and Pierre Tuffery, “Fpocket: An open source platform for ligand
13# pocket detection”, BMC Bioinformatics 2009, 10:168
14#
15# Peter Schmidtke & Xavier Barril “Understanding and predicting druggability. A high-throughput method
16# for detection of drug binding sites.”, J Med Chem, 2010, 53(15):5858-67
17#
18# Peter Schmldtke, Axel Bidon-Chanal, Javier Luque, Xavier Barril, “MDpocket: open-source cavity detection
19# and characterization on molecular dynamics trajectories.”, Bioinformatics. 2011 Dec 1;27(23):3276-85
21from os.path import exists, getsize, split
22from os import remove, chdir, getcwd
23import re
24import collections
26from subprocess import run, PIPE
28from model_workflow.tools.get_reduced_trajectory import get_reduced_trajectory
29from model_workflow.utils.auxiliar import warn, ToolError, save_json
30from model_workflow.utils.file import File
31from model_workflow.utils.constants import GREY_HEADER, COLOR_END
32from model_workflow.utils.constants import OUTPUT_POCKETS_FILENAME, OUTPUT_POCKET_STRUCTURES_PREFIX
33from model_workflow.utils.type_hints import *
35CURSOR_UP_ONE = '\x1b[1A'
36ERASE_LINE = '\x1b[2K'
37ERASE_PREVIOUS_LINE = CURSOR_UP_ONE + ERASE_LINE
38ERASE_4_PREVIOUS_LINES = ERASE_PREVIOUS_LINE + ERASE_PREVIOUS_LINE + ERASE_PREVIOUS_LINE + ERASE_PREVIOUS_LINE + CURSOR_UP_ONE
40# Set some known error logs to check if we are having them
41KNOWN_MDPOCKET_ERRORS = set([
42 'Error in creating clustering tree, return NULL pointer...breaking up'
43])
45def pockets (
46 structure_file : 'File',
47 trajectory_file : 'File',
48 output_directory : str,
49 pbc_selection : 'Selection',
50 snapshots : int,
51 frames_limit : int = 100,
52 # Get only the 10 first pockets since the analysis is quite slow by now
53 # DANI: Cuando hagamos threading y no haya limite de tamaño para cargar en mongo podremos hacer más pockets
54 maximum_pockets_number : int = 10):
55 """Perform the pockets analysis."""
57 # DANI: De momento, no se hacen pockets para simulaciones con residuos en PBC (e.g. membrana)
58 # DANI: Esto es debido a que los átomos donde NO queremos que encuentre pockets no se pueden descartar
59 # DANI: Descartarlos significa quitarlos, pero si los quitamos entonces podemos encontrar pockets donde están estos átomos
60 # DANI: Estamos a la espera que los de mdpocket incluyan un flag para estos casos
61 # https://github.com/Discngine/fpocket/issues/77#issuecomment-974193129
62 if pbc_selection:
63 print(' Pockets analysis will be skipped since we have PBC atoms')
64 return
66 # Set output filepaths
67 output_analysis_filepath = f'{output_directory}/{OUTPUT_POCKETS_FILENAME}'
69 # Set a reduced trajectory with only 100 frames
70 # Get the step between frames of the new reduced trajectory, since it will be append to the output
71 pockets_trajectory, step, frames = get_reduced_trajectory(
72 structure_file,
73 trajectory_file,
74 snapshots,
75 frames_limit,
76 )
77 # Save the pockets trajectory as a file
78 pockets_trajectory_file = File(pockets_trajectory)
80 # WARNING: There is a silent sharp limit of characters here
81 # https://github.com/Discngine/fpocket/issues/130
82 # To avoid the problem we must change the directory where we run pockets so all the paths are as short as possible
84 # Save the MD path
85 md_path, output_directory_name = split(output_directory)
86 # Save the current working directory so we can recover it later
87 recovery_path = getcwd()
89 # Move to the MD path so all relative paths become shorter
90 chdir(md_path)
92 # Now set the structure and trajectory paths relative to the current new location
93 # HARDCODE: We asume the trajectory and structure files are in the current location
94 # This is now true, but if this changes in the future this will fail, although the error will be obvious
95 auxiliar_trajectory_filepath = f'./{pockets_trajectory_file.filename}'
96 auxiliar_structure_filepath = f'./{structure_file.filename}'
98 # Run the mdpocket analysis to find new pockets
99 mdpocket_output = output_directory_name + '/mdpout'
100 # Set the filename of the fpocket output we are intereseted in
101 grid_filename = mdpocket_output + '_freq.dx'
102 # Skip this step if the output file already exists and is not empty
103 # WARNING: The file is created as soon as mdpocket starts to run
104 # WARNING: However the file remains empty until the end of mdpocket
105 if not exists(grid_filename) or getsize(grid_filename) == 0:
107 print('Searching new pockets')
108 print(GREY_HEADER)
109 process = run([
110 "mdpocket",
111 "--trajectory_file",
112 # WARNING: There is a silent sharp limit of characters here
113 # To avoid the problem we must use the relative path instead of the absolute path
114 auxiliar_trajectory_filepath,
115 "--trajectory_format",
116 "xtc",
117 "-f",
118 # WARNING: There is a silent sharp limit of characters here
119 # To avoid the problem we must use the relative path instead of the absolute path
120 auxiliar_structure_filepath,
121 "-o",
122 mdpocket_output,
123 ], stderr=PIPE)
124 error_logs = process.stderr.decode()
125 print(COLOR_END)
127 # If file does not exist or is still empty at this point then somethin went wrong
128 if not exists(grid_filename) or getsize(grid_filename) == 0:
129 print(error_logs)
130 raise ToolError('Something went wrong with mdpocket while searching pockets')
132 # Check if we are having concerning error logs
133 errored = False
134 for error in KNOWN_MDPOCKET_ERRORS:
135 matches = re.findall(error, error_logs)
136 count = len(matches)
137 if count > 0:
138 warn(f'We are having "{error}" ({count})')
139 errored = True
141 # If so we can stop here since we will not be able to mine the grid filename
142 if errored:
143 if exists(grid_filename):
144 remove(grid_filename)
145 raise ToolError('We had errors with mdpocket while searching pockets')
147 # Read and harvest the gird file
148 with open(grid_filename, 'r') as file:
150 # First, mine the grid dimensions and origin
151 dimensions = None
152 dimensions_pattern = "counts ([0-9]+) ([0-9]+) ([0-9]+)"
153 origin = None
154 origin_pattern = "origin ([.0-9-]+) ([.0-9-]+) ([.0-9-]+)"
155 for line in file:
156 search = re.search(dimensions_pattern, line)
157 if search != None:
158 dimensions = (int(search.group(1)),int(search.group(2)),int(search.group(3)))
159 search = re.search(origin_pattern, line)
160 if search != None:
161 origin = (float(search.group(1)),float(search.group(2)),float(search.group(3)))
162 if origin and dimensions:
163 break
164 if not dimensions:
165 # This may happend when one of the dimensions is negative, which an mdpocket error
166 # DANI: El origen de esto no está claro pero cuando me pasó vino acompañado de muchos errores:
167 # DANI: '! No Pockets Found while refining' y '! No pocket to reindex.'
168 # DANI: Además de algún 'Error in creating clustering tree, return NULL pointer...breaking up!'
169 raise Exception('Failed to mine dimensions')
171 # Next, mine the grid values
172 grid_values = []
173 grid_values_pattern = "^([.0-9]+) ([.0-9]+) ([.0-9]+) $"
174 # The last line of the grid may have less than 3 values
175 last_line_grid_values_pattern_1 = "^([.0-9]+) ([.0-9]+) $"
176 last_line_grid_values_pattern_2 = "^([.0-9]+) $"
177 for line in file:
178 search = re.search(grid_values_pattern, line)
179 if (search != None):
180 grid_values.append(float(search.group(1)))
181 grid_values.append(float(search.group(2)))
182 grid_values.append(float(search.group(3)))
183 continue
184 # This should only happend at the end
185 search = re.search(last_line_grid_values_pattern_1, line)
186 if (search != None):
187 grid_values.append(float(search.group(1)))
188 grid_values.append(float(search.group(2)))
189 continue
190 search = re.search(last_line_grid_values_pattern_2, line)
191 if (search != None):
192 grid_values.append(float(search.group(1)))
193 continue
195 # Set a function to get the value of a given 'x, y, z' grid point
196 # Grid points are disposed first in 'z' order, then 'y', and finally 'x'
197 xl, yl, zl = dimensions
198 def get_index (x : int, y : int, z : int) -> int:
199 index = x*yl*zl + y*zl + z
200 return index
202 # Get the coordinates of the colliding points
203 # Set the x, y and z index limit (i.e. the limit - 1)
204 xil, yil, zil = (xl-1, yl-1, zl-1)
205 def get_colliding_points (point : tuple) -> list:
206 x, y, z = point[0], point[1], point[2]
207 colliding_points = [
208 (x-1, y-1, z-1),
209 (x-1, y-1, z ),
210 (x-1, y-1, z+1),
211 (x-1, y , z-1),
212 (x-1, y , z ),
213 (x-1, y , z+1),
214 (x-1, y+1, z-1),
215 (x-1, y+1, z ),
216 (x-1, y+1, z+1),
217 (x , y-1, z-1),
218 (x , y-1, z ),
219 (x , y-1, z+1),
220 (x , y , z-1),
221 (x , y , z+1),
222 (x , y+1, z-1),
223 (x , y+1, z ),
224 (x , y+1, z+1),
225 (x+1, y-1, z-1),
226 (x+1, y-1, z ),
227 (x+1, y-1, z+1),
228 (x+1, y , z-1),
229 (x+1, y , z ),
230 (x+1, y , z+1),
231 (x+1, y+1, z-1),
232 (x+1, y+1, z ),
233 (x+1, y+1, z+1),
234 ]
235 if x <= 0:
236 colliding_points = [ point for point in colliding_points if point[0] != x-1 ]
237 if x >= xil:
238 colliding_points = [ point for point in colliding_points if point[0] != x+1 ]
239 if y <= 0:
240 colliding_points = [ point for point in colliding_points if point[1] != y-1 ]
241 if y >= yil:
242 colliding_points = [ point for point in colliding_points if point[1] != y+1 ]
243 if z <= 0:
244 colliding_points = [ point for point in colliding_points if point[2] != z-1 ]
245 if z >= zil:
246 colliding_points = [ point for point in colliding_points if point[2] != z+1 ]
247 return colliding_points
249 # Set a cuttoff value to consider a point valid
250 cuttoff = 0.5
252 # Classify all non-zero values by 'pocket' groups according to if they are connected
253 pockets = [0] * xl * yl * zl
254 pockets_count = 0
256 # Given a point which belongs to a pocket, find all pocket points connected to it
257 # This connection must be wide enought for a water molecule to fit in
258 # Grid points are 1 Ångstrom wide and a water molecule has a diameter of 2.75 Ångstroms aprox
259 # So, pocket points must be connected by a net of points at least 3 points wide in all dimensions (x,y,z)
260 # Tag all points with the same pocket number
261 def set_pocket (start_point : tuple, pocket_number : int):
262 points = [start_point]
263 for point in points:
264 # Get current point colliders
265 colliding_points = get_colliding_points(point)
266 # Filter only the pocket points
267 pocket_points = []
268 for point in colliding_points:
269 index = get_index(*point)
270 value = grid_values[index]
271 # If it is a pocket then set it as part of the current pocket
272 if value >= cuttoff:
273 # Tag it as the current pocket
274 pockets[index] = pocket_number
275 pocket_points.append(point)
276 # In case all its surrounding points are pockets add the points to the list so they can keep expanding the pocket
277 # This is done because if all surrounding points are pockets it means we have a wide enought region of the pocket
278 # It is a 3 x 3 x 3 region.
279 # Otherwise stop here since the surrounding points may not we wide enough to expand the pocket
280 if len(pocket_points) < 26:
281 continue
282 # Filter points which are not in the pockets list already
283 new_points = [ point for point in pocket_points if point not in points ]
284 # Update the points list
285 points += new_points
287 # Find out if a point is surrounded by all pocket points so it could be considered a pocket alone
288 def is_pocket_base (start_point : tuple) -> bool:
289 # Get current point colliders
290 colliding_points = get_colliding_points(start_point)
291 # Search if these points match the cutoff
292 # If only 1 point does not match then this is not a pocket base
293 for point in colliding_points:
294 index = get_index(*point)
295 value = grid_values[index]
296 if value < cuttoff:
297 return False
298 return True
300 # Save also each point coordinates
301 for x in range(xl):
302 for y in range(yl):
303 for z in range(zl):
304 index = get_index(x,y,z)
305 # If it is not a pocket then pass
306 value = grid_values[index]
307 if value < cuttoff:
308 continue
309 # If it is a pocket but it has been already tagged then pass
310 pocket = pockets[index]
311 if pocket:
312 continue
313 # If it is not wide enought to start a pocket then pass
314 point = (x,y,z)
315 if not is_pocket_base(point):
316 continue
317 # If none of the previous values was marked as a pocket then we set a new pocket number
318 pockets_count += 1
319 set_pocket(start_point=point, pocket_number=pockets_count)
321 # Exclude the first result which will always be 0 and it stands for no-pocket points
322 biggest_pockets = collections.Counter(pockets).most_common()
323 if len(biggest_pockets) == 1:
324 print(' No pockets were found')
325 # Recover the original directory
326 chdir(recovery_path)
327 return
328 biggest_pockets = biggest_pockets[1:]
329 pockets_number = len(biggest_pockets)
330 # If we have more than the maximum number of pockets then get the first pockets and discard the rest
331 if pockets_number > maximum_pockets_number:
332 biggest_pockets = biggest_pockets[0:maximum_pockets_number]
333 pockets_number = maximum_pockets_number
335 # First of all, get all header lines from the original grid file
336 # We need them to write grid files further
337 with open(grid_filename,'r') as file:
338 header_lines = []
339 header_pattern = "^[a-z]"
340 for line in file:
341 if re.match(header_pattern, line):
342 header_lines.append(line)
343 elif re.match(grid_values_pattern, line):
344 break
346 # Set the dict where all output data will be stored
347 output_analysis = []
349 # Set the output pocket structure files prefix
350 output_pocket_structure_prefix = f'{output_directory_name}/{OUTPUT_POCKET_STRUCTURES_PREFIX}_'
352 # Next, we analyze each selected pocket independently. The steps for each pocket are:
353 # 1 - Create a new grid file
354 # 2 - Conver the grid to pdb
355 # 3 - Analyze this pdb with mdpocket
356 # 4 - Harvest the volumes over time and write them in the pockets analysis file
357 for p, pock in enumerate(biggest_pockets, 1):
358 # WARNING: This name must match the final name of the pocket file once loaded in the database
359 pocket_name = 'pocket_' + str(p).zfill(2)
360 pocket_output = output_directory_name + '/' + pocket_name
361 # Check if current pocket files already exist and are complete. If so, skip this pocket
362 # Output files:
363 # - pX.dx: it is created and completed at the begining by this workflow
364 # - pX_descriptors.txt: it is created at the begining and completed along the mdpocket progress
365 # - pX_mdpocket_atoms.pdb: it is completed at the begining but remains size 0 until the end of mdpocket
366 # - pX_mdpocket.pdb: it is completed at the begining but remains size 0 until the end of mdpocket
367 # Note that checking pX_mdpocket_atoms.pdb or pX_mdpocket.pdb is enought to know if mdpocket was completed
368 checking_filename = pocket_output + '_mdpocket.pdb'
369 if not (exists(checking_filename) and getsize(checking_filename) > 0):
371 # Update the logs
372 print(f' Analyzing pocket {p}/{pockets_number}', end='\r')
374 # Create the new grid for this pocket, where all values from other pockets are set to 0
375 pocket_value = pock[0]
376 new_grid_values = [str(value).ljust(5,'0') if pockets[v] == pocket_value else '0.000' for v, value in enumerate(grid_values)]
377 new_grid_filename = output_directory_name + '/' + pocket_name + '.dx'
378 with open(new_grid_filename,'w') as file:
379 # Write the header lines
380 for line in header_lines:
381 file.write(line)
382 # Write values in lines of 3 values
383 count = 0
384 line = ''
385 for value in new_grid_values:
386 if count == 0:
387 line = value
388 count += 1
389 elif count == 1:
390 line = line + ' ' + value
391 count += 1
392 elif count == 2:
393 line = line + ' ' + value + ' \n'
394 file.write(line)
395 count = 0
397 # Set a function to get the 'x, y, z' coordinates of a given grid point index
398 def get_coordinates (index : int) -> tuple:
399 x, rem = divmod(index, yl*zl)
400 y, z = divmod(rem, zl)
401 return (x,y,z)
403 # Convert the grid coordinates to pdb
404 new_pdb_lines = []
405 lines_count = 0
406 new_pdb_filename = f'{output_pocket_structure_prefix}{str(p).zfill(2)}.pdb'
407 for j, pocket in enumerate(pockets):
408 if pocket != pocket_value:
409 continue
410 x,y,z = get_coordinates(j)
411 lines_count += 1
412 atom_num = str(lines_count).rjust(6,' ')
413 x_coordinates = str(round((origin[0] + x) * 1000) / 1000).rjust(8, ' ')
414 y_coordinates = str(round((origin[1] + y) * 1000) / 1000).rjust(8, ' ')
415 z_coordinates = str(round((origin[2] + z) * 1000) / 1000).rjust(8, ' ')
416 line = "ATOM "+ atom_num +" C PTH 1 "+ x_coordinates + y_coordinates + z_coordinates +" 0.00 0.00\n"
417 new_pdb_lines.append(line)
419 # Write the pdb file
420 with open(new_pdb_filename,'w') as file:
421 for line in new_pdb_lines:
422 file.write(line)
424 # Run the mdpocket analysis focusing in this specific pocket
425 print(GREY_HEADER)
426 error_logs = run([
427 "mdpocket",
428 "--trajectory_file",
429 auxiliar_trajectory_filepath,
430 "--trajectory_format",
431 "xtc",
432 "-f",
433 # WARNING: There is a silent sharp limit of characters here
434 # To avoid the problem we must use the relative path instead of the absolute path
435 auxiliar_structure_filepath,
436 "-o",
437 pocket_output,
438 "--selected_pocket",
439 new_pdb_filename,
440 ], stderr=PIPE).stderr.decode()
441 print(COLOR_END)
443 # If file does not exist or is still empty at this point then somethin went wrong
444 if not exists(checking_filename) or getsize(checking_filename) == 0:
445 print(error_logs)
446 raise Exception(f'Something went wrong with mdpocket while analysing pocket {p}')
448 # Remove previous lines
449 print(ERASE_4_PREVIOUS_LINES)
451 # Mine data from the mdpocket 'descriptors' output file
452 descriptors_data = {}
453 mdpocket_descriptors_filename = pocket_output + '_descriptors.txt'
454 with open(mdpocket_descriptors_filename,'r') as file:
455 # The '[:-1]' is to remove the break line at the end of each line
456 entries = re.split("[ ]+", next(file)[:-1])
457 for entry in entries:
458 descriptors_data[entry] = []
459 for line in file:
460 line_data = re.split("[ ]+", line[:-1])
461 for v, value in enumerate(line_data):
462 descriptors_data[entries[v]].append(value)
464 # Mine the Voronoi vertices which represent the pocket each frame
465 # Mine their positions and radius for each frame
466 # Example of lines in hte file to be mined:
467 # ATOM 0 O STP 105 79.352 75.486 84.079 0.00 4.25
468 # ATOM 1 C STP 105 79.735 75.228 84.482 0.00 4.13
469 # ATOM 2 C STP 105 79.624 74.251 84.263 0.00 4.52
470 # vertices = []
471 # vertices_filename = pocket_output + '_mdpocket.pdb'
472 # with open(vertices_filename,'r') as file:
473 # frame_vertices = []
474 # for line in file:
475 # line_data = re.split("[ ]+", line[:-1])
476 # if line_data[0] == 'ATOM':
477 # frame_vertices.append({
478 # 'e': line_data[2],
479 # 'n': line_data[4],
480 # 'x': line_data[5],
481 # 'y': line_data[6],
482 # 'z': line_data[7],
483 # 'r': line_data[9],
484 # })
485 # if line_data[0] == 'ENDMDL':
486 # vertices.append(frame_vertices)
487 # frame_vertices = []
489 # Mine the atoms implicated in each pocket
490 # In this file atoms are listed for each frame, but they are always the same
491 # For this reason, we mine only atoms in the first frame
492 atoms = []
493 atoms_filename = pocket_output + '_mdpocket_atoms.pdb'
494 with open(atoms_filename,'r') as file:
495 for line in file:
496 line_data = re.split("[ ]+", line[:-1])
497 if line_data[0] == 'MODEL':
498 continue
499 if line_data[0] == 'ATOM':
500 atoms.append(int(line_data[1]))
501 if line_data[0] == 'ENDMDL':
502 break
504 # Format the mined data and append it to the output data
505 # NEVER FORGET: The 'descriptors_data' object contains a lot of data, not only pocket volumes
506 # (e.g. drugability score)
507 # DANI: Habría que sentarnos un día a ver que otros valores queremos quedarnos
508 output = {
509 'name': pocket_name,
510 'volumes': list(map(float, descriptors_data['pock_volume'])),
511 #'vertices': vertices,
512 'atoms': atoms,
513 }
515 output_analysis.append(output)
517 # Recover the original directory
518 chdir(recovery_path)
520 # By default, the starting frame is always 0
521 start = 0
523 # Export the analysis in json format
524 save_json({ 'data': output_analysis, 'start': start, 'step': step }, output_analysis_filepath)