Coverage for model_workflow/analyses/pockets.py: 90%

1# Pockets analysis

3# The pockets analysis is carried by the 'Fpocket' software. Fpocket is a fast open source protein

4# pocket (cavity) detection algorithm based on Voronoi tessellation.

6# The analysis is performed over 100 frames selected along the trajectory. First of all, an 'MDpocket'

7# analysis is run over these frames thus generating an output grid file. This grid contains a measure

8# of frequency of how many times the pocket was open during the trajectory. The most stable and wide

9# pockets are selected from this grid and then analyzed again with MDpocket individually. This second

10# run returns dynamic data of the pocket volume and drugability score, among others.

11#

12# Vincent Le Guilloux, Peter Schmidtke and Pierre Tuffery, “Fpocket: An open source platform for ligand

13# pocket detection”, BMC Bioinformatics 2009, 10:168

14#

15# Peter Schmidtke & Xavier Barril “Understanding and predicting druggability. A high-throughput method

16# for detection of drug binding sites.”, J Med Chem, 2010, 53(15):5858-67

17#

18# Peter Schmldtke, Axel Bidon-Chanal, Javier Luque, Xavier Barril, “MDpocket: open-source cavity detection

19# and characterization on molecular dynamics trajectories.”, Bioinformatics. 2011 Dec 1;27(23):3276-85

21from os.path import exists, getsize, split

22from os import remove, chdir, getcwd

23import re

24import collections

26from subprocess import run, PIPE

28from model_workflow.tools.get_reduced_trajectory import get_reduced_trajectory

29from model_workflow.utils.auxiliar import warn, ToolError, save_json

30from model_workflow.utils.file import File

31from model_workflow.utils.constants import GREY_HEADER, COLOR_END

32from model_workflow.utils.constants import OUTPUT_POCKETS_FILENAME, OUTPUT_POCKET_STRUCTURES_PREFIX

33from model_workflow.utils.type_hints import *

35CURSOR_UP_ONE = '\x1b[1A'

36ERASE_LINE = '\x1b[2K'

37ERASE_PREVIOUS_LINE = CURSOR_UP_ONE + ERASE_LINE

38ERASE_4_PREVIOUS_LINES = ERASE_PREVIOUS_LINE + ERASE_PREVIOUS_LINE + ERASE_PREVIOUS_LINE + ERASE_PREVIOUS_LINE + CURSOR_UP_ONE

40# Set some known error logs to check if we are having them

41KNOWN_MDPOCKET_ERRORS = set([

42 'Error in creating clustering tree, return NULL pointer...breaking up'

43])

45def pockets (

46 structure_file : 'File',

47 trajectory_file : 'File',

48 output_directory : str,

49 pbc_selection : 'Selection',

50 snapshots : int,

51 frames_limit : int = 100,

52 # Get only the 10 first pockets since the analysis is quite slow by now

53 # DANI: Cuando hagamos threading y no haya limite de tamaño para cargar en mongo podremos hacer más pockets

54 maximum_pockets_number : int = 10):

55 """Perform the pockets analysis."""

57 # DANI: De momento, no se hacen pockets para simulaciones con residuos en PBC (e.g. membrana)

58 # DANI: Esto es debido a que los átomos donde NO queremos que encuentre pockets no se pueden descartar

59 # DANI: Descartarlos significa quitarlos, pero si los quitamos entonces podemos encontrar pockets donde están estos átomos

60 # DANI: Estamos a la espera que los de mdpocket incluyan un flag para estos casos

61 # https://github.com/Discngine/fpocket/issues/77#issuecomment-974193129

62 if pbc_selection:

63 print(' Pockets analysis will be skipped since we have PBC atoms')

64 return

66 # Set output filepaths

67 output_analysis_filepath = f'{output_directory}/{OUTPUT_POCKETS_FILENAME}'

69 # Set a reduced trajectory with only 100 frames

70 # Get the step between frames of the new reduced trajectory, since it will be append to the output

71 pockets_trajectory, step, frames = get_reduced_trajectory(

72 structure_file,

73 trajectory_file,

74 snapshots,

75 frames_limit,

76 )

77 # Save the pockets trajectory as a file

78 pockets_trajectory_file = File(pockets_trajectory)

80 # WARNING: There is a silent sharp limit of characters here

81 # https://github.com/Discngine/fpocket/issues/130

82 # To avoid the problem we must change the directory where we run pockets so all the paths are as short as possible

84 # Save the MD path

85 md_path, output_directory_name = split(output_directory)

86 # Save the current working directory so we can recover it later

87 recovery_path = getcwd()

89 # Move to the MD path so all relative paths become shorter

90 chdir(md_path)

92 # Now set the structure and trajectory paths relative to the current new location

93 # HARDCODE: We asume the trajectory and structure files are in the current location

94 # This is now true, but if this changes in the future this will fail, although the error will be obvious

95 auxiliar_trajectory_filepath = f'./{pockets_trajectory_file.filename}'

96 auxiliar_structure_filepath = f'./{structure_file.filename}'

98 # Run the mdpocket analysis to find new pockets

99 mdpocket_output = output_directory_name + '/mdpout'

100 # Set the filename of the fpocket output we are intereseted in

101 grid_filename = mdpocket_output + '_freq.dx'

102 # Skip this step if the output file already exists and is not empty

103 # WARNING: The file is created as soon as mdpocket starts to run

104 # WARNING: However the file remains empty until the end of mdpocket

105 if not exists(grid_filename) or getsize(grid_filename) == 0:

106

107 print('Searching new pockets')

108 print(GREY_HEADER)

109 process = run([

110 "mdpocket",

111 "--trajectory_file",

112 # WARNING: There is a silent sharp limit of characters here

113 # To avoid the problem we must use the relative path instead of the absolute path

114 auxiliar_trajectory_filepath,

115 "--trajectory_format",

116 "xtc",

117 "-f",

118 # WARNING: There is a silent sharp limit of characters here

119 # To avoid the problem we must use the relative path instead of the absolute path

120 auxiliar_structure_filepath,

121 "-o",

122 mdpocket_output,

123 ], stderr=PIPE)

124 error_logs = process.stderr.decode()

125 print(COLOR_END)

126

127 # If file does not exist or is still empty at this point then somethin went wrong

128 if not exists(grid_filename) or getsize(grid_filename) == 0:

129 print(error_logs)

130 raise ToolError('Something went wrong with mdpocket while searching pockets')

131

132 # Check if we are having concerning error logs

133 errored = False

134 for error in KNOWN_MDPOCKET_ERRORS:

135 matches = re.findall(error, error_logs)

136 count = len(matches)

137 if count > 0:

138 warn(f'We are having "{error}" ({count})')

139 errored = True

140

141 # If so we can stop here since we will not be able to mine the grid filename

142 if errored:

143 if exists(grid_filename):

144 remove(grid_filename)

145 raise ToolError('We had errors with mdpocket while searching pockets')

146

147 # Read and harvest the gird file

148 with open(grid_filename, 'r') as file:

149

150 # First, mine the grid dimensions and origin

151 dimensions = None

152 dimensions_pattern = "counts ([0-9]+) ([0-9]+) ([0-9]+)"

153 origin = None

154 origin_pattern = "origin ([.0-9-]+) ([.0-9-]+) ([.0-9-]+)"

155 for line in file:

156 search = re.search(dimensions_pattern, line)

157 if search != None:

158 dimensions = (int(search.group(1)),int(search.group(2)),int(search.group(3)))

159 search = re.search(origin_pattern, line)

160 if search != None:

161 origin = (float(search.group(1)),float(search.group(2)),float(search.group(3)))

162 if origin and dimensions:

163 break

164 if not dimensions:

165 # This may happend when one of the dimensions is negative, which an mdpocket error

166 # DANI: El origen de esto no está claro pero cuando me pasó vino acompañado de muchos errores:

167 # DANI: '! No Pockets Found while refining' y '! No pocket to reindex.'

168 # DANI: Además de algún 'Error in creating clustering tree, return NULL pointer...breaking up!'

169 raise Exception('Failed to mine dimensions')

170

171 # Next, mine the grid values

172 grid_values = []

173 grid_values_pattern = "^([.0-9]+) ([.0-9]+) ([.0-9]+) $"

174 # The last line of the grid may have less than 3 values

175 last_line_grid_values_pattern_1 = "^([.0-9]+) ([.0-9]+) $"

176 last_line_grid_values_pattern_2 = "^([.0-9]+) $"

177 for line in file:

178 search = re.search(grid_values_pattern, line)

179 if (search != None):

180 grid_values.append(float(search.group(1)))

181 grid_values.append(float(search.group(2)))

182 grid_values.append(float(search.group(3)))

183 continue

184 # This should only happend at the end

185 search = re.search(last_line_grid_values_pattern_1, line)

186 if (search != None):

187 grid_values.append(float(search.group(1)))

188 grid_values.append(float(search.group(2)))

189 continue

190 search = re.search(last_line_grid_values_pattern_2, line)

191 if (search != None):

192 grid_values.append(float(search.group(1)))

193 continue

194

195 # Set a function to get the value of a given 'x, y, z' grid point

196 # Grid points are disposed first in 'z' order, then 'y', and finally 'x'

197 xl, yl, zl = dimensions

198 def get_index (x : int, y : int, z : int) -> int:

199 index = x*yl*zl + y*zl + z

200 return index

201

202 # Get the coordinates of the colliding points

203 # Set the x, y and z index limit (i.e. the limit - 1)

204 xil, yil, zil = (xl-1, yl-1, zl-1)

205 def get_colliding_points (point : tuple) -> list:

206 x, y, z = point[0], point[1], point[2]

207 colliding_points = [

208 (x-1, y-1, z-1),

209 (x-1, y-1, z ),

210 (x-1, y-1, z+1),

211 (x-1, y , z-1),

212 (x-1, y , z ),

213 (x-1, y , z+1),

214 (x-1, y+1, z-1),

215 (x-1, y+1, z ),

216 (x-1, y+1, z+1),

217 (x , y-1, z-1),

218 (x , y-1, z ),

219 (x , y-1, z+1),

220 (x , y , z-1),

221 (x , y , z+1),

222 (x , y+1, z-1),

223 (x , y+1, z ),

224 (x , y+1, z+1),

225 (x+1, y-1, z-1),

226 (x+1, y-1, z ),

227 (x+1, y-1, z+1),

228 (x+1, y , z-1),

229 (x+1, y , z ),

230 (x+1, y , z+1),

231 (x+1, y+1, z-1),

232 (x+1, y+1, z ),

233 (x+1, y+1, z+1),

234 ]

235 if x <= 0:

236 colliding_points = [ point for point in colliding_points if point[0] != x-1 ]

237 if x >= xil:

238 colliding_points = [ point for point in colliding_points if point[0] != x+1 ]

239 if y <= 0:

240 colliding_points = [ point for point in colliding_points if point[1] != y-1 ]

241 if y >= yil:

242 colliding_points = [ point for point in colliding_points if point[1] != y+1 ]

243 if z <= 0:

244 colliding_points = [ point for point in colliding_points if point[2] != z-1 ]

245 if z >= zil:

246 colliding_points = [ point for point in colliding_points if point[2] != z+1 ]

247 return colliding_points

248

249 # Set a cuttoff value to consider a point valid

250 cuttoff = 0.5

251

252 # Classify all non-zero values by 'pocket' groups according to if they are connected

253 pockets = [0] * xl * yl * zl

254 pockets_count = 0

255

256 # Given a point which belongs to a pocket, find all pocket points connected to it

257 # This connection must be wide enought for a water molecule to fit in

258 # Grid points are 1 Ångstrom wide and a water molecule has a diameter of 2.75 Ångstroms aprox

259 # So, pocket points must be connected by a net of points at least 3 points wide in all dimensions (x,y,z)

260 # Tag all points with the same pocket number

261 def set_pocket (start_point : tuple, pocket_number : int):

262 points = [start_point]

263 for point in points:

264 # Get current point colliders

265 colliding_points = get_colliding_points(point)

266 # Filter only the pocket points

267 pocket_points = []

268 for point in colliding_points:

269 index = get_index(*point)

270 value = grid_values[index]

271 # If it is a pocket then set it as part of the current pocket

272 if value >= cuttoff:

273 # Tag it as the current pocket

274 pockets[index] = pocket_number

275 pocket_points.append(point)

276 # In case all its surrounding points are pockets add the points to the list so they can keep expanding the pocket

277 # This is done because if all surrounding points are pockets it means we have a wide enought region of the pocket

278 # It is a 3 x 3 x 3 region.

279 # Otherwise stop here since the surrounding points may not we wide enough to expand the pocket

280 if len(pocket_points) < 26:

281 continue

282 # Filter points which are not in the pockets list already

283 new_points = [ point for point in pocket_points if point not in points ]

284 # Update the points list

285 points += new_points

286

287 # Find out if a point is surrounded by all pocket points so it could be considered a pocket alone

288 def is_pocket_base (start_point : tuple) -> bool:

289 # Get current point colliders

290 colliding_points = get_colliding_points(start_point)

291 # Search if these points match the cutoff

292 # If only 1 point does not match then this is not a pocket base

293 for point in colliding_points:

294 index = get_index(*point)

295 value = grid_values[index]

296 if value < cuttoff:

297 return False

298 return True

299

300 # Save also each point coordinates

301 for x in range(xl):

302 for y in range(yl):

303 for z in range(zl):

304 index = get_index(x,y,z)

305 # If it is not a pocket then pass

306 value = grid_values[index]

307 if value < cuttoff:

308 continue

309 # If it is a pocket but it has been already tagged then pass

310 pocket = pockets[index]

311 if pocket:

312 continue

313 # If it is not wide enought to start a pocket then pass

314 point = (x,y,z)

315 if not is_pocket_base(point):

316 continue

317 # If none of the previous values was marked as a pocket then we set a new pocket number

318 pockets_count += 1

319 set_pocket(start_point=point, pocket_number=pockets_count)

320

321 # Exclude the first result which will always be 0 and it stands for no-pocket points

322 biggest_pockets = collections.Counter(pockets).most_common()

323 if len(biggest_pockets) == 1:

324 print(' No pockets were found')

325 # Recover the original directory

326 chdir(recovery_path)

327 return

328 biggest_pockets = biggest_pockets[1:]

329 pockets_number = len(biggest_pockets)

330 # If we have more than the maximum number of pockets then get the first pockets and discard the rest

331 if pockets_number > maximum_pockets_number:

332 biggest_pockets = biggest_pockets[0:maximum_pockets_number]

333 pockets_number = maximum_pockets_number

334

335 # First of all, get all header lines from the original grid file

336 # We need them to write grid files further

337 with open(grid_filename,'r') as file:

338 header_lines = []

339 header_pattern = "^[a-z]"

340 for line in file:

341 if re.match(header_pattern, line):

342 header_lines.append(line)

343 elif re.match(grid_values_pattern, line):

344 break

345

346 # Set the dict where all output data will be stored

347 output_analysis = []

348

349 # Set the output pocket structure files prefix

350 output_pocket_structure_prefix = f'{output_directory_name}/{OUTPUT_POCKET_STRUCTURES_PREFIX}_'

351

352 # Next, we analyze each selected pocket independently. The steps for each pocket are:

353 # 1 - Create a new grid file

354 # 2 - Conver the grid to pdb

355 # 3 - Analyze this pdb with mdpocket

356 # 4 - Harvest the volumes over time and write them in the pockets analysis file

357 for p, pock in enumerate(biggest_pockets, 1):

358 # WARNING: This name must match the final name of the pocket file once loaded in the database

359 pocket_name = 'pocket_' + str(p).zfill(2)

360 pocket_output = output_directory_name + '/' + pocket_name

361 # Check if current pocket files already exist and are complete. If so, skip this pocket

362 # Output files:

363 # - pX.dx: it is created and completed at the begining by this workflow

364 # - pX_descriptors.txt: it is created at the begining and completed along the mdpocket progress

365 # - pX_mdpocket_atoms.pdb: it is completed at the begining but remains size 0 until the end of mdpocket

366 # - pX_mdpocket.pdb: it is completed at the begining but remains size 0 until the end of mdpocket

367 # Note that checking pX_mdpocket_atoms.pdb or pX_mdpocket.pdb is enought to know if mdpocket was completed

368 checking_filename = pocket_output + '_mdpocket.pdb'

369 if not (exists(checking_filename) and getsize(checking_filename) > 0):

370

371 # Update the logs

372 print(f' Analyzing pocket {p}/{pockets_number}', end='\r')

373

374 # Create the new grid for this pocket, where all values from other pockets are set to 0

375 pocket_value = pock[0]

376 new_grid_values = [str(value).ljust(5,'0') if pockets[v] == pocket_value else '0.000' for v, value in enumerate(grid_values)]

377 new_grid_filename = output_directory_name + '/' + pocket_name + '.dx'

378 with open(new_grid_filename,'w') as file:

379 # Write the header lines

380 for line in header_lines:

381 file.write(line)

382 # Write values in lines of 3 values

383 count = 0

384 line = ''

385 for value in new_grid_values:

386 if count == 0:

387 line = value

388 count += 1

389 elif count == 1:

390 line = line + ' ' + value

391 count += 1

392 elif count == 2:

393 line = line + ' ' + value + ' \n'

394 file.write(line)

395 count = 0

396

397 # Set a function to get the 'x, y, z' coordinates of a given grid point index

398 def get_coordinates (index : int) -> tuple:

399 x, rem = divmod(index, yl*zl)

400 y, z = divmod(rem, zl)

401 return (x,y,z)

402

403 # Convert the grid coordinates to pdb

404 new_pdb_lines = []

405 lines_count = 0

406 new_pdb_filename = f'{output_pocket_structure_prefix}{str(p).zfill(2)}.pdb'

407 for j, pocket in enumerate(pockets):

408 if pocket != pocket_value:

409 continue

410 x,y,z = get_coordinates(j)

411 lines_count += 1

412 atom_num = str(lines_count).rjust(6,' ')

413 x_coordinates = str(round((origin[0] + x) * 1000) / 1000).rjust(8, ' ')

414 y_coordinates = str(round((origin[1] + y) * 1000) / 1000).rjust(8, ' ')

415 z_coordinates = str(round((origin[2] + z) * 1000) / 1000).rjust(8, ' ')

416 line = "ATOM "+ atom_num +" C PTH 1 "+ x_coordinates + y_coordinates + z_coordinates +" 0.00 0.00\n"

417 new_pdb_lines.append(line)

418

419 # Write the pdb file

420 with open(new_pdb_filename,'w') as file:

421 for line in new_pdb_lines:

422 file.write(line)

423

424 # Run the mdpocket analysis focusing in this specific pocket

425 print(GREY_HEADER)

426 error_logs = run([

427 "mdpocket",

428 "--trajectory_file",

429 auxiliar_trajectory_filepath,

430 "--trajectory_format",

431 "xtc",

432 "-f",

433 # WARNING: There is a silent sharp limit of characters here

434 # To avoid the problem we must use the relative path instead of the absolute path

435 auxiliar_structure_filepath,

436 "-o",

437 pocket_output,

438 "--selected_pocket",

439 new_pdb_filename,

440 ], stderr=PIPE).stderr.decode()

441 print(COLOR_END)

442

443 # If file does not exist or is still empty at this point then somethin went wrong

444 if not exists(checking_filename) or getsize(checking_filename) == 0:

445 print(error_logs)

446 raise Exception(f'Something went wrong with mdpocket while analysing pocket {p}')

447

448 # Remove previous lines

449 print(ERASE_4_PREVIOUS_LINES)

450

451 # Mine data from the mdpocket 'descriptors' output file

452 descriptors_data = {}

453 mdpocket_descriptors_filename = pocket_output + '_descriptors.txt'

454 with open(mdpocket_descriptors_filename,'r') as file:

455 # The '[:-1]' is to remove the break line at the end of each line

456 entries = re.split("[ ]+", next(file)[:-1])

457 for entry in entries:

458 descriptors_data[entry] = []

459 for line in file:

460 line_data = re.split("[ ]+", line[:-1])

461 for v, value in enumerate(line_data):

462 descriptors_data[entries[v]].append(value)

463

464 # Mine the Voronoi vertices which represent the pocket each frame

465 # Mine their positions and radius for each frame

466 # Example of lines in hte file to be mined:

467 # ATOM 0 O STP 105 79.352 75.486 84.079 0.00 4.25

468 # ATOM 1 C STP 105 79.735 75.228 84.482 0.00 4.13

469 # ATOM 2 C STP 105 79.624 74.251 84.263 0.00 4.52

470 # vertices = []

471 # vertices_filename = pocket_output + '_mdpocket.pdb'

472 # with open(vertices_filename,'r') as file:

473 # frame_vertices = []

474 # for line in file:

475 # line_data = re.split("[ ]+", line[:-1])

476 # if line_data[0] == 'ATOM':

477 # frame_vertices.append({

478 # 'e': line_data[2],

479 # 'n': line_data[4],

480 # 'x': line_data[5],

481 # 'y': line_data[6],

482 # 'z': line_data[7],

483 # 'r': line_data[9],

484 # })

485 # if line_data[0] == 'ENDMDL':

486 # vertices.append(frame_vertices)

487 # frame_vertices = []

488

489 # Mine the atoms implicated in each pocket

490 # In this file atoms are listed for each frame, but they are always the same

491 # For this reason, we mine only atoms in the first frame

492 atoms = []

493 atoms_filename = pocket_output + '_mdpocket_atoms.pdb'

494 with open(atoms_filename,'r') as file:

495 for line in file:

496 line_data = re.split("[ ]+", line[:-1])

497 if line_data[0] == 'MODEL':

498 continue

499 if line_data[0] == 'ATOM':

500 atoms.append(int(line_data[1]))

501 if line_data[0] == 'ENDMDL':

502 break

503

504 # Format the mined data and append it to the output data

505 # NEVER FORGET: The 'descriptors_data' object contains a lot of data, not only pocket volumes

506 # (e.g. drugability score)

507 # DANI: Habría que sentarnos un día a ver que otros valores queremos quedarnos

508 output = {

509 'name': pocket_name,

510 'volumes': list(map(float, descriptors_data['pock_volume'])),

511 #'vertices': vertices,

512 'atoms': atoms,

513 }

514

515 output_analysis.append(output)

516

517 # Recover the original directory

518 chdir(recovery_path)

519

520 # By default, the starting frame is always 0

521 start = 0

522

523 # Export the analysis in json format

524 save_json({ 'data': output_analysis, 'start': start, 'step': step }, output_analysis_filepath)