Coverage for mddb_workflow/analyses/sasa.py: 90%

1from mddb_workflow.tools.xvg_parse import xvg_parse

2from mddb_workflow.tools.get_pdb_frames import get_pdb_frames

3from mddb_workflow.utils.auxiliar import save_json

4from mddb_workflow.utils.constants import OUTPUT_SASA_FILENAME

5from mddb_workflow.utils.gmx_spells import run_gromacs

6from mddb_workflow.utils.type_hints import *

8import os

9import numpy

11# This is a residual file produced by the sasa analysis

12# It must be deleted after each

13AREA_FILENAME = 'area.xvg'

15def sasa(

16 structure_file: 'File',

17 trajectory_file: 'File',

18 output_directory: str,

19 structure : 'Structure',

20 pbc_residues : list[int],

21 snapshots : int,

22 frames_limit : int = 100,

23):

24 """Perform the Solvent Accessible Surface Analysis."""

26 # If all residues are to be excluded since the whole system is in PCB then stop here

27 if len(pbc_residues) == len(structure.residues):

28 print(' No residues to run the analysis')

29 return

31 # Set the output filepath

32 output_analysis_filepath = f'{output_directory}/{OUTPUT_SASA_FILENAME}'

34 # For this analysis me must filter out hydrogens

35 heavy_atoms_selection = '( not name "H.*" )'

37 # Execute the atom selection over the structure

38 structure_selection = structure.select(heavy_atoms_selection, syntax='vmd')

40 # At this point the number of residues between the original and the non-hydrogens structure should match

41 # We rely on this, so we must check

42 filtered_structure = structure.filter(structure_selection)

43 filtered_residues_count = len(filtered_structure.residues)

44 original_residues_count = len(structure.residues)

45 if filtered_residues_count != original_residues_count:

46 raise ValueError('The number of residues does not match after filtering out hydrogens')

48 # Convert the structure selection to a ndx file

49 selection_name = 'sasa'

50 ndx_selection = structure_selection.to_ndx(selection_name)

51 ndx_filename = 'indices.ndx'

52 with open(ndx_filename, 'w') as file:

53 file.write(ndx_selection)

55 # We must exclude PBC residues (e.g. membranes) from sasa results

56 # PBC residues close to the boundary will always have unrealistic sasa values

57 # WARNING: These results must be exlucded from the analysis afterwards, but the membrane can not be excluded from the structure

58 # Otherwise, those residues which are covered by the membrane would be exposed to the solvent during the analysis

59 skipped_residue_indices = pbc_residues

61 # Calculate the sasa for each frame

62 sasa_per_frame = []

63 frames, step, count = get_pdb_frames(structure_file.path, trajectory_file.path, snapshots, frames_limit)

64 for f, current_frame in enumerate(frames):

66 # Run the sasa analysis over the current frame

67 # WARNING: We want the SASA per residue, and this could be obtained replacing '-oa' per -'or'

68 # WARNING: However, residues are not enumerated the same in Gromacs and other tools (e.g. pytraj)

69 # For this reason we must always rely on atom numeration, which is the same along different tools

70 current_frame_sasa = f'sasa{f}.xvg'

71 run_gromacs(f'sasa -s {current_frame} -oa {current_frame_sasa} \

72 -n {ndx_filename} -surface 0', expected_output_filepath = current_frame_sasa)

74 # Mine the sasa results (.xvg file)

75 # Hydrogen areas are not recorded in the xvg file

76 sasa = xvg_parse(current_frame_sasa, ['n', 'area', 'sd'])

77 # Restructure data by adding all atoms sas per residue

78 atom_numbers = sasa['n']

79 atom_areas = sasa['area']

80 sas_per_residues = [0.0] * len(structure.residues)

81 for atom_number, atom_area in zip(atom_numbers, atom_areas):

82 atom_index = int(atom_number) - 1

83 atom = structure.atoms[atom_index]

84 residue_index = atom.residue_index

85 sas_per_residues[residue_index] += atom_area

86 sasa_per_frame.append(sas_per_residues)

87 # Delete current frame files before going for the next frame

88 os.remove(current_frame_sasa)

89 os.remove(AREA_FILENAME)

91 # Remove the indices file since it is not required anymore

92 os.remove(ndx_filename)

94 # Format output data

95 # Sasa values must be separated by residue and then ordered by frame

96 saspf = []

97 means = []

98 stdvs = []

99 for r, residue in enumerate(structure.residues):

100 # Skip membrane residues

101 if r in skipped_residue_indices:

102 saspf.append(None)

103 means.append(None)

104 stdvs.append(None)

105 continue

106 # Get the number of atoms in current residue

107 atom_count = len(residue.atoms)

108 # Harvest its sasa along each frame

109 residue_saspf = []

110 for frame in sasa_per_frame:

111 # IMPORTANT: The original SASA value is modified to be normalized

112 # We divide the value by the number of atoms

113 frame_sas = frame[r]

114 normalized_frame_sas = frame_sas / atom_count

115 # To make is standard with the rest of analyses we pass the results from nm² to A²

116 standard_frame_sas = normalized_frame_sas * 100

117 residue_saspf.append(standard_frame_sas)

118 # Add current resiude sas per frame to the overall list

119 saspf.append(residue_saspf)

120 # Calculate the mean and standard deviation of the residue sasa values

121 mean = numpy.mean(residue_saspf)

122 means.append(mean)

123 stdv = numpy.std(residue_saspf)

124 stdvs.append(stdv)

125 # Set the content of the analysis output file

126 output = {

127 'step': step,

128 'saspf': saspf,

129 'means': means,

130 'stdvs': stdvs,

131 }

132

133 # Export the analysis in json format

134 save_json(output, output_analysis_filepath)