Coverage for mddb_workflow / analyses / hydrogen_bonds.py: 84%

108 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 18:45 +0000

1# Hydrogen bonds analysis 

2 

3# The pytraj 'hbond' analysis finds possible hydrogen bonds interactions between closer atoms and 

4# calculates the percent of frames where the hydrogen bond would happen according to the distance 

5# between two atoms and the bond angle. 

6# 

7# Pytraj analysis will return the residue number, residue type and atom name of each pair of bonded 

8# atoms. WARNING: In irregular pdb topologies, there may be atoms with identical residue number, 

9# residue type and atom name. This makes impossible to know the real resulted atom. For this reason 

10# topology has been corrected previously and duplicated atoms have been renamed 

11 

12import pytraj as pt 

13import re 

14from math import ceil 

15from os.path import exists 

16 

17from mddb_workflow.utils.pyt_spells import get_pytraj_trajectory 

18from mddb_workflow.utils.auxiliar import save_json, numerate_filename, get_analysis_name, reprint 

19from mddb_workflow.utils.constants import OUTPUT_HBONDS_FILENAME 

20from mddb_workflow.utils.type_hints import * 

21 

22 

23# WARNING: the output file size depends on the number of hydrogen bonds 

24# WARNING: analyses must be no heavier than 16Mb in BSON format 

25# WARNING: In case of large surface interaction the output analysis may be larger than the limit 

26def hydrogen_bonds( 

27 structure_file: 'File', 

28 trajectory_file: 'File', 

29 output_directory: str, 

30 populations: Optional[list[float]], 

31 structure: 'Structure', 

32 interactions: list, 

33 is_time_dependent: bool, 

34 # Number of splits along the trajectory 

35 time_splits: int = 100, 

36 # Explicit values for the most populated frames is saved apart 

37 # Set how many frames (in order of population) are saved 

38 most_populated_frames_number: int = 5, 

39): 

40 """Perform an hydrogen bonds analysis for each interaction interface. 

41 The 'interactions' input may be an empty list (i.e. there are no interactions). 

42 In case there are no interactions the analysis stops. 

43 Note that this analysis find hydrogen bonds in a subset of frames along the trajectory. 

44 Storing the results for the whole trajectory is not possible due to storage limits. 

45 """ 

46 # Return before doing anything if there are no interactions 

47 if not interactions or len(interactions) == 0: 

48 print('No interactions were specified') 

49 return 

50 

51 # Set the main output filepath 

52 output_analysis_filepath = f'{output_directory}/{OUTPUT_HBONDS_FILENAME}' 

53 

54 # Set a reference system to handle conversions to pytraj topology 

55 # First set the pytraj topology 

56 pytraj_topology = structure.get_pytraj_topology() 

57 pytraj_residues = list(pytraj_topology.residues) 

58 

59 # Transform a pytraj residue numeration (1-based) and an atom name to a 0-based atom index 

60 # Double check the residue is the one we expect to find and, if not, use another strategy 

61 # Remember that sometimes pytraj may parse residues differently for chaotic topologies 

62 def pytraj_residue_number_2_residue(pytraj_residue_index: int) -> 'Residue': 

63 residue_index = pytraj_residue_index - 1 

64 pytraj_residue = pytraj_residues[residue_index] 

65 expected_number = pytraj_residue.original_resid 

66 expected_name = pytraj_residue.name 

67 # In the canonical way this index is equivalent to the structure resiude index 

68 if residue_index < len(structure.residues): 

69 residue = structure.residues[residue_index] 

70 if residue.number == expected_number and residue.name[0:3] == expected_name: 

71 return residue 

72 # Pytraj index may not match the structure index in caotic topologies 

73 # (i.e. when heavy atoms and hydrogen are listed independently) 

74 # When this happens we can try to find the residue by comparing resnum and resname 

75 # WARNING: Note that this alternative method is nos sensitive to chains or icodes 

76 # WARNING: This is because pytraj does not deal with chains or icodes 

77 for residue in structure.residues: 

78 if residue.number == expected_number and residue.name[0:3] == expected_name: 

79 return residue 

80 # Return None if there are no results 

81 return None 

82 

83 # Parse the trajectory intro ptraj 

84 # Reduce it in case it exceeds the frames limit 

85 pt_trajectory = get_pytraj_trajectory(structure_file.path, trajectory_file.path) 

86 

87 # Save each analysis to a dict which will be parsed to json 

88 output_summary = [] 

89 

90 # Iterate interactions 

91 print() 

92 for i, interaction in enumerate(interactions): 

93 

94 # If the interaction has coarse grain atoms then just skip it 

95 # Note that this analysis makes not sense if the interaction is not atomistic 

96 # FUN FACT: If tried, pytraj fails so spectacularly it is is not even able to report the error 

97 # RuntimeError: Failed to setup action. Use pytraj._verbose() to turn on the error report. 

98 if interaction.get('has_cg', False): continue 

99 

100 # Get the interaction name 

101 name = interaction['name'] 

102 reprint(f' Processing {name} ({i+1}/{len(interactions)})') 

103 # Set a filename for the current interaction data 

104 numbered_output_analysis_filepath = numerate_filename(output_analysis_filepath, i) 

105 

106 # Add the root of the output analysis filename to the run data 

107 analysis_name = get_analysis_name(numbered_output_analysis_filepath) 

108 # Append current interaction to the summary 

109 analysis_entry = {'name': name, 'analysis': analysis_name} 

110 output_summary.append(analysis_entry) 

111 

112 # If the analysis already exists then proceed to the next interaction 

113 if exists(numbered_output_analysis_filepath): continue 

114 

115 # Get interface atom indices 

116 interface_atom_indices_1 = interaction['interface_atom_indices_1'] 

117 interface_atom_indices_2 = interaction['interface_atom_indices_2'] 

118 # Select all interface residues in pytraj notation 

119 interface_atom_indices = interface_atom_indices_1 + interface_atom_indices_2 

120 if len(interface_atom_indices) == 0: 

121 raise ValueError(f'There are no interface atoms for interaction "{name}"') 

122 interface_selection = structure.select_atom_indices(interface_atom_indices) 

123 interface_pt_mask = interface_selection.to_pytraj() 

124 

125 # Run the analysis 

126 hbonds = pt.hbond(pt_trajectory, mask=interface_pt_mask) 

127 

128 # Save appart hbond 'old keys' which are not accessible for each hb individually 

129 # WARNING: These keys are internal pytraj keys but they are crucial 

130 # e.g. current key: 'CYS44_O-THR25_OG1-HG1', old key: 'CYS_44@O-THR_25@OG1-HG1' 

131 # WARNING: The fact that residue name and residue number is separated by '_' is critical 

132 # Some topologies have residue names with different length (2 or 3 characters) 

133 # Some topologies have residue names which end in numeric characters (e.g. 'P1', 'P2', etc.) 

134 # For this reason we can not use current keys where residue name and number are merged 

135 # This makes impossible to know what is the name and what is the number of the residue sometimes 

136 hbond_keys = hbonds._old_keys 

137 

138 # Get the number of snapshots from one analysis result 

139 snapshots = len(hbonds[0].values) 

140 # Calculate in how many frames we must split every time slot 

141 step = ceil(snapshots / time_splits) 

142 # Calculate how many time slots we will have at the end 

143 # Note that there is a residual number of frames at the end which is discarded 

144 nsteps = ceil(snapshots / step) 

145 # In case we have populations 

146 if populations: 

147 # Set a function to calculate the population-weighted value of a given frame 

148 def get_populated_value(hbond_value: float, frame: int) -> float: 

149 population = populations[frame] 

150 multiplier = population * snapshots 

151 return hbond_value * multiplier 

152 # Get the most populated frames numbers 

153 population_per_frames = [(population, frame) for frame, population in enumerate(populations)] 

154 population_sorted_frames = [frame for population, frame in sorted(population_per_frames, reverse=True)] 

155 most_populated_frames = population_sorted_frames[0:most_populated_frames_number] 

156 

157 # Save all atom indices for the final output 

158 acceptor_atom_index_list = [] 

159 donor_atom_index_list = [] 

160 hydrogen_atom_index_list = [] 

161 hbond_overall = [] 

162 hbond_timed = [] 

163 hbond_framed = [] 

164 

165 # Search all predicted hydrogen bonds 

166 for i, hb in enumerate(hbonds): 

167 key = hbond_keys[i] 

168 # hb.key example: "ASN15_OD1-LEU373_N-H" 

169 # matchObj = re.match( r'\w{3}(\d*)_(.*)-\w{3}(\d*)_(.*)-(.*)', hb.key, re.M|re.I) 

170 # hb.key example: "ASN_15@OD1-LEU_373@N-H" 

171 matchObj = re.match(r'\w*_(\d*)@(.*)-\w*_(\d*)@(.*)-(.*)', key, re.M|re.I) 

172 if matchObj is None: continue 

173 # Mine all data from the parsed results 

174 acceptor_resnum = int(matchObj.group(1)) 

175 acceptor_atom_name = matchObj.group(2) 

176 donor_resnum = int(matchObj.group(3)) 

177 donor_atom_name = matchObj.group(4) 

178 hydrogen_atom_name = matchObj.group(5) 

179 # Get the acceptor and donor parsed atoms 

180 acceptor_residue = pytraj_residue_number_2_residue(acceptor_resnum) 

181 acceptor_atom = acceptor_residue.get_atom_by_name(acceptor_atom_name) 

182 donor_residue = pytraj_residue_number_2_residue(donor_resnum) 

183 donor_atom = donor_residue.get_atom_by_name(donor_atom_name) 

184 hydrogen_atom = donor_residue.get_atom_by_name(hydrogen_atom_name) 

185 # WARNING: The analysis may return hydrogen bonds between residues from the same agent 

186 # Accept the hydrogen bond only if its residues belong to different interaction agents 

187 if acceptor_atom.index in interface_atom_indices_1 and donor_atom.index in interface_atom_indices_1: continue 

188 if acceptor_atom.index in interface_atom_indices_2 and donor_atom.index in interface_atom_indices_2: continue 

189 # Get the absolute index of each atom 

190 acceptor_atom_index_list.append(acceptor_atom.index) 

191 donor_atom_index_list.append(donor_atom.index) 

192 hydrogen_atom_index_list.append(hydrogen_atom.index) 

193 # List 'hb.values' because it is a ndarray, which is not JSON serializable 

194 hbond_values = list(map(int, hb.values)) 

195 # In case we have populations 

196 if populations: 

197 # Save if there is or not hydrogen bond for the most populated frames 

198 framed_hbonds = {frame: bool(hbond_values[frame]) for frame in most_populated_frames} 

199 hbond_framed.append(framed_hbonds) 

200 # Weigth hbond values according to populations 

201 hbond_values = [get_populated_value(hbond_value, frame) for frame, hbond_value in enumerate(hbond_values)] 

202 # Get the overall percent of frames where the hydrogen bond exists 

203 overall_percent = sum(hbond_values) / snapshots 

204 hbond_overall.append(overall_percent) 

205 # The last part is done only when the simulation is time dependent 

206 if not is_time_dependent: continue 

207 # Now split the whole trajectory in time slots and calculate hbonds percents for every slot 

208 temporal_percents = [] 

209 for i in range(nsteps): 

210 slot_hbond_values = hbond_values[i*step:(i+1)*step] 

211 temporal_percent = sum(slot_hbond_values) / step 

212 temporal_percents.append(temporal_percent) 

213 hbond_timed.append(temporal_percents) 

214 

215 # If no hydrogen bonds were found then do not append the analysis 

216 if len(acceptor_atom_index_list) == 0: 

217 output_summary.pop() 

218 continue 

219 

220 # Set the interaction output 

221 interaction_data = { 

222 'name': interaction['name'], 

223 'acceptors': acceptor_atom_index_list, 

224 'donors': donor_atom_index_list, 

225 'hydrogens': hydrogen_atom_index_list, 

226 'hbonds': hbond_overall, 

227 'version': '1.0.0' 

228 } 

229 if is_time_dependent: 

230 interaction_data['hbonds_timed'] = hbond_timed 

231 # DANI: Esto el cliente aún no lo soporta y me he quedado sin tiempo 

232 # DANI: De momento todo lo que sea ensemble que tire de mostar el overall y listo 

233 # if populations: 

234 # interaction_data['hbonds_framed'] = hbond_framed 

235 

236 # Write the interaction analysis output to a file 

237 save_json(interaction_data, numbered_output_analysis_filepath) 

238 

239 # Export the analysis in json format 

240 if len(output_summary) > 0: 

241 save_json(output_summary, output_analysis_filepath)