Coverage for model_workflow/analyses/hydrogen_bonds.py: 84%
108 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
1# Hydrogen bonds analysis
3# The pytraj 'hbond' analysis finds possible hydrogen bonds interactions between closer atoms and
4# calculates the percent of frames where the hydrogen bond would happen according to the distance
5# between two atoms and the bond angle.
6#
7# Pytraj analysis will return the residue number, residue type and atom name of each pair of bonded
8# atoms. WARNING: In irregular pdb topologies, there may be atoms with identical residue number,
9# residue type and atom name. This makes impossible to know the real resulted atom. For this reason
10# topology has been corrected previously and duplicated atoms have been renamed
12import pytraj as pt
13import re
14from math import ceil
15from os.path import exists
17from model_workflow.utils.pyt_spells import get_pytraj_trajectory
18from model_workflow.utils.auxiliar import save_json, numerate_filename, get_analysis_name, reprint
19from model_workflow.utils.constants import OUTPUT_HBONDS_FILENAME
20from model_workflow.utils.type_hints import *
22# WARNING: the output file size depends on the number of hydrogen bonds
23# WARNING: analyses must be no heavier than 16Mb in BSON format
24# WARNING: In case of large surface interaction the output analysis may be larger than the limit
25def hydrogen_bonds (
26 structure_file : 'File',
27 trajectory_file : 'File',
28 output_directory : str,
29 populations : Optional[List[float]],
30 structure : 'Structure',
31 interactions : list,
32 is_time_dependent : bool,
33 # Number of splits along the trajectory
34 time_splits : int = 100,
35 # Explicit values for the most populated frames is saved apart
36 # Set how many frames (in order of population) are saved
37 most_populated_frames_number : int = 5,
38 ):
39 """Perform an hydrogen bonds analysis for each interaction interface.
40 The 'interactions' input may be an empty list (i.e. there are no interactions).
41 In case there are no interactions the analysis stops.
42 Note that this analysis find hydrogen bonds in a subset of frames along the trajectory.
43 Storing the results for the whole trajectory is not possible due to storage limits."""
44 # Return before doing anything if there are no interactions
45 if not interactions or len(interactions) == 0:
46 print('No interactions were specified')
47 return
49 # Set the main output filepath
50 output_analysis_filepath = f'{output_directory}/{OUTPUT_HBONDS_FILENAME}'
52 # Set a reference system to handle conversions to pytraj topology
53 # First set the pytraj topology
54 pytraj_topology = structure.get_pytraj_topology()
55 pytraj_residues = list(pytraj_topology.residues)
57 # Transform a pytraj residue numeration (1-based) and an atom name to a 0-based atom index
58 # Double check the residue is the one we expect to find and, if not, use another strategy
59 # Remember that sometimes pytraj may parse residues differently for chaotic topologies
60 def pytraj_residue_number_2_residue (pytraj_residue_index : int) -> 'Residue':
61 residue_index = pytraj_residue_index - 1
62 pytraj_residue = pytraj_residues[residue_index]
63 expected_number = pytraj_residue.original_resid
64 expected_name = pytraj_residue.name
65 # In the canonical way this index is equivalent to the structure resiude index
66 if residue_index < len(structure.residues):
67 residue = structure.residues[residue_index]
68 if residue.number == expected_number and residue.name[0:3] == expected_name:
69 return residue
70 # Pytraj index may not match the structure index in caotic topologies
71 # (i.e. when heavy atoms and hydrogen are listed independently)
72 # When this happens we can try to find the residue by comparing resnum and resname
73 # WARNING: Note that this alternative method is nos sensitive to chains or icodes
74 # WARNING: This is because pytraj does not deal with chains or icodes
75 for residue in structure.residues:
76 if residue.number == expected_number and residue.name[0:3] == expected_name:
77 return residue
78 # Return None if there are no results
79 return None
81 # Parse the trajectory intro ptraj
82 # Reduce it in case it exceeds the frames limit
83 pt_trajectory = get_pytraj_trajectory(structure_file.path, trajectory_file.path)
85 # Save each analysis to a dict which will be parsed to json
86 output_summary = []
88 # Iterate interactions
89 print()
90 for i, interaction in enumerate(interactions):
92 # If the interaction has coarse grain atoms then just skip it
93 # Note that this analysis makes not sense if the interaction is not atomistic
94 # FUN FACT: If tried, pytraj fails so spectacularly it is is not even able to report the error
95 # RuntimeError: Failed to setup action. Use pytraj._verbose() to turn on the error report.
96 if interaction.get('has_cg', False): continue
98 # Get the interaction name
99 name = interaction['name']
100 reprint(f' Processing {name} ({i+1}/{len(interactions)})')
101 # Set a filename for the current interaction data
102 numbered_output_analysis_filepath = numerate_filename(output_analysis_filepath, i)
104 # Add the root of the output analysis filename to the run data
105 analysis_name = get_analysis_name(numbered_output_analysis_filepath)
106 # Append current interaction to the summary
107 analysis_entry = { 'name': name, 'analysis': analysis_name }
108 output_summary.append(analysis_entry)
110 # If the analysis already exists then proceed to the next interaction
111 if exists(numbered_output_analysis_filepath): continue
113 # Get interface atom indices
114 interface_atom_indices_1 = interaction['interface_atom_indices_1']
115 interface_atom_indices_2 = interaction['interface_atom_indices_2']
116 # Select all interface residues in pytraj notation
117 interface_atom_indices = interface_atom_indices_1 + interface_atom_indices_2
118 if len(interface_atom_indices) == 0:
119 raise ValueError(f'There are no interface atoms for interaction "{name}"')
120 interface_selection = structure.select_atom_indices(interface_atom_indices)
121 interface_pt_mask = interface_selection.to_pytraj()
123 # Run the analysis
124 hbonds = pt.hbond(pt_trajectory, mask=interface_pt_mask)
126 # Save appart hbond 'old keys' which are not accessible for each hb individually
127 # WARNING: These keys are internal pytraj keys but they are crucial
128 # e.g. current key: 'CYS44_O-THR25_OG1-HG1', old key: 'CYS_44@O-THR_25@OG1-HG1'
129 # WARNING: The fact that residue name and residue number is separated by '_' is critical
130 # Some topologies have residue names with different length (2 or 3 characters)
131 # Some topologies have residue names which end in numeric characters (e.g. 'P1', 'P2', etc.)
132 # For this reason we can not use current keys where residue name and number are merged
133 # This makes impossible to know what is the name and what is the number of the residue sometimes
134 hbond_keys = hbonds._old_keys
136 # Get the number of snapshots from one analysis result
137 snapshots = len(hbonds[0].values)
138 # Calculate in how many frames we must split every time slot
139 step = ceil(snapshots / time_splits)
140 # Calculate how many time slots we will have at the end
141 # Note that there is a residual number of frames at the end which is discarded
142 nsteps = ceil(snapshots / step)
143 # In case we have populations
144 if populations:
145 # Set a function to calculate the population-weighted value of a given frame
146 def get_populated_value (hbond_value : float, frame : int) -> float:
147 population = populations[frame]
148 multiplier = population * snapshots
149 return hbond_value * multiplier
150 # Get the most populated frames numbers
151 population_per_frames = [ (population, frame) for frame, population in enumerate(populations) ]
152 population_sorted_frames = [frame for population, frame in sorted(population_per_frames, reverse=True)]
153 most_populated_frames = population_sorted_frames[0:most_populated_frames_number]
155 # Save all atom indices for the final output
156 acceptor_atom_index_list = []
157 donor_atom_index_list = []
158 hydrogen_atom_index_list = []
159 hbond_overall = []
160 hbond_timed = []
161 hbond_framed = []
163 # Search all predicted hydrogen bonds
164 for i, hb in enumerate(hbonds):
165 key = hbond_keys[i]
166 # hb.key example: "ASN15_OD1-LEU373_N-H"
167 # matchObj = re.match( r'\w{3}(\d*)_(.*)-\w{3}(\d*)_(.*)-(.*)', hb.key, re.M|re.I)
168 # hb.key example: "ASN_15@OD1-LEU_373@N-H"
169 matchObj = re.match( r'\w*_(\d*)@(.*)-\w*_(\d*)@(.*)-(.*)', key, re.M|re.I)
170 if matchObj == None: continue
171 # Mine all data from the parsed results
172 acceptor_resnum = int(matchObj.group(1))
173 acceptor_atom_name = matchObj.group(2)
174 donor_resnum = int(matchObj.group(3))
175 donor_atom_name = matchObj.group(4)
176 hydrogen_atom_name = matchObj.group(5)
177 # Get the acceptor and donor parsed atoms
178 acceptor_residue = pytraj_residue_number_2_residue(acceptor_resnum)
179 acceptor_atom = acceptor_residue.get_atom_by_name(acceptor_atom_name)
180 donor_residue = pytraj_residue_number_2_residue(donor_resnum)
181 donor_atom = donor_residue.get_atom_by_name(donor_atom_name)
182 hydrogen_atom = donor_residue.get_atom_by_name(hydrogen_atom_name)
183 # WARNING: The analysis may return hydrogen bonds between residues from the same agent
184 # Accept the hydrogen bond only if its residues belong to different interaction agents
185 if acceptor_atom.index in interface_atom_indices_1 and donor_atom.index in interface_atom_indices_1: continue
186 if acceptor_atom.index in interface_atom_indices_2 and donor_atom.index in interface_atom_indices_2: continue
187 # Get the absolute index of each atom
188 acceptor_atom_index_list.append(acceptor_atom.index)
189 donor_atom_index_list.append(donor_atom.index)
190 hydrogen_atom_index_list.append(hydrogen_atom.index)
191 # List 'hb.values' because it is a ndarray, which is not JSON serializable
192 hbond_values = list(map(int, hb.values))
193 # In case we have populations
194 if populations:
195 # Save if there is or not hydrogen bond for the most populated frames
196 framed_hbonds = { frame: bool(hbond_values[frame]) for frame in most_populated_frames }
197 hbond_framed.append(framed_hbonds)
198 # Weigth hbond values according to populations
199 hbond_values = [ get_populated_value(hbond_value, frame) for frame, hbond_value in enumerate(hbond_values) ]
200 # Get the overall percent of frames where the hydrogen bond exists
201 overall_percent = sum(hbond_values) / snapshots
202 hbond_overall.append(overall_percent)
203 # The last part is done only when the simulation is time dependent
204 if not is_time_dependent: continue
205 # Now split the whole trajectory in time slots and calculate hbonds percents for every slot
206 temporal_percents = []
207 for i in range(nsteps):
208 slot_hbond_values = hbond_values[i*step:(i+1)*step]
209 temporal_percent = sum(slot_hbond_values) / step
210 temporal_percents.append(temporal_percent)
211 hbond_timed.append(temporal_percents)
213 # If no hydrogen bonds were found then do not append the analysis
214 if len(acceptor_atom_index_list) == 0:
215 output_summary.pop()
216 continue
218 # Set the interaction output
219 interaction_data = {
220 'name': interaction['name'],
221 'acceptors': acceptor_atom_index_list,
222 'donors': donor_atom_index_list,
223 'hydrogens': hydrogen_atom_index_list,
224 'hbonds': hbond_overall,
225 'version': '1.0.0'
226 }
227 if is_time_dependent:
228 interaction_data['hbonds_timed'] = hbond_timed
229 # DANI: Esto el cliente aún no lo soporta y me he quedado sin tiempo
230 # DANI: De momento todo lo que sea ensemble que tire de mostar el overall y listo
231 # if populations:
232 # interaction_data['hbonds_framed'] = hbond_framed
234 # Write the interaction analysis output to a file
235 save_json(interaction_data, numbered_output_analysis_filepath)
237 # Export the analysis in json format
238 if len(output_summary) > 0:
239 save_json(output_summary, output_analysis_filepath)