Coverage for mddb_workflow/tools/generate

1from mddb_workflow.tools.get_box_size import get_box_size

2from mddb_workflow.tools.get_atoms_count import get_atoms_count

3from mddb_workflow.tools.generate_map import get_sequence_metadata

4from mddb_workflow.utils.auxiliar import InputError, save_json

5from mddb_workflow.utils.constants import MD_DIRECTORY

6from mddb_workflow.utils.type_hints import *

8# Input fields + interaction type

9METADATA_INTERACTION_FIELDS = { "name", "agent_1", "agent_2", "selection_1", "selection_2", "type" }

11def prepare_project_metadata (

12 structure_file : 'File',

13 trajectory_file : 'File',

14 output_filepath : str,

15 structure : 'Structure',

16 residue_map : dict,

17 protein_references_file : 'File',

18 pdb_ids : list[str],

19 ligand_map : dict,

20 input_protein_references : list[str] | dict,

21 input_ligands : list[dict],

22 interactions : list[dict],

23 warnings : dict,

24 # Set all inputs to be loaded as they are

25 input_force_fields : list[str],

26 input_collections : list[str],

27 input_chain_names : list[str],

28 input_type : str,

29 input_framestep : float,

30 input_name : str,

31 input_description : str,

32 input_authors : list[str],

33 input_groups : list[str],

34 input_contact : str,

35 input_program : str,

36 input_version : str,

37 input_method : str,

38 input_license : str,

39 input_linkcense : str,

40 input_citation : str,

41 input_thanks : str,

42 input_links : list[dict],

43 input_timestep : float,

44 input_temperature : float,

45 input_ensemble : str,

46 input_water : str,

47 input_boxtype : str,

48 input_pbc_selection : str,

49 input_cg_selection : str,

50 input_customs : list[dict],

51 input_orientation : list[float],

52 input_multimeric : list[str],

53 # Additional topic-specific inputs

54 input_cv19_unit : str,

55 input_cv19_startconf : str,

56 input_cv19_abs : bool,

57 input_cv19_nanobs : bool,

58 ):

59 """ Prepare a JSON file with all project metadata. """

61 # Find out the box size (x, y and z)

62 (boxsizex, boxsizey, boxsizez) = get_box_size(

63 structure_file.path, trajectory_file.path)

65 # Count different types of atoms and residues

66 (system_atoms, system_residues, protein_atoms, protein_residues,

67 nucleic_atoms, nucleic_residues, lipid_atoms, lipid_residues,

68 carbohydrates_atoms, carbohydrates_residues, solvent_atoms, solvent_residues,

69 counter_cations, counter_anions, counter_ions) = get_atoms_count(structure)

71 # Get protein references from the residues map

72 # Get ligand references from the residues map

73 protein_references = []

74 ligand_references = []

75 inchikey_references = []

76 references = residue_map['references']

77 if references and len(references) > 0:

78 for ref, ref_type in zip(references, residue_map['reference_types']):

79 if ref_type == 'protein':

80 protein_references.append(ref)

81 elif ref_type == 'ligand':

82 ligand_references.append(ref)

83 elif ref_type == 'inchikey':

84 inchikey_references.append(ref)

86 # Get ligand names if any

87 forced_ligand_names = {

88 lig['name']: lig['forced_name'] for lig in ligand_map if lig.get('forced_name', False) }

89 if len(forced_ligand_names) == 0:

90 forced_ligand_names = None

92 # Make the forcefields a list in case it is a single string

93 forcefields = input_force_fields

94 if type(forcefields) == str:

95 forcefields = [forcefields]

97 # Collections must be null in case there are not collections

98 collections = input_collections

99 if not collections:

100 collections = []

101

102 # Get additional metadata related to the aminoacids sequence

103 sequence_metadata = get_sequence_metadata(structure, protein_references_file, residue_map)

104

105 # Find the PTMs

106 # Save only their names for now

107 # DANI: Esto es temporal y de momento solo busca ser un parámetro de facil query

108 # DANI: Cuando esté más maduro también almacenaremos residuo afectado, como mínimo

109 ptms = structure.find_ptms()

110 ptm_names = list(set([ ptm['name'] for ptm in ptms ]))

111

112 # Check chainnames to actually exist in the structure

113 structure_chains = set([ chain.name for chain in structure.chains ])

114 chainnames = input_chain_names

115 if chainnames:

116 for chain in chainnames.keys():

117 if chain not in structure_chains:

118 raise InputError(f'Chain {chain} from chainnames does not exist in the structure')

119

120 # Get the MD type

121 md_type = input_type

122 # In case this is an ensemble and not a time related trajectory and not an ensemble, the framestep may be missing

123 framestep = None if md_type == 'ensemble' else input_framestep

124

125 # Metadata interactions are input interactions and the interaction types combined

126 # Thus we take the processed interactions and remove the field we are not interested in

127 metadata_interactions = []

128 if interactions is not None:

129 for interaction in interactions:

130 metadata_interaction = { k: v for k, v in interaction.items() if k in METADATA_INTERACTION_FIELDS }

131 metadata_interactions.append(metadata_interaction)

132

133 # Make sure links are correct

134 links = input_links

135 if links != None:

136 if type(links) != list: links = [ links ]

137 for link in input_links:

138 if type(link) != dict: raise InputError('Links must be a list of objects')

139 if link.get('name', None) == None: raise InputError('Links must have a name')

140 if link.get('url', None) == None: raise InputError('Links must have a URL')

141

142 # Write the metadata file

143 # Metadata keys must be in CAPS, as they are in the client

144 metadata = {

145 'NAME': input_name,

146 'DESCRIPTION': input_description,

147 'AUTHORS': input_authors,

148 'GROUPS': input_groups,

149 'CONTACT': input_contact,

150 'PROGRAM': input_program,

151 'VERSION': input_version,

152 'TYPE': md_type,

153 'METHOD': input_method,

154 'LICENSE': input_license,

155 'LINKCENSE': input_linkcense,

156 'CITATION': input_citation,

157 'THANKS': input_thanks,

158 'LINKS': input_links,

159 'PDBIDS': pdb_ids,

160 'FORCED_REFERENCES': input_protein_references,

161 'REFERENCES': protein_references,

162 'INPUT_LIGANDS': input_ligands,

163 'LIGANDS': ligand_references,

164 'LIGANDNAMES': forced_ligand_names,

165 'INCHIKEYS': inchikey_references,

166 'PROTSEQ': sequence_metadata['protein_sequences'],

167 'NUCLSEQ': sequence_metadata['nucleic_sequences'],

168 'DOMAINS': sequence_metadata['domains'],

169 'FRAMESTEP': framestep,

170 'TIMESTEP': input_timestep,

171 'TEMP': input_temperature,

172 'ENSEMBLE': input_ensemble,

173 'FF': forcefields,

174 'WAT': input_water,

175 'BOXTYPE': input_boxtype,

176 'SYSTATS': system_atoms,

177 'SYSTRES': system_residues,

178 'PROTATS': protein_atoms,

179 'PROTRES': protein_residues,

180 'NUCLATS': nucleic_atoms,

181 'NUCLRES': nucleic_residues,

182 'LIPIATS': lipid_atoms,

183 'LIPIRES': lipid_residues,

184 'CARBATS': carbohydrates_atoms,

185 'CARBRES': carbohydrates_residues,

186 'SOLVATS': solvent_atoms,

187 'SOLVRES': solvent_residues,

188 'COUNCAT': counter_cations,

189 'COUNANI': counter_anions,

190 'COUNION': counter_ions,

191 'INTERACTIONS': metadata_interactions,

192 'PBC_SELECTION': input_pbc_selection,

193 'CG_SELECTION': input_cg_selection,

194 'CHAINNAMES': chainnames,

195 'CUSTOMS': input_customs,

196 'ORIENTATION': input_orientation,

197 'PTM': ptm_names,

198 'MULTIMERIC' : input_multimeric,

199 'COLLECTIONS': collections,

200 'WARNINGS': warnings,

201 }

202 # Add boxsizes only if any of them is 0

203 if boxsizex > 0 and boxsizey > 0 and boxsizez > 0:

204 metadata['BOXSIZEX'] = boxsizex

205 metadata['BOXSIZEY'] = boxsizey

206 metadata['BOXSIZEZ'] = boxsizez

207 # Add collection specific fields

208 if 'cv19' in collections:

209 cv19_unit = input_cv19_unit

210 cv19_startconf = input_cv19_startconf

211 cv19_abs = input_cv19_abs

212 cv19_nanobs = input_cv19_nanobs

213 cv19_variant = sequence_metadata['cv19_variant']

214

215 if cv19_unit is not None:

216 metadata['CV19_UNIT'] = cv19_unit

217

218 if cv19_startconf is not None:

219 metadata['CV19_STARTCONF'] = cv19_startconf

220

221 if cv19_abs is not None:

222 metadata['CV19_ABS'] = cv19_abs

223

224 if cv19_nanobs is not None:

225 metadata['CV19_NANOBS'] = cv19_nanobs

226

227 if cv19_variant is not None:

228 metadata['CV19_VARIANT'] = cv19_variant

229

230 # Write metadata to a file

231 save_json(metadata, output_filepath)

232

233metadata_fields = set([ 'NAME', 'DESCRIPTION', 'AUTHORS', 'GROUPS', 'CONTACT', 'PROGRAM', 'VERSION',

234 'TYPE', 'METHOD', 'LICENSE', 'LINKCENSE', 'CITATION', 'THANKS', 'LINKS', 'PDBIDS', 'FORCED_REFERENCES',

235 'REFERENCES', 'INPUT_LIGANDS', 'LIGANDS', 'LIGANDNAMES', 'PROTSEQ', 'NUCLSEQ', 'DOMAINS', 'FRAMESTEP', 'TIMESTEP',

236 'TEMP', 'ENSEMBLE', 'FF', 'WAT', 'BOXTYPE', 'SYSTATS', 'PROTATS', 'PROT', 'DPPC', 'SOL', 'NA', 'CL',

237 'INTERACTIONS', 'PBC_SELECTION', 'CHAINNAMES', 'MEMBRANES', 'CUSTOMS', 'ORIENTATION', 'PTM',

238 'MULTIMERIC', 'COLLECTIONS', 'WARNINGS', 'BOXSIZEX', 'BOXSIZEY', 'BOXSIZEZ', 'CV19_UNIT', 'CV19_STARTCONF',

239 'CV19_ABS', 'CV19_NANOBS', 'CV19_VARIANT'

240])

241

242def generate_md_metadata (

243 md_inputs : dict,

244 structure : 'Structure',

245 snapshots : int,

246 reference_frame : int,

247 warnings : dict,

248 output_filepath : str

249 ):

250 """Produce the MD metadata file to be uploaded to the database."""

251

252 # Mine name and directory from MD inputs

253 name = md_inputs.get('name', None)

254 directory = md_inputs.get(MD_DIRECTORY, None)

255

256 # Write the metadata file

257 md_metadata = {

258 'name': name,

259 'frames': snapshots,

260 'atoms': len(structure.atoms), # Should be always the same but we better have explicit confirmation

261 'refframe': reference_frame,

262 'warnings': warnings,

263 }

264

265 # Get other MD inputs than the name and the directory

266 other_md_inputs = { k: v for k, v in md_inputs.items() }

267 # Remove name from MD inputs to not further overwrite project metadata

268 if name:

269 del other_md_inputs['name']

270 # Remove the directory name form MD inputs since it is not to be uploaded to the database

271 if directory:

272 del other_md_inputs[MD_DIRECTORY]

273

274 # Inherit all metadata fields

275 metadata = {}

276 for field in metadata_fields:

277 input_field = field.lower()

278 field_value = other_md_inputs.get(input_field, None)

279 if field_value:

280 metadata[field] = field_value

281

282 # Add the matadata field only if there is at least one value

283 if len(metadata) > 0:

284 md_metadata['metadata'] = metadata

285

286 # Write metadata to a file

287 save_json(md_metadata, output_filepath)

Coverage for mddb_workflow/tools/generate_metadata.py: 73%

95 statements