Coverage for model_workflow/tools/generate

1from model_workflow.tools.get_box_size import get_box_size

2from model_workflow.tools.get_atoms_count import get_atoms_count

3from model_workflow.tools.generate_map import get_sequence_metadata

4from model_workflow.utils.auxiliar import InputError, save_json

5from model_workflow.utils.constants import MD_DIRECTORY

6from model_workflow.utils.type_hints import *

8def prepare_project_metadata (

9 structure_file : 'File',

10 trajectory_file : 'File',

11 output_filepath : str,

12 structure : 'Structure',

13 residue_map : dict,

14 protein_references_file : 'File',

15 pdb_ids : List[str],

16 ligand_map : dict,

17 input_protein_references : Union[ List[str], dict ],

18 input_ligands : List[dict],

19 input_interactions : list,

20 interaction_types : dict,

21 warnings : dict,

22 # Set all inputs to be loaded as they are

23 input_force_fields : List[str],

24 input_collections : List[str],

25 input_chain_names : List[str],

26 input_type : str,

27 input_framestep : float,

28 input_name : str,

29 input_description : str,

30 input_authors : List[str],

31 input_groups : List[str],

32 input_contact : str,

33 input_program : str,

34 input_version : str,

35 input_method : str,

36 input_license : str,

37 input_linkcense : str,

38 input_citation : str,

39 input_thanks : str,

40 input_links : List[dict],

41 input_timestep : float,

42 input_temperature : float,

43 input_ensemble : str,

44 input_water : str,

45 input_boxtype : str,

46 input_pbc_selection : str,

47 input_cg_selection : str,

48 input_customs : List[dict],

49 input_orientation : List[float],

50 input_multimeric : List[str],

51 # Additional topic-specific inputs

52 input_cv19_unit : str,

53 input_cv19_startconf : str,

54 input_cv19_abs : bool,

55 input_cv19_nanobs : bool,

56 ):

57 """Prepare a JSON file with all project metadata."""

59 # Find out the box size (x, y and z)

60 (boxsizex, boxsizey, boxsizez) = get_box_size(

61 structure_file.path, trajectory_file.path)

63 # Count different types of atoms and residues

64 (system_atoms, system_residues, protein_atoms, protein_residues,

65 nucleic_atoms, nucleic_residues, lipid_atoms, lipid_residues,

66 carbohydrates_atoms, carbohydrates_residues, solvent_atoms, solvent_residues,

67 counter_cations, counter_anions, counter_ions) = get_atoms_count(structure)

69 # Get protein references from the residues map

70 # Get ligand references from the residues map

71 protein_references = []

72 ligand_references = []

73 references = residue_map['references']

74 if references and len(references) > 0:

75 for ref, ref_type in zip(references, residue_map['reference_types']):

76 if ref_type == 'protein':

77 protein_references.append(ref)

78 elif ref_type == 'ligand':

79 ligand_references.append(ref)

81 # Get ligand names if any

82 forced_ligand_names = {

83 lig['name']: lig['forced_name'] for lig in ligand_map if lig.get('forced_name', False) }

84 if len(forced_ligand_names) == 0:

85 forced_ligand_names = None

87 # Make the forcefields a list in case it is a single string

88 forcefields = input_force_fields

89 if type(forcefields) == str:

90 forcefields = [forcefields]

92 # Collections must be null in case there are not collections

93 collections = input_collections

94 if not collections:

95 collections = []

97 # Get additional metadata related to the aminoacids sequence

98 sequence_metadata = get_sequence_metadata(structure, protein_references_file, residue_map)

100 # Find the PTMs

101 # Save only their names for now

102 # DANI: Esto es temporal y de momento solo busca ser un parámetro de facil query

103 # DANI: Cuando esté más maduro también almacenaremos residuo afectado, como mínimo

104 ptms = structure.find_ptms()

105 ptm_names = list(set([ ptm['name'] for ptm in ptms ]))

106

107 # Check chainnames to actually exist in the structure

108 structure_chains = set([ chain.name for chain in structure.chains ])

109 chainnames = input_chain_names

110 if chainnames:

111 for chain in chainnames.keys():

112 if chain not in structure_chains:

113 raise InputError(f'Chain {chain} from chainnames does not exist in the structure')

114

115 # Get the MD type

116 md_type = input_type

117 # In case this is an ensemble and not a time related trajectory and not an ensemble, the framestep may be missing

118 framestep = None if md_type == 'ensemble' else input_framestep

119

120 # Metadata interactions are input interactions and the interaction types combined

121 metadata_interactions = []

122 if input_interactions is not None:

123 for interaction in input_interactions:

124 metadata_interaction = { k: v for k, v in interaction.items() }

125 interaction_name = metadata_interaction['name']

126 metadata_interaction['type'] = interaction_types[interaction_name]

127 metadata_interactions.append(metadata_interaction)

128

129 # Make sure links are correct

130 links = input_links

131 if links != None:

132 if type(links) != list: links = [ links ]

133 for link in input_links:

134 if type(link) != dict: raise InputError('Links must be a list of objects')

135 if link.get('name', None) == None: raise InputError('Links must have a name')

136 if link.get('url', None) == None: raise InputError('Links must have a URL')

137

138 # Write the metadata file

139 # Metadata keys must be in CAPS, as they are in the client

140 metadata = {

141 'NAME': input_name,

142 'DESCRIPTION': input_description,

143 'AUTHORS': input_authors,

144 'GROUPS': input_groups,

145 'CONTACT': input_contact,

146 'PROGRAM': input_program,

147 'VERSION': input_version,

148 'TYPE': md_type,

149 'METHOD': input_method,

150 'LICENSE': input_license,

151 'LINKCENSE': input_linkcense,

152 'CITATION': input_citation,

153 'THANKS': input_thanks,

154 'LINKS': input_links,

155 'PDBIDS': pdb_ids,

156 'FORCED_REFERENCES': input_protein_references,

157 'REFERENCES': protein_references,

158 'INPUT_LIGANDS': input_ligands,

159 'LIGANDS': ligand_references,

160 'LIGANDNAMES': forced_ligand_names,

161 'PROTSEQ': sequence_metadata['protein_sequences'],

162 'NUCLSEQ': sequence_metadata['nucleic_sequences'],

163 'DOMAINS': sequence_metadata['domains'],

164 'FRAMESTEP': framestep,

165 'TIMESTEP': input_timestep,

166 'TEMP': input_temperature,

167 'ENSEMBLE': input_ensemble,

168 'FF': forcefields,

169 'WAT': input_water,

170 'BOXTYPE': input_boxtype,

171 'SYSTATS': system_atoms,

172 'SYSTRES': system_residues,

173 'PROTATS': protein_atoms,

174 'PROTRES': protein_residues,

175 'NUCLATS': nucleic_atoms,

176 'NUCLRES': nucleic_residues,

177 'LIPIATS': lipid_atoms,

178 'LIPIRES': lipid_residues,

179 'CARBATS': carbohydrates_atoms,

180 'CARBRES': carbohydrates_residues,

181 'SOLVATS': solvent_atoms,

182 'SOLVRES': solvent_residues,

183 'COUNCAT': counter_cations,

184 'COUNANI': counter_anions,

185 'COUNION': counter_ions,

186 'INTERACTIONS': metadata_interactions,

187 'PBC_SELECTION': input_pbc_selection,

188 'CG_SELECTION': input_cg_selection,

189 'CHAINNAMES': chainnames,

190 'CUSTOMS': input_customs,

191 'ORIENTATION': input_orientation,

192 'PTM': ptm_names,

193 'MULTIMERIC' : input_multimeric,

194 'COLLECTIONS': collections,

195 'WARNINGS': warnings,

196 }

197 # Add boxsizes only if any of them is 0

198 if boxsizex > 0 and boxsizey > 0 and boxsizez > 0:

199 metadata['BOXSIZEX'] = boxsizex

200 metadata['BOXSIZEY'] = boxsizey

201 metadata['BOXSIZEZ'] = boxsizez

202 # Add collection specific fields

203 if 'cv19' in collections:

204 cv19_unit = input_cv19_unit

205 cv19_startconf = input_cv19_startconf

206 cv19_abs = input_cv19_abs

207 cv19_nanobs = input_cv19_nanobs

208 cv19_variant = sequence_metadata['cv19_variant']

209

210 if cv19_unit is not None:

211 metadata['CV19_UNIT'] = cv19_unit

212

213 if cv19_startconf is not None:

214 metadata['CV19_STARTCONF'] = cv19_startconf

215

216 if cv19_abs is not None:

217 metadata['CV19_ABS'] = cv19_abs

218

219 if cv19_nanobs is not None:

220 metadata['CV19_NANOBS'] = cv19_nanobs

221

222 if cv19_variant is not None:

223 metadata['CV19_VARIANT'] = cv19_variant

224

225 # Write metadata to a file

226 save_json(metadata, output_filepath)

227

228metadata_fields = set([ 'NAME', 'DESCRIPTION', 'AUTHORS', 'GROUPS', 'CONTACT', 'PROGRAM', 'VERSION',

229 'TYPE', 'METHOD', 'LICENSE', 'LINKCENSE', 'CITATION', 'THANKS', 'LINKS', 'PDBIDS', 'FORCED_REFERENCES',

230 'REFERENCES', 'INPUT_LIGANDS', 'LIGANDS', 'LIGANDNAMES', 'PROTSEQ', 'NUCLSEQ', 'DOMAINS', 'FRAMESTEP', 'TIMESTEP',

231 'TEMP', 'ENSEMBLE', 'FF', 'WAT', 'BOXTYPE', 'SYSTATS', 'PROTATS', 'PROT', 'DPPC', 'SOL', 'NA', 'CL',

232 'INTERACTIONS', 'PBC_SELECTION', 'CHAINNAMES', 'MEMBRANES', 'CUSTOMS', 'ORIENTATION', 'PTM',

233 'MULTIMERIC', 'COLLECTIONS', 'WARNINGS', 'BOXSIZEX', 'BOXSIZEY', 'BOXSIZEZ', 'CV19_UNIT', 'CV19_STARTCONF',

234 'CV19_ABS', 'CV19_NANOBS', 'CV19_VARIANT'

235])

236

237def generate_md_metadata (

238 md_inputs : dict,

239 structure : 'Structure',

240 snapshots : int,

241 reference_frame : int,

242 warnings : dict,

243 output_filepath : str

244 ):

245 """Produce the MD metadata file to be uploaded to the database."""

246

247 # Mine name and directory from MD inputs

248 name = md_inputs.get('name', None)

249 directory = md_inputs.get(MD_DIRECTORY, None)

250

251 # Write the metadata file

252 md_metadata = {

253 'name': name,

254 'frames': snapshots,

255 'atoms': len(structure.atoms), # Should be always the same but we better have explicit confirmation

256 'refframe': reference_frame,

257 'warnings': warnings,

258 }

259

260 # Get other MD inputs than the name and the directory

261 other_md_inputs = { k: v for k, v in md_inputs.items() }

262 # Remove name from MD inputs to not further overwrite project metadata

263 if name:

264 del other_md_inputs['name']

265 # Remove the directory name form MD inputs since it is not to be uploaded to the database

266 if directory:

267 del other_md_inputs[MD_DIRECTORY]

268

269 # Inherit all metadata fields

270 metadata = {}

271 for field in metadata_fields:

272 input_field = field.lower()

273 field_value = other_md_inputs.get(input_field, None)

274 if field_value:

275 metadata[field] = field_value

276

277 # Add the matadata field only if there is at least one value

278 if len(metadata) > 0:

279 md_metadata['metadata'] = metadata

280

281 # Write metadata to a file

282 save_json(md_metadata, output_filepath)

Coverage for model_workflow/tools/generate_metadata.py: 72%

93 statements