Coverage for model_workflow/analyses/nassa.py: 59%

684 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-23 10:54 +0000

1from itertools import product, combinations_with_replacement 

2import numpy.ma as ma 

3import pandas as pd 

4import numpy as np 

5import os 

6from os import chdir, getcwd 

7from pathlib import Path 

8 

9from model_workflow.tools.nassa_base import Base 

10from model_workflow.tools.nassa_loaders import load_sequence 

11from model_workflow.tools.nassa_loaders import load_serfile 

12from model_workflow.utils.constants import NASSA_ANALYSES_CANALS, BLUE_HEADER, COLOR_END, CYAN_HEADER, GREEN_HEADER 

13 

14from model_workflow.utils.heatmaps_nassa import basepair_plot 

15from model_workflow.utils.heatmaps_nassa import bconf_heatmap 

16from model_workflow.utils.heatmaps_nassa import correlation_plot 

17from model_workflow.utils.heatmaps_nassa import arlequin_plot 

18from model_workflow.utils.bibitransformer_nassa import BiBiTransformer 

19from model_workflow.utils.auxiliar import InputError 

20from model_workflow.utils.nassa_file import generate_nassa_config 

21from typing import Optional, List 

22import yaml 

23 

24# There are five analyses in total 

25 

26# BASE PAIR CORRELATION ANALYSIS 

27class BasePairCorrelation(Base): 

28 """ 

29 Execution plan and methods for basepair correlation analysis. 

30 """ 

31 

32 def __init__(self, **kwargs): 

33 super().__init__(**kwargs) 

34 

35 def extract(self): 

36 extracted = {} 

37 sequences = [] 

38 for seq_file in self.sequence_files: 

39 seq = load_sequence( 

40 seq_file, 

41 unit_name=self.unit_name, 

42 unit_len=self.unit_len) 

43 sequences.append(seq) 

44 self.Ac = seq.Ac 

45 self.logger.debug(f"Adenine complement set to: <{self.Ac}>") 

46 extracted["sequences"] = sequences 

47 self.logger.info(f"loaded {len(sequences)} sequences") 

48 

49 #Iterate over the different parameters: shift, twist, roll, slide, tilt, rise 

50 bpcorr_coordiantes = NASSA_ANALYSES_CANALS['bpcorr'] 

51 for bpcorr_coord in bpcorr_coordiantes: 

52 for coord in self.coordinate_info.keys(): 

53 if coord == bpcorr_coord: 

54 crd_data = [] 

55 for ser_file in self.coordinate_info[coord]: 

56 ser = load_serfile( 

57 ser_file, 

58 self.tail, 

59 self.n_lines) 

60 crd_data.append(ser) 

61 extracted[coord.lower()] = crd_data 

62 self.logger.info(f"loaded {len(crd_data)} files for coordinate <{coord}>") 

63 

64 return extracted 

65 

66 def extraer_tetramero_central(hexamero): 

67 return hexamero[1:5] 

68 

69 def transform(self, data): 

70 sequences = data.pop("sequences") 

71 # iterate over trajectories 

72 corr_results = {} 

73 for traj, seq in enumerate(sequences): 

74 trajectory_series = {coord.lower(): data[coord][traj] 

75 for coord in data.keys()} 

76 coordinate_corr = self.iterate_trajectory( 

77 seq, trajectory_series) 

78 corr_results[seq.sequence] = coordinate_corr 

79 joined_df = [] 

80 for seq, val in corr_results.items(): 

81 df = pd.DataFrame.from_dict(val).T 

82 joined_df.append(df) 

83 joined_df = pd.concat(joined_df) 

84 return joined_df 

85 

86 def iterate_trajectory(self, sequence, coordinates): 

87 coordinate_correlations = {} 

88 # iterate over subunits 

89 start = 2 + sequence.flanksize 

90 end = sequence.size - (2 + sequence.baselen + sequence.flanksize - 1) 

91 # subtract 1 from end in order to stop at last sub1unit 

92 for idx in range(start, end-1): 

93 subunit = sequence.get_subunit(idx) 

94 next_subunit = sequence.get_subunit(idx+1) 

95 if subunit in coordinate_correlations: 

96 self.logger.info( 

97 f"skipping repeated {self.unit_name} {subunit}...") 

98 continue 

99 self.logger.info( 

100 f"analyzing {self.unit_name} {subunit}...") 

101 # add 1 to idx since .ser table includes an index 

102 unit_df = pd.DataFrame( 

103 {coord: coordinates[coord][idx+1] 

104 for coord in coordinates.keys()}) 

105 next_unit_df = pd.DataFrame( 

106 {coord: coordinates[coord][idx+2] 

107 for coord in coordinates.keys()}) 

108 crd_corr = self.get_correlation(next_unit_df, unit_df) # inverse order ?  

109 coordinate_correlations[f"{subunit}/{next_subunit}"] = crd_corr 

110 return coordinate_correlations 

111 

112 def get_correlation(self, unit, next_unit): 

113 method = { 

114 "shift": "linear", 

115 "slide": "linear", 

116 "rise": "linear", 

117 "tilt": "circular", 

118 "roll": "circular", 

119 "twist": "circular"} 

120 coordinates = method.keys() 

121 combos = product(coordinates, repeat=2) 

122 result = {} 

123 for crd1, crd2 in combos: 

124 method1 = method[crd1] 

125 method2 = method[crd2] 

126 arr1 = unit[crd1] 

127 arr2 = next_unit[crd2] 

128 value = self.get_corr_by_method( 

129 method1, 

130 method2, 

131 arr1, 

132 arr2) 

133 result[f"{crd1}/{crd2}"] = value 

134 return result 

135 

136 def get_corr_by_method(self, method1, method2, arr1, arr2): 

137 if method1 == "circular" and method2 == "linear": 

138 value = self.circlineal(arr2, arr1) 

139 if method1 == "linear" and method2 == "circular": 

140 value = self.circlineal(arr1, arr2) 

141 elif method1 == "circular" and method2 == "circular": 

142 value = self.circular(arr1, arr2) 

143 else: 

144 value = ma.corrcoef(ma.masked_invalid(arr1), ma.masked_invalid(arr2))[1, 0] 

145 return value 

146 

147 @staticmethod 

148 def circular(x1, x2): 

149 x1 = x1 * np.pi / 180 

150 x2 = x2 * np.pi / 180 

151 diff_1 = np.sin(x1 - x1.mean()) 

152 diff_2 = np.sin(x2 - x2.mean()) 

153 num = (diff_1 * diff_2).sum() 

154 den = np.sqrt((diff_1 ** 2).sum() * (diff_2 ** 2).sum()) 

155 return num / den 

156 

157 @staticmethod 

158 def circlineal(x1, x2): 

159 x2 = x2 * np.pi / 180 

160 x1 = x1.to_numpy() 

161 x2 = x2.to_numpy() 

162 rc = ma.corrcoef(ma.masked_invalid(x1), ma.masked_invalid(np.cos(x2)))[1, 0] 

163 rs = ma.corrcoef(ma.masked_invalid(x1), ma.masked_invalid(np.sin(x2)))[1, 0] 

164 rcs = ma.corrcoef(ma.masked_invalid(np.sin(x2)), ma.masked_invalid(np.cos(x2)))[1, 0] 

165 num = (rc ** 2) + (rs ** 2) - 2 * rc * rs * rcs 

166 den = 1 - (rcs ** 2) 

167 correlation = np.sqrt(num / den) 

168 if ma.corrcoef(ma.masked_invalid(x1), ma.masked_invalid(x2))[1, 0] < 0: 

169 correlation *= -1 

170 return correlation 

171 

172 def make_tables(self, dataset): 

173 dataset.to_csv(self.save_path / "all_basepairs.csv") 

174 

175 def make_plots(self, dataset): 

176 basepair_plot(dataset, "all_basepairs", self.save_path) 

177 

178# BASE PAIR CONFIRMATION ANALYSIS 

179 

180class BConformations(Base): 

181 """Execution plan and methods for BI/BII conformations analysis pipeline""" 

182 

183 def __init__(self, **kwargs): 

184 super().__init__(**kwargs) 

185 valid_coordinates = ["epsilC", "zetaC", "epsilW", "zetaW"] 

186 for coord in self.coordinate_info.keys(): 

187 try: 

188 assert coord in valid_coordinates 

189 except AssertionError as e: 

190 raise ValueError( 

191 f"{coord} is not a valid coordinate! " 

192 "Please rename coordinates in your configuration file " 

193 f"to match any of {valid_coordinates}") from e 

194 

195 def extract(self): 

196 extracted = {} 

197 sequences = [] 

198 for seq_file in self.sequence_files: 

199 seq = load_sequence( 

200 seq_file, 

201 unit_name=self.unit_name, 

202 unit_len=self.unit_len) 

203 sequences.append(seq) 

204 self.Ac = seq.Ac 

205 self.logger.debug(f"Adenine complement set to: <{self.Ac}>") 

206 extracted["sequences"] = sequences 

207 self.logger.info(f"loaded {len(sequences)} sequences") 

208 bconf_coordiantes = NASSA_ANALYSES_CANALS['bconf'] 

209 for bconf_coord in bconf_coordiantes: 

210 for coord in self.coordinate_info.keys(): 

211 if coord == bconf_coord: 

212 crd_data = [] 

213 for ser_file in self.coordinate_info[coord]: 

214 ser = load_serfile( 

215 ser_file, 

216 self.tail, 

217 self.n_lines) 

218 crd_data.append(ser) 

219 extracted[coord.lower()] = crd_data 

220 self.logger.info( 

221 f"loaded {len(crd_data)} files for coordinate <{coord}>") 

222 return extracted 

223 

224 def transform(self, data): 

225 sequences = data.pop("sequences") 

226 angles_df = [] 

227 # get dataframe for each coordinate 

228 for traj, seq in enumerate(sequences): 

229 # start reading from the 4th column: 

230 # skip index, first two bases and first flanks 

231 start = 3 + seq.flanksize 

232 # skip last two bases 

233 end = seq.size - (seq.baselen + seq.flanksize) 

234 # select relevant subset of columns 

235 epsilC = data["epsilc"][traj][start:end] 

236 zetaC = data["zetac"][traj][start:end] 

237 epsilW = data["epsilw"][traj][start:end] 

238 zetaW = data["zetaw"][traj][start:end] 

239 traj_df = self.get_angles_difference( 

240 seq, 

241 epsilC, 

242 zetaC, 

243 epsilW, 

244 zetaW) 

245 angles_df.append(traj_df) 

246 angles_df = pd.concat(angles_df, axis=1) 

247 # percentages BI 

248 B_I = (angles_df < 0).sum(axis=0) * 100 / len(angles_df) # self.n_lines 

249 # clean dataset 

250 B_I = B_I[~B_I.index.duplicated(keep='first')] 

251 B_I = B_I.reset_index() 

252 B_I = B_I.rename(columns={"index": self.unit_name, 0: "pct"}) 

253 return B_I 

254 

255 def get_angles_difference(self, seq, epsilC, zetaC, epsilW, zetaW): 

256 # get list of tetramers, except first and last two bases 

257 all_subunits = seq.all_subunits[2:-2] 

258 all_ic_subunits = seq.all_ic_subunits[2:-2] 

259 

260 # concatenate zeta and epsil arrays 

261 zeta = pd.concat([ 

262 pd.DataFrame(zetaW.T), 

263 pd.DataFrame(zetaC[::-1].T)], 

264 axis=1) 

265 zeta.columns = all_subunits * 2 

266 epsil = pd.concat([ 

267 pd.DataFrame(epsilW.T), 

268 pd.DataFrame(epsilC[::-1].T)], 

269 axis=1) 

270 

271 # This function was contained in a separated script in the original NASSA code (angle_utils.py) 

272 def fix_angle_range(x, domain=[0, 360]): 

273 """Fix angle range so it's in the given angle range (degrees) by adding or subtracting 360. 

274 

275 :param float x: angle value (asumed to be in degrees) 

276 :param sequence domain: start and end of angle range. 

277 

278 : return float: angle value with fixed range """ 

279 while x < domain[0]: 

280 x += 360 

281 while x > domain[1]: 

282 x -= 360 

283 return x 

284 

285 epsil.columns = all_subunits * 2 

286 # difference between epsilon and zeta coordinates 

287 diff = epsil - zeta 

288 diff = diff.applymap(lambda x: fix_angle_range(x, domain=[-180, 180])) 

289 

290 # repeat with inverse-complementary sequence 

291 zeta_ic = pd.concat([ 

292 pd.DataFrame(zetaW.T), 

293 pd.DataFrame(zetaC[::-1].T)], 

294 axis=1) 

295 zeta_ic.columns = all_ic_subunits * 2 

296 epsil_ic = pd.concat([ 

297 pd.DataFrame(epsilW.T), 

298 pd.DataFrame(epsilC[::-1].T)], 

299 axis=1) 

300 epsil_ic.columns = all_ic_subunits * 2 

301 diff_ic = epsil_ic - zeta_ic 

302 diff_ic = diff_ic.applymap( 

303 lambda x: fix_angle_range(x, domain=[-180, 180])) 

304 

305 diff_df = pd.concat([diff, diff_ic], axis=1) 

306 

307 return diff_df 

308 

309 def make_tables(self, B_I): 

310 B_I.to_csv(self.save_path / "BI.csv", index=False) 

311 # create BII 

312 B_II = B_I.copy() 

313 B_II["pct"] = 100 - B_I["pct"] 

314 B_II.to_csv(self.save_path / "BII.csv", index=False) 

315 

316 def make_plots(self, B_I): 

317 bconf_heatmap(B_I, "BI", self.save_path, self.unit_len, self.Ac) 

318 B_II = B_I.copy() 

319 B_II["pct"] = 100 - B_I["pct"] 

320 bconf_heatmap(B_II, "BII", self.save_path, self.unit_len, self.Ac) 

321 

322# COORDINATE CORRELATION ANALYSIS 

323class CoordinateCorrelation(Base): 

324 """ 

325 Execution plan and methods for correlation analyses 

326 """ 

327 

328 def __init__(self, **kwargs): 

329 super().__init__(**kwargs) 

330 

331 def extract(self): 

332 extracted = {} 

333 sequences = [] 

334 for seq_file in self.sequence_files: 

335 seq = load_sequence( 

336 seq_file, 

337 unit_name=self.unit_name, 

338 unit_len=self.unit_len) 

339 sequences.append(seq) 

340 self.Ac = seq.Ac 

341 self.logger.debug(f"Adenine complement set to: <{self.Ac}>") 

342 extracted["sequences"] = sequences 

343 self.logger.info(f"loaded {len(sequences)} sequences") 

344 

345 crdcorr_coordiantes = NASSA_ANALYSES_CANALS['crdcorr'] 

346 for crdcorr_coord in crdcorr_coordiantes: 

347 for coord in self.coordinate_info.keys(): 

348 if coord == crdcorr_coord: 

349 crd_data = [] 

350 for ser_file in self.coordinate_info[coord]: 

351 ser = load_serfile( 

352 ser_file, 

353 self.tail, 

354 self.n_lines) 

355 crd_data.append(ser) 

356 extracted[coord.lower()] = crd_data 

357 self.logger.info( 

358 f"loaded {len(crd_data)} files for coordinate <{coord}>") 

359 return extracted 

360 

361 def transform(self, data): 

362 sequences = data.pop("sequences") 

363 # iterate over trajectories 

364 corr_results = {} 

365 for traj, seq in enumerate(sequences): 

366 trajectory_series = {coord.lower(): data[coord][traj] 

367 for coord in data.keys()} 

368 correlations = self.iterate_trajectory( 

369 seq, trajectory_series) 

370 corr_results[traj] = correlations 

371 return corr_results 

372 

373 def iterate_trajectory(self, sequence, coordinates): 

374 corrtype = { 

375 "shift": "linear", 

376 "slide": "linear", 

377 "rise": "linear", 

378 "tilt": "circular", 

379 "roll": "circular", 

380 "twist": "circular"} 

381 start = 3 + sequence.flanksize 

382 end = sequence.size - sequence.flanksize - sequence.baselen 

383 all_subunits = sequence.all_subunits[2:-2] 

384 results_df = {} 

385 for crd1, crd2 in combinations_with_replacement( 

386 coordinates.keys(), 

387 r=2): 

388 crd1_series = pd.DataFrame(data=coordinates[crd1][start:end]).T 

389 crd2_series = pd.DataFrame(data=coordinates[crd2][start:end]).T 

390 crd1_series.columns = all_subunits 

391 crd2_series.columns = all_subunits 

392 method = self.get_corr_method(corrtype[crd1], corrtype[crd2]) 

393 corr_matrix = pd.DataFrame({ 

394 col: crd1_series.corrwith( 

395 crd2_series[col], 

396 method=method 

397 ) for col in crd2_series.columns}) 

398 crd1_inner_dict = results_df.get(crd1, {}) 

399 crd1_inner_dict[crd2] = corr_matrix 

400 results_df[crd1] = crd1_inner_dict 

401 

402 # build complete dataset 

403 coords = list(results_df.keys()) 

404 for crd1, inner_dict in results_df.items(): 

405 missing_coords = [ 

406 crd for crd in coords if crd not in inner_dict.keys()] 

407 for crd2 in missing_coords: 

408 results_df[crd1][crd2] = results_df[crd2][crd1] 

409 dfs = {k: pd.concat(v) for k, v in results_df.items()} 

410 complete_df = pd.concat(dfs, axis=1) 

411 results_df["complete"] = complete_df 

412 return results_df 

413 

414 def get_corr_method(self, corrtype1, corrtype2): 

415 if corrtype1 == "circular" and corrtype2 == "linear": 

416 method = self.circlineal 

417 if corrtype1 == "linear" and corrtype2 == "circular": 

418 method = self.circlineal 

419 elif corrtype1 == "circular" and corrtype2 == "circular": 

420 method = self.circular 

421 else: 

422 method = "pearson" 

423 return method 

424 

425 @staticmethod 

426 def circular(x1, x2): 

427 x1 = x1 * np.pi / 180 

428 x2 = x2 * np.pi / 180 

429 diff_1 = np.sin(x1 - x1.mean()) 

430 diff_2 = np.sin(x2 - x2.mean()) 

431 num = (diff_1 * diff_2).sum() 

432 den = np.sqrt((diff_1 ** 2).sum() * (diff_2 ** 2).sum()) 

433 return num / den 

434 

435 @staticmethod 

436 def circlineal(x1, x2): 

437 x2 = x2 * np.pi / 180 

438 rc = np.corrcoef(x1, np.cos(x2))[1, 0] 

439 rs = np.corrcoef(x1, np.sin(x2))[1, 0] 

440 rcs = np.corrcoef(np.sin(x2), np.cos(x2))[1, 0] 

441 num = (rc ** 2) + (rs ** 2) - 2 * rc * rs * rcs 

442 den = 1 - (rcs ** 2) 

443 correlation = np.sqrt(num / den) 

444 if np.corrcoef(x1, x2)[1, 0] < 0: 

445 correlation *= -1 

446 return correlation 

447 

448 def make_tables(self, dataset): 

449 for traj, data in dataset.items(): 

450 # make a new directory to save data for each trajectory 

451 save_subpath = self.save_path / f"traj_{traj}" 

452 save_subpath.mkdir(exist_ok=True) 

453 # data.to_csv(save_subpath / "coordinate_corr.csv") 

454 # coords = data.columns.get_level_values(0).unique() 

455 for crd1, inner_dict in data.items(): 

456 if crd1 == "complete": 

457 inner_dict.to_csv(save_subpath / "coordinate_corr.csv") 

458 else: 

459 for crd2, df in inner_dict.items(): 

460 df.to_csv(save_subpath / f"{crd1}_{crd2}.csv") 

461 

462 def make_plots(self, dataset): 

463 complete_dataset = pd.concat( 

464 [value['complete'] for value in dataset.values()], 

465 axis=1) 

466 save_subpath = self.save_path / "complete" 

467 save_subpath.mkdir(exist_ok=True) 

468 correlation_plot( 

469 complete_dataset, 

470 "coordcorr", 

471 save_subpath) 

472 

473# COORDINATE DISTRIBUTION ANALYSIS 

474class CoordinateDistributions(Base): 

475 """ 

476 Execution plan and methods for coordinate distributions analysis pipeline 

477 """ 

478 

479 def __init__( 

480 self, 

481 max_iter=400, 

482 tol=1e-5, 

483 **kwargs): 

484 super().__init__(**kwargs) 

485 self.max_iter = max_iter 

486 self.tol = tol 

487 self.logger.debug(f"max_iter: {self.max_iter}") 

488 self.logger.debug(f"tol: {self.tol}") 

489 

490 def extract(self): 

491 extracted = {} 

492 sequences = [] 

493 for seq_file in self.sequence_files: 

494 seq = load_sequence( 

495 seq_file, 

496 unit_name=self.unit_name, 

497 unit_len=self.unit_len) 

498 sequences.append(seq) 

499 self.Ac = seq.Ac 

500 self.logger.debug(f"Adenine complement set to: <{self.Ac}>") 

501 extracted["sequences"] = sequences 

502 self.logger.info(f"loaded {len(sequences)} sequences") 

503 

504 coordist_coordiantes = NASSA_ANALYSES_CANALS['coordist'] 

505 for coordist_coord in coordist_coordiantes: 

506 for coord in self.coordinate_info.keys(): 

507 if coord == coordist_coord: 

508 crd_data = [] 

509 for ser_file in self.coordinate_info[coord]: 

510 ser = load_serfile( 

511 ser_file, 

512 self.tail, 

513 self.n_lines) 

514 crd_data.append(ser) 

515 extracted[coord.lower()] = crd_data 

516 self.logger.info( 

517 f"loaded {len(crd_data)} files for coordinate <{coord}>") 

518 return extracted 

519 

520 def transform(self, data): 

521 sequences = data.pop("sequences") 

522 subunits_classification = {} 

523 # get dataframe for each coordinate 

524 for coordinate, coord_dataset in data.items(): 

525 coordinate_df = self.coordinate_iteration( 

526 sequences, 

527 coordinate, 

528 coord_dataset) 

529 coordinate_df = coordinate_df.drop_duplicates( 

530 subset=[self.unit_name]) 

531 coordinate_df, global_mean, global_std = self.add_modality_labels( 

532 coordinate_df) 

533 subunits_classification[coordinate] = [ 

534 coordinate_df, global_mean, global_std] 

535 return subunits_classification 

536 

537 def coordinate_iteration( 

538 self, 

539 sequences, 

540 coordinate, 

541 coord_dataset): 

542 coordinate_df = [] 

543 if self.duplicates: 

544 trajectory_df = self.duplicate_trajectory_iteration( 

545 coord_dataset, 

546 sequences, 

547 coordinate) 

548 coordinate_df.append(trajectory_df) 

549 else: 

550 for seq, dataseries in zip(sequences, coord_dataset): 

551 trajectory_df = self.trajectory_iteration( 

552 dataseries, 

553 seq, 

554 coordinate) 

555 coordinate_df.append(trajectory_df) 

556 # concatenate all dataframes 

557 coordinate_df = pd.concat( 

558 coordinate_df, 

559 axis=0) 

560 return coordinate_df 

561 

562 def duplicate_trajectory_iteration( 

563 self, 

564 coord_dataset, 

565 sequences, 

566 coordinate): 

567 trajectory_info = [] 

568 dict_all_subunits_ser = {} 

569 # iterate over datasets and sequences 

570 for dataseries, sequence in zip(coord_dataset, sequences): 

571 # iterate over subunits 

572 start = 2 + sequence.flanksize 

573 end = sequence.size - (2 + sequence.baselen + sequence.flanksize - 1) 

574 

575 for idx in range(start, end): 

576 subunit = sequence.get_subunit(idx) 

577 ic_subunit = sequence.inverse_complement(subunit) 

578 ser = dataseries[idx + 1] 

579 ser = ser[~np.isnan(ser)].to_numpy() 

580 

581 if subunit in dict_all_subunits_ser: 

582 dict_all_subunits_ser[subunit] = np.concatenate((dict_all_subunits_ser[subunit], ser)) 

583 else: 

584 dict_all_subunits_ser[subunit] = ser 

585 

586 if ic_subunit not in dict_all_subunits_ser: 

587 dict_all_subunits_ser[ic_subunit] = ser 

588 

589 for subunit, ser in dict_all_subunits_ser.items(): 

590 ser = ser.reshape(-1, 1) 

591 subunit_information = self.subunit_iteration( 

592 ser, 

593 subunit, 

594 coordinate) 

595 if not self.bimod: 

596 subunit_information["unimodal"] = True 

597 trajectory_info.append(subunit_information) 

598 

599 # create dataframe from list of dictionaries 

600 trajectory_df = pd.DataFrame.from_dict(trajectory_info) 

601 return trajectory_df 

602 

603 def trajectory_iteration( 

604 self, 

605 dataseries, 

606 sequence, 

607 coordinate): 

608 trajectory_info = [] 

609 # iterate over subunits 

610 start = 2 + sequence.flanksize 

611 end = sequence.size - (2 + sequence.baselen + sequence.flanksize - 1) 

612 for idx in range(start, end): 

613 # get unit and inverse-complement unit 

614 subunit = sequence.get_subunit(idx) 

615 ic_subunit = sequence.inverse_complement(subunit) 

616 self.logger.info( 

617 f"analyzing {self.unit_name} {subunit}/{ic_subunit}...") 

618 # add 1 to idx since .ser table includes an index 

619 ser = dataseries[idx + 1] 

620 ser = ser[~np.isnan(ser)].to_numpy() 

621 if ser.shape[0] < 2: 

622 self.logger.info("skipping because of insufficient data!") 

623 subunit_information = dict( 

624 coordinate=coordinate, 

625 binormal=False, 

626 uninormal=True, 

627 insuf_ev=False, 

628 unimodal=True, 

629 bics=[np.nan, np.nan], 

630 mean1=np.nan, 

631 mean2=np.nan, 

632 var1=np.nan, 

633 var2=np.nan, 

634 w1=np.nan, 

635 w2=np.nan) 

636 subunit_information[self.unit_name] = subunit 

637 else: 

638 # reshape dataset 

639 ser = ser.reshape(-1, 1) 

640 subunit_information = self.subunit_iteration( 

641 ser, 

642 subunit, 

643 coordinate) 

644 if not self.bimod: 

645 subunit_information["unimodal"] = True 

646 trajectory_info.append(subunit_information) 

647 # add inverse-complement 

648 ic_subunit_information = subunit_information.copy() 

649 ic_subunit_information[self.unit_name] = ic_subunit 

650 trajectory_info.append(ic_subunit_information) 

651 # create dataframe from list of dictionaries 

652 trajectory_df = pd.DataFrame.from_dict(trajectory_info) 

653 return trajectory_df 

654 

655 def subunit_iteration( 

656 self, 

657 ser, 

658 subunit, 

659 coordinate): 

660 # model subunit's series 

661 bibi = BiBiTransformer(max_iter=self.max_iter, tol=self.tol) 

662 subunit_info = bibi.fit_transform(ser) 

663 # add subunit name and coordinate to info dictionary 

664 subunit_info[self.unit_name] = subunit 

665 subunit_info["coordinate"] = coordinate 

666 return subunit_info 

667 

668 def add_modality_labels(self, data): 

669 df = data.set_index(self.unit_name) 

670 

671 # Series with global mean value for each tetramer 

672 weighted_mean = df.apply( 

673 lambda t: t["mean1"] if ( 

674 t["uninormal"] or t["insuf_ev"] or np.isnan(t["mean2"]) 

675 ) else (t["mean1"]*t["w1"]+t["mean2"]*t["w2"]), 

676 axis=1) 

677 

678 # Series with comparison values 

679 comparison_1 = df.apply( 

680 lambda t: t["mean1"] if ( 

681 t["uninormal"] or t["insuf_ev"] or not t["unimodal"] 

682 ) else t["mean1"]*t["w1"]+t["mean2"]*t["w2"], 

683 axis=1) 

684 

685 # mean1: uninormal+unimodal 

686 # u1*w1+u2*w2: binormal+unimodal 

687 # mean2: binormal+bimodal 

688 comparison_2 = df.apply( 

689 lambda t: np.nan if t["unimodal"] else t["mean2"], 

690 axis=1) 

691 comparison_2 = comparison_2.fillna(comparison_1) 

692 

693 # get limiting values for col1 and col2 

694 global_mean = weighted_mean.mean() 

695 global_std = weighted_mean.std() 

696 l1 = (global_mean + global_std) 

697 l2 = (global_mean - global_std) 

698 

699 # 1: above mean+std (blue) 

700 # 0: between mean-std and mean+std (white) 

701 # -1: below mean-std (red) 

702 col1 = comparison_1.apply( 

703 lambda t: -1 if (t < l2) else (1 if (t > l1) else 0)) 

704 col1 = col1.rename("col1") 

705 col2 = comparison_2.apply( 

706 lambda t: -1 if (t < l2) else (1 if (t > l1) else 0)) 

707 col2 = col2.rename("col2") 

708 

709 df = pd.concat( 

710 [df, col1, col2], 

711 axis=1).reset_index() 

712 

713 return df, global_mean, global_std 

714 

715 def make_tables(self, dataset, index=True): 

716 for coordinate, dataset in dataset.items(): 

717 dataset[0].to_csv( 

718 self.save_path / f"{coordinate}.csv", index=index) 

719 

720 def make_plots(self, dataset): 

721 for coordinate, data_dict in dataset.items(): 

722 dataframes = data_dict[0] 

723 global_mean = data_dict[1] 

724 global_std = data_dict[2] 

725 arlequin_plot( 

726 dataframes, 

727 global_mean, 

728 global_std, 

729 coordinate, 

730 self.save_path, 

731 base=self.Ac, 

732 unit_name=self.unit_name, 

733 unit_len=self.unit_len) 

734 

735 

736# STIFFNESS ANALYSIS 

737class StiffnessDistributions(Base): 

738 

739 def __init__( 

740 self, 

741 *args, 

742 **kwargs): 

743 super().__init__( 

744 *args, 

745 **kwargs) 

746 

747 def extract(self): 

748 extracted = {} 

749 sequences = [] 

750 for seq_file in self.sequence_files: 

751 seq = load_sequence( 

752 seq_file, 

753 unit_name=self.unit_name, 

754 unit_len=self.unit_len) 

755 sequences.append(seq) 

756 self.Ac = seq.Ac 

757 self.logger.debug(f"Adenine complement set to: <{self.Ac}>") 

758 extracted["sequences"] = sequences 

759 self.logger.info(f"loaded {len(sequences)} sequences") 

760 # In this case the selection of coordinates is different, it depends on the length of the unit (par or impar) 

761 if (self.unit_len % 2) == 0: 

762 stiff_coordiantes = NASSA_ANALYSES_CANALS['bpcorr'] 

763 elif (self.unit_len % 2) == 1: 

764 stiff_coordiantes = NASSA_ANALYSES_CANALS['stiff'] 

765 for stiff_coord in stiff_coordiantes: 

766 for coord in self.coordinate_info.keys(): 

767 if coord == stiff_coord: 

768 crd_data = [] 

769 for ser_file in self.coordinate_info[coord]: 

770 ser = load_serfile( 

771 ser_file, 

772 self.tail, 

773 self.n_lines) 

774 crd_data.append(ser) 

775 extracted[coord.lower()] = crd_data 

776 self.logger.info( 

777 f"loaded {len(crd_data)} files for coordinate <{coord}>") 

778 return extracted 

779 

780 def transform(self, data): 

781 sequences = data.pop("sequences") 

782 results = {"stiffness": [], "covariances": {}, "constants": {}} 

783 for traj, seq in enumerate(sequences): 

784 traj_series = {coord.lower(): data[coord][traj] 

785 for coord in data.keys()} 

786 traj_results = self.get_stiffness( 

787 seq, 

788 traj_series) 

789 results["stiffness"].append(traj_results["stiffness"]) 

790 results["covariances"].update(traj_results["covariances"]) 

791 results["constants"].update(traj_results["constants"]) 

792 stiffness_df = pd.concat(results["stiffness"], axis=0) 

793 stiffness_df = stiffness_df.drop_duplicates(subset=[self.unit_name]) 

794 stiffness_df = stiffness_df.set_index(self.unit_name) 

795 results["stiffness"] = stiffness_df 

796 return results 

797 

798 def get_stiffness( 

799 self, 

800 sequence, 

801 series_dict): 

802 # get stiffness table for a given trajectory 

803 coordinates = list(series_dict.keys()) 

804 results = {"stiffness": {}, "covariances": {}, "constants": {}} 

805 diagonals = {} 

806 start = 2 + sequence.flanksize 

807 end = sequence.size - (2 + sequence.baselen + sequence.flanksize - 1) 

808 for i in range(start, end): 

809 tetramer = sequence.get_subunit(i) 

810 ic_tetramer = sequence.inverse_complement(tetramer) 

811 cols_dict = {coord: series_dict[coord][i+1] 

812 for coord in series_dict.keys()} 

813 stiffness_diag, cte, cov_df = self.get_subunit_stiffness( 

814 cols_dict, 

815 coordinates) 

816 diagonals[tetramer] = np.append( 

817 stiffness_diag, 

818 [np.product(stiffness_diag), np.sum(stiffness_diag)]) 

819 diagonals[ic_tetramer] = np.append( 

820 stiffness_diag, 

821 [np.product(stiffness_diag), np.sum(stiffness_diag)]) 

822 #results["covariances"][tetramer] = cov_df 

823 results["covariances"][ic_tetramer] = cov_df 

824 #results["constants"][tetramer] = cte 

825 results["constants"][ic_tetramer] = cte 

826 # build stiffness table 

827 columns = [sequence.unit_name] + coordinates + ["product", "sum"] 

828 results["stiffness"] = pd.DataFrame.from_dict( 

829 diagonals, 

830 orient="index").reset_index() 

831 results["stiffness"].columns = columns 

832 return results 

833 

834 def get_subunit_stiffness( 

835 self, 

836 cols_dict, 

837 coordinates, 

838 scaling=[1, 1, 1, 10.6, 10.6, 10.6], 

839 KT=0.592186827): 

840 if (self.unit_len % 2) == 0: 

841 SH_av = cols_dict["shift"].mean() 

842 SL_av = cols_dict["slide"].mean() 

843 RS_av = cols_dict["rise"].mean() 

844 TL_av = self.circ_avg(cols_dict["tilt"]) 

845 RL_av = self.circ_avg(cols_dict["roll"]) 

846 TW_av = self.circ_avg(cols_dict["twist"]) 

847 elif (self.unit_len % 2) == 1: 

848 SH_av = cols_dict["shear"].mean() 

849 SL_av = cols_dict["stretch"].mean() 

850 RS_av = cols_dict["stagger"].mean() 

851 CW_av = cols_dict["chiw"].mean() 

852 CC_av = cols_dict["chic"].mean() 

853 TL_av = self.circ_avg(cols_dict["buckle"]) 

854 RL_av = self.circ_avg(cols_dict["propel"]) 

855 TW_av = self.circ_avg(cols_dict["opening"]) 

856 cols_arr = [cols_dict[coord] for coord in coordinates] 

857 cols_arr = np.array(cols_arr).T 

858 

859 cv = ma.cov(ma.masked_invalid(cols_arr), rowvar=False) 

860 cv.filled(np.nan) 

861 

862 cov_df = pd.DataFrame(cv, columns=coordinates, index=coordinates) 

863 stiff = np.linalg.inv(cv) * KT 

864 # Added two new variables: ChiC and ChiW -> 8 (for PENTAMERS) 

865 if (self.unit_len % 2) == 0: 

866 last_row = [SH_av, SL_av, RS_av, TL_av, RL_av, TW_av] #, CW_av, CC_av] 

867 stiff = np.append(stiff, last_row).reshape(7, 6) 

868 elif (self.unit_len % 2) == 1: 

869 last_row = [SH_av, SL_av, RS_av, TL_av, RL_av, TW_av, CW_av, CC_av] 

870 stiff = np.append(stiff, last_row).reshape(9, 8) 

871 scaling=[1, 1, 1, 10.6, 10.6, 10.6, 1, 1] 

872 

873 stiff = stiff.round(6) 

874 stiff_diag = np.diagonal(stiff) * np.array(scaling) 

875 

876 cte = pd.DataFrame(stiff) 

877 cte.columns = coordinates 

878 cte.index = coordinates + ["avg"] 

879 return stiff_diag, cte, cov_df 

880 

881 

882 

883 @ staticmethod 

884 def circ_avg(xarr, degrees=True): 

885 n = len(xarr) 

886 if degrees: 

887 # convert to radians 

888 xarr = xarr * np.pi / 180 

889 av = np.arctan2( 

890 (np.sum(np.sin(xarr)))/n, 

891 (np.sum(np.cos(xarr)))/n) * 180 / np.pi 

892 return av 

893 

894 def unimod_labels(self, df): 

895 # get limiting values for col1 and col2 

896 global_mean = df.mean(axis=0) 

897 global_std = df.std(axis=0) 

898 l1 = global_mean + global_std 

899 l2 = global_mean - global_std 

900 

901 # 1: above mean+std (blue) 

902 # 0: between mean-std and mean+std (white) 

903 # -1: below mean-std (red) 

904 labeled_df = (df < l2) * -1 + (df > l1) 

905 labeled_df.loc["g_mean"] = global_mean 

906 labeled_df.loc["g_std"] = global_std 

907 

908 return labeled_df 

909 

910 def make_tables(self, dataset): 

911 # stiffness 

912 stiffness_data = dataset["stiffness"] 

913 stiffness_data.to_csv(self.save_path / "stiffness.csv") 

914 # covariances 

915 covariances_path = self.save_path / "covariances" 

916 covariances_path.mkdir(exist_ok=True) 

917 for key, val in dataset["covariances"].items(): 

918 val.to_csv(covariances_path / f"{key}.csv") 

919 # constants 

920 constants_path = self.save_path / "constants" 

921 constants_path.mkdir(exist_ok=True) 

922 for key, val in dataset["constants"].items(): 

923 val.to_csv(constants_path / f"{key}.csv") 

924 

925 def make_plots(self, dataset): 

926 stiffness_data = dataset["stiffness"] 

927 labeled_df = self.unimod_labels(stiffness_data) 

928 for col in labeled_df.columns: 

929 df = labeled_df[col] 

930 g_mean = df.loc["g_mean"] 

931 g_std = df.loc["g_std"] 

932 df = df.iloc[:-2] 

933 df = df.rename("col1") 

934 df = df.reset_index() 

935 df["col2"] = df["col1"].copy() 

936 arlequin_plot( 

937 df, 

938 g_mean, 

939 g_std, 

940 col, 

941 self.save_path, 

942 base=self.Ac, 

943 unit_name=self.unit_name, 

944 unit_len=self.unit_len) 

945 

946# Run the NASSA analysis pipeline giving the name of the analysis and the configuration file.  

947# The overwrite flag could be used to overwrite the output folder if it already exists. 

948def run_nassa(analysis_name: str, 

949 config_archive: dict, 

950 overwrite_nassa: bool = False): 

951 """Run the NASSA analysis pipeline""" 

952 

953 # Dictionary with the available NASSA analyses and their corresponding classes 

954 analyses = { 

955 "bpcorr": BasePairCorrelation, 

956 "bconf": BConformations, 

957 "crdcorr": CoordinateCorrelation, 

958 "coordist": CoordinateDistributions, 

959 "stiff": StiffnessDistributions 

960 } 

961 

962 # We must check if the analysis name is valid 

963 if analysis_name in analyses: 

964 # A folder for each analysis has to be created 

965 analyses_folder = os.path.join(config_archive["save_path"], analysis_name) 

966 if not os.path.exists(analyses_folder): 

967 os.mkdir(analyses_folder) 

968 # The output folder is added to the configuration archive 

969 config_archive["save_path"] = analyses_folder 

970 else: 

971 # If the output folder already exists, it is checked so that it can be overwritten with te overwrite_nassa flag 

972 if os.listdir(analyses_folder) == [] or overwrite_nassa: 

973 config_archive["save_path"] = analyses_folder 

974 print(f'WARNING: Output folder {analyses_folder} already exists. Overwriting it.') 

975 else: 

976 print(f'WARNING: Output folder {analyses_folder} already exists and is not empty. Skipping NASSA analysis. \nSet the overwrite flag to overwrite the output folder (-own).') 

977 return 

978 # The analysis is run 

979 analysis_class = analyses[analysis_name] 

980 # The stiffness analysis is a special case, since the coordinate files are different depending on the unit length (if it is pair or impair) 

981 if analysis_name == "stiff": 

982 unit_len = config_archive["unit_len"] # Obtain the unit length from the configuration archive 

983 if (unit_len % 2) == 0: 

984 coordinate_files = NASSA_ANALYSES_CANALS["bpcorr"] 

985 elif (unit_len % 2) == 1: 

986 coordinate_files = NASSA_ANALYSES_CANALS["stiff"] 

987 # The rest of the analyses have their corresponding coordinate files  

988 else: 

989 coordinate_files = NASSA_ANALYSES_CANALS[analysis_name] 

990 # The coordinate files are filtered from the configuration archive because we only need the ones that correspond to the analysis 

991 config_archive["coordinate_info"] = { 

992 coord: config_archive["coordinate_info"][coord] for coord in coordinate_files} 

993 # Call the analysis class with the configuration archive and run NASSA software  

994 analysis_instance = analysis_class(**config_archive) 

995 analysis_instance.run() 

996 # If the analysis name is not valid, an error is raised 

997 else: 

998 raise ValueError( 

999 f"{analysis_name} is not a valid analysis! " 

1000 "Please choose from: " 

1001 f"{list(analyses.keys())}") 

1002 

1003 

1004def workflow_nassa( 

1005 config_file_path: Optional[str], 

1006 analysis_names: Optional[List[str]], 

1007 make_config: bool = False, 

1008 output: Optional[List[str]] = None, 

1009 working_directory: str = '.', 

1010 overwrite: bool = False, 

1011 overwrite_nassa: bool = False, 

1012 helical_par: bool = False, 

1013 proj_dirs: Optional[List[str]] = None, 

1014 input_structure_file: Optional[str] = None, 

1015 input_trajectory_file: Optional[str] = None, 

1016 input_top_file: Optional[str] = None, 

1017 all: bool = False, 

1018 unit_len: int = 6, 

1019 n_sequences: Optional[int] = '*', 

1020 seq_path: str = None, 

1021 md_directories: Optional[List[str]] = None, 

1022 trust: bool = False, 

1023 mercy: bool = False, 

1024 ): 

1025 

1026 # Change to the working directory and print the name of the directory to inform the user 

1027 chdir(working_directory) 

1028 current_directory_name = getcwd().split('/')[-1] 

1029 print(f'\n{BLUE_HEADER}Running NASSA at {current_directory_name}{COLOR_END}') 

1030 # If the user select the helical parameter analysis, we need to call the function from workflow to run it 

1031 if helical_par: 

1032 # AGUS: This option with glob is commented because it is not working properly so if in the future it is needed, it should be fixed 

1033 # AGUS: the objective to use glob is to run this part (of project directories) calling this function from python and not from the command line 

1034 #seq_paths = glob.glob(f'{proj_dirs}') 

1035 #seq_paths = [path for path in glob.glob(f'{proj_dirs}*') if os.path.isdir(path)] 

1036 

1037 actual_path = getcwd() 

1038 # The workflow function is imported from model_workflow to run the helical parameters analysis and do all the checks in each project directory 

1039 # By this way, we are assuming that each project directory is an independent project or MD simulation 

1040 from model_workflow.mwf import workflow 

1041 # Iterate over the project directories 

1042 for proj_path in proj_dirs: 

1043 # Obtain the path of the project directory and change to it 

1044 md_path = os.path.join(actual_path, proj_path) 

1045 os.chdir(md_path) 

1046 print(f'\n{CYAN_HEADER}Running Helical Parameters at {proj_path}{COLOR_END}') 

1047 # Call the workflow function to run the helical parameters analysis with the include flag set to helical and overwrite (if it is added) 

1048 workflow(project_parameters={#'directory':os.getcwd(), 

1049 #'md_directories':[os.getcwd()], 

1050 #'input_structure_filepath':input_structure_file,  

1051 'input_topology_filepath':input_top_file, 

1052 'input_trajectory_filepaths':input_trajectory_file, 

1053 'md_directories': md_directories, 

1054 'trust': trust, 

1055 'mercy': mercy 

1056 }, 

1057 include=['helical'], 

1058 overwrite=overwrite 

1059 ) 

1060 os.chdir('..') 

1061 # If all flag is selected, the NASSA analysis will be run after the helical parameters analysis 

1062 # Reminder: the NASSA analysis needs the helical parameters files to be generated before running it.  

1063 if all: 

1064 # The sequences path is needed to run the NASSA analysis so it has to be defined 

1065 if seq_path == None: 

1066 raise InputError('No sequence path defined. Please define it as an argument (--seq_path)') 

1067 output_nassa_analysis = os.path.join(actual_path, 'nassa_analysis') 

1068 # If the output folder already exists, it is checked so that it can be overwritten with te overwrite_nassa flag 

1069 if not os.path.exists(output_nassa_analysis): 

1070 os.mkdir(output_nassa_analysis) 

1071 else: 

1072 if overwrite_nassa == True: 

1073 print(f'WARNING: Output folder {output_nassa_analysis} already exists. Overwriting it.') 

1074 else: 

1075 print(f'WARNING: Output folder {output_nassa_analysis} already exists and is not empty. Skipping NASSA analysis. \nSet the overwrite flag to overwrite the output folder (--overwrite_nassa).') 

1076 return 

1077 # Generate the NASSA configuration file 

1078 generate_nassa_config( 

1079 folder_path=proj_dirs, 

1080 seq_path=seq_path, 

1081 output_path=output_nassa_analysis, 

1082 unit_len=unit_len, 

1083 n_sequences=n_sequences, 

1084 ) 

1085 print(f'NASSA configuration file generated at {output_nassa_analysis}') 

1086 config_file_path = os.path.join(output_nassa_analysis, 'nassa.yml') 

1087 # The user can select the analysis to run 

1088 if analysis_names is not None: 

1089 for analysis_name in analysis_names: 

1090 print(f' Running analysis {analysis_name}') 

1091 # Read the configuration file 

1092 try: 

1093 with Path(config_file_path).open("r") as ymlfile: 

1094 config_archive = yaml.load( 

1095 ymlfile, Loader=yaml.FullLoader) 

1096 except FileNotFoundError: 

1097 raise FileNotFoundError( 

1098 f"Configuration file {config_file_path} not found!") 

1099 # Run the NASSA analysis with the selected analysis 

1100 run_nassa(analysis_name=analysis_name, 

1101 config_archive=config_archive, 

1102 overwrite_nassa=overwrite_nassa) 

1103 print(f'NASSA analysis completed at {output_nassa_analysis}') 

1104 return 

1105 # If the user does not select any analysis, all the analyses will be run 

1106 for analysis_name in NASSA_ANALYSES_CANALS.keys(): 

1107 print(f' Running analysis {analysis_name}') 

1108 # Read the configuration file 

1109 try: 

1110 with Path(config_file_path).open("r") as ymlfile: 

1111 config_archive = yaml.load( 

1112 ymlfile, Loader=yaml.FullLoader) 

1113 except FileNotFoundError: 

1114 raise FileNotFoundError( 

1115 f"Configuration file {config_file_path} not found!") 

1116 # Run the NASSA analysis with the selected analysis  

1117 run_nassa(analysis_name=analysis_name, 

1118 config_archive=config_archive, 

1119 overwrite_nassa=overwrite_nassa) 

1120 print(f'NASSA analysis completed at {output_nassa_analysis}') 

1121 return 

1122 # If the helical parameter analysis is not selected, the NASSA analysis will be run 

1123 # To run this, the user has to provide the configuration file with the information needed 

1124 if config_file_path and analysis_names is not None: 

1125 config_file_path = Path(config_file_path) 

1126 print(f' Using config file {config_file_path}') 

1127 # Load the configuration file 

1128 try: 

1129 with Path(config_file_path).open("r") as ymlfile: 

1130 config_archive = yaml.load( 

1131 ymlfile, Loader=yaml.FullLoader) 

1132 except FileNotFoundError: 

1133 raise FileNotFoundError( 

1134 f"Configuration file {config_file_path} not found!") 

1135 # The user can select the analysis to run 

1136 for analysis_name in analysis_names: 

1137 # THe output folder is defined in the configuration file 

1138 if output is not None: 

1139 output_path = os.path.join(output, analysis_name) 

1140 config_archive['save_path'] = output_path 

1141 else: 

1142 if 'save_path' in config_archive: 

1143 output_path = os.path.join(config_archive['save_path'], analysis_name) 

1144 config_archive['save_path'] = output_path 

1145 else: 

1146 raise InputError('No output path defined. Please define it in the configuration file or pass it as an argument (--output)') 

1147 

1148 # To check if the user has created the configuration file with the needed information  

1149 # it's important to check if the needed files are defined for the analysis selected 

1150 canal_files_config = config_archive['coordinate_info'] 

1151 needed_files = NASSA_ANALYSES_CANALS[analysis_name] 

1152 for file in needed_files: 

1153 # If some of the files needed are not in the config file, raise an error because it is needed 

1154 if file not in canal_files_config: 

1155 raise InputError(f'Analysis {analysis_name} requires the files of {file} coordinate to be defined in the configuration file') 

1156 

1157 # The check for the output folder is done here 

1158 if os.path.exists(output_path): 

1159 if os.listdir(output_path) == [] or overwrite_nassa == True: 

1160 print(f' Output folder {output_path} already exists. Overwriting analysis {analysis_name}') 

1161 run_nassa(analysis_name, config_archive, overwrite_nassa) 

1162 else: 

1163 print(f' Output folder {output_path} already exists. Skipping analysis. \nSet the overwrite flag to overwrite the output folder (--overwrite_nassa).') 

1164 continue 

1165 else: 

1166 os.mkdir(output_path) 

1167 print(f' Running analysis {analysis_name} and saving results in {output_path}') 

1168 run_nassa(analysis_name, config_archive, overwrite_nassa) 

1169 if all: 

1170 for analysis_name in NASSA_ANALYSES_CANALS.keys(): 

1171 print(f"{GREEN_HEADER} |-----> Running analysis {analysis_name}{COLOR_END}") 

1172 # Read the configuration file 

1173 try: 

1174 with Path(config_file_path).open("r") as ymlfile: 

1175 config_archive = yaml.load( 

1176 ymlfile, Loader=yaml.FullLoader) 

1177 except FileNotFoundError: 

1178 raise FileNotFoundError( 

1179 f"Configuration file {config_file_path} not found!") 

1180 # Check if the NASSA output folder exists 

1181 output_path = os.path.join(config_archive['save_path'], 'nassa_analysis') 

1182 config_archive['save_path'] = output_path 

1183 if not os.path.exists(output_path): 

1184 os.makedirs(output_path) 

1185 # Check if the analysis output folder exists in NASSA output folder 

1186 if not os.path.exists(os.path.join(output_path, analysis_name)): 

1187 os.makedirs(os.path.join(output_path, analysis_name)) 

1188 # config_archive['save_path'] = os.path.join(output_path, analysis_name) 

1189 # Run the NASSA analysis with the selected analysis 

1190 run_nassa(analysis_name=analysis_name, 

1191 config_archive=config_archive, 

1192 overwrite_nassa=overwrite_nassa) 

1193 print(f'NASSA analysis completed at {current_directory_name}')