Coverage for model_workflow/tools/nassa_base.py: 80%
75 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
1import time
2import pathlib
3import logging
4from abc import ABC, abstractmethod
7class Base(ABC):
8 """Base class for nucleic acid analysis workflow.
10 :param list sequence_files: paths to sequence files.
11 :param dict coordinate_info: dictionary with coordinates as keys, and coordinate series files as values.
12 :param int n_lines: number of lines to read from coordinate series files.
13 :param bool tail: read last ``n_lines`` from coordinate series files.
14 :param str unit_name: name of subunit to analyze, used to name data table columns.
15 :param int unit_len: length of subunit to analyze.
16 :param bool save_tables: save data tables as .csv files.
17 :param bool save_plots: save data visualizations as .pdf files.
18 :param bool verbose: print verbose output.
19 :param str save_path: path to save directory. If it doesn't exist, it is created during execution. Default is the current working directory.
21 :raises ValueError: If number of sequence files doesn't match number of coordinate files.
22 """
24 def __init__(
25 self,
26 sequence_files,
27 coordinate_info,
28 save_path,
29 unit_len,
30 unit_name="subunit",
31 n_lines=1000,
32 tail=True,
33 bimod=True,
34 save_tables=True,
35 save_plots=True,
36 verbose=True,
37 duplicates=False):
39 # sequence and coordinate paths and files from config file
40 for coordinate_files in coordinate_info.values():
41 if (len(sequence_files) != len(coordinate_files)):
42 raise ValueError(
43 "number of sequence files must match "
44 "number of coordinate files")
45 self.sequence_files = sequence_files
46 self.coordinate_info = coordinate_info
48 # variables
49 self.n_lines = n_lines
50 self.tail = tail
51 self.unit_name = unit_name
52 self.unit_len = unit_len
53 self.bimod = bimod
55 # create logger
56 self.logger = self.create_logger(verbose)
58 # parse paths from string to PosixPath
59 self._save_path = save_path
61 # flags
62 self.save_tables = save_tables
63 self.save_plots = save_plots
64 self.duplicates = duplicates
66 # log information
67 self.logger.info(
68 "number of sequence files: "
69 f"{len(self.sequence_files)}")
70 self.logger.info(
71 "number of files for each coordinate: "
72 f"{[(k, len(v)) for k, v in self.coordinate_info.items()]}")
73 self.logger.debug(f"sequence files: {self.sequence_files}")
74 self.logger.debug(f"coordinate files: {self.coordinate_info}")
75 self.logger.debug(
76 f"reading {n_lines} lines from "
77 f"{'tail' if self.tail else 'head'} of input files...")
78 self.logger.debug(
79 f"analyzing units of length {self.unit_len}...")
81 @ abstractmethod
82 def extract():
83 """Extract data to be analyzed"""
84 pass
86 @ abstractmethod
87 def transform():
88 """Perform data transformations, cleaning and analyses"""
89 pass
91 @ abstractmethod
92 def make_tables(data, **kwargs):
93 """Save data in tables"""
94 pass
96 @ abstractmethod
97 def make_plots(data, **kwargs):
98 """Save data visualizations"""
99 pass
101 def load(self, data, **kwargs):
102 """Save data in table and visualization formats.
104 :param data: processed datasets
105 """
106 self.make_tables(data, **kwargs)
107 self.make_plots(data, **kwargs)
109 def run(self, **kwargs):
110 """Run complete data analysis"""
111 start = time.time()
113 data = self.extract(**kwargs)
114 data = self.transform(data, **kwargs)
116 if self.save_tables:
117 self.make_tables(data, **kwargs)
118 if self.save_plots:
119 self.make_plots(data, **kwargs)
120 end = time.time()
121 self.logger.info(
122 f"Execution took {end-start:.2f} seconds "
123 f"({(end-start)//60:.0f} minutes).")
125 return data
127 @ property
128 def save_path(self):
129 """
130 Parse ``save_path`` directory. If it doesn't exist, directory is created along with parent directories.
131 If not provided, current directory is used.
133 :return pathlib.Path: Save directory path.
134 """
135 if self._save_path:
136 new_path = pathlib.Path(self._save_path)
137 if not new_path.exists():
138 self.logger.info("creating directory to save output files...")
139 self.logger.debug(f"created path {new_path}")
140 new_path.mkdir(parents=True, exist_ok=True)
141 return new_path
142 else:
143 if self.save_plots or self.save_tables:
144 self.logger.info(
145 "save path not provided, using current directory")
146 return pathlib.Path.cwd()
147 else:
148 return None
150 @ staticmethod
151 def create_logger(verbose):
152 """Create logger.
154 :param bool verbose: if True, logging level is DEBUG. Else, it's set to INFO.
155 :return RootLogger: logger
156 """
157 if verbose:
158 logging_level = logging.DEBUG
159 else:
160 logging_level = logging.INFO
161 logging_format = "[%(filename)s:%(lineno)s] %(levelname)s: %(message)s"
162 logging.basicConfig(level=logging_level, format=logging_format)
163 logger = logging.getLogger('NASSA')
164 return logger