Coverage for model_workflow/tools/nassa_base.py: 80%

75 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-23 10:54 +0000

1import time 

2import pathlib 

3import logging 

4from abc import ABC, abstractmethod 

5 

6 

7class Base(ABC): 

8 """Base class for nucleic acid analysis workflow. 

9 

10 :param list sequence_files: paths to sequence files. 

11 :param dict coordinate_info: dictionary with coordinates as keys, and coordinate series files as values. 

12 :param int n_lines: number of lines to read from coordinate series files. 

13 :param bool tail: read last ``n_lines`` from coordinate series files. 

14 :param str unit_name: name of subunit to analyze, used to name data table columns. 

15 :param int unit_len: length of subunit to analyze. 

16 :param bool save_tables: save data tables as .csv files. 

17 :param bool save_plots: save data visualizations as .pdf files. 

18 :param bool verbose: print verbose output. 

19 :param str save_path: path to save directory. If it doesn't exist, it is created during execution. Default is the current working directory. 

20 

21 :raises ValueError: If number of sequence files doesn't match number of coordinate files. 

22 """ 

23 

24 def __init__( 

25 self, 

26 sequence_files, 

27 coordinate_info, 

28 save_path, 

29 unit_len, 

30 unit_name="subunit", 

31 n_lines=1000, 

32 tail=True, 

33 bimod=True, 

34 save_tables=True, 

35 save_plots=True, 

36 verbose=True, 

37 duplicates=False): 

38 

39 # sequence and coordinate paths and files from config file 

40 for coordinate_files in coordinate_info.values(): 

41 if (len(sequence_files) != len(coordinate_files)): 

42 raise ValueError( 

43 "number of sequence files must match " 

44 "number of coordinate files") 

45 self.sequence_files = sequence_files 

46 self.coordinate_info = coordinate_info 

47 

48 # variables 

49 self.n_lines = n_lines 

50 self.tail = tail 

51 self.unit_name = unit_name 

52 self.unit_len = unit_len 

53 self.bimod = bimod 

54 

55 # create logger 

56 self.logger = self.create_logger(verbose) 

57 

58 # parse paths from string to PosixPath 

59 self._save_path = save_path 

60 

61 # flags 

62 self.save_tables = save_tables 

63 self.save_plots = save_plots 

64 self.duplicates = duplicates 

65 

66 # log information 

67 self.logger.info( 

68 "number of sequence files: " 

69 f"{len(self.sequence_files)}") 

70 self.logger.info( 

71 "number of files for each coordinate: " 

72 f"{[(k, len(v)) for k, v in self.coordinate_info.items()]}") 

73 self.logger.debug(f"sequence files: {self.sequence_files}") 

74 self.logger.debug(f"coordinate files: {self.coordinate_info}") 

75 self.logger.debug( 

76 f"reading {n_lines} lines from " 

77 f"{'tail' if self.tail else 'head'} of input files...") 

78 self.logger.debug( 

79 f"analyzing units of length {self.unit_len}...") 

80 

81 @ abstractmethod 

82 def extract(): 

83 """Extract data to be analyzed""" 

84 pass 

85 

86 @ abstractmethod 

87 def transform(): 

88 """Perform data transformations, cleaning and analyses""" 

89 pass 

90 

91 @ abstractmethod 

92 def make_tables(data, **kwargs): 

93 """Save data in tables""" 

94 pass 

95 

96 @ abstractmethod 

97 def make_plots(data, **kwargs): 

98 """Save data visualizations""" 

99 pass 

100 

101 def load(self, data, **kwargs): 

102 """Save data in table and visualization formats. 

103 

104 :param data: processed datasets 

105 """ 

106 self.make_tables(data, **kwargs) 

107 self.make_plots(data, **kwargs) 

108 

109 def run(self, **kwargs): 

110 """Run complete data analysis""" 

111 start = time.time() 

112 

113 data = self.extract(**kwargs) 

114 data = self.transform(data, **kwargs) 

115 

116 if self.save_tables: 

117 self.make_tables(data, **kwargs) 

118 if self.save_plots: 

119 self.make_plots(data, **kwargs) 

120 end = time.time() 

121 self.logger.info( 

122 f"Execution took {end-start:.2f} seconds " 

123 f"({(end-start)//60:.0f} minutes).") 

124 

125 return data 

126 

127 @ property 

128 def save_path(self): 

129 """ 

130 Parse ``save_path`` directory. If it doesn't exist, directory is created along with parent directories. 

131 If not provided, current directory is used. 

132 

133 :return pathlib.Path: Save directory path. 

134 """ 

135 if self._save_path: 

136 new_path = pathlib.Path(self._save_path) 

137 if not new_path.exists(): 

138 self.logger.info("creating directory to save output files...") 

139 self.logger.debug(f"created path {new_path}") 

140 new_path.mkdir(parents=True, exist_ok=True) 

141 return new_path 

142 else: 

143 if self.save_plots or self.save_tables: 

144 self.logger.info( 

145 "save path not provided, using current directory") 

146 return pathlib.Path.cwd() 

147 else: 

148 return None 

149 

150 @ staticmethod 

151 def create_logger(verbose): 

152 """Create logger. 

153 

154 :param bool verbose: if True, logging level is DEBUG. Else, it's set to INFO. 

155 :return RootLogger: logger 

156 """ 

157 if verbose: 

158 logging_level = logging.DEBUG 

159 else: 

160 logging_level = logging.INFO 

161 logging_format = "[%(filename)s:%(lineno)s] %(levelname)s: %(message)s" 

162 logging.basicConfig(level=logging_level, format=logging_format) 

163 logger = logging.getLogger('NASSA') 

164 return logger