Coverage for model_workflow/tools/nassa

1import time

2import pathlib

3import logging

4from abc import ABC, abstractmethod

7class Base(ABC):

8 """Base class for nucleic acid analysis workflow.

10 :param list sequence_files: paths to sequence files.

11 :param dict coordinate_info: dictionary with coordinates as keys, and coordinate series files as values.

12 :param int n_lines: number of lines to read from coordinate series files.

13 :param bool tail: read last ``n_lines`` from coordinate series files.

14 :param str unit_name: name of subunit to analyze, used to name data table columns.

15 :param int unit_len: length of subunit to analyze.

16 :param bool save_tables: save data tables as .csv files.

17 :param bool save_plots: save data visualizations as .pdf files.

18 :param bool verbose: print verbose output.

19 :param str save_path: path to save directory. If it doesn't exist, it is created during execution. Default is the current working directory.

21 :raises ValueError: If number of sequence files doesn't match number of coordinate files.

22 """

24 def __init__(

25 self,

26 sequence_files,

27 coordinate_info,

28 save_path,

29 unit_len,

30 unit_name="subunit",

31 n_lines=1000,

32 tail=True,

33 bimod=True,

34 save_tables=True,

35 save_plots=True,

36 verbose=True,

37 duplicates=False):

39 # sequence and coordinate paths and files from config file

40 for coordinate_files in coordinate_info.values():

41 if (len(sequence_files) != len(coordinate_files)):

42 raise ValueError(

43 "number of sequence files must match "

44 "number of coordinate files")

45 self.sequence_files = sequence_files

46 self.coordinate_info = coordinate_info

48 # variables

49 self.n_lines = n_lines

50 self.tail = tail

51 self.unit_name = unit_name

52 self.unit_len = unit_len

53 self.bimod = bimod

55 # create logger

56 self.logger = self.create_logger(verbose)

58 # parse paths from string to PosixPath

59 self._save_path = save_path

61 # flags

62 self.save_tables = save_tables

63 self.save_plots = save_plots

64 self.duplicates = duplicates

66 # log information

67 self.logger.info(

68 "number of sequence files: "

69 f"{len(self.sequence_files)}")

70 self.logger.info(

71 "number of files for each coordinate: "

72 f"{[(k, len(v)) for k, v in self.coordinate_info.items()]}")

73 self.logger.debug(f"sequence files: {self.sequence_files}")

74 self.logger.debug(f"coordinate files: {self.coordinate_info}")

75 self.logger.debug(

76 f"reading {n_lines} lines from "

77 f"{'tail' if self.tail else 'head'} of input files...")

78 self.logger.debug(

79 f"analyzing units of length {self.unit_len}...")

81 @ abstractmethod

82 def extract():

83 """Extract data to be analyzed"""

84 pass

86 @ abstractmethod

87 def transform():

88 """Perform data transformations, cleaning and analyses"""

89 pass

91 @ abstractmethod

92 def make_tables(data, **kwargs):

93 """Save data in tables"""

94 pass

96 @ abstractmethod

97 def make_plots(data, **kwargs):

98 """Save data visualizations"""

99 pass

100

101 def load(self, data, **kwargs):

102 """Save data in table and visualization formats.

103

104 :param data: processed datasets

105 """

106 self.make_tables(data, **kwargs)

107 self.make_plots(data, **kwargs)

108

109 def run(self, **kwargs):

110 """Run complete data analysis"""

111 start = time.time()

112

113 data = self.extract(**kwargs)

114 data = self.transform(data, **kwargs)

115

116 if self.save_tables:

117 self.make_tables(data, **kwargs)

118 if self.save_plots:

119 self.make_plots(data, **kwargs)

120 end = time.time()

121 self.logger.info(

122 f"Execution took {end-start:.2f} seconds "

123 f"({(end-start)//60:.0f} minutes).")

124

125 return data

126

127 @ property

128 def save_path(self):

129 """

130 Parse ``save_path`` directory. If it doesn't exist, directory is created along with parent directories.

131 If not provided, current directory is used.

132

133 :return pathlib.Path: Save directory path.

134 """

135 if self._save_path:

136 new_path = pathlib.Path(self._save_path)

137 if not new_path.exists():

138 self.logger.info("creating directory to save output files...")

139 self.logger.debug(f"created path {new_path}")

140 new_path.mkdir(parents=True, exist_ok=True)

141 return new_path

142 else:

143 if self.save_plots or self.save_tables:

144 self.logger.info(

145 "save path not provided, using current directory")

146 return pathlib.Path.cwd()

147 else:

148 return None

149

150 @ staticmethod

151 def create_logger(verbose):

152 """Create logger.

153

154 :param bool verbose: if True, logging level is DEBUG. Else, it's set to INFO.

155 :return RootLogger: logger

156 """

157 if verbose:

158 logging_level = logging.DEBUG

159 else:

160 logging_level = logging.INFO

161 logging_format = "[%(filename)s:%(lineno)s] %(levelname)s: %(message)s"

162 logging.basicConfig(level=logging_level, format=logging_format)

163 logger = logging.getLogger('NASSA')

164 return logger

Coverage for model_workflow/tools/nassa_base.py: 80%

75 statements