Coverage for model_workflow/utils/nucleicacid.py: 64%

73 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-23 10:54 +0000

1 

2class NucleicAcid: 

3 """Attributes and methods for operations on nucleic acid sequences. 

4 

5 :param sequence: nucleic acid sequence 

6 :type sequence: str 

7 :param unit_len: length of sequence subunit to be analysed 

8 :type unit_len: int 

9 :param unit_name: name of subunit to be analysed. If not given, it is inferred from `unit_len`. 

10 :type unit_name: str, optional 

11 :param ic_sequence: inverse-complement of nucleic acid sequence. If not given, it is computed from sequence. 

12 :type ic_sequence: str, optional 

13 :param Ac: complement to Adenine, defaults to "X" 

14 :type Ac: str, optional 

15 :raises ValueError: if nucleic acid sequence is empty 

16 :raises ValueError: if both U and T are present in sequence 

17 :raises ValueError: if unit length is negative or zero 

18 """ 

19 

20 def __init__( 

21 self, 

22 sequence, 

23 unit_len, 

24 unit_name="subunit", 

25 ic_sequence=None, 

26 Ac="X"): 

27 # sequence quality checks 

28 if unit_len <= 0: 

29 raise ValueError( 

30 "unit length must be an integer " 

31 "and greater than or equal to 1") 

32 if len(sequence) < unit_len: 

33 raise ValueError( 

34 "length of sequence can't be smaller than " 

35 "the specified subunit length") 

36 if "U" in sequence and "T" in sequence: 

37 raise ValueError("Both T and U bases present in sequence") 

38 self.sequence = sequence.upper() 

39 self.unit_len = unit_len 

40 self.unit_name = unit_name 

41 self._ic_sequence = ic_sequence 

42 self.Ac = self._adenine_complement(Ac) 

43 

44 self.all_subunits = self.get_all_subunits() 

45 self.all_ic_subunits = self.get_all_ic_subunits() 

46 

47 def __len__(self): 

48 return len(self.sequence) 

49 

50 def __repr__(self): 

51 representation = "NucleicAcid(" 

52 for k, v in self.__dict__.items(): 

53 representation += f"{k}={v}, " 

54 representation = representation[:-2] + ")" 

55 return representation 

56 

57 @property 

58 def size(self): 

59 """get sequence length""" 

60 return len(self.sequence) 

61 

62 @property 

63 def bases(self): 

64 """get nucleic acid sequence bases""" 

65 return set(self.sequence) 

66 

67 @property 

68 def baselen(self): 

69 """get length of subunit base""" 

70 if self.unit_len % 2 == 0: 

71 baselen = 2 

72 else: 

73 baselen = 1 

74 return baselen 

75 

76 @property 

77 def flanksize(self): 

78 """get size of subunit flanks""" 

79 return (self.unit_len - self.baselen) // 2 

80 

81 @property 

82 def ic_sequence(self): 

83 """get inverse complement sequence""" 

84 if self._ic_sequence is None: 

85 self._ic_sequence = self.inverse_complement(self.sequence) 

86 return self._ic_sequence 

87 

88 def _adenine_complement(self, Ac): 

89 """get complement base for Adenine (A)""" 

90 Ac = Ac.upper() 

91 if Ac != "T" and Ac != "U": 

92 # try to look for Adenine complement in sequence 

93 if "T" in self.sequence or "T" in self._ic_sequence: 

94 Ac = "T" 

95 elif "U" in self.sequence or "U" in self._ic_sequence: 

96 Ac = "U" 

97 else: 

98 Ac = "X" 

99 raise UserWarning( 

100 "Adenine complement not U or T! Using X as placeholder " 

101 "(this could lead to errors. " 

102 "It is recommended to state this variable explicitly).") 

103 return Ac 

104 

105 def inverse_complement(self, sequence): 

106 """compute inverse complement subunit.""" 

107 complement = { 

108 "A": self.Ac, 

109 self.Ac: "A", 

110 "G": "C", 

111 "C": "G" 

112 } 

113 return "".join([complement[b] for b in sequence])[::-1] 

114 

115 def get_subunit(self, idx): 

116 """ 

117 Get complete subunit given the index of the first base, excluding flanks. 

118 For example, if you have the sequence AGTCTGA, and you want the tetramer at index 3, it will be TCTG. 

119 """ 

120 subunit = self.sequence[ 

121 idx - self.flanksize: 

122 idx + self.baselen + self.flanksize] 

123 return subunit 

124 

125 def get_all_subunits(self): 

126 """ 

127 Get a list of all subunits of length `unit_len` contained sequence. 

128 Returns a list of Subunit instances. 

129 """ 

130 sequence_range = range( 

131 self.flanksize, 

132 self.size - self.baselen - self.flanksize + 1) 

133 return [self.get_subunit(idx) for idx in sequence_range] 

134 

135 def get_all_ic_subunits(self): 

136 """Get a list of all subunits in sequence.""" 

137 return [self.inverse_complement(unit) for unit in self.all_subunits] 

138 

139 def search_subunit(self, subunit): 

140 """ 

141 Returns index of the first base of subunit, excluding flanks (compatible with get_subunit method). 

142 For example, if you search for TCTG in sequence AGTCTGA, it will return index 3. 

143 """ 

144 try: 

145 seq_index = self.sequence.index(subunit) 

146 except ValueError: 

147 try: 

148 seq_index = self.sequence.index( 

149 self.inverse_complement(subunit)) 

150 except ValueError: 

151 return None 

152 return seq_index + 1