Coverage for model_workflow/utils/nucleicacid.py: 64%
73 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
2class NucleicAcid:
3 """Attributes and methods for operations on nucleic acid sequences.
5 :param sequence: nucleic acid sequence
6 :type sequence: str
7 :param unit_len: length of sequence subunit to be analysed
8 :type unit_len: int
9 :param unit_name: name of subunit to be analysed. If not given, it is inferred from `unit_len`.
10 :type unit_name: str, optional
11 :param ic_sequence: inverse-complement of nucleic acid sequence. If not given, it is computed from sequence.
12 :type ic_sequence: str, optional
13 :param Ac: complement to Adenine, defaults to "X"
14 :type Ac: str, optional
15 :raises ValueError: if nucleic acid sequence is empty
16 :raises ValueError: if both U and T are present in sequence
17 :raises ValueError: if unit length is negative or zero
18 """
20 def __init__(
21 self,
22 sequence,
23 unit_len,
24 unit_name="subunit",
25 ic_sequence=None,
26 Ac="X"):
27 # sequence quality checks
28 if unit_len <= 0:
29 raise ValueError(
30 "unit length must be an integer "
31 "and greater than or equal to 1")
32 if len(sequence) < unit_len:
33 raise ValueError(
34 "length of sequence can't be smaller than "
35 "the specified subunit length")
36 if "U" in sequence and "T" in sequence:
37 raise ValueError("Both T and U bases present in sequence")
38 self.sequence = sequence.upper()
39 self.unit_len = unit_len
40 self.unit_name = unit_name
41 self._ic_sequence = ic_sequence
42 self.Ac = self._adenine_complement(Ac)
44 self.all_subunits = self.get_all_subunits()
45 self.all_ic_subunits = self.get_all_ic_subunits()
47 def __len__(self):
48 return len(self.sequence)
50 def __repr__(self):
51 representation = "NucleicAcid("
52 for k, v in self.__dict__.items():
53 representation += f"{k}={v}, "
54 representation = representation[:-2] + ")"
55 return representation
57 @property
58 def size(self):
59 """get sequence length"""
60 return len(self.sequence)
62 @property
63 def bases(self):
64 """get nucleic acid sequence bases"""
65 return set(self.sequence)
67 @property
68 def baselen(self):
69 """get length of subunit base"""
70 if self.unit_len % 2 == 0:
71 baselen = 2
72 else:
73 baselen = 1
74 return baselen
76 @property
77 def flanksize(self):
78 """get size of subunit flanks"""
79 return (self.unit_len - self.baselen) // 2
81 @property
82 def ic_sequence(self):
83 """get inverse complement sequence"""
84 if self._ic_sequence is None:
85 self._ic_sequence = self.inverse_complement(self.sequence)
86 return self._ic_sequence
88 def _adenine_complement(self, Ac):
89 """get complement base for Adenine (A)"""
90 Ac = Ac.upper()
91 if Ac != "T" and Ac != "U":
92 # try to look for Adenine complement in sequence
93 if "T" in self.sequence or "T" in self._ic_sequence:
94 Ac = "T"
95 elif "U" in self.sequence or "U" in self._ic_sequence:
96 Ac = "U"
97 else:
98 Ac = "X"
99 raise UserWarning(
100 "Adenine complement not U or T! Using X as placeholder "
101 "(this could lead to errors. "
102 "It is recommended to state this variable explicitly).")
103 return Ac
105 def inverse_complement(self, sequence):
106 """compute inverse complement subunit."""
107 complement = {
108 "A": self.Ac,
109 self.Ac: "A",
110 "G": "C",
111 "C": "G"
112 }
113 return "".join([complement[b] for b in sequence])[::-1]
115 def get_subunit(self, idx):
116 """
117 Get complete subunit given the index of the first base, excluding flanks.
118 For example, if you have the sequence AGTCTGA, and you want the tetramer at index 3, it will be TCTG.
119 """
120 subunit = self.sequence[
121 idx - self.flanksize:
122 idx + self.baselen + self.flanksize]
123 return subunit
125 def get_all_subunits(self):
126 """
127 Get a list of all subunits of length `unit_len` contained sequence.
128 Returns a list of Subunit instances.
129 """
130 sequence_range = range(
131 self.flanksize,
132 self.size - self.baselen - self.flanksize + 1)
133 return [self.get_subunit(idx) for idx in sequence_range]
135 def get_all_ic_subunits(self):
136 """Get a list of all subunits in sequence."""
137 return [self.inverse_complement(unit) for unit in self.all_subunits]
139 def search_subunit(self, subunit):
140 """
141 Returns index of the first base of subunit, excluding flanks (compatible with get_subunit method).
142 For example, if you search for TCTG in sequence AGTCTGA, it will return index 3.
143 """
144 try:
145 seq_index = self.sequence.index(subunit)
146 except ValueError:
147 try:
148 seq_index = self.sequence.index(
149 self.inverse_complement(subunit))
150 except ValueError:
151 return None
152 return seq_index + 1