Coverage for model_workflow/utils/bibitransformer_nassa.py: 92%

66 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-23 10:54 +0000

1import numpy as np 

2from sklearn.base import TransformerMixin 

3from sklearn import mixture 

4 

5 

6class BiBiTransformer(TransformerMixin): 

7 """ 

8 Obtain binormality/bimodality information for a given a distribution. 

9 This is accomplished by fitting the dataset to two Gaussian Mixture models with one and two components, respectively. 

10 Then the BIC score together with Bayes Factor is used to assert binormality, and a modification of Helguero's theorem to assert bimodality. 

11 Other parameters such as means, variances and weights are given for both distributions. 

12 

13 :param float confidence_level: confidence level to use in Bayes Factor when determining binormality. (Default 5.0) 

14 :param **kwargs: other arguments to be passed to sklearn.mixture.GaussianMixture 

15 

16 :raises ValueError: if ``confidence_level`` is not between 0 and 100. 

17 """ 

18 

19 def __init__(self, confidence_level=5.0, **kwargs): 

20 self.models = self.GMM_models(**kwargs) 

21 

22 if confidence_level < 0 and confidence_level > 100: 

23 raise ValueError("confidence_level must be between 0 and 100") 

24 self.confidence_level = confidence_level 

25 

26 def GMM_models(self, **kwargs): 

27 """ 

28 Create two GaussianMixture models with the same hyperparameters, 

29 one with one component and the other with two components. 

30 

31 :return: tuple of both Gaussian Mixture models 

32 """ 

33 gmm_1 = mixture.GaussianMixture( 

34 n_components=1, 

35 **kwargs) 

36 gmm_2 = mixture.GaussianMixture( 

37 n_components=2, 

38 **kwargs) 

39 return gmm_1, gmm_2 

40 

41 def fit(self, X): 

42 """Fits both models to the same data. 

43 

44 :param X: array of shape (n_samples, n_features)""" 

45 self.X = X 

46 self.models = tuple(map( 

47 lambda model: model.fit(X), 

48 self.models)) 

49 return self 

50 

51 def transform(self, X, y=None): 

52 """Get parameters describing the distribution fitted to model with lowest BIC value. 

53 The information returned is:  

54 bimodality 

55 binormality/uninormality/insuficient evidence 

56 BIC values 

57 mean(s) 

58 variance(s) 

59 weight(s) 

60 

61 :param X: array of shape (n_samples, n_features) 

62 

63 :return: Dictionary with distribution information.""" 

64 bics = [] 

65 means = [] 

66 variances = [] 

67 weights = [] 

68 for gmm in self.models: 

69 m = gmm.means_.flatten() 

70 v = gmm.covariances_.flatten() 

71 b = gmm.bic(X) 

72 w = gmm.weights_.flatten() 

73 means.append(m) 

74 variances.append(v) 

75 bics.append(b) 

76 weights.append(w) 

77 

78 # bayes factor criteria for normality 

79 uninormal, binormal, insuf_ev, p = self.bayes_factor_criteria(bics[0],bics[1]) 

80 if binormal: 

81 maxm = np.argmax(means[1]) 

82 minm = np.argmin(means[1]) 

83 mean1 = means[1][minm] 

84 var1 = variances[1][minm] 

85 w1 = weights[1][minm] 

86 mean2 = means[1][maxm] 

87 var2 = variances[1][maxm] 

88 w2 = weights[1][maxm] 

89 # Helgueros theorem for unimodality 

90 unimodal = self.is_unimodal( 

91 [mean1, mean2], [var1, var2]) 

92 else: 

93 mean1 = means[0][0] 

94 var1 = variances[0][0] 

95 w1 = weights[0][0] 

96 mean2, var2, w2 = np.nan, np.nan, 0 

97 unimodal = True 

98 info = dict( 

99 binormal=binormal, 

100 uninormal=uninormal, 

101 insuf_ev=insuf_ev, 

102 unimodal=unimodal, 

103 bics=bics, 

104 p=p, 

105 mean1=mean1, 

106 mean2=mean2, 

107 var1=var1, 

108 var2=var2, 

109 w1=w1, 

110 w2=w2) 

111 return info 

112 

113 def bayes_factor_criteria(self, bics0, bics1): 

114 """ 

115 Determine uninormality, bimodality and insufficient evidence parameters from Bayes Factor. 

116 

117 :param bics0: BIC value for 1-component model 

118 :param bics1: BIC value for 2-component model 

119 

120 :return: Tuple with uninormal, binormal and insuficient evidence boolean values. 

121 """ 

122 diff_bic = bics1 - bics0 

123 # probability of a two-component model 

124 p = 1 / (1+np.exp(0.5*diff_bic)) 

125 if p == np.nan: 

126 if bics0 == np.nan: 

127 p = 1 

128 elif bics1 == np.nan: 

129 p = 0 

130 

131 uninormal = p < (self.confidence_level / 100) 

132 binormal = p > (1 - (self.confidence_level / 100)) 

133 insuf_ev = True if (not uninormal and not binormal) else False 

134 return uninormal, binormal, insuf_ev, p 

135 

136 def is_unimodal(self, means, vars): 

137 """implements P.Dans version of Helguero's theorem in order to detect unimodality. 

138 

139 :param means: array with values of means for both fitted models. 

140 :param vars: array with values of variances for both fitted models. 

141 

142 :return: unimodal boolean value.""" 

143 r = vars[0] / vars[1] 

144 

145 # separation factor 

146 s = np.sqrt( 

147 -2 + 3*r + 3*r**2 - 2*r**3 + 2*(1 - r + r**2)**1.5 

148 ) / ( 

149 np.sqrt(r)*(1+np.sqrt(r)) 

150 ) 

151 

152 unimodal = abs(means[1]-means[0]) <= s * \ 

153 (np.sqrt(vars[0]) + np.sqrt(vars[1])) 

154 return unimodal