Coverage for mddb_workflow / utils / bibitransformer_nassa.py: 15%

66 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 18:45 +0000

1import numpy as np 

2from sklearn.base import TransformerMixin 

3from sklearn import mixture 

4 

5 

6class BiBiTransformer(TransformerMixin): 

7 """Obtain binormality/bimodality information for a given a distribution. 

8 This is accomplished by fitting the dataset to two Gaussian Mixture models with one and two components, respectively. 

9 Then the BIC score together with Bayes Factor is used to assert binormality, and a modification of Helguero's theorem to assert bimodality. 

10 Other parameters such as means, variances and weights are given for both distributions. 

11 

12 :param float confidence_level: confidence level to use in Bayes Factor when determining binormality. (Default 5.0) 

13 :param **kwargs: other arguments to be passed to sklearn.mixture.GaussianMixture 

14 

15 :raises ValueError: if ``confidence_level`` is not between 0 and 100. 

16 """ 

17 

18 def __init__(self, confidence_level=5.0, **kwargs): 

19 self.models = self.GMM_models(**kwargs) 

20 

21 if confidence_level < 0 and confidence_level > 100: 

22 raise ValueError("confidence_level must be between 0 and 100") 

23 self.confidence_level = confidence_level 

24 

25 def GMM_models(self, **kwargs): 

26 """Create two GaussianMixture models with the same hyperparameters, 

27 one with one component and the other with two components. 

28 

29 :return: tuple of both Gaussian Mixture models 

30 """ 

31 gmm_1 = mixture.GaussianMixture( 

32 n_components=1, 

33 **kwargs) 

34 gmm_2 = mixture.GaussianMixture( 

35 n_components=2, 

36 **kwargs) 

37 return gmm_1, gmm_2 

38 

39 def fit(self, X): 

40 """Fits both models to the same data. 

41 

42 :param X: array of shape (n_samples, n_features)""" 

43 self.X = X 

44 self.models = tuple(map( 

45 lambda model: model.fit(X), 

46 self.models)) 

47 return self 

48 

49 def transform(self, X, y=None): 

50 """Get parameters describing the distribution fitted to model with lowest BIC value. 

51 The information returned is:  

52 bimodality 

53 binormality/uninormality/insuficient evidence 

54 BIC values 

55 mean(s) 

56 variance(s) 

57 weight(s) 

58 

59 :param X: array of shape (n_samples, n_features) 

60 

61 :return: Dictionary with distribution information.""" 

62 bics = [] 

63 means = [] 

64 variances = [] 

65 weights = [] 

66 for gmm in self.models: 

67 m = gmm.means_.flatten() 

68 v = gmm.covariances_.flatten() 

69 b = gmm.bic(X) 

70 w = gmm.weights_.flatten() 

71 means.append(m) 

72 variances.append(v) 

73 bics.append(b) 

74 weights.append(w) 

75 

76 # bayes factor criteria for normality 

77 uninormal, binormal, insuf_ev, p = self.bayes_factor_criteria(bics[0],bics[1]) 

78 if binormal: 

79 maxm = np.argmax(means[1]) 

80 minm = np.argmin(means[1]) 

81 mean1 = means[1][minm] 

82 var1 = variances[1][minm] 

83 w1 = weights[1][minm] 

84 mean2 = means[1][maxm] 

85 var2 = variances[1][maxm] 

86 w2 = weights[1][maxm] 

87 # Helgueros theorem for unimodality 

88 unimodal = self.is_unimodal( 

89 [mean1, mean2], [var1, var2]) 

90 else: 

91 mean1 = means[0][0] 

92 var1 = variances[0][0] 

93 w1 = weights[0][0] 

94 mean2, var2, w2 = np.nan, np.nan, 0 

95 unimodal = True 

96 info = dict( 

97 binormal=binormal, 

98 uninormal=uninormal, 

99 insuf_ev=insuf_ev, 

100 unimodal=unimodal, 

101 bics=bics, 

102 p=p, 

103 mean1=mean1, 

104 mean2=mean2, 

105 var1=var1, 

106 var2=var2, 

107 w1=w1, 

108 w2=w2) 

109 return info 

110 

111 def bayes_factor_criteria(self, bics0, bics1): 

112 """ 

113 Determine uninormality, bimodality and insufficient evidence parameters from Bayes Factor. 

114 

115 :param bics0: BIC value for 1-component model 

116 :param bics1: BIC value for 2-component model 

117 

118 :return: tuple with uninormal, binormal and insuficient evidence boolean values. 

119 """ 

120 diff_bic = bics1 - bics0 

121 # probability of a two-component model 

122 p = 1 / (1+np.exp(0.5*diff_bic)) 

123 if p == np.nan: 

124 if bics0 == np.nan: 

125 p = 1 

126 elif bics1 == np.nan: 

127 p = 0 

128 

129 uninormal = p < (self.confidence_level / 100) 

130 binormal = p > (1 - (self.confidence_level / 100)) 

131 insuf_ev = True if (not uninormal and not binormal) else False 

132 return uninormal, binormal, insuf_ev, p 

133 

134 def is_unimodal(self, means, vars): 

135 """implements P.Dans version of Helguero's theorem in order to detect unimodality. 

136 

137 :param means: array with values of means for both fitted models. 

138 :param vars: array with values of variances for both fitted models. 

139 

140 :return: unimodal boolean value.""" 

141 r = vars[0] / vars[1] 

142 

143 # separation factor 

144 s = np.sqrt( 

145 -2 + 3*r + 3*r**2 - 2*r**3 + 2*(1 - r + r**2)**1.5 

146 ) / ( 

147 np.sqrt(r)*(1+np.sqrt(r)) 

148 ) 

149 

150 unimodal = abs(means[1]-means[0]) <= s * \ 

151 (np.sqrt(vars[0]) + np.sqrt(vars[1])) 

152 return unimodal