Coverage for model_workflow/utils/bibitransformer_nassa.py: 92%
66 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-23 10:54 +0000
1import numpy as np
2from sklearn.base import TransformerMixin
3from sklearn import mixture
6class BiBiTransformer(TransformerMixin):
7 """
8 Obtain binormality/bimodality information for a given a distribution.
9 This is accomplished by fitting the dataset to two Gaussian Mixture models with one and two components, respectively.
10 Then the BIC score together with Bayes Factor is used to assert binormality, and a modification of Helguero's theorem to assert bimodality.
11 Other parameters such as means, variances and weights are given for both distributions.
13 :param float confidence_level: confidence level to use in Bayes Factor when determining binormality. (Default 5.0)
14 :param **kwargs: other arguments to be passed to sklearn.mixture.GaussianMixture
16 :raises ValueError: if ``confidence_level`` is not between 0 and 100.
17 """
19 def __init__(self, confidence_level=5.0, **kwargs):
20 self.models = self.GMM_models(**kwargs)
22 if confidence_level < 0 and confidence_level > 100:
23 raise ValueError("confidence_level must be between 0 and 100")
24 self.confidence_level = confidence_level
26 def GMM_models(self, **kwargs):
27 """
28 Create two GaussianMixture models with the same hyperparameters,
29 one with one component and the other with two components.
31 :return: tuple of both Gaussian Mixture models
32 """
33 gmm_1 = mixture.GaussianMixture(
34 n_components=1,
35 **kwargs)
36 gmm_2 = mixture.GaussianMixture(
37 n_components=2,
38 **kwargs)
39 return gmm_1, gmm_2
41 def fit(self, X):
42 """Fits both models to the same data.
44 :param X: array of shape (n_samples, n_features)"""
45 self.X = X
46 self.models = tuple(map(
47 lambda model: model.fit(X),
48 self.models))
49 return self
51 def transform(self, X, y=None):
52 """Get parameters describing the distribution fitted to model with lowest BIC value.
53 The information returned is:
54 bimodality
55 binormality/uninormality/insuficient evidence
56 BIC values
57 mean(s)
58 variance(s)
59 weight(s)
61 :param X: array of shape (n_samples, n_features)
63 :return: Dictionary with distribution information."""
64 bics = []
65 means = []
66 variances = []
67 weights = []
68 for gmm in self.models:
69 m = gmm.means_.flatten()
70 v = gmm.covariances_.flatten()
71 b = gmm.bic(X)
72 w = gmm.weights_.flatten()
73 means.append(m)
74 variances.append(v)
75 bics.append(b)
76 weights.append(w)
78 # bayes factor criteria for normality
79 uninormal, binormal, insuf_ev, p = self.bayes_factor_criteria(bics[0],bics[1])
80 if binormal:
81 maxm = np.argmax(means[1])
82 minm = np.argmin(means[1])
83 mean1 = means[1][minm]
84 var1 = variances[1][minm]
85 w1 = weights[1][minm]
86 mean2 = means[1][maxm]
87 var2 = variances[1][maxm]
88 w2 = weights[1][maxm]
89 # Helgueros theorem for unimodality
90 unimodal = self.is_unimodal(
91 [mean1, mean2], [var1, var2])
92 else:
93 mean1 = means[0][0]
94 var1 = variances[0][0]
95 w1 = weights[0][0]
96 mean2, var2, w2 = np.nan, np.nan, 0
97 unimodal = True
98 info = dict(
99 binormal=binormal,
100 uninormal=uninormal,
101 insuf_ev=insuf_ev,
102 unimodal=unimodal,
103 bics=bics,
104 p=p,
105 mean1=mean1,
106 mean2=mean2,
107 var1=var1,
108 var2=var2,
109 w1=w1,
110 w2=w2)
111 return info
113 def bayes_factor_criteria(self, bics0, bics1):
114 """
115 Determine uninormality, bimodality and insufficient evidence parameters from Bayes Factor.
117 :param bics0: BIC value for 1-component model
118 :param bics1: BIC value for 2-component model
120 :return: Tuple with uninormal, binormal and insuficient evidence boolean values.
121 """
122 diff_bic = bics1 - bics0
123 # probability of a two-component model
124 p = 1 / (1+np.exp(0.5*diff_bic))
125 if p == np.nan:
126 if bics0 == np.nan:
127 p = 1
128 elif bics1 == np.nan:
129 p = 0
131 uninormal = p < (self.confidence_level / 100)
132 binormal = p > (1 - (self.confidence_level / 100))
133 insuf_ev = True if (not uninormal and not binormal) else False
134 return uninormal, binormal, insuf_ev, p
136 def is_unimodal(self, means, vars):
137 """implements P.Dans version of Helguero's theorem in order to detect unimodality.
139 :param means: array with values of means for both fitted models.
140 :param vars: array with values of variances for both fitted models.
142 :return: unimodal boolean value."""
143 r = vars[0] / vars[1]
145 # separation factor
146 s = np.sqrt(
147 -2 + 3*r + 3*r**2 - 2*r**3 + 2*(1 - r + r**2)**1.5
148 ) / (
149 np.sqrt(r)*(1+np.sqrt(r))
150 )
152 unimodal = abs(means[1]-means[0]) <= s * \
153 (np.sqrt(vars[0]) + np.sqrt(vars[1]))
154 return unimodal