Coverage for mddb_workflow / utils / bibitransformer_nassa.py: 15%
66 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 18:45 +0000
1import numpy as np
2from sklearn.base import TransformerMixin
3from sklearn import mixture
6class BiBiTransformer(TransformerMixin):
7 """Obtain binormality/bimodality information for a given a distribution.
8 This is accomplished by fitting the dataset to two Gaussian Mixture models with one and two components, respectively.
9 Then the BIC score together with Bayes Factor is used to assert binormality, and a modification of Helguero's theorem to assert bimodality.
10 Other parameters such as means, variances and weights are given for both distributions.
12 :param float confidence_level: confidence level to use in Bayes Factor when determining binormality. (Default 5.0)
13 :param **kwargs: other arguments to be passed to sklearn.mixture.GaussianMixture
15 :raises ValueError: if ``confidence_level`` is not between 0 and 100.
16 """
18 def __init__(self, confidence_level=5.0, **kwargs):
19 self.models = self.GMM_models(**kwargs)
21 if confidence_level < 0 and confidence_level > 100:
22 raise ValueError("confidence_level must be between 0 and 100")
23 self.confidence_level = confidence_level
25 def GMM_models(self, **kwargs):
26 """Create two GaussianMixture models with the same hyperparameters,
27 one with one component and the other with two components.
29 :return: tuple of both Gaussian Mixture models
30 """
31 gmm_1 = mixture.GaussianMixture(
32 n_components=1,
33 **kwargs)
34 gmm_2 = mixture.GaussianMixture(
35 n_components=2,
36 **kwargs)
37 return gmm_1, gmm_2
39 def fit(self, X):
40 """Fits both models to the same data.
42 :param X: array of shape (n_samples, n_features)"""
43 self.X = X
44 self.models = tuple(map(
45 lambda model: model.fit(X),
46 self.models))
47 return self
49 def transform(self, X, y=None):
50 """Get parameters describing the distribution fitted to model with lowest BIC value.
51 The information returned is:
52 bimodality
53 binormality/uninormality/insuficient evidence
54 BIC values
55 mean(s)
56 variance(s)
57 weight(s)
59 :param X: array of shape (n_samples, n_features)
61 :return: Dictionary with distribution information."""
62 bics = []
63 means = []
64 variances = []
65 weights = []
66 for gmm in self.models:
67 m = gmm.means_.flatten()
68 v = gmm.covariances_.flatten()
69 b = gmm.bic(X)
70 w = gmm.weights_.flatten()
71 means.append(m)
72 variances.append(v)
73 bics.append(b)
74 weights.append(w)
76 # bayes factor criteria for normality
77 uninormal, binormal, insuf_ev, p = self.bayes_factor_criteria(bics[0],bics[1])
78 if binormal:
79 maxm = np.argmax(means[1])
80 minm = np.argmin(means[1])
81 mean1 = means[1][minm]
82 var1 = variances[1][minm]
83 w1 = weights[1][minm]
84 mean2 = means[1][maxm]
85 var2 = variances[1][maxm]
86 w2 = weights[1][maxm]
87 # Helgueros theorem for unimodality
88 unimodal = self.is_unimodal(
89 [mean1, mean2], [var1, var2])
90 else:
91 mean1 = means[0][0]
92 var1 = variances[0][0]
93 w1 = weights[0][0]
94 mean2, var2, w2 = np.nan, np.nan, 0
95 unimodal = True
96 info = dict(
97 binormal=binormal,
98 uninormal=uninormal,
99 insuf_ev=insuf_ev,
100 unimodal=unimodal,
101 bics=bics,
102 p=p,
103 mean1=mean1,
104 mean2=mean2,
105 var1=var1,
106 var2=var2,
107 w1=w1,
108 w2=w2)
109 return info
111 def bayes_factor_criteria(self, bics0, bics1):
112 """
113 Determine uninormality, bimodality and insufficient evidence parameters from Bayes Factor.
115 :param bics0: BIC value for 1-component model
116 :param bics1: BIC value for 2-component model
118 :return: tuple with uninormal, binormal and insuficient evidence boolean values.
119 """
120 diff_bic = bics1 - bics0
121 # probability of a two-component model
122 p = 1 / (1+np.exp(0.5*diff_bic))
123 if p == np.nan:
124 if bics0 == np.nan:
125 p = 1
126 elif bics1 == np.nan:
127 p = 0
129 uninormal = p < (self.confidence_level / 100)
130 binormal = p > (1 - (self.confidence_level / 100))
131 insuf_ev = True if (not uninormal and not binormal) else False
132 return uninormal, binormal, insuf_ev, p
134 def is_unimodal(self, means, vars):
135 """implements P.Dans version of Helguero's theorem in order to detect unimodality.
137 :param means: array with values of means for both fitted models.
138 :param vars: array with values of variances for both fitted models.
140 :return: unimodal boolean value."""
141 r = vars[0] / vars[1]
143 # separation factor
144 s = np.sqrt(
145 -2 + 3*r + 3*r**2 - 2*r**3 + 2*(1 - r + r**2)**1.5
146 ) / (
147 np.sqrt(r)*(1+np.sqrt(r))
148 )
150 unimodal = abs(means[1]-means[0]) <= s * \
151 (np.sqrt(vars[0]) + np.sqrt(vars[1]))
152 return unimodal