Coverage for mddb_workflow/utils/bibitransformer

1import numpy as np

2from sklearn.base import TransformerMixin

3from sklearn import mixture

6class BiBiTransformer(TransformerMixin):

7 """Obtain binormality/bimodality information for a given a distribution.

8 This is accomplished by fitting the dataset to two Gaussian Mixture models with one and two components, respectively.

9 Then the BIC score together with Bayes Factor is used to assert binormality, and a modification of Helguero's theorem to assert bimodality.

10 Other parameters such as means, variances and weights are given for both distributions.

12 :param float confidence_level: confidence level to use in Bayes Factor when determining binormality. (Default 5.0)

13 :param **kwargs: other arguments to be passed to sklearn.mixture.GaussianMixture

15 :raises ValueError: if ``confidence_level`` is not between 0 and 100.

16 """

18 def __init__(self, confidence_level=5.0, **kwargs):

19 self.models = self.GMM_models(**kwargs)

21 if confidence_level < 0 and confidence_level > 100:

22 raise ValueError("confidence_level must be between 0 and 100")

23 self.confidence_level = confidence_level

25 def GMM_models(self, **kwargs):

26 """Create two GaussianMixture models with the same hyperparameters,

27 one with one component and the other with two components.

29 :return: tuple of both Gaussian Mixture models

30 """

31 gmm_1 = mixture.GaussianMixture(

32 n_components=1,

33 **kwargs)

34 gmm_2 = mixture.GaussianMixture(

35 n_components=2,

36 **kwargs)

37 return gmm_1, gmm_2

39 def fit(self, X):

40 """Fits both models to the same data.

42 :param X: array of shape (n_samples, n_features)"""

43 self.X = X

44 self.models = tuple(map(

45 lambda model: model.fit(X),

46 self.models))

47 return self

49 def transform(self, X, y=None):

50 """Get parameters describing the distribution fitted to model with lowest BIC value.

51 The information returned is:

52 bimodality

53 binormality/uninormality/insuficient evidence

54 BIC values

55 mean(s)

56 variance(s)

57 weight(s)

59 :param X: array of shape (n_samples, n_features)

61 :return: Dictionary with distribution information."""

62 bics = []

63 means = []

64 variances = []

65 weights = []

66 for gmm in self.models:

67 m = gmm.means_.flatten()

68 v = gmm.covariances_.flatten()

69 b = gmm.bic(X)

70 w = gmm.weights_.flatten()

71 means.append(m)

72 variances.append(v)

73 bics.append(b)

74 weights.append(w)

76 # bayes factor criteria for normality

77 uninormal, binormal, insuf_ev, p = self.bayes_factor_criteria(bics[0],bics[1])

78 if binormal:

79 maxm = np.argmax(means[1])

80 minm = np.argmin(means[1])

81 mean1 = means[1][minm]

82 var1 = variances[1][minm]

83 w1 = weights[1][minm]

84 mean2 = means[1][maxm]

85 var2 = variances[1][maxm]

86 w2 = weights[1][maxm]

87 # Helgueros theorem for unimodality

88 unimodal = self.is_unimodal(

89 [mean1, mean2], [var1, var2])

90 else:

91 mean1 = means[0][0]

92 var1 = variances[0][0]

93 w1 = weights[0][0]

94 mean2, var2, w2 = np.nan, np.nan, 0

95 unimodal = True

96 info = dict(

97 binormal=binormal,

98 uninormal=uninormal,

99 insuf_ev=insuf_ev,

100 unimodal=unimodal,

101 bics=bics,

102 p=p,

103 mean1=mean1,

104 mean2=mean2,

105 var1=var1,

106 var2=var2,

107 w1=w1,

108 w2=w2)

109 return info

110

111 def bayes_factor_criteria(self, bics0, bics1):

112 """

113 Determine uninormality, bimodality and insufficient evidence parameters from Bayes Factor.

114

115 :param bics0: BIC value for 1-component model

116 :param bics1: BIC value for 2-component model

117

118 :return: tuple with uninormal, binormal and insuficient evidence boolean values.

119 """

120 diff_bic = bics1 - bics0

121 # probability of a two-component model

122 p = 1 / (1+np.exp(0.5*diff_bic))

123 if p == np.nan:

124 if bics0 == np.nan:

125 p = 1

126 elif bics1 == np.nan:

127 p = 0

128

129 uninormal = p < (self.confidence_level / 100)

130 binormal = p > (1 - (self.confidence_level / 100))

131 insuf_ev = True if (not uninormal and not binormal) else False

132 return uninormal, binormal, insuf_ev, p

133

134 def is_unimodal(self, means, vars):

135 """implements P.Dans version of Helguero's theorem in order to detect unimodality.

136

137 :param means: array with values of means for both fitted models.

138 :param vars: array with values of variances for both fitted models.

139

140 :return: unimodal boolean value."""

141 r = vars[0] / vars[1]

142

143 # separation factor

144 s = np.sqrt(

145 -2 + 3*r + 3*r**2 - 2*r**3 + 2*(1 - r + r**2)**1.5

146 ) / (

147 np.sqrt(r)*(1+np.sqrt(r))

148 )

149

150 unimodal = abs(means[1]-means[0]) <= s * \

151 (np.sqrt(vars[0]) + np.sqrt(vars[1]))

152 return unimodal

Coverage for mddb_workflow / utils / bibitransformer_nassa.py: 15%

66 statements