11import numpy as np
22import scipy .stats as stats
3- from scipy . special import erfc
3+ from sklearn . preprocessing import StandardScaler
44
55from .base import BaseThresholder
66from .thresh_utility import cut
@@ -17,12 +17,11 @@ class CHAU(BaseThresholder):
1717 Parameters
1818 ----------
1919
20- method : {'mean ', 'median', 'gmean' }, optional (default='mean ')
21- Calculate the area normal to distance using a scaler
20+ method : {'classic ', 'effective' }, optional (default='effective ')
21+ Determines how the threshold is computed:
2222
23- - 'mean': Construct a scaler with the mean of the scores
24- - 'median: Construct a scaler with the median of the scores
25- - 'gmean': Construct a scaler with the geometric mean of the scores
23+ - 'classic': Uses the classical Chauvenet's criterion based on all samples.
24+ - 'effective': Uses an entropy-based effective sample size to adjust the threshold.
2625
2726 random_state : int, optional (default=1234)
2827 Random seed for the random number generators of the thresholders. Can also
@@ -37,52 +36,57 @@ class CHAU(BaseThresholder):
3736
3837 Notes
3938 -----
40-
41- The Chauvenet's criterion for a one tail of a distribution is defined
42- as follows:
39+ The classical Chauvenet's criterion identifies outliers in a dataset
40+ by computing a threshold based on the z-score of each observation:
4341
4442 .. math::
4543
46- D_{\mathrm{max}}>Z \mathrm{,}
44+ Z = \frac{x - \bar{x}}{\sigma} \mathrm{,}
4745
48- where :math:`D_{\mathrm{max}}` is the bounds of the probability band
49- around the mean given by,
46+ where :math:`\bar{x}` is the mean and :math:`\sigma` the standard deviation
47+ of the dataset. An observation is considered an outlier if the probability
48+ of obtaining a value at least as extreme is less than
5049
5150 .. math::
5251
53- D_{\mathrm{max}} = \lvert norm.ppf(Pz) \rvert \mathrm{,}
52+ P_z = \frac{1}{2N} \mathrm{,}
5453
55- where this bounds is equal to the inverse of a cumulative distribution function
56- for a probability of one of the tails of the normal distribution, and :math:`P_z`
57- is therefore defined as,
54+ with :math:`N` being the total number of samples. The corresponding z-score
55+ threshold is then given by the inverse survival function of the standard
56+ normal distribution:
5857
5958 .. math::
6059
61- P_z = \frac{1}{4n} \mathrm{,}
60+ Z_\mathrm{crit} = \mathrm{norm.isf}(P_z)
61+
62+ Any observation with :math:`|Z| > Z_\mathrm{crit}` is flagged as an outlier.
6263
63- with :math:`n` being the number of samples in the decision scores. Finally the z-score
64- can be calculated as follows:
64+ In the 'effective' method, the classical threshold is adjusted by an
65+ entropy-based effective sample size. This accounts for situations where
66+ the dataset may contain correlated or redundant samples, reducing the
67+ effective number of independent observations. The effective sample size
68+ :math:`N_\mathrm{eff}` is estimated as
6569
6670 .. math::
6771
68- Z = \frac{x-\bar{x}}{\sigma} \mathrm{,}
72+ N_\mathrm{eff} = \min(N, \exp(H)) \mathrm{,}
6973
70- with :math:`\bar{x}` as the mean and :math:`\sigma` the standard deviation
71- of the decision scores.
74+ where :math:`H` is the entropy of the histogram of standardized scores.
75+ The threshold probability is then
7276
73- CHAU employs variants of the classical Chauvenet's criterion as the mean can be
74- replaced with the geometric mean or the median.
77+ .. math::
7578
76- Any z-score greater than the Chauvenet's criterion is considered an outlier.
79+ P_z = \frac{1}{2 N_\mathrm{eff}}
7780
81+ which typically results in a more conservative threshold that adapts to
82+ the actual variability and redundancy in the data.
7883
7984 """
8085
81- def __init__ (self , method = 'mean ' , random_state = 1234 ):
86+ def __init__ (self , method = 'effective ' , random_state = 1234 ):
8287
8388 super ().__init__ ()
84- stat = {'mean' : np .mean , 'median' : np .median , 'gmean' : stats .gmean }
85- self .method = stat [method ]
89+ self .method = method
8690 self .random_state = random_state
8791 np .random .seed (random_state )
8892
@@ -106,14 +110,46 @@ def eval(self, decision):
106110
107111 decision = self ._data_setup (decision )
108112
109- # Calculate Chauvenet's criterion for one tail
110- Pz = 1 / (4 * len (decision ))
111- criterion = 1 / abs (stats .norm .ppf (Pz ))
113+ scaler = StandardScaler ()
114+ z = scaler .fit_transform (decision .reshape (- 1 , 1 ))
115+
116+ if self .method == 'classic' :
117+ N = len (z )
118+ elif self .method == 'effective' :
119+ N = self ._effective_sample_size_entropy (z )
120+
121+ Pz = 1 / (2 * N )
122+ zcrit = stats .norm .isf (Pz )
123+ zcrit = scaler .inverse_transform (np .array ([[zcrit ]]))[0 , 0 ]
124+
125+ self .thresh_ = zcrit
126+
127+ return cut (decision , zcrit )
128+
129+ def _effective_sample_size_entropy (self , x ):
130+ """
131+ Entropy-based effective sample size.
132+
133+ Parameters
134+ ----------
135+ x : np.ndarray
136+ 1D array of outlier likelihood scores
137+
138+ Returns
139+ -------
140+ Neff : float
141+ Entropy-based effective sample size
142+ """
143+ x = np .asarray (x )
144+ N = len (x )
145+
146+ hist , _ = np .histogram (x , bins = 'auto' , density = False )
147+ hist = hist .astype (float )
112148
113- # Get area normal to distance
114- prob = erfc (np .abs (decision - self .method (decision )) /
115- decision .std ()/ 2.0 ** 0.5 )
149+ p = hist / hist .sum ()
150+ p = p [p > 0 ] # avoid log(0)
116151
117- self .thresh_ = criterion * (1 - np .min (prob ))/ np .max (prob )
152+ H = - np .sum (p * np .log (p ))
153+ Neff = np .exp (H )
118154
119- return 1 - cut ( prob , criterion )
155+ return min ( N , Neff )
0 commit comments