Skip to content

Commit cb53ed3

Browse files
committed
Aligning CHAU to classical Chauvenet criterion
1 parent dead0b8 commit cb53ed3

4 files changed

Lines changed: 75 additions & 38 deletions

File tree

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,4 @@ v<1.0.3>, <01/19/2026> -- Updated docs with thumbnails
106106
v<1.0.3>, <01/27/2026> -- Added fallback verbosity action arg (when threshold not found)
107107
v<1.0.3>, <01/27/2026> -- Fixed refitting thresholders during predict in COMB
108108
v<1.0.3>, <01/27/2026> -- Added DUMMY thresholder for benchmarking, thanks @NicohMP
109+
v<1.0.4>, <02/22/2026> -- Aligning CHAU to classical Chauvenet criterion

docs/pythresh.thresholds.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030

3131
.. automodule:: pythresh.thresholds.chau
3232
:members:
33-
:exclude-members:
33+
:exclude-members: _effective_sample_size_entropy
3434
:undoc-members:
3535
:show-inheritance:
3636
:inherited-members:

pythresh/test/test_chau.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def setUpClass(cls):
3939
]).T
4040
cls.all_scores = [cls.single_score, cls.multiple_scores]
4141

42-
cls.methods = ['mean', 'median', 'gmean']
42+
cls.methods = ['classic', 'effective']
4343

4444
cls.params = list(product(cls.all_scores, cls.methods))
4545

pythresh/thresholds/chau.py

Lines changed: 72 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import numpy as np
22
import scipy.stats as stats
3-
from scipy.special import erfc
3+
from sklearn.preprocessing import StandardScaler
44

55
from .base import BaseThresholder
66
from .thresh_utility import cut
@@ -17,12 +17,11 @@ class CHAU(BaseThresholder):
1717
Parameters
1818
----------
1919
20-
method : {'mean', 'median', 'gmean'}, optional (default='mean')
21-
Calculate the area normal to distance using a scaler
20+
method : {'classic', 'effective'}, optional (default='effective')
21+
Determines how the threshold is computed:
2222
23-
- 'mean': Construct a scaler with the mean of the scores
24-
- 'median: Construct a scaler with the median of the scores
25-
- 'gmean': Construct a scaler with the geometric mean of the scores
23+
- 'classic': Uses the classical Chauvenet's criterion based on all samples.
24+
- 'effective': Uses an entropy-based effective sample size to adjust the threshold.
2625
2726
random_state : int, optional (default=1234)
2827
Random seed for the random number generators of the thresholders. Can also
@@ -37,52 +36,57 @@ class CHAU(BaseThresholder):
3736
3837
Notes
3938
-----
40-
41-
The Chauvenet's criterion for a one tail of a distribution is defined
42-
as follows:
39+
The classical Chauvenet's criterion identifies outliers in a dataset
40+
by computing a threshold based on the z-score of each observation:
4341
4442
.. math::
4543
46-
D_{\mathrm{max}}>Z \mathrm{,}
44+
Z = \frac{x - \bar{x}}{\sigma} \mathrm{,}
4745
48-
where :math:`D_{\mathrm{max}}` is the bounds of the probability band
49-
around the mean given by,
46+
where :math:`\bar{x}` is the mean and :math:`\sigma` the standard deviation
47+
of the dataset. An observation is considered an outlier if the probability
48+
of obtaining a value at least as extreme is less than
5049
5150
.. math::
5251
53-
D_{\mathrm{max}} = \lvert norm.ppf(Pz) \rvert \mathrm{,}
52+
P_z = \frac{1}{2N} \mathrm{,}
5453
55-
where this bounds is equal to the inverse of a cumulative distribution function
56-
for a probability of one of the tails of the normal distribution, and :math:`P_z`
57-
is therefore defined as,
54+
with :math:`N` being the total number of samples. The corresponding z-score
55+
threshold is then given by the inverse survival function of the standard
56+
normal distribution:
5857
5958
.. math::
6059
61-
P_z = \frac{1}{4n} \mathrm{,}
60+
Z_\mathrm{crit} = \mathrm{norm.isf}(P_z)
61+
62+
Any observation with :math:`|Z| > Z_\mathrm{crit}` is flagged as an outlier.
6263
63-
with :math:`n` being the number of samples in the decision scores. Finally the z-score
64-
can be calculated as follows:
64+
In the 'effective' method, the classical threshold is adjusted by an
65+
entropy-based effective sample size. This accounts for situations where
66+
the dataset may contain correlated or redundant samples, reducing the
67+
effective number of independent observations. The effective sample size
68+
:math:`N_\mathrm{eff}` is estimated as
6569
6670
.. math::
6771
68-
Z = \frac{x-\bar{x}}{\sigma} \mathrm{,}
72+
N_\mathrm{eff} = \min(N, \exp(H)) \mathrm{,}
6973
70-
with :math:`\bar{x}` as the mean and :math:`\sigma` the standard deviation
71-
of the decision scores.
74+
where :math:`H` is the entropy of the histogram of standardized scores.
75+
The threshold probability is then
7276
73-
CHAU employs variants of the classical Chauvenet's criterion as the mean can be
74-
replaced with the geometric mean or the median.
77+
.. math::
7578
76-
Any z-score greater than the Chauvenet's criterion is considered an outlier.
79+
P_z = \frac{1}{2 N_\mathrm{eff}}
7780
81+
which typically results in a more conservative threshold that adapts to
82+
the actual variability and redundancy in the data.
7883
7984
"""
8085

81-
def __init__(self, method='mean', random_state=1234):
86+
def __init__(self, method='effective', random_state=1234):
8287

8388
super().__init__()
84-
stat = {'mean': np.mean, 'median': np.median, 'gmean': stats.gmean}
85-
self.method = stat[method]
89+
self.method = method
8690
self.random_state = random_state
8791
np.random.seed(random_state)
8892

@@ -106,14 +110,46 @@ def eval(self, decision):
106110

107111
decision = self._data_setup(decision)
108112

109-
# Calculate Chauvenet's criterion for one tail
110-
Pz = 1/(4*len(decision))
111-
criterion = 1/abs(stats.norm.ppf(Pz))
113+
scaler = StandardScaler()
114+
z = scaler.fit_transform(decision.reshape(-1, 1))
115+
116+
if self.method == 'classic':
117+
N = len(z)
118+
elif self.method == 'effective':
119+
N = self._effective_sample_size_entropy(z)
120+
121+
Pz = 1 / (2 * N)
122+
zcrit = stats.norm.isf(Pz)
123+
zcrit = scaler.inverse_transform(np.array([[zcrit]]))[0, 0]
124+
125+
self.thresh_ = zcrit
126+
127+
return cut(decision, zcrit)
128+
129+
def _effective_sample_size_entropy(self, x):
130+
"""
131+
Entropy-based effective sample size.
132+
133+
Parameters
134+
----------
135+
x : np.ndarray
136+
1D array of outlier likelihood scores
137+
138+
Returns
139+
-------
140+
Neff : float
141+
Entropy-based effective sample size
142+
"""
143+
x = np.asarray(x)
144+
N = len(x)
145+
146+
hist, _ = np.histogram(x, bins='auto', density=False)
147+
hist = hist.astype(float)
112148

113-
# Get area normal to distance
114-
prob = erfc(np.abs(decision-self.method(decision)) /
115-
decision.std()/2.0**0.5)
149+
p = hist / hist.sum()
150+
p = p[p > 0] # avoid log(0)
116151

117-
self.thresh_ = criterion * (1-np.min(prob))/np.max(prob)
152+
H = -np.sum(p * np.log(p))
153+
Neff = np.exp(H)
118154

119-
return 1-cut(prob, criterion)
155+
return min(N, Neff)

0 commit comments

Comments
 (0)