Here are the examples of the python api scipy.stats.chi2.ppf taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
14 Examples
3
Example 1
def _make_ellipse(mean, cov, ax, level=0.95, color=None):
"""Support function for scatter_ellipse."""
from matplotlib.patches import Ellipse
v, w = np.linalg.eigh(cov)
u = w[0] / np.linalg.norm(w[0])
angle = np.arctan(u[1]/u[0])
angle = 180 * angle / np.pi # convert to degrees
v = 2 * np.sqrt(v * stats.chi2.ppf(level, 2)) #get size corresponding to level
ell = Ellipse(mean[:2], v[0], v[1], 180 + angle, facecolor='none',
edgecolor=color,
#ls='dashed', #for debugging
lw=1.5)
ell.set_clip_box(ax.bbox)
ell.set_alpha(0.5)
ax.add_artist(ell)
3
Example 2
def genomic_control(pvals):
"""
calculate genomic control factor, lambda
>>> genomic_control([0.25, 0.5, 0.75])
1.0000800684096998
>>> genomic_control([0.025, 0.005, 0.0075])
15.715846578113579
"""
from scipy import stats
import numpy as np
pvals = np.asarray(pvals)
return np.median(stats.chi2.ppf(1 - pvals, 1)) / 0.4549
3
Example 3
def genome_control_adjust(pvals):
"""
adjust p-values by the genomic control factor, lambda
>>> genome_control_adjust([0.4, 0.01, 0.02])
array([ 0.8072264 , 0.45518836, 0.50001716])
"""
import numpy as np
from scipy import stats
pvals = np.asarray(pvals)
qchi = stats.chi2.ppf(1 - pvals, 1)
gc = np.median(qchi) / 0.4549
return 1 - stats.chi2.cdf(qchi / gc, 1)
3
Example 4
def __init__(self, lower, upper , insert_dist, ci = 0.9):
self.lower = lower
self.upper = upper
#self.smooth_mat = np.tile(signal.gaussian(151,25),(upper-lower,1))
nuc_probs = insert_dist.nuc_fit.get(lower,upper)
self.nuc_probs = nuc_probs /np.sum(nuc_probs)
nfr_probs = insert_dist.nfr_fit.get(lower,upper)
self.nfr_probs = nfr_probs /np.sum(nfr_probs)
self.alphas = np.linspace(0, 1, 101)
#self.x = map(lambda alpha: np.log(alpha * self.nuc_probs + (1 - alpha) * self.nfr_probs), self.alphas)
self.l = len(self.alphas)
self.cutoff = stats.chi2.ppf(ci,1)
0
Example 5
def significance(signal, dt, scales, sigma_test=0, alpha=None,
significance_level=0.95, dof=-1, wavelet='morlet'):
"""
Significance testing for the one dimensional wavelet transform.
Parameters
----------
signal : array like, float
Input signal array. If a float number is given, then the
variance is assumed to have this value. If an array is
given, then its variance is automatically computed.
dt : float, optional
Sample spacing. Default is 1.0.
scales : array like
Vector of scale indices given returned by cwt function.
sigma_test : int, optional
Sets the type of significance test to be performed.
Accepted values are 0, 1 or 2. If omitted assume 0.
Notes
-----
If sigma_test is set to 0, performs a regular chi-square test, according
to Torrence and Compo (1998) equation 18.
If set to 1, performs a time-average test (equation 23). In
this case, dof should be set to the number of local wavelet
spectra that where averaged together. For the global
wavelet spectra it would be dof=N, the number of points in
the time-series.
If set to 2, performs a scale-average test (equations 25 to
28). In this case dof should be set to a two element vector
[s1, s2], which gives the scale range that were averaged
together. If, for example, the average between scales 2 and
8 was taken, then dof=[2, 8].
alpha : float, optional
Lag-1 autocorrelation, used for the significance levels.
Default is 0.0.
significance_level : float, optional
Significance level to use. Default is 0.95.
dof : variant, optional
Degrees of freedom for significance test to be set
according to the type set in sigma_test.
wavelet : instance of a wavelet class, optional
Mother wavelet class. Default is Morlet().
Returns
-------
signif : array like
Significance levels as a function of scale.
fft_theor (array like):
Theoretical red-noise spectrum as a function of period.
"""
if isinstance(wavelet, unicode) or isinstance(wavelet, str):
wavelet = mothers[wavelet]()
try:
n0 = len(signal)
except:
n0 = 1
J = len(scales) - 1
dj = np.log2(scales[1] / scales[0])
if n0 == 1:
variance = signal
else:
variance = signal.std() ** 2
if alpha == None:
alpha, _, _ = ar1(signal)
period = scales * wavelet.flambda() # Fourier equivalent periods
freq = dt / period # Normalized frequency
dofmin = wavelet.dofmin # Degrees of freedom with no smoothing
Cdelta = wavelet.cdelta # Reconstruction factor
gamma_fac = wavelet.gamma # Time-decorrelation factor
dj0 = wavelet.deltaj0 # Scale-decorrelation factor
# Theoretical discrete Fourier power spectrum of the noise signal following
# Gilman et al. (1963) and Torrence and Compo (1998), equation 16.
pk = lambda k, a, N: (1 - a ** 2) / (1 + a ** 2 - 2 * a *
np.cos(2 * np.pi * k / N))
fft_theor = pk(freq, alpha, n0)
fft_theor = variance * fft_theor # Including time-series variance
signif = fft_theor
try:
if dof == -1:
dof = dofmin
except:
pass
if sigma_test == 0: # No smoothing, dof=dofmin, TC98 sec. 4
dof = dofmin
# As in Torrence and Compo (1998), equation 18
chisquare = chi2.ppf(significance_level, dof) / dof
signif = fft_theor * chisquare
elif sigma_test == 1: # Time-averaged significance
if len(dof) == 1:
dof = np.zeros(1, J+1) + dof
sel = find(dof < 1)
dof[sel] = 1
# As in Torrence and Compo (1998), equation 23:
dof = dofmin * (1 + (dof * dt / gamma_fac / scales) ** 2 ) ** 0.5
sel = find(dof < dofmin)
dof[sel] = dofmin # Minimum dof is dofmin
for n, d in enumerate(dof):
chisquare = chi2.ppf(significance_level, d) / d;
signif[n] = fft_theor[n] * chisquare
elif sigma_test == 2: # Time-averaged significance
if len(dof) != 2:
raise Exception('DOF must be set to [s1, s2], '
'the range of scale-averages')
if Cdelta == -1:
raise Exception('Cdelta and dj0 not defined for %s with f0=%f' %
(wavelet.name, wavelet.f0))
s1, s2 = dof
sel = find((scales >= s1) & (scales <= s2));
navg = sel.size
if navg == 0:
raise Exception('No valid scales between %d and %d.' % (s1, s2))
# As in Torrence and Compo (1998), equation 25
Savg = 1 / sum(1. / scales[sel])
# Power-of-two mid point:
Smid = np.exp((np.log(s1) + np.log(s2)) / 2.)
# As in Torrence and Compo (1998), equation 28
dof = (dofmin * navg * Savg / Smid) * (
(1 + (navg * dj / dj0) ** 2) ** 0.5)
# As in Torrence and Compo (1998), equation 27
fft_theor = Savg * sum(fft_theor[sel] / scales[sel])
chisquare = chi2.ppf(significance_level, dof) / dof;
# As in Torrence and Compo (1998), equation 26
signif = (dj * dt / Cdelta / Savg) * fft_theor * chisquare
else:
raise Exception('sigma_test must be either 0, 1, or 2.')
return signif, fft_theor
0
Example 6
def xwt(signal, signal2, dt, dj=1/12, s0=-1, J=-1, significance_level=0.95,
wavelet='morlet', normalize=True):
"""
Calculate the cross wavelet transform (XWT). The XWT finds regions in time
frequency space where the time series show high common power. Torrence and
Compo (1998) state that the percent point function -- PPF (inverse of the
cuemulative distribution function) of a chi-square distribution at 95%
confidence and two degrees of freedom is Z2(95%)=3.999. However, calculating
the PPF using chi2.ppf gives Z2(95%)=5.991. To ensure similar significance
intervals as in Grinsted et al. (2004), one has to use confidence of 86.46%.
Parameters
----------
signal, signal2 : numpy.ndarray, list
Input signal array to calculate cross wavelet transform.
dt : float
Sample spacing.
dj : float, optional
Spacing between discrete scales. Default value is 0.25.
Smaller values will result in better scale resolution, but
slower calculation and plot.
s0 : float, optional
Smallest scale of the wavelet. Default value is 2*dt.
J : float, optional
Number of scales less one. Scales range from s0 up to
s0 * 2**(J * dj), which gives a total of (J + 1) scales.
Default is J = (log2(N*dt/so))/dj.
wavelet : instance of a wavelet class, optional
Mother wavelet class. Default is Morlet wavelet.
significance_level : float, optional
Significance level to use. Default is 0.95.
normalize : bool, optional
If set to true, normalizes CWT by the standard deviation of
the signals.
Returns
-------
xwt (array like):
Cross wavelet transform according to the selected mother wavelet.
x (array like):
Intersected independent variable.
coi (array like):
Cone of influence, which is a vector of N points containing
the maximum Fourier period of useful information at that
particular time. Periods greater than those are subject to
edge effects.
freqs (array like):
Vector of Fourier equivalent frequencies (in 1 / time units)
that correspond to the wavelet scales.
signif (array like):
Significance levels as a function of scale.
"""
if isinstance(wavelet, unicode) or isinstance(wavelet, str):
wavelet = mothers[wavelet]()
# Defines some parameters like length of both time-series, time step
# and calculates the standard deviation for normalization and statistical
# significance tests.
signal = np.asarray(signal)
signal2 = np.asarray(signal2)
if normalize:
std1 = signal.std()
std2 = signal2.std()
else:
std1 = std2 = 1
# Calculates the CWT of the time-series making sure the same parameters
# are used in both calculations.
cwt_kwargs = dict(dt=dt, dj=dj, s0=s0, J=J, wavelet=wavelet)
W1, sj, freq, coi, _, _ = cwt(signal/std1, **cwt_kwargs)
W2, sj, freq, coi, _, _ = cwt(signal2/std2, **cwt_kwargs)
# Now the cross correlation of y1 and y2
W12 = W1 * W2.conj()
# And the significance tests. Note that the confidence level is calculated
# using the percent point function (PPF) of the chi-squared cuemulative
# distribution function (CDF) instead of using Z1(95%) = 2.182 and
# Z2(95%)=3.999 as suggested by Torrence & Compo (1998) and Grinsted et
# al. (2004). If the CWT has been normalized, then std1 and std2 should
# be reset to unity, otherwise the standard deviation of both series have
# to be calculated.
if not normalize:
std1 = signal.std()
std2 = signal2.std()
else:
std1 = std2 = 1.
a1, _, _ = ar1(signal)
a2, _, _ = ar1(signal2)
Pk1 = ar1_spectrum(freq * dt, a1)
Pk2 = ar1_spectrum(freq * dt, a2)
dof = wavelet.dofmin
PPF = chi2.ppf(significance_level, dof)
signif = (std1 * std2 * (Pk1 * Pk2) ** 0.5 * PPF / dof)
# The resuts:
return W12, coi, freq, signif
0
Example 7
def ci_beta(self, param_num, beta_high, beta_low, sig=.05):
"""
Returns the confidence interval for a regression
parameter in the AFT model.
Parameters
---------
param_num: int
Parameter number of interest
beta_high: float
Upper bound for the confidence interval
beta_low:
Lower bound for the confidence interval
sig: float, optional
Significance level. Default is .05
Notes
----
If the function returns f(a) and f(b) must have different signs,
consider widening the search area by adjusting beta_low and
beta_high.
Also note that this process is computational intensive. There
are 4 levels of optimization/solving. From outer to inner:
1) Solving so that llr-critical value = 0
2) maximizing over nuisance parameters
3) Using EM at each value of nuisamce parameters
4) Using the _modified_Newton optimizer at each iteration
of the EM algorithm.
Also, for very unlikely nuisance parameters, it is possible for
the EM algorithm to not converge. This is not an indicator
that the solver did not find the correct solution. It just means
for a specific iteration of the nuisance parameters, the optimizer
was unable to converge.
If the user desires to verify the success of the optimization,
it is recommended to test the limits using test_beta.
"""
params = self.params()
self.r0 = chi2.ppf(1 - sig, 1)
ll = optimize.brentq(self._ci_limits_beta, beta_low,
params[param_num], (param_num))
ul = optimize.brentq(self._ci_limits_beta,
params[param_num], beta_high, (param_num))
return ll, ul
0
Example 8
def ci_mean(self, sig=.05, method='gamma', epsilon=10 ** -8,
gamma_low=-10 ** 10, gamma_high=10 ** 10):
"""
Returns the confidence interval for the mean.
Parameters
----------
sig : float
significance level. Default is .05
method : str
Root finding method, Can be 'nested-brent' or
'gamma'. Default is 'gamma'
'gamma' Tries to solve for the gamma parameter in the
Lagrange (see Owen pg 22) and then determine the weights.
'nested brent' uses brents method to find the confidence
intervals but must maximize the likelihhod ratio on every
iteration.
gamma is generally much faster. If the optimizations does not
converge, try expanding the gamma_high and gamma_low
variable.
gamma_low : float
Lower bound for gamma when finding lower limit.
If function returns f(a) and f(b) must have different signs,
consider lowering gamma_low.
gamma_high : float
Upper bound for gamma when finding upper limit.
If function returns f(a) and f(b) must have different signs,
consider raising gamma_high.
epsilon : float
When using 'nested-brent', amount to decrease (increase)
from the maximum (minimum) of the data when
starting the search. This is to protect against the
likelihood ratio being zero at the maximum (minimum)
value of the data. If data is very small in absolute value
(<10 ``**`` -6) consider shrinking epsilon
When using 'gamma', amount to decrease (increase) the
minimum (maximum) by to start the search for gamma.
If fucntion returns f(a) and f(b) must have differnt signs,
consider lowering epsilon.
Returns
-------
Interval : tuple
Confidence interval for the mean
"""
endog = self.endog
sig = 1 - sig
if method == 'nested-brent':
self.r0 = chi2.ppf(sig, 1)
middle = np.mean(endog)
epsilon_u = (max(endog) - np.mean(endog)) * epsilon
epsilon_l = (np.mean(endog) - min(endog)) * epsilon
ulim = optimize.brentq(self._ci_limits_mu, middle,
max(endog) - epsilon_u)
llim = optimize.brentq(self._ci_limits_mu, middle,
min(endog) + epsilon_l)
return llim, ulim
if method == 'gamma':
self.r0 = chi2.ppf(sig, 1)
gamma_star_l = optimize.brentq(self._find_gamma, gamma_low,
min(endog) - epsilon)
gamma_star_u = optimize.brentq(self._find_gamma, \
max(endog) + epsilon, gamma_high)
weights_low = ((endog - gamma_star_l) ** -1) / \
np.sum((endog - gamma_star_l) ** -1)
weights_high = ((endog - gamma_star_u) ** -1) / \
np.sum((endog - gamma_star_u) ** -1)
mu_low = np.sum(weights_low * endog)
mu_high = np.sum(weights_high * endog)
return mu_low, mu_high
0
Example 9
def ci_var(self, lower_bound=None, upper_bound=None, sig=.05):
"""
Returns the confidence interval for the variance.
Parameters
----------
lower_bound : float
The minimum value the lower confidence interval can
take. The p-value from test_var(lower_bound) must be lower
than 1 - significance level. Default is .99 confidence
limit assuming normality
upper_bound : float
The maximum value the upper confidence interval
can take. The p-value from test_var(upper_bound) must be lower
than 1 - significance level. Default is .99 confidence
limit assuming normality
sig : float
The significance level. Default is .05
Returns
--------
Interval : tuple
Confidence interval for the variance
Examples
--------
>>> import numpy as np
>>> import statsmodels.api as sm
>>> random_numbers = np.random.standard_normal(100)
>>> el_analysis = sm.emplike.DescStat(random_numbers)
>>> el_analysis.ci_var()
(0.7539322567470305, 1.229998852496268)
>>> el_analysis.ci_var(.5, 2)
(0.7539322567469926, 1.2299988524962664)
Notes
-----
If the function returns the error f(a) and f(b) must have
different signs, consider lowering lower_bound and raising
upper_bound.
"""
endog = self.endog
if upper_bound is None:
upper_bound = ((self.nobs - 1) * endog.var()) / \
(chi2.ppf(.0001, self.nobs - 1))
if lower_bound is None:
lower_bound = ((self.nobs - 1) * endog.var()) / \
(chi2.ppf(.9999, self.nobs - 1))
self.r0 = chi2.ppf(1 - sig, 1)
llim = optimize.brentq(self._ci_limits_var, lower_bound, endog.var())
ulim = optimize.brentq(self._ci_limits_var, endog.var(), upper_bound)
return llim, ulim
0
Example 10
def ci_skew(self, sig=.05, upper_bound=None, lower_bound=None):
"""
Returns the confidence interval for skewness.
Parameters
----------
sig : float
The significance level. Default is .05
upper_bound : float
Maximum value of skewness the upper limit can be.
Default is .99 confidence limit assuming normality.
lower_bound : float
Minimum value of skewness the lower limit can be.
Default is .99 confidence level assuming normality.
Returns
-------
Interval : tuple
Confidence interval for the skewness
Notes
-----
If function returns f(a) and f(b) must have different signs, consider
expanding lower and upper bounds
"""
nobs = self.nobs
endog = self.endog
if upper_bound is None:
upper_bound = skew(endog) + \
2.5 * ((6. * nobs * (nobs - 1.)) / \
((nobs - 2.) * (nobs + 1.) * \
(nobs + 3.))) ** .5
if lower_bound is None:
lower_bound = skew(endog) - \
2.5 * ((6. * nobs * (nobs - 1.)) / \
((nobs - 2.) * (nobs + 1.) * \
(nobs + 3.))) ** .5
self.r0 = chi2.ppf(1 - sig, 1)
llim = optimize.brentq(self._ci_limits_skew, lower_bound, skew(endog))
ulim = optimize.brentq(self._ci_limits_skew, skew(endog), upper_bound)
return llim, ulim
0
Example 11
def ci_kurt(self, sig=.05, upper_bound=None, lower_bound=None):
"""
Returns the confidence interval for kurtosis.
Parameters
----------
sig : float
The significance level. Default is .05
upper_bound : float
Maximum value of kurtosis the upper limit can be.
Default is .99 confidence limit assuming normality.
lower_bound : float
Minimum value of kurtosis the lower limit can be.
Default is .99 confidence limit assuming normality.
Returns
--------
Interval : tuple
Lower and upper confidence limit
Notes
-----
For small n, upper_bound and lower_bound may have to be
provided by the user. Consider using test_kurt to find
values close to the desired significance level.
If function returns f(a) and f(b) must have different signs, consider
expanding the bounds.
"""
endog = self.endog
nobs = self.nobs
if upper_bound is None:
upper_bound = kurtosis(endog) + \
(2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \
((nobs - 2.) * (nobs + 1.) * \
(nobs + 3.))) ** .5) * \
(((nobs ** 2.) - 1.) / ((nobs - 3.) *\
(nobs + 5.))) ** .5)
if lower_bound is None:
lower_bound = kurtosis(endog) - \
(2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \
((nobs - 2.) * (nobs + 1.) * \
(nobs + 3.))) ** .5) * \
(((nobs ** 2.) - 1.) / ((nobs - 3.) *\
(nobs + 5.))) ** .5)
self.r0 = chi2.ppf(1 - sig, 1)
llim = optimize.brentq(self._ci_limits_kurt, lower_bound, \
kurtosis(endog))
ulim = optimize.brentq(self._ci_limits_kurt, kurtosis(endog), \
upper_bound)
return llim, ulim
0
Example 12
def ci_corr(self, sig=.05, upper_bound=None, lower_bound=None):
"""
Returns the confidence intervals for the correlation coefficient
Parameters
----------
sig : float
The significance level. Default is .05
upper_bound : float
Maximum value the upper confidence limit can be.
Default is 99% confidence limit assuming normality.
lower_bound : float
Minimum value the lower condidence limit can be.
Default is 99% confidence limit assuming normality.
Returns
-------
interval : tuple
Confidence interval for the correlation
"""
endog = self.endog
nobs = self.nobs
self.r0 = chi2.ppf(1 - sig, 1)
point_est = np.corrcoef(endog[:, 0], endog[:, 1])[0, 1]
if upper_bound is None:
upper_bound = min(.999, point_est + \
2.5 * ((1. - point_est ** 2.) / \
(nobs - 2.)) ** .5)
if lower_bound is None:
lower_bound = max(- .999, point_est - \
2.5 * (np.sqrt((1. - point_est ** 2.) / \
(nobs - 2.))))
llim = optimize.brenth(self._ci_limits_corr, lower_bound, point_est)
ulim = optimize.brenth(self._ci_limits_corr, point_est, upper_bound)
return llim, ulim
0
Example 13
def conf_int_el(self, param_num, upper_bound=None,
lower_bound=None, sig=.05, method='nm',
stochastic_exog=1):
"""
Returns the confidence interval for a regression parameter when the
regression is forced through the origin
Parameters
----------
param_num : int
The parameter number to be tested. Note this uses python
indexing but the '0' parameter refers to the intercept term
upper_bound : float
The maximum value the upper confidence limit can be. The
closer this is to the confidence limit, the quicker the
computation. Default is .00001 confidence limit under normality
lower_bound : float
The minimum value the lower confidence limit can be.
Default is .00001 confidence limit under normality
sig : float, optional
The significance level. Default .05
method : str, optional
Algorithm to optimize of nuisance params. Can be 'nm' or
'powell'. Default is 'nm'.
Returns
-------
ci: tuple
The confidence interval for the parameter 'param_num'
"""
r0 = chi2.ppf(1 - sig, 1)
param_num = np.array([param_num])
if upper_bound is None:
upper_bound = (np.squeeze(self.model.fit().
conf_int(.0001)[param_num])[1])
if lower_bound is None:
lower_bound = (np.squeeze(self.model.fit().conf_int(.00001)
[param_num])[0])
f = lambda b0: self.el_test(np.array([b0]), param_num,
method=method,
stochastic_exog=stochastic_exog)[0] - r0
lowerl = optimize.brentq(f, lower_bound, self.params[param_num])
upperl = optimize.brentq(f, self.params[param_num], upper_bound)
return (lowerl, upperl)
0
Example 14
def multinomial_proportions_confint(counts, alpha=0.05, method='goodman'):
'''Confidence intervals for multinomial proportions.
Parameters
----------
counts : array_like of int, 1-D
Number of observations in each category.
alpha : float in (0, 1), optional
Significance level, defaults to 0.05.
method : {'goodman', 'sison-glaz'}, optional
Method to use to compute the confidence intervals; available methods
are:
- `goodman`: based on a chi-squared approximation, valid if all
values in `counts` are greater or equal to 5 [2]_
- `sison-glaz`: less conservative than `goodman`, but only valid if
`counts` has 7 or more categories (``len(counts) >= 7``) [3]_
Returns
-------
confint : ndarray, 2-D
Array of [lower, upper] confidence levels for each category, such that
overall coverage is (approximately) `1-alpha`.
Raises
------
ValueError
If `alpha` is not in `(0, 1)` (bounds excluded), or if the values in
`counts` are not all positive or null.
NotImplementedError
If `method` is not kown.
Exception
When ``method == 'sison-glaz'``, if for some reason `c` cannot be
computed; this signals a bug and should be reported.
Notes
-----
The `goodman` method [2]_ is based on approximating a statistic based on
the multinomial as a chi-squared random variable. The usual recommendation
is that this is valid if all the values in `counts` are greater than or
equal to 5. There is no condition on the number of categories for this
method.
The `sison-glaz` method [3]_ approximates the multinomial probabilities,
and evaluates that with a maximum-likelihood estimator. The first
approximation is an Edgeworth expansion that converges when the number of
categories goes to infinity, and the maximum-likelihood estimator converges
when the number of observations (``sum(counts)``) goes to infinity. In
their paper, Sison & Glaz demo their method with at least 7 categories, so
``len(counts) >= 7`` with all values in `counts` at or above 5 can be used
as a rule of thumb for the validity of this method. This method is less
conservative than the `goodman` method (i.e. it will yield confidence
intervals closer to the desired significance level), but produces
confidence intervals of uniform width over all categories (except when the
intervals reach 0 or 1, in which case they are truncated), which makes it
most useful when proportions are of similar magnitude.
Aside from the original sources ([1]_, [2]_, and [3]_), the implementation
uses the formulas (though not the code) presented in [4]_ and [5]_.
References
----------
.. [1] Levin, Bruce, "A representation for multinomial cuemulative
distribution functions," The Annals of Statistics, Vol. 9, No. 5,
1981, pp. 1123-1126.
.. [2] Goodman, L.A., "On simultaneous confidence intervals for multinomial
proportions," Technometrics, Vol. 7, No. 2, 1965, pp. 247-254.
.. [3] Sison, Cristina P., and Joseph Glaz, "Simultaneous Confidence
Intervals and Sample Size Determination for Multinomial
Proportions," Journal of the American Statistical Association,
Vol. 90, No. 429, 1995, pp. 366-369.
.. [4] May, Warren L., and William D. Johnson, "A SAS® macro for
constructing simultaneous confidence intervals for multinomial
proportions," Computer methods and programs in Biomedicine, Vol. 53,
No. 3, 1997, pp. 153-162.
.. [5] May, Warren L., and William D. Johnson, "Constructing two-sided
simultaneous confidence intervals for multinomial proportions for
small counts in a large number of cells," Journal of Statistical
Software, Vol. 5, No. 6, 2000, pp. 1-24.
'''
if alpha <= 0 or alpha >= 1:
raise ValueError('alpha must be in (0, 1), bounds excluded')
counts = np.array(counts, dtype=np.float)
if (counts < 0).any():
raise ValueError('counts must be >= 0')
n = counts.sum()
k = len(counts)
proportions = counts / n
if method == 'goodman':
chi2 = stats.chi2.ppf(1 - alpha / k, 1)
delta = chi2 ** 2 + (4 * n * proportions * chi2 * (1 - proportions))
region = ((2 * n * proportions + chi2 +
np.array([- np.sqrt(delta), np.sqrt(delta)])) /
(2 * (chi2 + n))).T
elif method[:5] == 'sison': # We accept any name starting with 'sison'
# Define a few functions we'll use a lot.
def poisson_interval(interval, p):
"""Compute P(b <= Z <= a) where Z ~ Poisson(p) and
`interval = (b, a)`."""
b, a = interval
prob = stats.poisson.cdf(a, p) - stats.poisson.cdf(b - 1, p)
if p == 0 and np.isnan(prob):
# hack for older scipy <=0.16.1
return int(b - 1 < 0)
return prob
def truncated_poisson_factorial_moment(interval, r, p):
"""Compute mu_r, the r-th factorial moment of a poisson random
variable of parameter `p` truncated to `interval = (b, a)`."""
b, a = interval
return p ** r * (1 - ((poisson_interval((a - r + 1, a), p) -
poisson_interval((b - r, b - 1), p)) /
poisson_interval((b, a), p)))
def edgeworth(intervals):
"""Compute the Edgeworth expansion term of Sison & Glaz's formula
(1) (approximated probability for multinomial proportions in a
given box)."""
# Compute means and central moments of the truncated poisson
# variables.
mu_r1, mu_r2, mu_r3, mu_r4 = [
np.array([truncated_poisson_factorial_moment(interval, r, p)
for (interval, p) in zip(intervals, counts)])
for r in range(1, 5)
]
mu = mu_r1
mu2 = mu_r2 + mu - mu ** 2
mu3 = mu_r3 + mu_r2 * (3 - 3 * mu) + mu - 3 * mu ** 2 + 2 * mu ** 3
mu4 = (mu_r4 + mu_r3 * (6 - 4 * mu) +
mu_r2 * (7 - 12 * mu + 6 * mu ** 2) +
mu - 4 * mu ** 2 + 6 * mu ** 3 - 3 * mu ** 4)
# Compute expansion factors, gamma_1 and gamma_2.
g1 = mu3.sum() / mu2.sum() ** 1.5
g2 = (mu4.sum() - 3 * (mu2 ** 2).sum()) / mu2.sum() ** 2
# Compute the expansion itself.
x = (n - mu.sum()) / np.sqrt(mu2.sum())
phi = np.exp(- x ** 2 / 2) / np.sqrt(2 * np.pi)
H3 = x ** 3 - 3 * x
H4 = x ** 4 - 6 * x ** 2 + 3
H6 = x ** 6 - 15 * x ** 4 + 45 * x ** 2 - 15
f = phi * (1 + g1 * H3 / 6 + g2 * H4 / 24 + g1 ** 2 * H6 / 72)
return f / np.sqrt(mu2.sum())
def approximated_multinomial_interval(intervals):
"""Compute approximated probability for Multinomial(n, proportions)
to be in `intervals` (Sison & Glaz's formula (1))."""
return np.exp(
np.sum(np.log([poisson_interval(interval, p)
for (interval, p) in zip(intervals, counts)])) +
np.log(edgeworth(intervals)) -
np.log(stats.poisson._pmf(n, n))
)
def nu(c):
"""Compute interval coverage for a given `c` (Sison & Glaz's
formula (7))."""
return approximated_multinomial_interval(
[(np.maximum(count - c, 0), np.minimum(count + c, n))
for count in counts])
# Find the value of `c` that will give us the confidence intervals
# (solving nu(c) <= 1 - alpha < nu(c + 1).
c = 1.0
nuc = nu(c)
nucp1 = nu(c + 1)
while not (nuc <= (1 - alpha) < nucp1):
if c > n:
raise Exception("Couldn't find a value for `c` that "
"solves nu(c) <= 1 - alpha < nu(c + 1)")
c += 1
nuc = nucp1
nucp1 = nu(c + 1)
# Compute gamma and the corresponding confidence intervals.
g = (1 - alpha - nuc) / (nucp1 - nuc)
ci_lower = np.maximum(proportions - c / n, 0)
ci_upper = np.minimum(proportions + (c + 2 * g) / n, 1)
region = np.array([ci_lower, ci_upper]).T
else:
raise NotImplementedError('method "%s" is not available' % method)
return region