scipy.stats.chi2.ppf

Here are the examples of the python api scipy.stats.chi2.ppf taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

14 Examples 7

Example 1

Project: statsmodels
Source File: plot_grids.py
View license
def _make_ellipse(mean, cov, ax, level=0.95, color=None):
    """Support function for scatter_ellipse."""
    from matplotlib.patches import Ellipse

    v, w = np.linalg.eigh(cov)
    u = w[0] / np.linalg.norm(w[0])
    angle = np.arctan(u[1]/u[0])
    angle = 180 * angle / np.pi # convert to degrees
    v = 2 * np.sqrt(v * stats.chi2.ppf(level, 2)) #get size corresponding to level
    ell = Ellipse(mean[:2], v[0], v[1], 180 + angle, facecolor='none',
                  edgecolor=color,
                  #ls='dashed',  #for debugging
                  lw=1.5)
    ell.set_clip_box(ax.bbox)
    ell.set_alpha(0.5)
    ax.add_artist(ell)

Example 2

Project: combined-pvalues
Source File: _common.py
View license
def genomic_control(pvals):
    """
    calculate genomic control factor, lambda
    >>> genomic_control([0.25, 0.5, 0.75])
    1.0000800684096998
    >>> genomic_control([0.025, 0.005, 0.0075])
    15.715846578113579
    """
    from scipy import stats
    import numpy as np
    pvals = np.asarray(pvals)
    return np.median(stats.chi2.ppf(1 - pvals, 1)) / 0.4549

Example 3

Project: combined-pvalues
Source File: _common.py
View license
def genome_control_adjust(pvals):
    """
    adjust p-values by the genomic control factor, lambda
    >>> genome_control_adjust([0.4, 0.01, 0.02])
    array([ 0.8072264 ,  0.45518836,  0.50001716])
    """
    import numpy as np
    from scipy import stats
    pvals = np.asarray(pvals)
    qchi = stats.chi2.ppf(1 - pvals, 1)
    gc = np.median(qchi) / 0.4549
    return 1 - stats.chi2.cdf(qchi / gc, 1)

Example 4

Project: NucleoATAC
Source File: Occupancy.py
View license
    def __init__(self, lower, upper , insert_dist, ci = 0.9):
        self.lower = lower
        self.upper = upper
        #self.smooth_mat = np.tile(signal.gaussian(151,25),(upper-lower,1))
        nuc_probs = insert_dist.nuc_fit.get(lower,upper)
        self.nuc_probs = nuc_probs /np.sum(nuc_probs)
        nfr_probs = insert_dist.nfr_fit.get(lower,upper)
        self.nfr_probs = nfr_probs /np.sum(nfr_probs)
        self.alphas = np.linspace(0, 1, 101)
        #self.x = map(lambda alpha: np.log(alpha * self.nuc_probs + (1 - alpha) * self.nfr_probs), self.alphas)
        self.l = len(self.alphas)
        self.cutoff = stats.chi2.ppf(ci,1)

Example 5

Project: pycwt
Source File: wavelet.py
View license
def significance(signal, dt, scales, sigma_test=0, alpha=None,
                 significance_level=0.95, dof=-1, wavelet='morlet'):
    """
    Significance testing for the one dimensional wavelet transform.

    Parameters
    ----------
    signal : array like, float
        Input signal array. If a float number is given, then the
        variance is assumed to have this value. If an array is
        given, then its variance is automatically computed.
    dt : float, optional
        Sample spacing. Default is 1.0.
    scales : array like
        Vector of scale indices given returned by cwt function.
    sigma_test : int, optional
        Sets the type of significance test to be performed.
        Accepted values are 0, 1 or 2. If omitted assume 0.

    Notes
    -----
    If sigma_test is set to 0, performs a regular chi-square test, according
    to Torrence and Compo (1998) equation 18.

    If set to 1, performs a time-average test (equation 23). In
    this case, dof should be set to the number of local wavelet
    spectra that where averaged together. For the global
    wavelet spectra it would be dof=N, the number of points in
    the time-series.

    If set to 2, performs a scale-average test (equations 25 to
    28). In this case dof should be set to a two element vector
    [s1, s2], which gives the scale range that were averaged
    together. If, for example, the average between scales 2 and
    8 was taken, then dof=[2, 8].

    alpha : float, optional
        Lag-1 autocorrelation, used for the significance levels.
        Default is 0.0.
    significance_level : float, optional
        Significance level to use. Default is 0.95.
    dof : variant, optional
        Degrees of freedom for significance test to be set
        according to the type set in sigma_test.
    wavelet : instance of a wavelet class, optional
        Mother wavelet class. Default is Morlet().

    Returns
    -------
    signif : array like
        Significance levels as a function of scale.
    fft_theor (array like):
        Theoretical red-noise spectrum as a function of period.

    """
    if isinstance(wavelet, unicode) or isinstance(wavelet, str):
        wavelet = mothers[wavelet]()

    try:
      n0 = len(signal)
    except:
      n0 = 1
    J = len(scales) - 1
    dj = np.log2(scales[1] / scales[0])

    if n0 == 1:
        variance = signal
    else:
        variance = signal.std() ** 2

    if alpha == None:
        alpha, _, _ = ar1(signal)

    period = scales * wavelet.flambda()  # Fourier equivalent periods
    freq = dt / period                   # Normalized frequency
    dofmin = wavelet.dofmin              # Degrees of freedom with no smoothing
    Cdelta = wavelet.cdelta              # Reconstruction factor
    gamma_fac = wavelet.gamma            # Time-decorrelation factor
    dj0 = wavelet.deltaj0                # Scale-decorrelation factor

    # Theoretical discrete Fourier power spectrum of the noise signal following
    # Gilman et al. (1963) and Torrence and Compo (1998), equation 16.
    pk = lambda k, a, N: (1 - a ** 2) / (1 + a ** 2 - 2 * a *
        np.cos(2 * np.pi * k / N))
    fft_theor = pk(freq, alpha, n0)
    fft_theor = variance * fft_theor     # Including time-series variance
    signif = fft_theor

    try:
        if dof == -1:
            dof = dofmin
    except:
        pass

    if sigma_test == 0:  # No smoothing, dof=dofmin, TC98 sec. 4
        dof = dofmin
        # As in Torrence and Compo (1998), equation 18
        chisquare = chi2.ppf(significance_level, dof) / dof
        signif = fft_theor * chisquare
    elif sigma_test == 1:  # Time-averaged significance
        if len(dof) == 1:
            dof = np.zeros(1, J+1) + dof
        sel = find(dof < 1)
        dof[sel] = 1
        # As in Torrence and Compo (1998), equation 23:
        dof = dofmin * (1 + (dof * dt / gamma_fac / scales) ** 2 ) ** 0.5
        sel = find(dof < dofmin)
        dof[sel] = dofmin  # Minimum dof is dofmin
        for n, d in enumerate(dof):
            chisquare = chi2.ppf(significance_level, d) / d;
            signif[n] = fft_theor[n] * chisquare
    elif sigma_test == 2:  # Time-averaged significance
        if len(dof) != 2:
            raise Exception('DOF must be set to [s1, s2], '
                              'the range of scale-averages')
        if Cdelta == -1:
            raise Exception('Cdelta and dj0 not defined for %s with f0=%f' %
                             (wavelet.name, wavelet.f0))

        s1, s2 = dof
        sel = find((scales >= s1) & (scales <= s2));
        navg = sel.size
        if navg == 0:
            raise Exception('No valid scales between %d and %d.' % (s1, s2))

        # As in Torrence and Compo (1998), equation 25
        Savg = 1 / sum(1. / scales[sel])
        # Power-of-two mid point:
        Smid = np.exp((np.log(s1) + np.log(s2)) / 2.)
        # As in Torrence and Compo (1998), equation 28
        dof = (dofmin * navg * Savg / Smid) * (
            (1 + (navg * dj / dj0) ** 2) ** 0.5)
        # As in Torrence and Compo (1998), equation 27
        fft_theor = Savg * sum(fft_theor[sel] / scales[sel])
        chisquare = chi2.ppf(significance_level, dof) / dof;
        # As in Torrence and Compo (1998), equation 26
        signif = (dj * dt / Cdelta / Savg) * fft_theor * chisquare
    else:
        raise Exception('sigma_test must be either 0, 1, or 2.')

    return signif, fft_theor

Example 6

Project: pycwt
Source File: wavelet.py
View license
def xwt(signal, signal2, dt, dj=1/12, s0=-1, J=-1, significance_level=0.95,
        wavelet='morlet', normalize=True):
    """
    Calculate the cross wavelet transform (XWT). The XWT finds regions in time
    frequency space where the time series show high common power. Torrence and
    Compo (1998) state that the percent point function -- PPF (inverse of the
    cumulative distribution function) of a chi-square distribution at 95%
    confidence and two degrees of freedom is Z2(95%)=3.999. However, calculating
    the PPF using chi2.ppf gives Z2(95%)=5.991. To ensure similar significance
    intervals as in Grinsted et al. (2004), one has to use confidence of 86.46%.

    Parameters
    ----------
    signal, signal2 : numpy.ndarray, list
        Input signal array to calculate cross wavelet transform.
    dt : float
        Sample spacing.
    dj : float, optional
        Spacing between discrete scales. Default value is 0.25.
        Smaller values will result in better scale resolution, but
        slower calculation and plot.
    s0 : float, optional
        Smallest scale of the wavelet. Default value is 2*dt.
    J : float, optional
        Number of scales less one. Scales range from s0 up to
        s0 * 2**(J * dj), which gives a total of (J + 1) scales.
        Default is J = (log2(N*dt/so))/dj.
    wavelet : instance of a wavelet class, optional
        Mother wavelet class. Default is Morlet wavelet.
        significance_level : float, optional
        Significance level to use. Default is 0.95.
    normalize : bool, optional
        If set to true, normalizes CWT by the standard deviation of
        the signals.

    Returns
    -------
    xwt (array like):
        Cross wavelet transform according to the selected mother wavelet.
    x (array like):
        Intersected independent variable.
    coi (array like):
        Cone of influence, which is a vector of N points containing
        the maximum Fourier period of useful information at that
        particular time. Periods greater than those are subject to
        edge effects.
    freqs (array like):
        Vector of Fourier equivalent frequencies (in 1 / time units)
        that correspond to the wavelet scales.
    signif (array like):
        Significance levels as a function of scale.

    """
    if isinstance(wavelet, unicode) or isinstance(wavelet, str):
        wavelet = mothers[wavelet]()

    # Defines some parameters like length of both time-series, time step
    # and calculates the standard deviation for normalization and statistical
    # significance tests.
    signal = np.asarray(signal)
    signal2 = np.asarray(signal2)

    if normalize:
        std1 = signal.std()
        std2 = signal2.std()
    else:
        std1 = std2 = 1

    # Calculates the CWT of the time-series making sure the same parameters
    # are used in both calculations.

    cwt_kwargs = dict(dt=dt, dj=dj, s0=s0, J=J, wavelet=wavelet)
    W1, sj, freq, coi, _, _ = cwt(signal/std1, **cwt_kwargs)
    W2, sj, freq, coi, _, _ = cwt(signal2/std2, **cwt_kwargs)

    # Now the cross correlation of y1 and y2
    W12 = W1 * W2.conj()

    # And the significance tests. Note that the confidence level is calculated
    # using the percent point function (PPF) of the chi-squared cumulative
    # distribution function (CDF) instead of using Z1(95%) = 2.182 and
    # Z2(95%)=3.999 as suggested by Torrence & Compo (1998) and Grinsted et
    # al. (2004). If the CWT has been normalized, then std1 and std2 should
    # be reset to unity, otherwise the standard deviation of both series have
    # to be calculated.
    if not normalize:
        std1 = signal.std()
        std2 = signal2.std()
    else:
        std1 = std2 = 1.
    a1, _, _ = ar1(signal)
    a2, _, _ = ar1(signal2)
    Pk1 = ar1_spectrum(freq * dt, a1)
    Pk2 = ar1_spectrum(freq * dt, a2)
    dof = wavelet.dofmin
    PPF = chi2.ppf(significance_level, dof)
    signif = (std1 * std2 * (Pk1 * Pk2) ** 0.5 * PPF / dof)

    # The resuts:
    return W12, coi, freq, signif

Example 7

Project: statsmodels
Source File: aft_el.py
View license
    def ci_beta(self, param_num, beta_high, beta_low, sig=.05):
        """
        Returns the confidence interval for a regression
        parameter in the AFT model.

        Parameters
        ---------

        param_num: int
            Parameter number of interest

        beta_high: float
            Upper bound for the confidence interval

        beta_low:
            Lower bound for the confidence interval

        sig: float, optional
            Significance level.  Default is .05

        Notes
        ----
        If the function returns f(a) and f(b) must have different signs,
        consider widening the search area by adjusting beta_low and
        beta_high.

        Also note that this process is computational intensive.  There
        are 4 levels of optimization/solving.  From outer to inner:

        1) Solving so that llr-critical value = 0
        2) maximizing over nuisance parameters
        3) Using  EM at each value of nuisamce parameters
        4) Using the _modified_Newton optimizer at each iteration
           of the EM algorithm.

        Also, for very unlikely nuisance parameters, it is possible for
        the EM algorithm to not converge.  This is not an indicator
        that the solver did not find the correct solution.  It just means
        for a specific iteration of the nuisance parameters, the optimizer
        was unable to converge.

        If the user desires to verify the success of the optimization,
        it is recommended to test the limits using test_beta.

        """
        params = self.params()
        self.r0 = chi2.ppf(1 - sig, 1)
        ll = optimize.brentq(self._ci_limits_beta, beta_low,
                             params[param_num], (param_num))
        ul = optimize.brentq(self._ci_limits_beta,
                             params[param_num], beta_high, (param_num))
        return ll, ul

Example 8

Project: statsmodels
Source File: descriptive.py
View license
    def ci_mean(self, sig=.05, method='gamma', epsilon=10 ** -8,
                 gamma_low=-10 ** 10, gamma_high=10 ** 10):
        """
        Returns the confidence interval for the mean.

        Parameters
        ----------
        sig : float
            significance level. Default is .05

        method : str
            Root finding method,  Can be 'nested-brent' or
            'gamma'.  Default is 'gamma'

            'gamma' Tries to solve for the gamma parameter in the
            Lagrange (see Owen pg 22) and then determine the weights.

            'nested brent' uses brents method to find the confidence
            intervals but must maximize the likelihhod ratio on every
            iteration.

            gamma is generally much faster.  If the optimizations does not
            converge, try expanding the gamma_high and gamma_low
            variable.

        gamma_low : float
            Lower bound for gamma when finding lower limit.
            If function returns f(a) and f(b) must have different signs,
            consider lowering gamma_low.

        gamma_high : float
            Upper bound for gamma when finding upper limit.
            If function returns f(a) and f(b) must have different signs,
            consider raising gamma_high.

        epsilon : float
            When using 'nested-brent', amount to decrease (increase)
            from the maximum (minimum) of the data when
            starting the search.  This is to protect against the
            likelihood ratio being zero at the maximum (minimum)
            value of the data.  If data is very small in absolute value
            (<10 ``**`` -6) consider shrinking epsilon

            When using 'gamma', amount to decrease (increase) the
            minimum (maximum) by to start the search for gamma.
            If fucntion returns f(a) and f(b) must have differnt signs,
            consider lowering epsilon.

        Returns
        -------
        Interval : tuple
            Confidence interval for the mean
        """
        endog = self.endog
        sig = 1 - sig
        if method == 'nested-brent':
            self.r0 = chi2.ppf(sig, 1)
            middle = np.mean(endog)
            epsilon_u = (max(endog) - np.mean(endog)) * epsilon
            epsilon_l = (np.mean(endog) - min(endog)) * epsilon
            ulim = optimize.brentq(self._ci_limits_mu, middle,
                max(endog) - epsilon_u)
            llim = optimize.brentq(self._ci_limits_mu, middle,
                min(endog) + epsilon_l)
            return  llim, ulim

        if method == 'gamma':
            self.r0 = chi2.ppf(sig, 1)
            gamma_star_l = optimize.brentq(self._find_gamma, gamma_low,
                min(endog) - epsilon)
            gamma_star_u = optimize.brentq(self._find_gamma, \
                         max(endog) + epsilon, gamma_high)
            weights_low = ((endog - gamma_star_l) ** -1) / \
                np.sum((endog - gamma_star_l) ** -1)
            weights_high = ((endog - gamma_star_u) ** -1) / \
                np.sum((endog - gamma_star_u) ** -1)
            mu_low = np.sum(weights_low * endog)
            mu_high = np.sum(weights_high * endog)
            return mu_low,  mu_high

Example 9

Project: statsmodels
Source File: descriptive.py
View license
    def ci_var(self, lower_bound=None, upper_bound=None, sig=.05):
        """
        Returns the confidence interval for the variance.

        Parameters
        ----------
        lower_bound : float
            The minimum value the lower confidence interval can
            take. The p-value from test_var(lower_bound) must be lower
            than 1 - significance level. Default is .99 confidence
            limit assuming normality

        upper_bound : float
            The maximum value the upper confidence interval
            can take. The p-value from test_var(upper_bound) must be lower
            than 1 - significance level.  Default is .99 confidence
            limit assuming normality

        sig : float
            The significance level. Default is .05

        Returns
        --------
        Interval : tuple
            Confidence interval for the variance

        Examples
        --------
        >>> import numpy as np
        >>> import statsmodels.api as sm
        >>> random_numbers = np.random.standard_normal(100)
        >>> el_analysis = sm.emplike.DescStat(random_numbers)
        >>> el_analysis.ci_var()
        (0.7539322567470305, 1.229998852496268)
        >>> el_analysis.ci_var(.5, 2)
        (0.7539322567469926, 1.2299988524962664)

        Notes
        -----
        If the function returns the error f(a) and f(b) must have
        different signs, consider lowering lower_bound and raising
        upper_bound.
        """
        endog = self.endog
        if upper_bound is None:
            upper_bound = ((self.nobs - 1) * endog.var()) / \
              (chi2.ppf(.0001, self.nobs - 1))
        if lower_bound is None:
            lower_bound = ((self.nobs - 1) * endog.var()) / \
              (chi2.ppf(.9999, self.nobs - 1))
        self.r0 = chi2.ppf(1 - sig, 1)
        llim = optimize.brentq(self._ci_limits_var, lower_bound, endog.var())
        ulim = optimize.brentq(self._ci_limits_var, endog.var(), upper_bound)
        return llim, ulim

Example 10

Project: statsmodels
Source File: descriptive.py
View license
    def ci_skew(self, sig=.05, upper_bound=None, lower_bound=None):
        """
        Returns the confidence interval for skewness.

        Parameters
        ----------
        sig : float
            The significance level.  Default is .05

        upper_bound : float
            Maximum value of skewness the upper limit can be.
            Default is .99 confidence limit assuming normality.

        lower_bound : float
            Minimum value of skewness the lower limit can be.
            Default is .99 confidence level assuming normality.

        Returns
        -------
        Interval : tuple
            Confidence interval for the skewness

        Notes
        -----
        If function returns f(a) and f(b) must have different signs, consider
        expanding lower and upper bounds
        """
        nobs = self.nobs
        endog = self.endog
        if upper_bound is None:
            upper_bound = skew(endog) + \
            2.5 * ((6. * nobs * (nobs - 1.)) / \
              ((nobs - 2.) * (nobs + 1.) * \
               (nobs + 3.))) ** .5
        if lower_bound is None:
            lower_bound = skew(endog) - \
            2.5 * ((6. * nobs * (nobs - 1.)) / \
              ((nobs - 2.) * (nobs + 1.) * \
               (nobs + 3.))) ** .5
        self.r0 = chi2.ppf(1 - sig, 1)
        llim = optimize.brentq(self._ci_limits_skew, lower_bound, skew(endog))
        ulim = optimize.brentq(self._ci_limits_skew, skew(endog), upper_bound)
        return   llim, ulim

Example 11

Project: statsmodels
Source File: descriptive.py
View license
    def ci_kurt(self, sig=.05, upper_bound=None, lower_bound=None):
        """
        Returns the confidence interval for kurtosis.

        Parameters
        ----------

        sig : float
            The significance level.  Default is .05

        upper_bound : float
            Maximum value of kurtosis the upper limit can be.
            Default is .99 confidence limit assuming normality.

        lower_bound : float
            Minimum value of kurtosis the lower limit can be.
            Default is .99 confidence limit assuming normality.

        Returns
        --------
        Interval : tuple
            Lower and upper confidence limit

        Notes
        -----
        For small n, upper_bound and lower_bound may have to be
        provided by the user.  Consider using test_kurt to find
        values close to the desired significance level.

        If function returns f(a) and f(b) must have different signs, consider
        expanding the bounds.
        """
        endog = self.endog
        nobs = self.nobs
        if upper_bound is None:
            upper_bound = kurtosis(endog) + \
            (2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \
              ((nobs - 2.) * (nobs + 1.) * \
               (nobs + 3.))) ** .5) * \
               (((nobs ** 2.) - 1.) / ((nobs - 3.) *\
                 (nobs + 5.))) ** .5)
        if lower_bound is None:
            lower_bound = kurtosis(endog) - \
            (2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \
              ((nobs - 2.) * (nobs + 1.) * \
               (nobs + 3.))) ** .5) * \
               (((nobs ** 2.) - 1.) / ((nobs - 3.) *\
                 (nobs + 5.))) ** .5)
        self.r0 = chi2.ppf(1 - sig, 1)
        llim = optimize.brentq(self._ci_limits_kurt, lower_bound, \
                             kurtosis(endog))
        ulim = optimize.brentq(self._ci_limits_kurt, kurtosis(endog), \
                             upper_bound)
        return   llim, ulim

Example 12

Project: statsmodels
Source File: descriptive.py
View license
    def ci_corr(self, sig=.05, upper_bound=None, lower_bound=None):
        """
        Returns the confidence intervals for the correlation coefficient

        Parameters
        ----------
        sig : float
            The significance level.  Default is .05

        upper_bound : float
            Maximum value the upper confidence limit can be.
            Default is  99% confidence limit assuming normality.

        lower_bound : float
            Minimum value the lower condidence limit can be.
            Default is 99% confidence limit assuming normality.

        Returns
        -------
        interval : tuple
            Confidence interval for the correlation

        """
        endog = self.endog
        nobs = self.nobs
        self.r0 = chi2.ppf(1 - sig, 1)
        point_est = np.corrcoef(endog[:, 0], endog[:, 1])[0, 1]
        if upper_bound is None:
            upper_bound = min(.999, point_est + \
                          2.5 * ((1. - point_est ** 2.) / \
                          (nobs - 2.)) ** .5)

        if lower_bound is None:
            lower_bound = max(- .999, point_est - \
                          2.5 * (np.sqrt((1. - point_est ** 2.) / \
                          (nobs - 2.))))

        llim = optimize.brenth(self._ci_limits_corr, lower_bound, point_est)
        ulim = optimize.brenth(self._ci_limits_corr, point_est, upper_bound)
        return llim, ulim

Example 13

Project: statsmodels
Source File: originregress.py
View license
    def conf_int_el(self, param_num, upper_bound=None,
                       lower_bound=None, sig=.05, method='nm',
                       stochastic_exog=1):
        """
        Returns the confidence interval for a regression parameter when the
        regression is forced through the origin

        Parameters
        ----------
        param_num : int
            The parameter number to be tested.  Note this uses python
            indexing but the '0' parameter refers to the intercept term

        upper_bound : float
            The maximum value the upper confidence limit can be.  The
            closer this is to the confidence limit, the quicker the
            computation.  Default is .00001 confidence limit under normality

        lower_bound : float
            The minimum value the lower confidence limit can be.
            Default is .00001 confidence limit under normality

        sig : float, optional
            The significance level.  Default .05

        method : str, optional
             Algorithm to optimize of nuisance params.  Can be 'nm' or
            'powell'.  Default is 'nm'.

        Returns
        -------
        ci: tuple
            The confidence interval for the parameter 'param_num'
        """
        r0 = chi2.ppf(1 - sig, 1)
        param_num = np.array([param_num])
        if upper_bound is None:
            upper_bound = (np.squeeze(self.model.fit().
                                      conf_int(.0001)[param_num])[1])
        if lower_bound is None:
            lower_bound = (np.squeeze(self.model.fit().conf_int(.00001)
                                      [param_num])[0])
        f = lambda b0:  self.el_test(np.array([b0]), param_num,
                                     method=method,
                                 stochastic_exog=stochastic_exog)[0] - r0
        lowerl = optimize.brentq(f, lower_bound, self.params[param_num])
        upperl = optimize.brentq(f, self.params[param_num], upper_bound)
        return (lowerl, upperl)

Example 14

Project: statsmodels
Source File: proportion.py
View license
def multinomial_proportions_confint(counts, alpha=0.05, method='goodman'):
    '''Confidence intervals for multinomial proportions.

    Parameters
    ----------
    counts : array_like of int, 1-D
        Number of observations in each category.
    alpha : float in (0, 1), optional
        Significance level, defaults to 0.05.
    method : {'goodman', 'sison-glaz'}, optional
        Method to use to compute the confidence intervals; available methods
        are:

         - `goodman`: based on a chi-squared approximation, valid if all
           values in `counts` are greater or equal to 5 [2]_
         - `sison-glaz`: less conservative than `goodman`, but only valid if
           `counts` has 7 or more categories (``len(counts) >= 7``) [3]_

    Returns
    -------
    confint : ndarray, 2-D
        Array of [lower, upper] confidence levels for each category, such that
        overall coverage is (approximately) `1-alpha`.

    Raises
    ------
    ValueError
        If `alpha` is not in `(0, 1)` (bounds excluded), or if the values in
        `counts` are not all positive or null.
    NotImplementedError
        If `method` is not kown.
    Exception
        When ``method == 'sison-glaz'``, if for some reason `c` cannot be
        computed; this signals a bug and should be reported.

    Notes
    -----
    The `goodman` method [2]_ is based on approximating a statistic based on
    the multinomial as a chi-squared random variable. The usual recommendation
    is that this is valid if all the values in `counts` are greater than or
    equal to 5. There is no condition on the number of categories for this
    method.

    The `sison-glaz` method [3]_ approximates the multinomial probabilities,
    and evaluates that with a maximum-likelihood estimator. The first
    approximation is an Edgeworth expansion that converges when the number of
    categories goes to infinity, and the maximum-likelihood estimator converges
    when the number of observations (``sum(counts)``) goes to infinity. In
    their paper, Sison & Glaz demo their method with at least 7 categories, so
    ``len(counts) >= 7`` with all values in `counts` at or above 5 can be used
    as a rule of thumb for the validity of this method. This method is less
    conservative than the `goodman` method (i.e. it will yield confidence
    intervals closer to the desired significance level), but produces
    confidence intervals of uniform width over all categories (except when the
    intervals reach 0 or 1, in which case they are truncated), which makes it
    most useful when proportions are of similar magnitude.

    Aside from the original sources ([1]_, [2]_, and [3]_), the implementation
    uses the formulas (though not the code) presented in [4]_ and [5]_.

    References
    ----------
    .. [1] Levin, Bruce, "A representation for multinomial cumulative
           distribution functions," The Annals of Statistics, Vol. 9, No. 5,
           1981, pp. 1123-1126.

    .. [2] Goodman, L.A., "On simultaneous confidence intervals for multinomial
           proportions," Technometrics, Vol. 7, No. 2, 1965, pp. 247-254.

    .. [3] Sison, Cristina P., and Joseph Glaz, "Simultaneous Confidence
           Intervals and Sample Size Determination for Multinomial
           Proportions," Journal of the American Statistical Association,
           Vol. 90, No. 429, 1995, pp. 366-369.

    .. [4] May, Warren L., and William D. Johnson, "A SAS® macro for
           constructing simultaneous confidence intervals  for multinomial
           proportions," Computer methods and programs in Biomedicine, Vol. 53,
           No. 3, 1997, pp. 153-162.

    .. [5] May, Warren L., and William D. Johnson, "Constructing two-sided
           simultaneous confidence intervals for multinomial proportions for
           small counts in a large number of cells," Journal of Statistical
           Software, Vol. 5, No. 6, 2000, pp. 1-24.
    '''
    if alpha <= 0 or alpha >= 1:
        raise ValueError('alpha must be in (0, 1), bounds excluded')
    counts = np.array(counts, dtype=np.float)
    if (counts < 0).any():
        raise ValueError('counts must be >= 0')

    n = counts.sum()
    k = len(counts)
    proportions = counts / n
    if method == 'goodman':
        chi2 = stats.chi2.ppf(1 - alpha / k, 1)
        delta = chi2 ** 2 + (4 * n * proportions * chi2 * (1 - proportions))
        region = ((2 * n * proportions + chi2 +
                   np.array([- np.sqrt(delta), np.sqrt(delta)])) /
                  (2 * (chi2 + n))).T
    elif method[:5] == 'sison':  # We accept any name starting with 'sison'
        # Define a few functions we'll use a lot.
        def poisson_interval(interval, p):
            """Compute P(b <= Z <= a) where Z ~ Poisson(p) and
            `interval = (b, a)`."""
            b, a = interval
            prob = stats.poisson.cdf(a, p) - stats.poisson.cdf(b - 1, p)
            if p == 0 and np.isnan(prob):
                # hack for older scipy <=0.16.1
                return int(b - 1 < 0)
            return prob

        def truncated_poisson_factorial_moment(interval, r, p):
            """Compute mu_r, the r-th factorial moment of a poisson random
            variable of parameter `p` truncated to `interval = (b, a)`."""
            b, a = interval
            return p ** r * (1 - ((poisson_interval((a - r + 1, a), p) -
                                   poisson_interval((b - r, b - 1), p)) /
                                  poisson_interval((b, a), p)))

        def edgeworth(intervals):
            """Compute the Edgeworth expansion term of Sison & Glaz's formula
            (1) (approximated probability for multinomial proportions in a
            given box)."""
            # Compute means and central moments of the truncated poisson
            # variables.
            mu_r1, mu_r2, mu_r3, mu_r4 = [
                np.array([truncated_poisson_factorial_moment(interval, r, p)
                          for (interval, p) in zip(intervals, counts)])
                for r in range(1, 5)
            ]
            mu = mu_r1
            mu2 = mu_r2 + mu - mu ** 2
            mu3 = mu_r3 + mu_r2 * (3 - 3 * mu) + mu - 3 * mu ** 2 + 2 * mu ** 3
            mu4 = (mu_r4 + mu_r3 * (6 - 4 * mu) +
                   mu_r2 * (7 - 12 * mu + 6 * mu ** 2) +
                   mu - 4 * mu ** 2 + 6 * mu ** 3 - 3 * mu ** 4)

            # Compute expansion factors, gamma_1 and gamma_2.
            g1 = mu3.sum() / mu2.sum() ** 1.5
            g2 = (mu4.sum() - 3 * (mu2 ** 2).sum()) / mu2.sum() ** 2

            # Compute the expansion itself.
            x = (n - mu.sum()) / np.sqrt(mu2.sum())
            phi = np.exp(- x ** 2 / 2) / np.sqrt(2 * np.pi)
            H3 = x ** 3 - 3 * x
            H4 = x ** 4 - 6 * x ** 2 + 3
            H6 = x ** 6 - 15 * x ** 4 + 45 * x ** 2 - 15
            f = phi * (1 + g1 * H3 / 6 + g2 * H4 / 24 + g1 ** 2 * H6 / 72)
            return f / np.sqrt(mu2.sum())


        def approximated_multinomial_interval(intervals):
            """Compute approximated probability for Multinomial(n, proportions)
            to be in `intervals` (Sison & Glaz's formula (1))."""
            return np.exp(
                np.sum(np.log([poisson_interval(interval, p)
                               for (interval, p) in zip(intervals, counts)])) +
                np.log(edgeworth(intervals)) -
                np.log(stats.poisson._pmf(n, n))
            )

        def nu(c):
            """Compute interval coverage for a given `c` (Sison & Glaz's
            formula (7))."""
            return approximated_multinomial_interval(
                [(np.maximum(count - c, 0), np.minimum(count + c, n))
                 for count in counts])

        # Find the value of `c` that will give us the confidence intervals
        # (solving nu(c) <= 1 - alpha < nu(c + 1).
        c = 1.0
        nuc = nu(c)
        nucp1 = nu(c + 1)
        while not (nuc <= (1 - alpha) < nucp1):
            if c > n:
                raise Exception("Couldn't find a value for `c` that "
                                "solves nu(c) <= 1 - alpha < nu(c + 1)")
            c += 1
            nuc = nucp1
            nucp1 = nu(c + 1)

        # Compute gamma and the corresponding confidence intervals.
        g = (1 - alpha - nuc) / (nucp1 - nuc)
        ci_lower = np.maximum(proportions - c / n, 0)
        ci_upper = np.minimum(proportions + (c + 2 * g) / n, 1)
        region = np.array([ci_lower, ci_upper]).T
    else:
        raise NotImplementedError('method "%s" is not available' % method)
    return region