numpy.bincount

Here are the examples of the python api numpy.bincount taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

83 Examples 7

Page 1 Selected Page 2

Example 1

Project: hyperopt Source File: test_pchoice.py
    def test_random(self):
        # test that that a space with a pchoice in it is
        # (a) accepted by tpe.suggest and
        # (b) handled correctly.
        N = 150
        fmin(self.objective,
            space=self.space,
            trials=self.trials,
            algo=rand.suggest,
            max_evals=N)

        a_vals = [t['misc']['vals']['a'][0] for t in self.trials.trials]
        counts = np.bincount(a_vals)
        print counts
        assert counts[3] > N * .35
        assert counts[3] < N * .60

Example 2

Project: horizont Source File: test_utils.py
    def test_matrix_to_lists(self):
        dtm, D, N_WORDS_PER_DOC = self.dtm, self.D, self.N_WORDS_PER_DOC
        N_BY_D, N_BY_W = self.N_BY_D, self.N_BY_W
        WS, DS = utils.matrix_to_lists(dtm)
        self.assertEqual(len(WS), D * N_WORDS_PER_DOC)
        self.assertEqual(len(WS), len(DS))
        self.assertEqual(dtm.shape, (max(DS) + 1, max(WS) + 1))
        self.assertTrue(all(DS == sorted(DS)))
        self.assertTrue(np.all(np.bincount(DS) == N_BY_D))
        self.assertTrue(np.all(np.bincount(WS) == N_BY_W))

Example 3

Project: info-flow-experiments Source File: permutation_test.py
def perm_unique(elements):
    bins = np.bincount(elements)
    listunique = []
    for i in range(0,len(bins)):
    	listunique.append(UniqueElement(i, bins[i]))
    u=len(elements)
    return perm_unique_helper(listunique,[0]*u,u-1)

Example 4

Project: kaggle-right-whale Source File: train_model.py
def filter_by_min_occ(X, y, min_occ):
    occs = np.bincount(y)
    mask = np.zeros_like(y).astype(bool)

    for i, occ in enumerate(occs):
        if occ == min_occ:
            mask[y == i] = True

    return X[mask], y[mask]

Example 5

Project: mondrianforest Source File: mondrianforest_utils.py
def update_posterior_node_incremental(tree, data, param, settings, cache, node_id, train_ids_new):
    y_train_new = data['y_train'][train_ids_new]
    if settings.optype == 'class':
        tree.counts[node_id] += np.bincount(y_train_new, minlength=data['n_class'])
    else:
        sum_y_new, sum_y2_new, n_points_new = get_reg_stats(y_train_new)
        tree.sum_y[node_id] += sum_y_new
        tree.sum_y2[node_id] += sum_y2_new
        tree.n_points[node_id] += n_points_new

Example 6

Project: scikit-learn Source File: test_dummy.py
def test_uniform_strategy():
    X = [[0]] * 4  # ignored
    y = [1, 2, 1, 1]
    clf = DummyClassifier(strategy="uniform", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 500
    y_pred = clf.predict(X)
    p = np.bincount(y_pred) / float(len(X))
    assert_almost_equal(p[1], 0.5, decimal=1)
    assert_almost_equal(p[2], 0.5, decimal=1)
    _check_predict_proba(clf, X, y)

Example 7

Project: klustaviewa Source File: ccg.py
def _increment(arr, indices):
    """Increment some indices in a 1D vector of non-negative integers.
    Repeated indices are taken into account."""
    arr = _as_array(arr)
    indices = _as_array(indices)
    bbins = np.bincount(indices)
    arr[:len(bbins)] += bbins
    return arr

Example 8

Project: statsmodels Source File: grouputils.py
    def count_categories(self, level=0):
        """
        Sets the attribute counts to equal the bincount of the (integer-valued)
        labels.
        """
        # TODO: refactor this not to set an attribute. Why would we do this?
        self.counts = np.bincount(self.labels[level])

Example 9

Project: implicit Source File: lastfm.py
def bm25_weight(X, K1=100, B=0.8):
    """ Weighs each row of the sparse matrix of the data by BM25 weighting """
    # calculate idf per term (user)
    X = coo_matrix(X)
    N = X.shape[0]
    idf = numpy.log(float(N) / (1 + numpy.bincount(X.col)))

    # calculate length_norm per docuement (artist)
    row_sums = numpy.ravel(X.sum(axis=1))
    average_length = row_sums.mean()
    length_norm = (1.0 - B) + B * row_sums / average_length

    # weight matrix rows by bm25
    X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
    return X

Example 10

Project: BuildingMachineLearningSystemsWithPython Source File: chapter.py
Function: chist
def chist(im):
    im = im // 64
    r,g,b = im.transpose((2,0,1))
    pixels = 1 * r + 4 * b + 16 * g
    hist = np.bincount(pixels.ravel(), minlength=64)
    hist = hist.astype(float)
    hist = np.log1p(hist)
    return hist

Example 11

Project: scikit-learn Source File: test_dummy.py
def test_stratified_strategy():
    X = [[0]] * 5  # ignored
    y = [1, 2, 1, 1, 2]
    clf = DummyClassifier(strategy="stratified", random_state=0)
    clf.fit(X, y)

    X = [[0]] * 500
    y_pred = clf.predict(X)
    p = np.bincount(y_pred) / float(len(X))
    assert_almost_equal(p[1], 3. / 5, decimal=1)
    assert_almost_equal(p[2], 2. / 5, decimal=1)
    _check_predict_proba(clf, X, y)

Example 12

Project: pyhawkes Source File: network.py
    def resample_m(self):
        """
        Resample m given c and pi
        """
        pi = self.pi + np.bincount(self.c, minlength=self.C)
        self.m = np.random.dirichlet(pi)

Example 13

Project: polara Source File: data.py
    @staticmethod
    def is_not_uniform(idx, nbins=10, allowed_gap=0.75):
        idx_bins = pd.cut(idx, bins=nbins, labels=False)
        idx_bin_size = np.bincount(idx_bins)

        diff = idx_bin_size[:-1] - idx_bin_size[1:]
        monotonic = (diff < 0).all() or (diff > 0).all()
        huge_gap = (idx_bin_size.min()*1.0 / idx_bin_size.max()) < allowed_gap
        return monotonic or huge_gap

Example 14

Project: discrete_sieve Source File: plot_ica.py
Function: discretize
def discretize(data, k=2):
    ranks = rankdata(data, method='dense').astype(int) - 1
    j = 1
    while len(np.bincount(ranks / j)) > k:
        j += 1
    return ranks / j

Example 15

Project: attention-lvcsr Source File: extra_ops.py
    def perform(self, node, inputs, output_storage):
        x = inputs[0]
        weights = inputs[1]
        z = output_storage[0]

        if weights is not None and weights.shape != x.shape:
            raise TypeError("All inputs must have the same shape.")

        # Needed for numpy 1.4.1 compatibility
        if self.minlength:
            out = np.bincount(x, weights=weights, minlength=self.minlength)
        else:
            out = np.bincount(x, weights=weights)

        z[0] = theano._asarray(out, dtype=node.outputs[0].dtype)

Example 16

Project: hyperopt Source File: test_pchoice.py
    def test_anneal(self):
        N = 100
        fmin(self.objective,
            space=self.space,
            trials=self.trials,
            algo=partial(anneal.suggest),
            max_evals=N)

        a_vals = [t['misc']['vals']['a'][0] for t in self.trials.trials]
        counts = np.bincount(a_vals)
        print counts
        assert counts[3] > N * .6

Example 17

Project: pyspatial Source File: test_raster_query.py
Function: compute_stats
def compute_stats(values, weights):
    # values = np.array(row, dtype=np.uint8)
    counts = np.bincount(values, weights=weights, minlength=256)
    zeros = np.zeros(counts.shape)
    total = 1. * np.sum(counts)
    return counts/total if total > 0 else zeros

Example 18

Project: discrete_sieve Source File: sieve.py
Function: fit
    def fit(self, x):
        n_samples, self.n_variables = x.shape

        while len(self.layers) < self.max_layers:
            next_layer = SieveLayer(x, **self.kwargs)
            x = next_layer.transform(x)
            if self.verbose:
                print 'tc: %0.3f, (+) %0.3f, (-) %0.3f' % (next_layer.corex.tc, next_layer.ub, next_layer.lb)
            #if next_layer.corex.tc - 2 * next_layer.ub - next_layer.lb > 1. / n_samples:  # Lower bound still increasing
            if next_layer.corex.tc - next_layer.lb > 1. / n_samples:  # Lower bound still increasing
                self.layers.append(next_layer)
                self.x_stats = [np.bincount(x[x[:, i] >= 0, i]) for i in range(self.n_variables)]
            else:
                break

        if self.verbose:
            print ['tc: %0.3f (-) %0.3f (+) %0.3f' % (layer.corex.tc, layer.lb, layer.ub) for layer in self.layers]
        return self

Example 19

Project: peas Source File: checkers.py
def gamefitness(game):
    """ Returns the fitness of
        the black player. (according to {gauci2008case}) """
    counts = np.bincount(game.board.flat)
    return (100 + 2 * counts[BLACK|MAN] + 3 * counts[BLACK|KING] + 
            2 * (12 - counts[WHITE|MAN] + 3 * (12 - counts[WHITE|KING])))

Example 20

Project: lhcb_trigger_ml Source File: uboost.py
def generate_mask(n_samples, bagging=True, random_generator=np.random):
    """bagging: float or bool (default=True), bagging usually
        speeds up the convergence and prevents overfitting
        (see http://en.wikipedia.org/wiki/Bootstrap_aggregating)
        if True, usual bootstrap aggregating is used
           (sampling with replacement at each iteration, size=len(X))
        if float, used sampling without replacement, the size of generated
           set is bagging * len(X)
        if False, returns ones for all events."""
    if bagging is True:
        indices = random_generator.randint(0, n_samples, size=n_samples)
        mask = np.bincount(indices, minlength=n_samples)
    elif isinstance(bagging, float):
        mask = random_generator.uniform(size=n_samples) > 1. - bagging
    elif bagging is False:
        mask = np.ones(n_samples, dtype='float')
    else:
        raise ValueError("something wrong was passed as bagging")
    return mask

Example 21

Project: scikit-learn Source File: fixes.py
Function: bin_count
    def bincount(x, weights=None, minlength=None):
        if len(x) > 0:
            return np.bincount(x, weights, minlength)
        else:
            if minlength is None:
                minlength = 0
            minlength = np.asscalar(np.asarray(minlength, dtype=np.intp))
            return np.zeros(minlength, dtype=np.intp)

Example 22

Project: bhmm Source File: generic_hmm.py
    def count_init(self):
        """Compute the counts at the first time step

        Returns
        -------
        n : ndarray(nstates)
            n[i] is the number of trajectories starting in state i

        """
        if self.hidden_state_trajectories is None:
            raise RuntimeError('HMM model does not have a hidden state trajectory.')

        n = [traj[0] for traj in self.hidden_state_trajectories]
        return np.bincount(n, minlength=self.nstates)

Example 23

Project: hyperopt Source File: test_pchoice.py
Function: test_tpe
    def test_tpe(self):
        N = 100
        fmin(self.objective,
            space=self.space,
            trials=self.trials,
            algo=partial(tpe.suggest, n_startup_jobs=10),
            max_evals=N)

        a_vals = [t['misc']['vals']['a'][0] for t in self.trials.trials]
        counts = np.bincount(a_vals)
        print counts
        assert counts[3] > N * .6

Example 24

Project: scipy Source File: compressed.py
Function: getnnz
    def getnnz(self, axis=None):
        if axis is None:
            return int(self.indptr[-1])
        else:
            if axis < 0:
                axis += 2
            axis, _ = self._swap((axis, 1 - axis))
            _, N = self._swap(self.shape)
            if axis == 0:
                return np.bincount(downcast_intp_index(self.indices),
                                   minlength=N)
            elif axis == 1:
                return np.diff(self.indptr)
            raise ValueError('axis out of bounds')

Example 25

Project: pybasicbayes Source File: mixture.py
    @property
    def used_labels(self):
        if len(self.labels_list) > 0:
            label_usages = sum(np.bincount(l.z,minlength=self.N) for l in self.labels_list)
            used_labels, = np.where(label_usages > 0)
        else:
            used_labels = np.argsort(self.weights.weights)[-1:-11:-1]
        return used_labels

Example 26

Project: PyEMMA Source File: test_msm.py
    @classmethod
    def setUpClass(cls):
        import pyemma.datasets
        cls.dtraj = pyemma.datasets.load_2well_discrete().dtraj_T100K_dt10
        nu = 1.*np.bincount(cls.dtraj)        
        cls.statdist = nu/nu.sum()
        
        cls.tau = 10
        cls.msmrev = estimate_markov_model(cls.dtraj, cls.tau)
        cls.msmrevpi = estimate_markov_model(cls.dtraj, cls.tau,
                                             statdist=cls.statdist)
        cls.msm = estimate_markov_model(cls.dtraj, cls.tau, reversible=False)

        """Sparse"""
        cls.msmrev_sparse = estimate_markov_model(cls.dtraj, cls.tau, sparse=True)
        cls.msmrevpi_sparse = estimate_markov_model(cls.dtraj, cls.tau,
                                                    statdist=cls.statdist,
                                                    sparse=True)
        cls.msm_sparse = estimate_markov_model(cls.dtraj, cls.tau, reversible=False, sparse=True)

Example 27

Project: msmbuilder-legacy Source File: test_wrappers.py
    def test(self):
        args, metric = Cluster.parser.parse_args([
            '-p', get('points_on_cube/ProjectInfo.yaml', just_filename=True),
            '-o', self.td,
            'rmsd', '-a', get('points_on_cube/AtomIndices.dat', just_filename=True),
            'kcenters', '-k', '4'], print_banner=False)
        Cluster.main(args, metric)

        assignments = load(pjoin(self.td, 'Assignments.h5'))["arr_0"]
        assignment_counts = np.bincount(assignments.flatten())
        eq(assignment_counts, np.array([2, 2, 2, 2]))

        distances = load(pjoin(self.td, 'Assignments.h5.distances'))["arr_0"]
        eq(distances, np.zeros((1,8)))

Example 28

Project: peas Source File: checkers.py
Function: evaluate
    def evaluate(self, game):
        counts = np.bincount(game.board.flat)

        nwm = counts[WHITE|MAN]
        nwk = counts[WHITE|KING]
        nbm = counts[BLACK|MAN]
        nbk = counts[BLACK|KING]

        vb = (100 * nbm + 130 * nbk)
        vw = (100 * nwm + 130 * nwk)

        return vb - vw

Example 29

Project: APGL Source File: GraphStatistics.py
    def vectorStatistics(self, graph, treeStats=False, eigenStats=True):
        """
        Find a series of statistics for the given input graph which can be represented 
        as vector values.
        """
        Parameter.checkClass(graph, AbstractMatrixGraph)
        Parameter.checkBoolean(treeStats)
        statsDict = {}

        statsDict["inDegreeDist"] = graph.inDegreeDistribution()
        statsDict["outDegreeDist"] = graph.degreeDistribution()
        logging.debug("Computing hop counts")
        P = graph.findAllDistances(False)
        statsDict["hopCount"] = graph.hopCount(P)
        logging.debug("Computing triangle count")
        if graph.getNumVertices() != 0:
            statsDict["triangleDist"] = numpy.bincount(graph.triangleSequence())
        else:
            statsDict["triangleDist"] = numpy.array([])
        
        #Get the distribution of component sizes 
        logging.debug("Finding distribution of component sizes")
        
        if graph.isUndirected(): 
            components = graph.findConnectedComponents()
            if len(components) != 0: 
                statsDict["componentsDist"] = numpy.bincount(numpy.array([len(c) for c in components], numpy.int))

        #Make sure weight matrix is symmetric
        
        if graph.getNumVertices()!=0 and eigenStats:
            logging.debug("Computing eigenvalues/vectors")
            W = graph.getWeightMatrix()
            W = (W + W.T)/2
            eigenDistribution, V = numpy.linalg.eig(W)
            i = numpy.argmax(eigenDistribution)
            statsDict["maxEigVector"] = V[:, i]
            statsDict["eigenDist"] = numpy.flipud(numpy.sort(eigenDistribution[eigenDistribution>0]))
            gc.collect() 
        else:
            statsDict["maxEigVector"] = numpy.array([])
            statsDict["eigenDist"] = numpy.array([])

        if treeStats:
            logging.debug("Computing statistics on trees")
            trees = graph.findTrees()
            statsDict["treeSizesDist"] = numpy.bincount([len(x) for x in trees])
            treeDepths = [GraphUtils.treeDepth((graph.subgraph(x))) for x in trees]
            statsDict["treeDepthsDist"] = numpy.bincount(treeDepths)

        return statsDict

Example 30

Project: statsmodels Source File: grouputils.py
Function: counts
    def counts(self):
        return np.bincount(self.group_int)

Example 31

Project: variational-text-tensorflow Source File: reader.py
Function: one_hot
  def onehot(self, data, min_length=None):
    if min_length == None:
      min_length = self.vocab_size
    return np.bincount(data, minlength=min_length)

Example 32

Project: selective_search_py Source File: features.py
Function: init_size
    def __init_size(self, n_region):
        bincnt = numpy.bincount(self.label.ravel(), minlength = n_region)
        return {i : bincnt[i] for i in range(n_region)}

Example 33

Project: tvb-library Source File: monitors.py
Function: config_for_sim
    def config_for_sim(self, simulator):
        "Configure projection matrix monitor for given simulation."

        super(Projection, self).config_for_sim(simulator)
        self._sim = simulator
        if hasattr(self, 'sensors'):
            self.sensors.configure()

        # handle region vs simulation, analytic vs numerical proj, cortical vs subcortical.
        # setup convenient locals
        surf = simulator.surface
        conn = simulator.connectivity
        using_cortical_surface = surf is not None
        if using_cortical_surface:
            non_cortical_indices, = numpy.where(numpy.bincount(surf.region_mapping) == 1)
            self.rmap = surf.region_mapping
        else:
            # assume all cortical if no info
            if conn.cortical.size == 0:
                conn.cortical = numpy.array([True] * conn.weights.shape[0])
            non_cortical_indices, = numpy.where(~conn.cortical)
            if self.region_mapping is None:
                raise Exception("Please specify a region mapping on the EEG/MEG/iEEG monitor when "
                                "performing a region simulation.")
            else:
                self.rmap = self.region_mapping

            LOG.debug('Projection used in region sim has %d non-cortical regions', non_cortical_indices.size)

        have_subcortical = len(non_cortical_indices) > 0

        # determine source space
        if using_cortical_surface:
            sources = {'loc': surf.vertices, 'ori': surf.vertex_normals}
        else:
            sources = {'loc': conn.centres[conn.cortical], 'ori': conn.orientations[conn.cortical]}

        # compute analytic if not provided
        if self.projection is None:
            LOG.debug('Precomputed projection not unavailable using analytic approximation.')
            self.gain = self.analytic(**sources)

        # reduce to region lead field if region sim
        if not using_cortical_surface and self.gain.shape[1] == self.rmap.size:
            gain = numpy.zeros((self.gain.shape[0], conn.number_of_regions))
            numpy_add_at(gain.T, self.rmap, self.gain.T)
            LOG.debug('Region mapping gain shape %s to %s', self.gain.shape, gain.shape)
            self.gain = gain

        # append analytic sub-cortical to lead field
        if have_subcortical:
            # need matrix of shape (proj.shape[0], len(sc_ind))
            src = conn.centres[non_cortical_indices], conn.orientations[non_cortical_indices]
            self.gain = numpy.hstack((self.gain, self.analytic(*src)))
            LOG.debug('Added subcortical analytic gain, for final shape %s', self.gain.shape)

        if self.sensors.usable is not None and not self.sensors.usable.all():
            mask_unusable = ~self.sensors.usable
            self.gain[mask_unusable] = 0.0
            LOG.debug('Zeroed gain coefficients for %d unusable sensors', mask_unusable.sum())

        # unconditionally zero NaN elements; framework not prepared for NaNs.
        nan_mask = numpy.isfinite(self.gain).all(axis=1)
        self.gain[~nan_mask] = 0.0
        LOG.debug('Zeroed %d NaN gain coefficients', nan_mask.sum())

        # attrs used for recording
        self._state = numpy.zeros((self.gain.shape[0], len(self.voi)))
        self._period_in_steps = int(self.period / self.dt)
        LOG.debug('State shape %s, period in steps %s', self._state.shape, self._period_in_steps)

        LOG.info('Projection configured gain shape %s', self.gain.shape)

Example 34

Project: crosscat Source File: MultinomialComponentModel.py
    @staticmethod
    def log_likelihood(X, params):
        """
        Calculates the log likelihood of the data X given mean mu and precision
        rho.
        Inputs:
            X: a column of data (numpy)
            params: a dict with the following keys
                weights: a list of categories weights (should sum to 1)
        """
        check_data_type_column_data(X)
        check_model_parameters_dict(params)

        N = len(X)
        K = len(params['weights'])
        check_data_vs_k(X,K)
        counts= numpy.bincount(X,minlength=K)

        weights = numpy.array(params['weights'])

        A = gammaln(N+1)-numpy.sum(gammaln(counts+1))
        B = numpy.sum(counts*numpy.log(weights));

        log_likelihood = A+B

        return log_likelihood

Example 35

Project: mondrianforest Source File: mondrianforest_utils.py
def compute_left_right_statistics(data, param, cache, train_ids, feat_id_chosen, \
        split_chosen, settings):
    cond = data['x_train'][train_ids, feat_id_chosen] <= split_chosen
    train_ids_left = train_ids[cond]
    train_ids_right = train_ids[~cond]
    cache_tmp = {}
    if settings.optype == 'class':
        range_n_class = cache['range_n_class']
        cnt_left_chosen = np.bincount(data['y_train'][train_ids_left], minlength=data['n_class'])
        cnt_right_chosen = np.bincount(data['y_train'][train_ids_right], minlength=data['n_class'])
        cache_tmp['cnt_left_chosen'] = cnt_left_chosen
        cache_tmp['cnt_right_chosen'] = cnt_right_chosen
    else:
        cache_tmp['sum_y_left'] = np.sum(data['y_train'][train_ids_left])
        cache_tmp['sum_y2_left'] = np.sum(data['y_train'][train_ids_left] ** 2)
        cache_tmp['n_points_left'] = len(train_ids_left)
        cache_tmp['sum_y_right'] = np.sum(data['y_train'][train_ids_right])
        cache_tmp['sum_y2_right'] = np.sum(data['y_train'][train_ids_right] ** 2)
        cache_tmp['n_points_right'] = len(train_ids_right)
    if settings.verbose >= 2:
        print 'feat_id_chosen = %s, split_chosen = %s' % (feat_id_chosen, split_chosen)
        print 'y (left) = %s\ny (right) = %s' % (data['y_train'][train_ids_left], \
                                                    data['y_train'][train_ids_right])
    return(train_ids_left, train_ids_right, cache_tmp)

Example 36

Project: gplearn Source File: test_genetic.py
def test_program_init_depth():
    """'full' should create constant depth programs for single depth limit"""

    params = {'function_set': [add2, sub2, mul2, div2, sqrt1, log1, abs1, max2,
                               min2],
              'arities': {1: [sqrt1, log1, abs1],
                          2: [add2, sub2, mul2, div2, max2, min2]},
              'init_depth': (6, 6),
              'n_features': 10,
              'const_range': (-1.0, 1.0),
              'metric': 'mean absolute error',
              'p_point_replace': 0.05,
              'parsimony_coefficient': 0.1}
    random_state = check_random_state(415)
    programs = []
    for i in range(20):
        programs.append(_Program(init_method='full',
                                 random_state=random_state, **params))
    full_depth = np.bincount([gp.depth_ for gp in programs])
    programs = []
    for i in range(20):
        programs.append(_Program(init_method='half and half',
                                 random_state=random_state, **params))
    hnh_depth = np.bincount([gp.depth_ for gp in programs])
    programs = []
    for i in range(20):
        programs.append(_Program(init_method='grow',
                                 random_state=random_state, **params))
    grow_depth = np.bincount([gp.depth_ for gp in programs])

    assert_true(full_depth[-1] == 20)
    assert_false(hnh_depth[-1] == 20)
    assert_false(grow_depth[-1] == 20)

Example 37

Project: gplearn Source File: _program.py
Function: get_all_indices
    def get_all_indices(self, n_samples=None, max_samples=None,
                        random_state=None):
        """Get the indices on which to evaluate the fitness of a program.

        Parameters
        ----------
        n_samples : int
            The number of samples.

        max_samples : int
            The maximum number of samples to use.

        random_state : RandomState instance
            The random number generator.

        Returns
        -------
        indices : array-like, shape = [n_samples]
            The in-sample indices.

        not_indices : array-like, shape = [n_samples]
            The out-of-sample indices.
        """
        if self._indices_state is None and random_state is None:
            raise ValueError('The program has not been evaluated for fitness '
                             'yet, indices not available.')

        if n_samples is not None and self._n_samples is None:
            self._n_samples = n_samples
        if max_samples is not None and self._max_samples is None:
            self._max_samples = max_samples
        if random_state is not None and self._indices_state is None:
            self._indices_state = random_state.get_state()

        indices_state = check_random_state(None)
        indices_state.set_state(self._indices_state)

        not_indices = sample_without_replacement(
            self._n_samples,
            self._n_samples - self._max_samples,
            random_state=indices_state)
        sample_counts = np.bincount(not_indices, minlength=self._n_samples)
        indices = np.where(sample_counts == 0)[0]

        return indices, not_indices

Example 38

Project: msaf Source File: segmenter.py
Function: most_frequent
def most_frequent(x):
    """Returns the most frequent value in x."""
    return np.argmax(np.bincount(x))

Example 39

Project: scipy Source File: coo.py
Function: getnnz
    def getnnz(self, axis=None):
        if axis is None:
            nnz = len(self.data)
            if nnz != len(self.row) or nnz != len(self.col):
                raise ValueError('row, column, and data array must all be the '
                                 'same length')

            if self.data.ndim != 1 or self.row.ndim != 1 or \
                    self.col.ndim != 1:
                raise ValueError('row, column, and data arrays must be 1-D')

            return int(nnz)

        if axis < 0:
            axis += 2
        if axis == 0:
            return np.bincount(downcast_intp_index(self.col),
                               minlength=self.shape[1])
        elif axis == 1:
            return np.bincount(downcast_intp_index(self.row),
                               minlength=self.shape[0])
        else:
            raise ValueError('axis out of bounds')

Example 40

Project: brew Source File: smote_bagging.py
    def smote_bootstrap_sample(self, X, y, b, k):

        count = np.bincount(y)  # number of instances of each class

        majority_class = count.argmax()  # majority class
        majority_count = count.max()  # majority class

        data = np.empty((0, X.shape[1]))
        target = np.empty((0,))

        class_data = X[(y == majority_class), :]
        idx = np.random.choice(majority_count, (majority_count,))
        data = np.concatenate((data, class_data[idx, :]))
        target = np.concatenate(
            (target, majority_class * np.ones((majority_count,))))

        minority_class = count.argmin()
        minority_count = count.min()

        # print majority_count
        N_syn = int((majority_count) * (b / 100))
        # print N_syn
        N_res = majority_count - N_syn
        # print N_res
        N_syn, N_res = N_res, N_syn

        class_data = X[(y == minority_class), :]
        idx = np.random.choice(class_data.shape[0], (N_res,))
        sampled_min_data = class_data[idx, :]
        # print sampled_min_data.shape
        if N_syn > 0:
            N_smote = np.ceil(N_syn / minority_count) * 100
            N_smote = 100 if N_smote < 100 else int(N_smote - N_smote % 100)
            synthetic = smote(X[y == minority_class], N=int(N_smote), k=self.k)

            idx = np.random.choice(synthetic.shape[0], (N_syn,))
            new_class_data = np.concatenate(
                (sampled_min_data, synthetic[idx, :]))
            data = np.concatenate((data, new_class_data))
            target = np.concatenate(
                (target, minority_class * np.ones((new_class_data.shape[0],))))
        else:
            data = np.concatenate((data, sampled_min_data))
            target = np.concatenate(
                (target, minority_class * np.ones((sampled_min_data.shape[0],))))  # noqa

        return data, target

Example 41

Project: intrinsic Source File: solver.py
    def remove_unused_intensities(self):
        """ Remove any intensities that are not currently assigned to a pixel,
        and then re-number all labels so they are contiguous again. """

        if self.params.logging:
            prev_r_s = self.decomposition.get_r_s()

        labels_nz = self.decomposition.labels_nz
        intensities = self.decomposition.intensities
        chromaticities = self.decomposition.chromaticities
        nlabels = intensities.shape[0]

        new_to_old = np.nonzero(np.bincount(
            labels_nz, minlength=nlabels))[0]
        old_to_new = np.empty(nlabels, dtype=np.int32)
        old_to_new.fill(-1)
        for new, old in enumerate(new_to_old):
            old_to_new[old] = new

        self.decomposition.labels_nz = old_to_new[labels_nz]
        self.decomposition.intensities = intensities[new_to_old]
        self.decomposition.chromaticities = chromaticities[new_to_old]

        if self.params.logging:
            print ('remove_unused_intensities: %s/%s labels kept' % (
                   len(self.decomposition.intensities), len(intensities)))

        if self.params.logging:
            np.testing.assert_equal(self.decomposition.get_r_s(), prev_r_s)
            assert (self.decomposition.chromaticities.shape[0] ==
                    self.decomposition.intensities.shape[0])

Example 42

Project: scikit-bio Source File: _ace.py
@experimental(as_of="0.4.0")
def ace(counts, rare_threshold=10):
    r"""Calculate the ACE metric (Abundance-based Coverage Estimator).

    The ACE metric is defined as:

    .. math::

       S_{ace}=S_{abund}+\frac{S_{rare}}{C_{ace}}+
       \frac{F_1}{C_{ace}}\gamma^2_{ace}

    where :math:`S_{abund}` is the number of abundant OTUs (with more than
    `rare_threshold`  individuals) when all samples are pooled,
    :math:`S_{rare}` is the number of rare OTUs (with less than or equal to
    `rare_threshold` individuals) when all samples are pooled, :math:`C_{ace}`
    is the sample abundance coverage estimator, :math:`F_1` is the frequency of
    singletons, and :math:`\gamma^2_{ace}` is the estimated coefficient of
    variation for rare OTUs.

    The estimated coefficient of variation is defined as (assuming
    `rare_threshold` is 10, the default):

    .. math::

       \gamma^2_{ace}=max\left[\frac{S_{rare}}{C_{ace}}
       \frac{\sum^{10}_{i=1}{{i\left(i-1\right)}}F_i}
       {\left(N_{rare}\right)\left(N_{rare}-1\right)} -1,0\right]

    Parameters
    ----------
    counts : 1-D array_like, int
        Vector of counts.
    rare_threshold : int, optional
        Threshold at which an OTU containing as many or fewer individuals will
        be considered rare.

    Returns
    -------
    double
        Computed ACE metric.

    Raises
    ------
    ValueError
        If every rare OTU is a singleton.

    Notes
    -----
    ACE was first introduced in [1]_ and [2]_. The implementation here is based
    on the description given in the EstimateS manual [3]_.

    If no rare OTUs exist, returns the number of abundant OTUs. The default
    value of 10 for `rare_threshold` is based on [4]_.

    If `counts` contains zeros, indicating OTUs which are known to exist in the
    environment but did not appear in the sample, they will be ignored for the
    purpose of calculating the number of rare OTUs.

    References
    ----------
    .. [1] Chao, A. & S.-M Lee. 1992 Estimating the number of classes via
       sample coverage. Journal of the American Statistical Association 87,
       210-217.
    .. [2] Chao, A., M.-C. Ma, & M. C. K. Yang. 1993. Stopping rules and
       estimation for recapture debugging with unequal failure rates.
       Biometrika 80, 193-201.
    .. [3] http://viceroy.eeb.uconn.edu/estimates/
    .. [4] Chao, A., W.-H. Hwang, Y.-C. Chen, and C.-Y. Kuo. 2000. Estimating
       the number of shared species in two communities. Statistica Sinica
       10:227-246.

    """
    counts = _validate_counts_vector(counts)
    freq_counts = np.bincount(counts)
    s_rare = _otus_rare(freq_counts, rare_threshold)
    singles = freq_counts[1]

    if singles > 0 and singles == s_rare:
        raise ValueError("The only rare OTUs are singletons, so the ACE "
                         "metric is undefined. EstimateS suggests using "
                         "bias-corrected Chao1 instead.")

    s_abun = _otus_abundant(freq_counts, rare_threshold)
    if s_rare == 0:
        return s_abun

    n_rare = _number_rare(freq_counts, rare_threshold)
    c_ace = 1 - singles / n_rare

    top = s_rare * _number_rare(freq_counts, rare_threshold, gamma=True)
    bottom = c_ace * n_rare * (n_rare - 1)
    gamma_ace = (top / bottom) - 1

    if gamma_ace < 0:
        gamma_ace = 0

    return s_abun + (s_rare / c_ace) + ((singles / c_ace) * gamma_ace)

Example 43

Project: scikit-bio Source File: _grammared_sequence.py
Function: validate
    def _validate(self):
        # This is the fastest way that we have found to identify the
        # presence or absence of certain characters (numbers).
        # It works by multiplying a mask where the numbers which are
        # permitted have a zero at their index, and all others have a one.
        # The result is a vector which will propogate counts of invalid
        # numbers and remove counts of valid numbers, so that we need only
        # see if the array is empty to determine validity.
        invalid_characters = np.bincount(
            self._bytes, minlength=self._number_of_extended_ascii_codes
        ) * self._validation_mask
        if np.any(invalid_characters):
            bad = list(np.where(
                invalid_characters > 0)[0].astype(np.uint8).view('|S1'))
            raise ValueError(
                "Invalid character%s in sequence: %r. \n"
                "Valid characters: %r\n"
                "Note: Use `lowercase` if your sequence contains lowercase "
                "characters not in the sequence's alphabet."
                % ('s' if len(bad) > 1 else '',
                   [str(b.tostring().decode("ascii")) for b in bad] if
                   len(bad) > 1 else bad[0],
                   list(self.alphabet)))

Example 44

Project: scikit-bio Source File: _nucleotide_mixin.py
    @stable(as_of='0.4.0')
    def gc_frequency(self, relative=False):
        """Calculate frequency of G's and C's in the sequence.

        This calculates the minimum GC frequency, which corresponds to IUPAC
        characters G, C, and S (which stands for G or C).

        Parameters
        ----------
        relative : bool, optional
            If False return the frequency of G, C, and S characters (ie the
            count). If True return the relative frequency, ie the proportion
            of G, C, and S characters in the sequence. In this case the
            sequence will also be degapped before the operation, so gap
            characters will not be included when calculating the length of the
            sequence.

        Returns
        -------
        int or float
            Either frequency (count) or relative frequency (proportion),
            depending on `relative`.

        See Also
        --------
        gc_content

        Examples
        --------
        >>> from skbio import DNA
        >>> DNA('ACGT').gc_frequency()
        2
        >>> DNA('ACGT').gc_frequency(relative=True)
        0.5
        >>> DNA('ACGT--..').gc_frequency(relative=True)
        0.5
        >>> DNA('--..').gc_frequency(relative=True)
        0

        `S` means `G` or `C`, so it counts:

        >>> DNA('ASST').gc_frequency()
        2

        Other degenerates don't count:

        >>> DNA('RYKMBDHVN').gc_frequency()
        0

        """

        counts = np.bincount(self._bytes,
                             minlength=self._number_of_extended_ascii_codes)
        gc = counts[self._gc_codes].sum()
        if relative:
            seq = self.degap()
            if len(seq) != 0:
                gc /= len(seq)
        return gc

Example 45

Project: scikit-bio Source File: _permanova.py
@experimental(as_of="0.4.0")
def permanova(distance_matrix, grouping, column=None, permutations=999):
    """Test for significant differences between groups using PERMANOVA.

    Permutational Multivariate Analysis of Variance (PERMANOVA) is a
    non-parametric method that tests whether two or more groups of objects
    (e.g., samples) are significantly different based on a categorical factor.
    It is conceptually similar to ANOVA except that it operates on a distance
    matrix, which allows for multivariate analysis. PERMANOVA computes a
    pseudo-F statistic.

    Statistical significance is assessed via a permutation test. The assignment
    of objects to groups (`grouping`) is randomly permuted a number of times
    (controlled via `permutations`). A pseudo-F statistic is computed for each
    permutation and the p-value is the proportion of permuted pseudo-F
    statisics that are equal to or greater than the original (unpermuted)
    pseudo-F statistic.

    Parameters
    ----------
    distance_matrix : DistanceMatrix
        Distance matrix containing distances between objects (e.g., distances
        between samples of microbial communities).
    grouping : 1-D array_like or pandas.DataFrame
        Vector indicating the assignment of objects to groups. For example,
        these could be strings or integers denoting which group an object
        belongs to. If `grouping` is 1-D ``array_like``, it must be the same
        length and in the same order as the objects in `distance_matrix`. If
        `grouping` is a ``DataFrame``, the column specified by `column` will be
        used as the grouping vector. The ``DataFrame`` must be indexed by the
        IDs in `distance_matrix` (i.e., the row labels must be distance matrix
        IDs), but the order of IDs between `distance_matrix` and the
        ``DataFrame`` need not be the same. All IDs in the distance matrix must
        be present in the ``DataFrame``. Extra IDs in the ``DataFrame`` are
        allowed (they are ignored in the calculations).
    column : str, optional
        Column name to use as the grouping vector if `grouping` is a
        ``DataFrame``. Must be provided if `grouping` is a ``DataFrame``.
        Cannot be provided if `grouping` is 1-D ``array_like``.
    permutations : int, optional
        Number of permutations to use when assessing statistical
        significance. Must be greater than or equal to zero. If zero,
        statistical significance calculations will be skipped and the p-value
        will be ``np.nan``.

    Returns
    -------
    pandas.Series
        Results of the statistical test, including ``test statistic`` and
        ``p-value``.

    See Also
    --------
    anosim

    Notes
    -----
    See [1]_ for the original method reference, as well as ``vegan::adonis``,
    available in R's vegan package [2]_.

    The p-value will be ``np.nan`` if `permutations` is zero.

    References
    ----------
    .. [1] Anderson, Marti J. "A new method for non-parametric multivariate
       analysis of variance." Austral Ecology 26.1 (2001): 32-46.

    .. [2] http://cran.r-project.org/web/packages/vegan/index.html

    Examples
    --------
    See :mod:`skbio.stats.distance.anosim` for usage examples (both functions
    provide similar interfaces).

    """
    sample_size, num_groups, grouping, tri_idxs, distances = _preprocess_input(
        distance_matrix, grouping, column)

    # Calculate number of objects in each group.
    group_sizes = np.bincount(grouping)
    s_T = (distances ** 2).sum() / sample_size

    test_stat_function = partial(_compute_f_stat, sample_size, num_groups,
                                 tri_idxs, distances, group_sizes, s_T)
    stat, p_value = _run_monte_carlo_stats(test_stat_function, grouping,
                                           permutations)

    return _build_results('PERMANOVA', 'pseudo-F', sample_size, num_groups,
                          stat, p_value, permutations)

Example 46

Project: quality-assessment-protocol Source File: spatial_qc.py
Function: artifacts
def artifacts(anat_data, fg_mask_data, calculate_qi2=False):

    # Detect artifacts in the anatomical image using the method described in
    # Mortamet et al. 2009 (MRM)
    # Calculates QI1, the fraction of total voxels that within artifacts.
    
    # Optionally, also calculates QI2, the distance between the distribution 
    # of noise voxel (non-artifact background voxels) intensities, and a 
    # Ricean distribution.

    import numpy as np

    background, bg_mask = get_background(anat_data, fg_mask_data)
    
    # make sure the datatype is an int
    background = check_datatype(background)
       
    # Find the background threshold (the most frequently occurring value 
    # excluding 0)
    bg_counts       = np.bincount(background.flatten())
    bg_threshold    = np.argmax(bg_counts[1:]) + 1

    # Apply this threshold to the background voxels to identify voxels
    # contributing artifacts. 
    background[background <= bg_threshold] = 0
    background[background != 0] = 1

    # Create a structural element to be used in an opening operation.
    struct_elmnt    = np.zeros((3,3,3))
    struct_elmnt[0,1,1] = 1
    struct_elmnt[1,1,:] = 1
    struct_elmnt[1,:,1] = 1
    struct_elmnt[2,1,1] = 1

    # Perform an opening operation on the background data.
    background      = nd.binary_opening(background, structure=struct_elmnt)

    # Count the number of voxels that remain after the opening operation. 
    # These are artifacts.
    QI1             = background.sum() / float(bg_mask.sum())
    
    ''' "bg" in code below not defined- need to ascertain what that should '''
    '''      be, and correct it- unit test for this part disabled for now  '''
    if calculate_qi2:
        # Now lets focus on the noise, which is everything in the background
        # that was not identified as artifact
        bgNoise     = anat_data[(fg_mask_data-bg)==1]

        # calculate the histogram of the noise and its derivative
        H           = np.bincount(bgNoise)
        H           = 1.0*H/H.sum()
        dH          = H[1:]-H[:-1]

        # find the first value on the right tail, i.e. tail with negative
        # slope, i.e. dH < 0 that is less than or equal to half of the
        # histograms max
        firstNegSlope = np.nonzero(dH<0)[0][0]
        halfMaxRightTail = np.nonzero(H[firstNegSlope:]<(H.max()/2))[0][0]

        # divide by the standard deviation
        bgNoiseZ    = bgNoise / bgNoise.std()
        bgChiParams = ss.chi.fit(bgNoiseZ)
        #print bgChiParams
    
        # now generate values that are consistent with the histogram
        yx          = range(0,H.size)/bgNoise.std()
        rvs         = ss.chi.pdf(yx,bgChiParams[0],loc=bgChiParams[1],scale=bgChiParams[2])

        # now we can calculate the goodness of fit
        gof         = np.average(np.absolute(H[halfMaxRightTail:]-rvs[halfMaxRightTail:]))
        QI2         = QI1+gof
    else:
        QI2         = None

    return (QI1,QI2)

Example 47

Project: auto-sklearn Source File: evaluation_util.py
def get_500_classes_datamanager():
    weights = ([0.002] * 475) + ([0.001] * 25)
    X, Y = sklearn.datasets.make_classification(n_samples=1000,
                                                n_features=20,
                                                n_classes=500,
                                                n_clusters_per_class=1,
                                                n_informative=15,
                                                n_redundant=5,
                                                n_repeated=0,
                                                weights=weights,
                                                flip_y=0,
                                                class_sep=1.0,
                                                hypercube=True,
                                                shift=None,
                                                scale=1.0,
                                                shuffle=True,
                                                random_state=1)

    assert (25 == np.sum(np.bincount(Y) == 1), np.sum(np.bincount(Y) == 1))
    D = Dummy()
    D.info = {
        'metric': ACC_METRIC,
        'task': MULTICLASS_CLASSIFICATION,
        'is_sparse': False,
        'label_num': 500
    }
    D.data = {'X_train': X[:700], 'Y_train': Y[:700],
              'X_valid': X[700:710], 'Y_valid': Y[700:710],
              'X_test': X[710:], 'Y_test': Y[710:]
              }
    D.feat_type = ['numerical'] * 20
    return D

Example 48

Project: scikit-beam Source File: correlation.py
def _two_time_process(buf, g2, label_array, num_bufs, num_pixels,
                      img_per_level, lag_steps, current_img_time,
                      level, buf_no):
    """
    Parameters
    ----------
    buf: array
        image data array to use for two time correlation
    g2: array
        two time correlation matrix
        shape (number of labels(ROI), number of frames, number of frames)
    label_array: array
        Elements not inside any ROI are zero; elements inside each
        ROI are 1, 2, 3, etc. corresponding to the order they are specified
        in edges and segments
    num_bufs: int, even
        number of buffers(channels)
    num_pixels : array
        number of pixels in certain ROI's
        ROI's, dimensions are len(np.unique(label_array))
    img_per_level: array
        to track how many images processed in each level
    lag_steps : array
        delay or lag steps for the multiple tau analysis
        shape num_levels
    current_img_time : int
        the current image number
    level : int
        the current multi-tau level
    buf_no : int
        the current buffer number
    """
    img_per_level[level] += 1

    # in multi-tau correlation other than first level all other levels
    #  have to do the half of the correlation
    if level == 0:
        i_min = 0
    else:
        i_min = num_bufs//2

    for i in range(i_min, min(img_per_level[level], num_bufs)):
        t_index = level*num_bufs/2 + i

        delay_no = (buf_no - i) % num_bufs

        past_img = buf[level, delay_no]
        future_img = buf[level, buf_no]

        #  get the matrix of correlation function without normalizations
        tmp_binned = (np.bincount(label_array,
                                  weights=past_img*future_img)[1:])
        # get the matrix of past intensity normalizations
        pi_binned = (np.bincount(label_array,
                                 weights=past_img)[1:])

        # get the matrix of future intensity normalizations
        fi_binned = (np.bincount(label_array,
                                 weights=future_img)[1:])

        tind1 = (current_img_time - 1)

        tind2 = (current_img_time - lag_steps[t_index] - 1)

        if not isinstance(current_img_time, int):
            nshift = 2**(level-1)
            for i in range(-nshift+1, nshift+1):
                g2[:, int(tind1+i),
                   int(tind2+i)] = (tmp_binned/(pi_binned *
                                                fi_binned))*num_pixels
        else:
            g2[:, tind1, tind2] = tmp_binned/(pi_binned * fi_binned)*num_pixels

Example 49

Project: statsmodels Source File: survival2.py
    def fitting_proc(self, group):
        """
        For internal use
        """
        t = ((group[:,self.endog]).astype(float)).astype(int)
        if self.censoring == None:
            events = np.bincount(t)
            t = np.unique(t)
            events = events[:,list(t)]
            events = events.astype(float)
            eventsSum = np.cuemsum(events)
            eventsSum = np.r_[0,eventsSum]
            n = len(group) - eventsSum[:-1]
        else:
            censoring = ((group[:,self.censoring]).astype(float)).astype(int)
            reverseCensoring = -1*(censoring - 1)
            events = np.bincount(t,censoring)
            censored = np.bincount(t,reverseCensoring)
            t = np.unique(t)
            censored = censored[:,list(t)]
            censored = censored.astype(float)
            censoredSum = np.cuemsum(censored)
            censoredSum = np.r_[0,censoredSum]
            events = events[:,list(t)]
            events = events.astype(float)
            eventsSum = np.cuemsum(events)
            eventsSum = np.r_[0,eventsSum]
            n = len(group) - eventsSum[:-1] - censoredSum[:-1]
            (self.censorings).append(censored)
        survival = np.cuemprod(1-events/n)
        var = ((survival*survival) *
               np.cuemsum(events/(n*(n-events))))
        se = np.sqrt(var)
        (self.results).append(np.array([survival,se]))
        (self.ts).append(t)
        (self.event).append(events)

Example 50

Project: WASP Source File: update_total_depth.py
def get_at_gc_count(seq_h5, chrm, start, end):
    # seq HDF5 file contains ascii values for nucleotides
    # e.g. A = 65
    node = seq_h5.getNode("/%s" % chrm)
    vals = node[start-1:end]

    counts = np.bincount(vals)

    at_count = 0
    gc_count = 0
    
    if len(counts) >= ord("A"):
        at_count += counts[ord("A")]
    if len(counts) >= ord("T"):
        at_count += counts[ord("T")]
    if len(counts) >= ord("G"):
        gc_count += counts[ord("G")]
    if len(counts) >= ord("C"):
        gc_count += counts[ord("C")]
    
    return at_count, gc_count
See More Examples - Go to Next Page
Page 1 Selected Page 2