Here are the examples of the python api numpy.bincount taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
83 Examples
3
Example 1
Project: hyperopt Source File: test_pchoice.py
def test_random(self):
# test that that a space with a pchoice in it is
# (a) accepted by tpe.suggest and
# (b) handled correctly.
N = 150
fmin(self.objective,
space=self.space,
trials=self.trials,
algo=rand.suggest,
max_evals=N)
a_vals = [t['misc']['vals']['a'][0] for t in self.trials.trials]
counts = np.bincount(a_vals)
print counts
assert counts[3] > N * .35
assert counts[3] < N * .60
3
Example 2
Project: horizont Source File: test_utils.py
def test_matrix_to_lists(self):
dtm, D, N_WORDS_PER_DOC = self.dtm, self.D, self.N_WORDS_PER_DOC
N_BY_D, N_BY_W = self.N_BY_D, self.N_BY_W
WS, DS = utils.matrix_to_lists(dtm)
self.assertEqual(len(WS), D * N_WORDS_PER_DOC)
self.assertEqual(len(WS), len(DS))
self.assertEqual(dtm.shape, (max(DS) + 1, max(WS) + 1))
self.assertTrue(all(DS == sorted(DS)))
self.assertTrue(np.all(np.bincount(DS) == N_BY_D))
self.assertTrue(np.all(np.bincount(WS) == N_BY_W))
3
Example 3
Project: info-flow-experiments Source File: permutation_test.py
def perm_unique(elements):
bins = np.bincount(elements)
listunique = []
for i in range(0,len(bins)):
listunique.append(UniqueElement(i, bins[i]))
u=len(elements)
return perm_unique_helper(listunique,[0]*u,u-1)
3
Example 4
Project: kaggle-right-whale Source File: train_model.py
def filter_by_min_occ(X, y, min_occ):
occs = np.bincount(y)
mask = np.zeros_like(y).astype(bool)
for i, occ in enumerate(occs):
if occ == min_occ:
mask[y == i] = True
return X[mask], y[mask]
3
Example 5
Project: mondrianforest Source File: mondrianforest_utils.py
def update_posterior_node_incremental(tree, data, param, settings, cache, node_id, train_ids_new):
y_train_new = data['y_train'][train_ids_new]
if settings.optype == 'class':
tree.counts[node_id] += np.bincount(y_train_new, minlength=data['n_class'])
else:
sum_y_new, sum_y2_new, n_points_new = get_reg_stats(y_train_new)
tree.sum_y[node_id] += sum_y_new
tree.sum_y2[node_id] += sum_y2_new
tree.n_points[node_id] += n_points_new
3
Example 6
Project: scikit-learn Source File: test_dummy.py
def test_uniform_strategy():
X = [[0]] * 4 # ignored
y = [1, 2, 1, 1]
clf = DummyClassifier(strategy="uniform", random_state=0)
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
p = np.bincount(y_pred) / float(len(X))
assert_almost_equal(p[1], 0.5, decimal=1)
assert_almost_equal(p[2], 0.5, decimal=1)
_check_predict_proba(clf, X, y)
3
Example 7
Project: klustaviewa Source File: ccg.py
def _increment(arr, indices):
"""Increment some indices in a 1D vector of non-negative integers.
Repeated indices are taken into account."""
arr = _as_array(arr)
indices = _as_array(indices)
bbins = np.bincount(indices)
arr[:len(bbins)] += bbins
return arr
3
Example 8
Project: statsmodels Source File: grouputils.py
def count_categories(self, level=0):
"""
Sets the attribute counts to equal the bincount of the (integer-valued)
labels.
"""
# TODO: refactor this not to set an attribute. Why would we do this?
self.counts = np.bincount(self.labels[level])
3
Example 9
Project: implicit Source File: lastfm.py
def bm25_weight(X, K1=100, B=0.8):
""" Weighs each row of the sparse matrix of the data by BM25 weighting """
# calculate idf per term (user)
X = coo_matrix(X)
N = X.shape[0]
idf = numpy.log(float(N) / (1 + numpy.bincount(X.col)))
# calculate length_norm per docuement (artist)
row_sums = numpy.ravel(X.sum(axis=1))
average_length = row_sums.mean()
length_norm = (1.0 - B) + B * row_sums / average_length
# weight matrix rows by bm25
X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
return X
3
Example 10
def chist(im):
im = im // 64
r,g,b = im.transpose((2,0,1))
pixels = 1 * r + 4 * b + 16 * g
hist = np.bincount(pixels.ravel(), minlength=64)
hist = hist.astype(float)
hist = np.log1p(hist)
return hist
3
Example 11
Project: scikit-learn Source File: test_dummy.py
def test_stratified_strategy():
X = [[0]] * 5 # ignored
y = [1, 2, 1, 1, 2]
clf = DummyClassifier(strategy="stratified", random_state=0)
clf.fit(X, y)
X = [[0]] * 500
y_pred = clf.predict(X)
p = np.bincount(y_pred) / float(len(X))
assert_almost_equal(p[1], 3. / 5, decimal=1)
assert_almost_equal(p[2], 2. / 5, decimal=1)
_check_predict_proba(clf, X, y)
3
Example 12
Project: pyhawkes Source File: network.py
def resample_m(self):
"""
Resample m given c and pi
"""
pi = self.pi + np.bincount(self.c, minlength=self.C)
self.m = np.random.dirichlet(pi)
3
Example 13
Project: polara Source File: data.py
@staticmethod
def is_not_uniform(idx, nbins=10, allowed_gap=0.75):
idx_bins = pd.cut(idx, bins=nbins, labels=False)
idx_bin_size = np.bincount(idx_bins)
diff = idx_bin_size[:-1] - idx_bin_size[1:]
monotonic = (diff < 0).all() or (diff > 0).all()
huge_gap = (idx_bin_size.min()*1.0 / idx_bin_size.max()) < allowed_gap
return monotonic or huge_gap
3
Example 14
def discretize(data, k=2):
ranks = rankdata(data, method='dense').astype(int) - 1
j = 1
while len(np.bincount(ranks / j)) > k:
j += 1
return ranks / j
3
Example 15
Project: attention-lvcsr Source File: extra_ops.py
def perform(self, node, inputs, output_storage):
x = inputs[0]
weights = inputs[1]
z = output_storage[0]
if weights is not None and weights.shape != x.shape:
raise TypeError("All inputs must have the same shape.")
# Needed for numpy 1.4.1 compatibility
if self.minlength:
out = np.bincount(x, weights=weights, minlength=self.minlength)
else:
out = np.bincount(x, weights=weights)
z[0] = theano._asarray(out, dtype=node.outputs[0].dtype)
3
Example 16
Project: hyperopt Source File: test_pchoice.py
def test_anneal(self):
N = 100
fmin(self.objective,
space=self.space,
trials=self.trials,
algo=partial(anneal.suggest),
max_evals=N)
a_vals = [t['misc']['vals']['a'][0] for t in self.trials.trials]
counts = np.bincount(a_vals)
print counts
assert counts[3] > N * .6
3
Example 17
def compute_stats(values, weights):
# values = np.array(row, dtype=np.uint8)
counts = np.bincount(values, weights=weights, minlength=256)
zeros = np.zeros(counts.shape)
total = 1. * np.sum(counts)
return counts/total if total > 0 else zeros
3
Example 18
def fit(self, x):
n_samples, self.n_variables = x.shape
while len(self.layers) < self.max_layers:
next_layer = SieveLayer(x, **self.kwargs)
x = next_layer.transform(x)
if self.verbose:
print 'tc: %0.3f, (+) %0.3f, (-) %0.3f' % (next_layer.corex.tc, next_layer.ub, next_layer.lb)
#if next_layer.corex.tc - 2 * next_layer.ub - next_layer.lb > 1. / n_samples: # Lower bound still increasing
if next_layer.corex.tc - next_layer.lb > 1. / n_samples: # Lower bound still increasing
self.layers.append(next_layer)
self.x_stats = [np.bincount(x[x[:, i] >= 0, i]) for i in range(self.n_variables)]
else:
break
if self.verbose:
print ['tc: %0.3f (-) %0.3f (+) %0.3f' % (layer.corex.tc, layer.lb, layer.ub) for layer in self.layers]
return self
3
Example 19
Project: peas Source File: checkers.py
def gamefitness(game):
""" Returns the fitness of
the black player. (according to {gauci2008case}) """
counts = np.bincount(game.board.flat)
return (100 + 2 * counts[BLACK|MAN] + 3 * counts[BLACK|KING] +
2 * (12 - counts[WHITE|MAN] + 3 * (12 - counts[WHITE|KING])))
3
Example 20
Project: lhcb_trigger_ml Source File: uboost.py
def generate_mask(n_samples, bagging=True, random_generator=np.random):
"""bagging: float or bool (default=True), bagging usually
speeds up the convergence and prevents overfitting
(see http://en.wikipedia.org/wiki/Bootstrap_aggregating)
if True, usual bootstrap aggregating is used
(sampling with replacement at each iteration, size=len(X))
if float, used sampling without replacement, the size of generated
set is bagging * len(X)
if False, returns ones for all events."""
if bagging is True:
indices = random_generator.randint(0, n_samples, size=n_samples)
mask = np.bincount(indices, minlength=n_samples)
elif isinstance(bagging, float):
mask = random_generator.uniform(size=n_samples) > 1. - bagging
elif bagging is False:
mask = np.ones(n_samples, dtype='float')
else:
raise ValueError("something wrong was passed as bagging")
return mask
3
Example 21
def bincount(x, weights=None, minlength=None):
if len(x) > 0:
return np.bincount(x, weights, minlength)
else:
if minlength is None:
minlength = 0
minlength = np.asscalar(np.asarray(minlength, dtype=np.intp))
return np.zeros(minlength, dtype=np.intp)
3
Example 22
Project: bhmm Source File: generic_hmm.py
def count_init(self):
"""Compute the counts at the first time step
Returns
-------
n : ndarray(nstates)
n[i] is the number of trajectories starting in state i
"""
if self.hidden_state_trajectories is None:
raise RuntimeError('HMM model does not have a hidden state trajectory.')
n = [traj[0] for traj in self.hidden_state_trajectories]
return np.bincount(n, minlength=self.nstates)
3
Example 23
def test_tpe(self):
N = 100
fmin(self.objective,
space=self.space,
trials=self.trials,
algo=partial(tpe.suggest, n_startup_jobs=10),
max_evals=N)
a_vals = [t['misc']['vals']['a'][0] for t in self.trials.trials]
counts = np.bincount(a_vals)
print counts
assert counts[3] > N * .6
3
Example 24
def getnnz(self, axis=None):
if axis is None:
return int(self.indptr[-1])
else:
if axis < 0:
axis += 2
axis, _ = self._swap((axis, 1 - axis))
_, N = self._swap(self.shape)
if axis == 0:
return np.bincount(downcast_intp_index(self.indices),
minlength=N)
elif axis == 1:
return np.diff(self.indptr)
raise ValueError('axis out of bounds')
3
Example 25
Project: pybasicbayes Source File: mixture.py
@property
def used_labels(self):
if len(self.labels_list) > 0:
label_usages = sum(np.bincount(l.z,minlength=self.N) for l in self.labels_list)
used_labels, = np.where(label_usages > 0)
else:
used_labels = np.argsort(self.weights.weights)[-1:-11:-1]
return used_labels
3
Example 26
Project: PyEMMA Source File: test_msm.py
@classmethod
def setUpClass(cls):
import pyemma.datasets
cls.dtraj = pyemma.datasets.load_2well_discrete().dtraj_T100K_dt10
nu = 1.*np.bincount(cls.dtraj)
cls.statdist = nu/nu.sum()
cls.tau = 10
cls.msmrev = estimate_markov_model(cls.dtraj, cls.tau)
cls.msmrevpi = estimate_markov_model(cls.dtraj, cls.tau,
statdist=cls.statdist)
cls.msm = estimate_markov_model(cls.dtraj, cls.tau, reversible=False)
"""Sparse"""
cls.msmrev_sparse = estimate_markov_model(cls.dtraj, cls.tau, sparse=True)
cls.msmrevpi_sparse = estimate_markov_model(cls.dtraj, cls.tau,
statdist=cls.statdist,
sparse=True)
cls.msm_sparse = estimate_markov_model(cls.dtraj, cls.tau, reversible=False, sparse=True)
3
Example 27
Project: msmbuilder-legacy Source File: test_wrappers.py
def test(self):
args, metric = Cluster.parser.parse_args([
'-p', get('points_on_cube/ProjectInfo.yaml', just_filename=True),
'-o', self.td,
'rmsd', '-a', get('points_on_cube/AtomIndices.dat', just_filename=True),
'kcenters', '-k', '4'], print_banner=False)
Cluster.main(args, metric)
assignments = load(pjoin(self.td, 'Assignments.h5'))["arr_0"]
assignment_counts = np.bincount(assignments.flatten())
eq(assignment_counts, np.array([2, 2, 2, 2]))
distances = load(pjoin(self.td, 'Assignments.h5.distances'))["arr_0"]
eq(distances, np.zeros((1,8)))
3
Example 28
def evaluate(self, game):
counts = np.bincount(game.board.flat)
nwm = counts[WHITE|MAN]
nwk = counts[WHITE|KING]
nbm = counts[BLACK|MAN]
nbk = counts[BLACK|KING]
vb = (100 * nbm + 130 * nbk)
vw = (100 * nwm + 130 * nwk)
return vb - vw
2
Example 29
Project: APGL Source File: GraphStatistics.py
def vectorStatistics(self, graph, treeStats=False, eigenStats=True):
"""
Find a series of statistics for the given input graph which can be represented
as vector values.
"""
Parameter.checkClass(graph, AbstractMatrixGraph)
Parameter.checkBoolean(treeStats)
statsDict = {}
statsDict["inDegreeDist"] = graph.inDegreeDistribution()
statsDict["outDegreeDist"] = graph.degreeDistribution()
logging.debug("Computing hop counts")
P = graph.findAllDistances(False)
statsDict["hopCount"] = graph.hopCount(P)
logging.debug("Computing triangle count")
if graph.getNumVertices() != 0:
statsDict["triangleDist"] = numpy.bincount(graph.triangleSequence())
else:
statsDict["triangleDist"] = numpy.array([])
#Get the distribution of component sizes
logging.debug("Finding distribution of component sizes")
if graph.isUndirected():
components = graph.findConnectedComponents()
if len(components) != 0:
statsDict["componentsDist"] = numpy.bincount(numpy.array([len(c) for c in components], numpy.int))
#Make sure weight matrix is symmetric
if graph.getNumVertices()!=0 and eigenStats:
logging.debug("Computing eigenvalues/vectors")
W = graph.getWeightMatrix()
W = (W + W.T)/2
eigenDistribution, V = numpy.linalg.eig(W)
i = numpy.argmax(eigenDistribution)
statsDict["maxEigVector"] = V[:, i]
statsDict["eigenDist"] = numpy.flipud(numpy.sort(eigenDistribution[eigenDistribution>0]))
gc.collect()
else:
statsDict["maxEigVector"] = numpy.array([])
statsDict["eigenDist"] = numpy.array([])
if treeStats:
logging.debug("Computing statistics on trees")
trees = graph.findTrees()
statsDict["treeSizesDist"] = numpy.bincount([len(x) for x in trees])
treeDepths = [GraphUtils.treeDepth((graph.subgraph(x))) for x in trees]
statsDict["treeDepthsDist"] = numpy.bincount(treeDepths)
return statsDict
0
Example 30
def counts(self):
return np.bincount(self.group_int)
0
Example 31
def onehot(self, data, min_length=None):
if min_length == None:
min_length = self.vocab_size
return np.bincount(data, minlength=min_length)
0
Example 32
def __init_size(self, n_region):
bincnt = numpy.bincount(self.label.ravel(), minlength = n_region)
return {i : bincnt[i] for i in range(n_region)}
0
Example 33
def config_for_sim(self, simulator):
"Configure projection matrix monitor for given simulation."
super(Projection, self).config_for_sim(simulator)
self._sim = simulator
if hasattr(self, 'sensors'):
self.sensors.configure()
# handle region vs simulation, analytic vs numerical proj, cortical vs subcortical.
# setup convenient locals
surf = simulator.surface
conn = simulator.connectivity
using_cortical_surface = surf is not None
if using_cortical_surface:
non_cortical_indices, = numpy.where(numpy.bincount(surf.region_mapping) == 1)
self.rmap = surf.region_mapping
else:
# assume all cortical if no info
if conn.cortical.size == 0:
conn.cortical = numpy.array([True] * conn.weights.shape[0])
non_cortical_indices, = numpy.where(~conn.cortical)
if self.region_mapping is None:
raise Exception("Please specify a region mapping on the EEG/MEG/iEEG monitor when "
"performing a region simulation.")
else:
self.rmap = self.region_mapping
LOG.debug('Projection used in region sim has %d non-cortical regions', non_cortical_indices.size)
have_subcortical = len(non_cortical_indices) > 0
# determine source space
if using_cortical_surface:
sources = {'loc': surf.vertices, 'ori': surf.vertex_normals}
else:
sources = {'loc': conn.centres[conn.cortical], 'ori': conn.orientations[conn.cortical]}
# compute analytic if not provided
if self.projection is None:
LOG.debug('Precomputed projection not unavailable using analytic approximation.')
self.gain = self.analytic(**sources)
# reduce to region lead field if region sim
if not using_cortical_surface and self.gain.shape[1] == self.rmap.size:
gain = numpy.zeros((self.gain.shape[0], conn.number_of_regions))
numpy_add_at(gain.T, self.rmap, self.gain.T)
LOG.debug('Region mapping gain shape %s to %s', self.gain.shape, gain.shape)
self.gain = gain
# append analytic sub-cortical to lead field
if have_subcortical:
# need matrix of shape (proj.shape[0], len(sc_ind))
src = conn.centres[non_cortical_indices], conn.orientations[non_cortical_indices]
self.gain = numpy.hstack((self.gain, self.analytic(*src)))
LOG.debug('Added subcortical analytic gain, for final shape %s', self.gain.shape)
if self.sensors.usable is not None and not self.sensors.usable.all():
mask_unusable = ~self.sensors.usable
self.gain[mask_unusable] = 0.0
LOG.debug('Zeroed gain coefficients for %d unusable sensors', mask_unusable.sum())
# unconditionally zero NaN elements; framework not prepared for NaNs.
nan_mask = numpy.isfinite(self.gain).all(axis=1)
self.gain[~nan_mask] = 0.0
LOG.debug('Zeroed %d NaN gain coefficients', nan_mask.sum())
# attrs used for recording
self._state = numpy.zeros((self.gain.shape[0], len(self.voi)))
self._period_in_steps = int(self.period / self.dt)
LOG.debug('State shape %s, period in steps %s', self._state.shape, self._period_in_steps)
LOG.info('Projection configured gain shape %s', self.gain.shape)
0
Example 34
Project: crosscat Source File: MultinomialComponentModel.py
@staticmethod
def log_likelihood(X, params):
"""
Calculates the log likelihood of the data X given mean mu and precision
rho.
Inputs:
X: a column of data (numpy)
params: a dict with the following keys
weights: a list of categories weights (should sum to 1)
"""
check_data_type_column_data(X)
check_model_parameters_dict(params)
N = len(X)
K = len(params['weights'])
check_data_vs_k(X,K)
counts= numpy.bincount(X,minlength=K)
weights = numpy.array(params['weights'])
A = gammaln(N+1)-numpy.sum(gammaln(counts+1))
B = numpy.sum(counts*numpy.log(weights));
log_likelihood = A+B
return log_likelihood
0
Example 35
Project: mondrianforest Source File: mondrianforest_utils.py
def compute_left_right_statistics(data, param, cache, train_ids, feat_id_chosen, \
split_chosen, settings):
cond = data['x_train'][train_ids, feat_id_chosen] <= split_chosen
train_ids_left = train_ids[cond]
train_ids_right = train_ids[~cond]
cache_tmp = {}
if settings.optype == 'class':
range_n_class = cache['range_n_class']
cnt_left_chosen = np.bincount(data['y_train'][train_ids_left], minlength=data['n_class'])
cnt_right_chosen = np.bincount(data['y_train'][train_ids_right], minlength=data['n_class'])
cache_tmp['cnt_left_chosen'] = cnt_left_chosen
cache_tmp['cnt_right_chosen'] = cnt_right_chosen
else:
cache_tmp['sum_y_left'] = np.sum(data['y_train'][train_ids_left])
cache_tmp['sum_y2_left'] = np.sum(data['y_train'][train_ids_left] ** 2)
cache_tmp['n_points_left'] = len(train_ids_left)
cache_tmp['sum_y_right'] = np.sum(data['y_train'][train_ids_right])
cache_tmp['sum_y2_right'] = np.sum(data['y_train'][train_ids_right] ** 2)
cache_tmp['n_points_right'] = len(train_ids_right)
if settings.verbose >= 2:
print 'feat_id_chosen = %s, split_chosen = %s' % (feat_id_chosen, split_chosen)
print 'y (left) = %s\ny (right) = %s' % (data['y_train'][train_ids_left], \
data['y_train'][train_ids_right])
return(train_ids_left, train_ids_right, cache_tmp)
0
Example 36
Project: gplearn Source File: test_genetic.py
def test_program_init_depth():
"""'full' should create constant depth programs for single depth limit"""
params = {'function_set': [add2, sub2, mul2, div2, sqrt1, log1, abs1, max2,
min2],
'arities': {1: [sqrt1, log1, abs1],
2: [add2, sub2, mul2, div2, max2, min2]},
'init_depth': (6, 6),
'n_features': 10,
'const_range': (-1.0, 1.0),
'metric': 'mean absolute error',
'p_point_replace': 0.05,
'parsimony_coefficient': 0.1}
random_state = check_random_state(415)
programs = []
for i in range(20):
programs.append(_Program(init_method='full',
random_state=random_state, **params))
full_depth = np.bincount([gp.depth_ for gp in programs])
programs = []
for i in range(20):
programs.append(_Program(init_method='half and half',
random_state=random_state, **params))
hnh_depth = np.bincount([gp.depth_ for gp in programs])
programs = []
for i in range(20):
programs.append(_Program(init_method='grow',
random_state=random_state, **params))
grow_depth = np.bincount([gp.depth_ for gp in programs])
assert_true(full_depth[-1] == 20)
assert_false(hnh_depth[-1] == 20)
assert_false(grow_depth[-1] == 20)
0
Example 37
def get_all_indices(self, n_samples=None, max_samples=None,
random_state=None):
"""Get the indices on which to evaluate the fitness of a program.
Parameters
----------
n_samples : int
The number of samples.
max_samples : int
The maximum number of samples to use.
random_state : RandomState instance
The random number generator.
Returns
-------
indices : array-like, shape = [n_samples]
The in-sample indices.
not_indices : array-like, shape = [n_samples]
The out-of-sample indices.
"""
if self._indices_state is None and random_state is None:
raise ValueError('The program has not been evaluated for fitness '
'yet, indices not available.')
if n_samples is not None and self._n_samples is None:
self._n_samples = n_samples
if max_samples is not None and self._max_samples is None:
self._max_samples = max_samples
if random_state is not None and self._indices_state is None:
self._indices_state = random_state.get_state()
indices_state = check_random_state(None)
indices_state.set_state(self._indices_state)
not_indices = sample_without_replacement(
self._n_samples,
self._n_samples - self._max_samples,
random_state=indices_state)
sample_counts = np.bincount(not_indices, minlength=self._n_samples)
indices = np.where(sample_counts == 0)[0]
return indices, not_indices
0
Example 38
def most_frequent(x):
"""Returns the most frequent value in x."""
return np.argmax(np.bincount(x))
0
Example 39
def getnnz(self, axis=None):
if axis is None:
nnz = len(self.data)
if nnz != len(self.row) or nnz != len(self.col):
raise ValueError('row, column, and data array must all be the '
'same length')
if self.data.ndim != 1 or self.row.ndim != 1 or \
self.col.ndim != 1:
raise ValueError('row, column, and data arrays must be 1-D')
return int(nnz)
if axis < 0:
axis += 2
if axis == 0:
return np.bincount(downcast_intp_index(self.col),
minlength=self.shape[1])
elif axis == 1:
return np.bincount(downcast_intp_index(self.row),
minlength=self.shape[0])
else:
raise ValueError('axis out of bounds')
0
Example 40
Project: brew Source File: smote_bagging.py
def smote_bootstrap_sample(self, X, y, b, k):
count = np.bincount(y) # number of instances of each class
majority_class = count.argmax() # majority class
majority_count = count.max() # majority class
data = np.empty((0, X.shape[1]))
target = np.empty((0,))
class_data = X[(y == majority_class), :]
idx = np.random.choice(majority_count, (majority_count,))
data = np.concatenate((data, class_data[idx, :]))
target = np.concatenate(
(target, majority_class * np.ones((majority_count,))))
minority_class = count.argmin()
minority_count = count.min()
# print majority_count
N_syn = int((majority_count) * (b / 100))
# print N_syn
N_res = majority_count - N_syn
# print N_res
N_syn, N_res = N_res, N_syn
class_data = X[(y == minority_class), :]
idx = np.random.choice(class_data.shape[0], (N_res,))
sampled_min_data = class_data[idx, :]
# print sampled_min_data.shape
if N_syn > 0:
N_smote = np.ceil(N_syn / minority_count) * 100
N_smote = 100 if N_smote < 100 else int(N_smote - N_smote % 100)
synthetic = smote(X[y == minority_class], N=int(N_smote), k=self.k)
idx = np.random.choice(synthetic.shape[0], (N_syn,))
new_class_data = np.concatenate(
(sampled_min_data, synthetic[idx, :]))
data = np.concatenate((data, new_class_data))
target = np.concatenate(
(target, minority_class * np.ones((new_class_data.shape[0],))))
else:
data = np.concatenate((data, sampled_min_data))
target = np.concatenate(
(target, minority_class * np.ones((sampled_min_data.shape[0],)))) # noqa
return data, target
0
Example 41
Project: intrinsic Source File: solver.py
def remove_unused_intensities(self):
""" Remove any intensities that are not currently assigned to a pixel,
and then re-number all labels so they are contiguous again. """
if self.params.logging:
prev_r_s = self.decomposition.get_r_s()
labels_nz = self.decomposition.labels_nz
intensities = self.decomposition.intensities
chromaticities = self.decomposition.chromaticities
nlabels = intensities.shape[0]
new_to_old = np.nonzero(np.bincount(
labels_nz, minlength=nlabels))[0]
old_to_new = np.empty(nlabels, dtype=np.int32)
old_to_new.fill(-1)
for new, old in enumerate(new_to_old):
old_to_new[old] = new
self.decomposition.labels_nz = old_to_new[labels_nz]
self.decomposition.intensities = intensities[new_to_old]
self.decomposition.chromaticities = chromaticities[new_to_old]
if self.params.logging:
print ('remove_unused_intensities: %s/%s labels kept' % (
len(self.decomposition.intensities), len(intensities)))
if self.params.logging:
np.testing.assert_equal(self.decomposition.get_r_s(), prev_r_s)
assert (self.decomposition.chromaticities.shape[0] ==
self.decomposition.intensities.shape[0])
0
Example 42
Project: scikit-bio Source File: _ace.py
@experimental(as_of="0.4.0")
def ace(counts, rare_threshold=10):
r"""Calculate the ACE metric (Abundance-based Coverage Estimator).
The ACE metric is defined as:
.. math::
S_{ace}=S_{abund}+\frac{S_{rare}}{C_{ace}}+
\frac{F_1}{C_{ace}}\gamma^2_{ace}
where :math:`S_{abund}` is the number of abundant OTUs (with more than
`rare_threshold` individuals) when all samples are pooled,
:math:`S_{rare}` is the number of rare OTUs (with less than or equal to
`rare_threshold` individuals) when all samples are pooled, :math:`C_{ace}`
is the sample abundance coverage estimator, :math:`F_1` is the frequency of
singletons, and :math:`\gamma^2_{ace}` is the estimated coefficient of
variation for rare OTUs.
The estimated coefficient of variation is defined as (assuming
`rare_threshold` is 10, the default):
.. math::
\gamma^2_{ace}=max\left[\frac{S_{rare}}{C_{ace}}
\frac{\sum^{10}_{i=1}{{i\left(i-1\right)}}F_i}
{\left(N_{rare}\right)\left(N_{rare}-1\right)} -1,0\right]
Parameters
----------
counts : 1-D array_like, int
Vector of counts.
rare_threshold : int, optional
Threshold at which an OTU containing as many or fewer individuals will
be considered rare.
Returns
-------
double
Computed ACE metric.
Raises
------
ValueError
If every rare OTU is a singleton.
Notes
-----
ACE was first introduced in [1]_ and [2]_. The implementation here is based
on the description given in the EstimateS manual [3]_.
If no rare OTUs exist, returns the number of abundant OTUs. The default
value of 10 for `rare_threshold` is based on [4]_.
If `counts` contains zeros, indicating OTUs which are known to exist in the
environment but did not appear in the sample, they will be ignored for the
purpose of calculating the number of rare OTUs.
References
----------
.. [1] Chao, A. & S.-M Lee. 1992 Estimating the number of classes via
sample coverage. Journal of the American Statistical Association 87,
210-217.
.. [2] Chao, A., M.-C. Ma, & M. C. K. Yang. 1993. Stopping rules and
estimation for recapture debugging with unequal failure rates.
Biometrika 80, 193-201.
.. [3] http://viceroy.eeb.uconn.edu/estimates/
.. [4] Chao, A., W.-H. Hwang, Y.-C. Chen, and C.-Y. Kuo. 2000. Estimating
the number of shared species in two communities. Statistica Sinica
10:227-246.
"""
counts = _validate_counts_vector(counts)
freq_counts = np.bincount(counts)
s_rare = _otus_rare(freq_counts, rare_threshold)
singles = freq_counts[1]
if singles > 0 and singles == s_rare:
raise ValueError("The only rare OTUs are singletons, so the ACE "
"metric is undefined. EstimateS suggests using "
"bias-corrected Chao1 instead.")
s_abun = _otus_abundant(freq_counts, rare_threshold)
if s_rare == 0:
return s_abun
n_rare = _number_rare(freq_counts, rare_threshold)
c_ace = 1 - singles / n_rare
top = s_rare * _number_rare(freq_counts, rare_threshold, gamma=True)
bottom = c_ace * n_rare * (n_rare - 1)
gamma_ace = (top / bottom) - 1
if gamma_ace < 0:
gamma_ace = 0
return s_abun + (s_rare / c_ace) + ((singles / c_ace) * gamma_ace)
0
Example 43
def _validate(self):
# This is the fastest way that we have found to identify the
# presence or absence of certain characters (numbers).
# It works by multiplying a mask where the numbers which are
# permitted have a zero at their index, and all others have a one.
# The result is a vector which will propogate counts of invalid
# numbers and remove counts of valid numbers, so that we need only
# see if the array is empty to determine validity.
invalid_characters = np.bincount(
self._bytes, minlength=self._number_of_extended_ascii_codes
) * self._validation_mask
if np.any(invalid_characters):
bad = list(np.where(
invalid_characters > 0)[0].astype(np.uint8).view('|S1'))
raise ValueError(
"Invalid character%s in sequence: %r. \n"
"Valid characters: %r\n"
"Note: Use `lowercase` if your sequence contains lowercase "
"characters not in the sequence's alphabet."
% ('s' if len(bad) > 1 else '',
[str(b.tostring().decode("ascii")) for b in bad] if
len(bad) > 1 else bad[0],
list(self.alphabet)))
0
Example 44
Project: scikit-bio Source File: _nucleotide_mixin.py
@stable(as_of='0.4.0')
def gc_frequency(self, relative=False):
"""Calculate frequency of G's and C's in the sequence.
This calculates the minimum GC frequency, which corresponds to IUPAC
characters G, C, and S (which stands for G or C).
Parameters
----------
relative : bool, optional
If False return the frequency of G, C, and S characters (ie the
count). If True return the relative frequency, ie the proportion
of G, C, and S characters in the sequence. In this case the
sequence will also be degapped before the operation, so gap
characters will not be included when calculating the length of the
sequence.
Returns
-------
int or float
Either frequency (count) or relative frequency (proportion),
depending on `relative`.
See Also
--------
gc_content
Examples
--------
>>> from skbio import DNA
>>> DNA('ACGT').gc_frequency()
2
>>> DNA('ACGT').gc_frequency(relative=True)
0.5
>>> DNA('ACGT--..').gc_frequency(relative=True)
0.5
>>> DNA('--..').gc_frequency(relative=True)
0
`S` means `G` or `C`, so it counts:
>>> DNA('ASST').gc_frequency()
2
Other degenerates don't count:
>>> DNA('RYKMBDHVN').gc_frequency()
0
"""
counts = np.bincount(self._bytes,
minlength=self._number_of_extended_ascii_codes)
gc = counts[self._gc_codes].sum()
if relative:
seq = self.degap()
if len(seq) != 0:
gc /= len(seq)
return gc
0
Example 45
Project: scikit-bio Source File: _permanova.py
@experimental(as_of="0.4.0")
def permanova(distance_matrix, grouping, column=None, permutations=999):
"""Test for significant differences between groups using PERMANOVA.
Permutational Multivariate Analysis of Variance (PERMANOVA) is a
non-parametric method that tests whether two or more groups of objects
(e.g., samples) are significantly different based on a categorical factor.
It is conceptually similar to ANOVA except that it operates on a distance
matrix, which allows for multivariate analysis. PERMANOVA computes a
pseudo-F statistic.
Statistical significance is assessed via a permutation test. The assignment
of objects to groups (`grouping`) is randomly permuted a number of times
(controlled via `permutations`). A pseudo-F statistic is computed for each
permutation and the p-value is the proportion of permuted pseudo-F
statisics that are equal to or greater than the original (unpermuted)
pseudo-F statistic.
Parameters
----------
distance_matrix : DistanceMatrix
Distance matrix containing distances between objects (e.g., distances
between samples of microbial communities).
grouping : 1-D array_like or pandas.DataFrame
Vector indicating the assignment of objects to groups. For example,
these could be strings or integers denoting which group an object
belongs to. If `grouping` is 1-D ``array_like``, it must be the same
length and in the same order as the objects in `distance_matrix`. If
`grouping` is a ``DataFrame``, the column specified by `column` will be
used as the grouping vector. The ``DataFrame`` must be indexed by the
IDs in `distance_matrix` (i.e., the row labels must be distance matrix
IDs), but the order of IDs between `distance_matrix` and the
``DataFrame`` need not be the same. All IDs in the distance matrix must
be present in the ``DataFrame``. Extra IDs in the ``DataFrame`` are
allowed (they are ignored in the calculations).
column : str, optional
Column name to use as the grouping vector if `grouping` is a
``DataFrame``. Must be provided if `grouping` is a ``DataFrame``.
Cannot be provided if `grouping` is 1-D ``array_like``.
permutations : int, optional
Number of permutations to use when assessing statistical
significance. Must be greater than or equal to zero. If zero,
statistical significance calculations will be skipped and the p-value
will be ``np.nan``.
Returns
-------
pandas.Series
Results of the statistical test, including ``test statistic`` and
``p-value``.
See Also
--------
anosim
Notes
-----
See [1]_ for the original method reference, as well as ``vegan::adonis``,
available in R's vegan package [2]_.
The p-value will be ``np.nan`` if `permutations` is zero.
References
----------
.. [1] Anderson, Marti J. "A new method for non-parametric multivariate
analysis of variance." Austral Ecology 26.1 (2001): 32-46.
.. [2] http://cran.r-project.org/web/packages/vegan/index.html
Examples
--------
See :mod:`skbio.stats.distance.anosim` for usage examples (both functions
provide similar interfaces).
"""
sample_size, num_groups, grouping, tri_idxs, distances = _preprocess_input(
distance_matrix, grouping, column)
# Calculate number of objects in each group.
group_sizes = np.bincount(grouping)
s_T = (distances ** 2).sum() / sample_size
test_stat_function = partial(_compute_f_stat, sample_size, num_groups,
tri_idxs, distances, group_sizes, s_T)
stat, p_value = _run_monte_carlo_stats(test_stat_function, grouping,
permutations)
return _build_results('PERMANOVA', 'pseudo-F', sample_size, num_groups,
stat, p_value, permutations)
0
Example 46
def artifacts(anat_data, fg_mask_data, calculate_qi2=False):
# Detect artifacts in the anatomical image using the method described in
# Mortamet et al. 2009 (MRM)
# Calculates QI1, the fraction of total voxels that within artifacts.
# Optionally, also calculates QI2, the distance between the distribution
# of noise voxel (non-artifact background voxels) intensities, and a
# Ricean distribution.
import numpy as np
background, bg_mask = get_background(anat_data, fg_mask_data)
# make sure the datatype is an int
background = check_datatype(background)
# Find the background threshold (the most frequently occurring value
# excluding 0)
bg_counts = np.bincount(background.flatten())
bg_threshold = np.argmax(bg_counts[1:]) + 1
# Apply this threshold to the background voxels to identify voxels
# contributing artifacts.
background[background <= bg_threshold] = 0
background[background != 0] = 1
# Create a structural element to be used in an opening operation.
struct_elmnt = np.zeros((3,3,3))
struct_elmnt[0,1,1] = 1
struct_elmnt[1,1,:] = 1
struct_elmnt[1,:,1] = 1
struct_elmnt[2,1,1] = 1
# Perform an opening operation on the background data.
background = nd.binary_opening(background, structure=struct_elmnt)
# Count the number of voxels that remain after the opening operation.
# These are artifacts.
QI1 = background.sum() / float(bg_mask.sum())
''' "bg" in code below not defined- need to ascertain what that should '''
''' be, and correct it- unit test for this part disabled for now '''
if calculate_qi2:
# Now lets focus on the noise, which is everything in the background
# that was not identified as artifact
bgNoise = anat_data[(fg_mask_data-bg)==1]
# calculate the histogram of the noise and its derivative
H = np.bincount(bgNoise)
H = 1.0*H/H.sum()
dH = H[1:]-H[:-1]
# find the first value on the right tail, i.e. tail with negative
# slope, i.e. dH < 0 that is less than or equal to half of the
# histograms max
firstNegSlope = np.nonzero(dH<0)[0][0]
halfMaxRightTail = np.nonzero(H[firstNegSlope:]<(H.max()/2))[0][0]
# divide by the standard deviation
bgNoiseZ = bgNoise / bgNoise.std()
bgChiParams = ss.chi.fit(bgNoiseZ)
#print bgChiParams
# now generate values that are consistent with the histogram
yx = range(0,H.size)/bgNoise.std()
rvs = ss.chi.pdf(yx,bgChiParams[0],loc=bgChiParams[1],scale=bgChiParams[2])
# now we can calculate the goodness of fit
gof = np.average(np.absolute(H[halfMaxRightTail:]-rvs[halfMaxRightTail:]))
QI2 = QI1+gof
else:
QI2 = None
return (QI1,QI2)
0
Example 47
Project: auto-sklearn Source File: evaluation_util.py
def get_500_classes_datamanager():
weights = ([0.002] * 475) + ([0.001] * 25)
X, Y = sklearn.datasets.make_classification(n_samples=1000,
n_features=20,
n_classes=500,
n_clusters_per_class=1,
n_informative=15,
n_redundant=5,
n_repeated=0,
weights=weights,
flip_y=0,
class_sep=1.0,
hypercube=True,
shift=None,
scale=1.0,
shuffle=True,
random_state=1)
assert (25 == np.sum(np.bincount(Y) == 1), np.sum(np.bincount(Y) == 1))
D = Dummy()
D.info = {
'metric': ACC_METRIC,
'task': MULTICLASS_CLASSIFICATION,
'is_sparse': False,
'label_num': 500
}
D.data = {'X_train': X[:700], 'Y_train': Y[:700],
'X_valid': X[700:710], 'Y_valid': Y[700:710],
'X_test': X[710:], 'Y_test': Y[710:]
}
D.feat_type = ['numerical'] * 20
return D
0
Example 48
Project: scikit-beam Source File: correlation.py
def _two_time_process(buf, g2, label_array, num_bufs, num_pixels,
img_per_level, lag_steps, current_img_time,
level, buf_no):
"""
Parameters
----------
buf: array
image data array to use for two time correlation
g2: array
two time correlation matrix
shape (number of labels(ROI), number of frames, number of frames)
label_array: array
Elements not inside any ROI are zero; elements inside each
ROI are 1, 2, 3, etc. corresponding to the order they are specified
in edges and segments
num_bufs: int, even
number of buffers(channels)
num_pixels : array
number of pixels in certain ROI's
ROI's, dimensions are len(np.unique(label_array))
img_per_level: array
to track how many images processed in each level
lag_steps : array
delay or lag steps for the multiple tau analysis
shape num_levels
current_img_time : int
the current image number
level : int
the current multi-tau level
buf_no : int
the current buffer number
"""
img_per_level[level] += 1
# in multi-tau correlation other than first level all other levels
# have to do the half of the correlation
if level == 0:
i_min = 0
else:
i_min = num_bufs//2
for i in range(i_min, min(img_per_level[level], num_bufs)):
t_index = level*num_bufs/2 + i
delay_no = (buf_no - i) % num_bufs
past_img = buf[level, delay_no]
future_img = buf[level, buf_no]
# get the matrix of correlation function without normalizations
tmp_binned = (np.bincount(label_array,
weights=past_img*future_img)[1:])
# get the matrix of past intensity normalizations
pi_binned = (np.bincount(label_array,
weights=past_img)[1:])
# get the matrix of future intensity normalizations
fi_binned = (np.bincount(label_array,
weights=future_img)[1:])
tind1 = (current_img_time - 1)
tind2 = (current_img_time - lag_steps[t_index] - 1)
if not isinstance(current_img_time, int):
nshift = 2**(level-1)
for i in range(-nshift+1, nshift+1):
g2[:, int(tind1+i),
int(tind2+i)] = (tmp_binned/(pi_binned *
fi_binned))*num_pixels
else:
g2[:, tind1, tind2] = tmp_binned/(pi_binned * fi_binned)*num_pixels
0
Example 49
Project: statsmodels Source File: survival2.py
def fitting_proc(self, group):
"""
For internal use
"""
t = ((group[:,self.endog]).astype(float)).astype(int)
if self.censoring == None:
events = np.bincount(t)
t = np.unique(t)
events = events[:,list(t)]
events = events.astype(float)
eventsSum = np.cuemsum(events)
eventsSum = np.r_[0,eventsSum]
n = len(group) - eventsSum[:-1]
else:
censoring = ((group[:,self.censoring]).astype(float)).astype(int)
reverseCensoring = -1*(censoring - 1)
events = np.bincount(t,censoring)
censored = np.bincount(t,reverseCensoring)
t = np.unique(t)
censored = censored[:,list(t)]
censored = censored.astype(float)
censoredSum = np.cuemsum(censored)
censoredSum = np.r_[0,censoredSum]
events = events[:,list(t)]
events = events.astype(float)
eventsSum = np.cuemsum(events)
eventsSum = np.r_[0,eventsSum]
n = len(group) - eventsSum[:-1] - censoredSum[:-1]
(self.censorings).append(censored)
survival = np.cuemprod(1-events/n)
var = ((survival*survival) *
np.cuemsum(events/(n*(n-events))))
se = np.sqrt(var)
(self.results).append(np.array([survival,se]))
(self.ts).append(t)
(self.event).append(events)
0
Example 50
Project: WASP Source File: update_total_depth.py
def get_at_gc_count(seq_h5, chrm, start, end):
# seq HDF5 file contains ascii values for nucleotides
# e.g. A = 65
node = seq_h5.getNode("/%s" % chrm)
vals = node[start-1:end]
counts = np.bincount(vals)
at_count = 0
gc_count = 0
if len(counts) >= ord("A"):
at_count += counts[ord("A")]
if len(counts) >= ord("T"):
at_count += counts[ord("T")]
if len(counts) >= ord("G"):
gc_count += counts[ord("G")]
if len(counts) >= ord("C"):
gc_count += counts[ord("C")]
return at_count, gc_count