Here are the examples of the python api sklearn.feature_selection.chi2 taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
9 Examples
3
Example 1
def __init__(self, percentile, score_func="chi2", random_state=None):
""" Parameters:
random state : ignored
score_func : callable, Function taking two arrays X and y, and
returning a pair of arrays (scores, pvalues).
"""
import sklearn.feature_selection
self.random_state = random_state # We don't use this
self.percentile = int(float(percentile))
if score_func == "chi2":
self.score_func = sklearn.feature_selection.chi2
elif score_func == "f_classif":
self.score_func = sklearn.feature_selection.f_classif
else:
raise ValueError("score_func must be in ('chi2, 'f_classif'), "
"but is: %s" % score_func)
3
Example 2
def transform(self, X):
import scipy.sparse
import sklearn.feature_selection
# Because the pipeline guarantees that each feature is positive,
# clip all values below zero to zero
if self.score_func == sklearn.feature_selection.chi2:
if scipy.sparse.issparse(X):
X.data[X.data < 0] = 0.0
else:
X[X < 0] = 0.0
if self.preprocessor is None:
raise NotImplementedError()
Xt = self.preprocessor.transform(X)
if Xt.shape[1] == 0:
raise ValueError(
"%s removed all features." % self.__class__.__name__)
return Xt
3
Example 3
def __init__(self, alpha, mode='fpr',
score_func="chi2", random_state=None):
import sklearn.feature_selection
self.random_state = random_state # We don't use this
self.alpha = float(alpha)
if score_func == "chi2":
self.score_func = sklearn.feature_selection.chi2
elif score_func == "f_classif":
self.score_func = sklearn.feature_selection.f_classif
else:
raise ValueError("score_func must be in ('chi2, 'f_classif'), "
"but is: %s" % score_func)
self.mode = mode
3
Example 4
Project: scikit-learn Source File: test_chi2.py
def test_chi2_unused_feature():
# Unused feature should evaluate to NaN
# and should issue no runtime warning
clean_warning_registry()
with warnings.catch_warnings(record=True) as warned:
warnings.simplefilter('always')
chi, p = chi2([[1, 0], [0, 0]], [1, 0])
for w in warned:
if 'divide by zero' in w.message:
raise AssertionError('Found unexpected warning %s' % w)
assert_array_equal(chi, [1, np.nan])
assert_array_equal(p[1], np.nan)
3
Example 5
def chi_square(X, y):
"""
This function implements the chi-square feature selection (existing method for classification in scikit-learn)
Input
-----
X: {numpy array}, shape (n_samples, n_features)
input data
y: {numpy array},shape (n_samples,)
input class labels
Output
------
F: {numpy array}, shape (n_features,)
chi-square score for each feature
"""
F, pval = chi2(X, y)
return F
3
Example 6
Project: pandas-ml Source File: test_feature_selection.py
def test_chi2(self):
iris = datasets.load_iris()
df = pdml.ModelFrame(iris)
result = df.feature_selection.chi2()
expected = fs.chi2(iris.data, iris.target)
self.assertEqual(len(result), 2)
self.assert_numpy_array_almost_equal(result[0], expected[0])
self.assert_numpy_array_almost_equal(result[1], expected[1])
0
Example 7
def transform(self, X):
import scipy.sparse
import sklearn.feature_selection
# Because the pipeline guarantees that each feature is positive,
# clip all values below zero to zero
if self.score_func == sklearn.feature_selection.chi2:
if scipy.sparse.issparse(X):
X.data[X.data < 0] = 0.0
else:
X[X < 0] = 0.0
if self.preprocessor is None:
raise NotImplementedError()
try:
Xt = self.preprocessor.transform(X)
except ValueError as e:
if "zero-size array to reduction operation maximum which has no " \
"identity" in e.message:
raise ValueError(
"%s removed all features." % self.__class__.__name__)
else:
raise e
if Xt.shape[1] == 0:
raise ValueError(
"%s removed all features." % self.__class__.__name__)
return Xt
0
Example 8
Project: scikit-learn Source File: test_feature_select.py
def test_boundary_case_ch2():
# Test boundary case, and always aim to select 1 feature.
X = np.array([[10, 20], [20, 20], [20, 30]])
y = np.array([[1], [0], [0]])
scores, pvalues = chi2(X, y)
assert_array_almost_equal(scores, np.array([4., 0.71428571]))
assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))
filter_fdr = SelectFdr(chi2, alpha=0.1)
filter_fdr.fit(X, y)
support_fdr = filter_fdr.get_support()
assert_array_equal(support_fdr, np.array([True, False]))
filter_kbest = SelectKBest(chi2, k=1)
filter_kbest.fit(X, y)
support_kbest = filter_kbest.get_support()
assert_array_equal(support_kbest, np.array([True, False]))
filter_percentile = SelectPercentile(chi2, percentile=50)
filter_percentile.fit(X, y)
support_percentile = filter_percentile.get_support()
assert_array_equal(support_percentile, np.array([True, False]))
filter_fpr = SelectFpr(chi2, alpha=0.1)
filter_fpr.fit(X, y)
support_fpr = filter_fpr.get_support()
assert_array_equal(support_fpr, np.array([True, False]))
filter_fwe = SelectFwe(chi2, alpha=0.1)
filter_fwe.fit(X, y)
support_fwe = filter_fwe.get_support()
assert_array_equal(support_fwe, np.array([True, False]))
0
Example 9
Project: ProFET Source File: Model_Parameters_CV.py
def plot_BestKFeatures (X_train, y_train):
'''
http://nbviewer.ipython.org/github/gmonce/scikit-learn-book/blob/master/Chapter%204%20-%20Advanced%20Features%20-%20Feature%20Engineering%20and%20Selection.ipynb
Find the best percentile of features to use,
using cross-validation on the training set and get K best feats
'''
from sklearn import cross_validation
from sklearn import feature_selection
from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt = RandomForestClassifier(n_jobs=2, bootstrap=True, n_estimators=250, criterion='gini')
dt = dt.fit(X_train, y_train)
percentiles = range(1, 95, 5)
results = []
for i in range(1, 95, 5):
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i) #Original
fs = feature_selection.SelectPercentile(feature_selection.f_classif, percentile=i) # alt
X_train_fs = fs.fit_transform(X_train, y_train)
scores = cross_validation.cross_val_score(dt, X_train_fs, y_train, cv=4)
#print i,scores.mean()
results = np.append(results, scores.mean())
optimal_percentil = np.where(results == results.max())[0]
print (("Optimal number of features:{0}".format(percentiles[optimal_percentil])), "\n")
# Plot number of features VS. cross-validation scores
import pylab as pl
import matplotlib.pylab as pl
pl.figure()
pl.xlabel("Number of features selected")
pl.ylabel("Cross validation accuracy)")
pl.plot(percentiles,results)
print ("Mean scores:",results)
return