Here are the examples of the python api sklearn.feature_selection.SelectKBest taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
32 Examples
5
Example 1
def test_classes_property():
iris = load_iris()
X = iris.data
y = iris.target
reg = make_pipeline(SelectKBest(k=1), LinearRegression())
reg.fit(X, y)
assert_raises(AttributeError, getattr, reg, "classes_")
clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0))
assert_raises(AttributeError, getattr, clf, "classes_")
clf.fit(X, y)
assert_array_equal(clf.classes_, np.unique(y))
3
Example 2
def test_select_kbest_zero():
# Test whether k=0 correctly returns no features.
X, y = make_classification(n_samples=20, n_features=10,
shuffle=False, random_state=0)
univariate_filter = SelectKBest(f_classif, k=0)
univariate_filter.fit(X, y)
support = univariate_filter.get_support()
gtruth = np.zeros(10, dtype=bool)
assert_array_equal(support, gtruth)
X_selected = assert_warns_message(UserWarning, 'No features were selected',
univariate_filter.transform, X)
assert_equal(X_selected.shape, (20, 0))
3
Example 3
def test_select_kbest_regression():
# Test whether the relative univariate feature selection
# gets the correct items in a simple regression problem
# with the k best heuristic
X, y = make_regression(n_samples=200, n_features=20, n_informative=5,
shuffle=False, random_state=0, noise=10)
univariate_filter = SelectKBest(f_regression, k=5)
X_r = univariate_filter.fit(X, y).transform(X)
assert_best_scores_kept(univariate_filter)
X_r2 = GenericUnivariateSelect(
f_regression, mode='k_best', param=5).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(20)
gtruth[:5] = 1
assert_array_equal(support, gtruth)
3
Example 4
def test_selectkbest_tiebreaking():
# Test whether SelectKBest actually selects k features in case of ties.
# Prior to 0.11, SelectKBest would return more features than requested.
Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]]
y = [1]
dummy_score = lambda X, y: (X[0], X[0])
for X in Xs:
sel = SelectKBest(dummy_score, k=1)
X1 = ignore_warnings(sel.fit_transform)([X], y)
assert_equal(X1.shape[1], 1)
assert_best_scores_kept(sel)
sel = SelectKBest(dummy_score, k=2)
X2 = ignore_warnings(sel.fit_transform)([X], y)
assert_equal(X2.shape[1], 2)
assert_best_scores_kept(sel)
3
Example 5
def test_nans():
# Assert that SelectKBest and SelectPercentile can handle NaNs.
# First feature has zero variance to confuse f_classif (ANOVA) and
# make it return a NaN.
X = [[0, 1, 0], [0, -1, -1], [0, .5, .5]]
y = [1, 0, 1]
for select in (SelectKBest(f_classif, 2),
SelectPercentile(f_classif, percentile=67)):
ignore_warnings(select.fit)(X, y)
assert_array_equal(select.get_support(indices=True), np.array([1, 2]))
3
Example 6
def test_pipeline_methods_anova():
# Test the various methods of the pipeline (anova).
iris = load_iris()
X = iris.data
y = iris.target
# Test with Anova + LogisticRegression
clf = LogisticRegression()
filter1 = SelectKBest(f_classif, k=2)
pipe = Pipeline([('anova', filter1), ('logistic', clf)])
pipe.fit(X, y)
pipe.predict(X)
pipe.predict_proba(X)
pipe.predict_log_proba(X)
pipe.score(X, y)
3
Example 7
def test_pipeline_methods_anova_rus():
# Test the various methods of the pipeline (anova).
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1,
n_samples=5000, random_state=0)
# Test with RandomUnderSampling + Anova + LogisticRegression
clf = LogisticRegression()
rus = RandomUnderSampler(random_state=0)
filter1 = SelectKBest(f_classif, k=2)
pipe = Pipeline([('rus', rus), ('anova', filter1), ('logistic', clf)])
pipe.fit(X, y)
pipe.predict(X)
pipe.predict_proba(X)
pipe.predict_log_proba(X)
pipe.score(X, y)
3
Example 8
def test_pipeline_with_step_that_it_is_pipeline():
# Test the various methods of the pipeline (anova).
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1,
n_samples=5000, random_state=0)
# Test with RandomUnderSampling + Anova + LogisticRegression
clf = LogisticRegression()
rus = RandomUnderSampler(random_state=0)
filter1 = SelectKBest(f_classif, k=2)
pipe1 = Pipeline([('rus', rus), ('anova', filter1)])
assert_raises(TypeError, Pipeline, [('pipe1', pipe1), ('logistic', clf)])
3
Example 9
def featureFitting(filename, X, y, featureNames,optimalFlag, kbest=20, alpha=0.05, model=None):
'''
Gets the K-best features (filtered by FDR, then select best ranked by t-test, more advanced options can be implemented).
Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
Returns new features matrix, FD scaler, and K-select scaler
'''
a=alpha
FD = SelectFdr(alpha=a)
X = FD.fit_transform(X,y)
selectK = SelectKBest(k=kbest)
selectK.fit(X,y)
selectK_mask=selectK.get_support()
K_featnames = featureNames[selectK_mask]
print("K_featnames: %s" %(K_featnames))
Reduced_df = pd.read_csv(filename, index_col=0)
Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
Reduced_df.to_csv('REDUCED_Feat.csv')
return Reduced_df, FD, selectK
3
Example 10
def test_select_kbest_2():
"""Ensure that the TPOT select kbest outputs the same result as sklearn select kbest when k<0"""
tpot_obj = TPOT()
non_feature_columns = ['class', 'group', 'guess']
training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values
with warnings.catch_warnings():
warnings.simplefilter('ignore', category=UserWarning)
selector = SelectKBest(f_classif, k=1)
selector.fit(training_features, training_class_vals)
mask = selector.get_support(True)
mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns
assert np.array_equal(tpot_obj._select_kbest(training_testing_data, -1), training_testing_data[mask_cols])
3
Example 11
def test_select_kbest_3():
"""Ensure that the TPOT select kbest outputs the same result as sklearn select kbest when k> no. of features"""
tpot_obj = TPOT()
non_feature_columns = ['class', 'group', 'guess']
training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values
with warnings.catch_warnings():
warnings.simplefilter('ignore', category=UserWarning)
selector = SelectKBest(f_classif, k=64)
selector.fit(training_features, training_class_vals)
mask = selector.get_support(True)
mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns
assert np.array_equal(tpot_obj._select_kbest(training_testing_data, 100), training_testing_data[mask_cols])
3
Example 12
def test_select_kbest_4():
"""Ensure that the TPOT select kbest outputs the same result as sklearn select kbest when 0< k< features"""
tpot_obj = TPOT()
non_feature_columns = ['class', 'group', 'guess']
training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values
with warnings.catch_warnings():
warnings.simplefilter('ignore', category=UserWarning)
selector = SelectKBest(f_classif, k=42)
selector.fit(training_features, training_class_vals)
mask = selector.get_support(True)
mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns
assert np.array_equal(tpot_obj._select_kbest(training_testing_data, 42), training_testing_data[mask_cols])
3
Example 13
def select_best_features(dataset, train_labels, num_best, verbose=True):
(X_train, Y_train), (X_test, Y_test) = dataset
if verbose:
print('\nSelecting %d best features\n'%num_best)
selector = SelectKBest(chi2, k=num_best)
X_train = selector.fit_transform(X_train,train_labels)
X_test = selector.transform(X_test)
return ((X_train, Y_train), (X_test, Y_test)),selector.scores_
3
Example 14
def test_grid_search():
pipeline = dl.Pipeline([("pca", PCA()),
("select_k", SelectKBest()),
("svm", LinearSVC())])
param_grid = {'select_k__k': [1, 2, 3, 4],
'svm__C': np.logspace(-3, 2, 3)}
grid = dl.GridSearchCV(pipeline, param_grid)
with dask.set_options(get=dask.get):
result = grid.fit(X_train, y_train).score(X_test, y_test)
assert isinstance(result, float)
2
Example 15
def test_bagging_with_pipeline():
estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
DecisionTreeClassifier()),
max_features=2)
estimator.fit(iris.data, iris.target)
assert_true(isinstance(estimator[0].steps[-1][1].random_state,
int))
2
Example 16
def test_select_kbest_classif():
# Test whether the relative univariate feature selection
# gets the correct items in a simple classification problem
# with the k best heuristic
X, y = make_classification(n_samples=200, n_features=20,
n_informative=3, n_redundant=2,
n_repeated=0, n_classes=8,
n_clusters_per_class=1, flip_y=0.0,
class_sep=10, shuffle=False, random_state=0)
univariate_filter = SelectKBest(f_classif, k=5)
X_r = univariate_filter.fit(X, y).transform(X)
X_r2 = GenericUnivariateSelect(
f_classif, mode='k_best', param=5).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(20)
gtruth[:5] = 1
assert_array_equal(support, gtruth)
2
Example 17
def test_select_kbest_all():
# Test whether k="all" correctly returns all features.
X, y = make_classification(n_samples=20, n_features=10,
shuffle=False, random_state=0)
univariate_filter = SelectKBest(f_classif, k='all')
X_r = univariate_filter.fit(X, y).transform(X)
assert_array_equal(X, X_r)
0
Example 18
def mkchi2(k):
"""Make k-best chi2 selector"""
return SelectKBest(chi2, k=k)
0
Example 19
def test_boundary_case_ch2():
# Test boundary case, and always aim to select 1 feature.
X = np.array([[10, 20], [20, 20], [20, 30]])
y = np.array([[1], [0], [0]])
scores, pvalues = chi2(X, y)
assert_array_almost_equal(scores, np.array([4., 0.71428571]))
assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))
filter_fdr = SelectFdr(chi2, alpha=0.1)
filter_fdr.fit(X, y)
support_fdr = filter_fdr.get_support()
assert_array_equal(support_fdr, np.array([True, False]))
filter_kbest = SelectKBest(chi2, k=1)
filter_kbest.fit(X, y)
support_kbest = filter_kbest.get_support()
assert_array_equal(support_kbest, np.array([True, False]))
filter_percentile = SelectPercentile(chi2, percentile=50)
filter_percentile.fit(X, y)
support_percentile = filter_percentile.get_support()
assert_array_equal(support_percentile, np.array([True, False]))
filter_fpr = SelectFpr(chi2, alpha=0.1)
filter_fpr.fit(X, y)
support_fpr = filter_fpr.get_support()
assert_array_equal(support_fpr, np.array([True, False]))
filter_fwe = SelectFwe(chi2, alpha=0.1)
filter_fwe.fit(X, y)
support_fwe = filter_fwe.get_support()
assert_array_equal(support_fwe, np.array([True, False]))
0
Example 20
def test_mutual_info_classif():
X, y = make_classification(n_samples=100, n_features=5,
n_informative=1, n_redundant=1,
n_repeated=0, n_classes=2,
n_clusters_per_class=1, flip_y=0.0,
class_sep=10, shuffle=False, random_state=0)
# Test in KBest mode.
univariate_filter = SelectKBest(mutual_info_classif, k=2)
X_r = univariate_filter.fit(X, y).transform(X)
X_r2 = GenericUnivariateSelect(
mutual_info_classif, mode='k_best', param=2).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(5)
gtruth[:2] = 1
assert_array_equal(support, gtruth)
# Test in Percentile mode.
univariate_filter = SelectPercentile(mutual_info_classif, percentile=40)
X_r = univariate_filter.fit(X, y).transform(X)
X_r2 = GenericUnivariateSelect(
mutual_info_classif, mode='percentile', param=40).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(5)
gtruth[:2] = 1
assert_array_equal(support, gtruth)
0
Example 21
def test_mutual_info_regression():
X, y = make_regression(n_samples=100, n_features=10, n_informative=2,
shuffle=False, random_state=0, noise=10)
# Test in KBest mode.
univariate_filter = SelectKBest(mutual_info_regression, k=2)
X_r = univariate_filter.fit(X, y).transform(X)
assert_best_scores_kept(univariate_filter)
X_r2 = GenericUnivariateSelect(
mutual_info_regression, mode='k_best', param=2).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(10)
gtruth[:2] = 1
assert_array_equal(support, gtruth)
# Test in Percentile mode.
univariate_filter = SelectPercentile(mutual_info_regression, percentile=20)
X_r = univariate_filter.fit(X, y).transform(X)
X_r2 = GenericUnivariateSelect(mutual_info_regression, mode='percentile',
param=20).fit(X, y).transform(X)
assert_array_equal(X_r, X_r2)
support = univariate_filter.get_support()
gtruth = np.zeros(10)
gtruth[:2] = 1
assert_array_equal(support, gtruth)
0
Example 22
def test_pipeline_init():
# Test the various init parameters of the pipeline.
assert_raises(TypeError, Pipeline)
# Check that we can't instantiate pipelines with objects without fit
# method
assert_raises_regex(TypeError,
'Last step of Pipeline should implement fit. '
'.*NoFit.*',
Pipeline, [('clf', NoFit())])
# Smoke test with only an estimator
clf = NoTrans()
pipe = Pipeline([('svc', clf)])
assert_equal(pipe.get_params(deep=True),
dict(svc__a=None, svc__b=None, svc=clf,
**pipe.get_params(deep=False)))
# Check that params are set
pipe.set_params(svc__a=0.1)
assert_equal(clf.a, 0.1)
assert_equal(clf.b, None)
# Smoke test the repr:
repr(pipe)
# Test with two objects
clf = SVC()
filter1 = SelectKBest(f_classif)
pipe = Pipeline([('anova', filter1), ('svc', clf)])
# Check that we can't instantiate with non-transformers on the way
# Note that NoTrans implements fit, but not transform
assert_raises_regex(TypeError,
'All intermediate steps should be transformers'
'.*\\bNoTrans\\b.*',
Pipeline, [('t', NoTrans()), ('svc', clf)])
# Check that params are set
pipe.set_params(svc__C=0.1)
assert_equal(clf.C, 0.1)
# Smoke test the repr:
repr(pipe)
# Check that params are not set when naming them wrong
assert_raises(ValueError, pipe.set_params, anova__C=0.1)
# Test clone
pipe2 = clone(pipe)
assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc'])
# Check that apart from estimators, the parameters are the same
params = pipe.get_params(deep=True)
params2 = pipe2.get_params(deep=True)
for x in pipe.get_params(deep=False):
params.pop(x)
for x in pipe2.get_params(deep=False):
params2.pop(x)
# Remove estimators that where copied
params.pop('svc')
params.pop('anova')
params2.pop('svc')
params2.pop('anova')
assert_equal(params, params2)
0
Example 23
def test_feature_union():
# basic sanity check for feature union
iris = load_iris()
X = iris.data
X -= X.mean(axis=0)
y = iris.target
svd = TruncatedSVD(n_components=2, random_state=0)
select = SelectKBest(k=1)
fs = FeatureUnion([("svd", svd), ("select", select)])
fs.fit(X, y)
X_transformed = fs.transform(X)
assert_equal(X_transformed.shape, (X.shape[0], 3))
# check if it does the expected thing
assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
assert_array_equal(X_transformed[:, -1],
select.fit_transform(X, y).ravel())
# test if it also works for sparse input
# We use a different svd object to control the random_state stream
fs = FeatureUnion([("svd", svd), ("select", select)])
X_sp = sparse.csr_matrix(X)
X_sp_transformed = fs.fit_transform(X_sp, y)
assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())
# test setting parameters
fs.set_params(select__k=2)
assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
# test it works with transformers missing fit_transform
fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)])
X_transformed = fs.fit_transform(X, y)
assert_equal(X_transformed.shape, (X.shape[0], 8))
# test error if some elements do not support transform
assert_raises_regex(TypeError,
'All estimators should implement fit and '
'transform.*\\bNoTrans\\b',
FeatureUnion,
[("transform", Transf()), ("no_transform", NoTrans())])
0
Example 24
def test_feature_union_weights():
# test feature union with transformer weights
iris = load_iris()
X = iris.data
y = iris.target
pca = PCA(n_components=2, svd_solver='randomized', random_state=0)
select = SelectKBest(k=1)
# test using fit followed by transform
fs = FeatureUnion([("pca", pca), ("select", select)],
transformer_weights={"pca": 10})
fs.fit(X, y)
X_transformed = fs.transform(X)
# test using fit_transform
fs = FeatureUnion([("pca", pca), ("select", select)],
transformer_weights={"pca": 10})
X_fit_transformed = fs.fit_transform(X, y)
# test it works with transformers missing fit_transform
fs = FeatureUnion([("mock", Transf()), ("pca", pca), ("select", select)],
transformer_weights={"mock": 10})
X_fit_transformed_wo_method = fs.fit_transform(X, y)
# check against expected result
# We use a different pca object to control the random_state stream
assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
assert_array_equal(X_transformed[:, -1],
select.fit_transform(X, y).ravel())
assert_array_almost_equal(X_fit_transformed[:, :-1],
10 * pca.fit_transform(X))
assert_array_equal(X_fit_transformed[:, -1],
select.fit_transform(X, y).ravel())
assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
0
Example 25
def test_pipeline_init():
# Test the various init parameters of the pipeline.
assert_raises(TypeError, Pipeline)
# Check that we can't instantiate pipelines with objects without fit
# method
pipe = assert_raises(TypeError, Pipeline, [('svc', IncorrectT)])
# Smoke test with only an estimator
clf = T()
pipe = Pipeline([('svc', clf)])
assert_equal(pipe.get_params(deep=True),
dict(svc__a=None, svc__b=None, svc=clf,
**pipe.get_params(deep=False)))
# Check that params are set
pipe.set_params(svc__a=0.1)
assert_equal(clf.a, 0.1)
assert_equal(clf.b, None)
# Smoke test the repr:
repr(pipe)
# Test with two objects
clf = SVC()
filter1 = SelectKBest(f_classif)
pipe = Pipeline([('anova', filter1), ('svc', clf)])
# Check that we can't use the same stage name twice
assert_raises(ValueError, Pipeline, [('svc', SVC()), ('svc', SVC())])
# Check that params are set
pipe.set_params(svc__C=0.1)
assert_equal(clf.C, 0.1)
# Smoke test the repr:
repr(pipe)
# Check that params are not set when naming them wrong
assert_raises(ValueError, pipe.set_params, anova__C=0.1)
# Test clone
pipe2 = clone(pipe)
assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc'])
# Check that apart from estimators, the parameters are the same
params = pipe.get_params(deep=True)
params2 = pipe2.get_params(deep=True)
for x in pipe.get_params(deep=False):
params.pop(x)
for x in pipe2.get_params(deep=False):
params2.pop(x)
# Remove estimators that where copied
params.pop('svc')
params.pop('anova')
params2.pop('svc')
params2.pop('anova')
assert_equal(params, params2)
0
Example 26
def build_pipeline():
x_train, x_test, y_train, y_test = get_training_data()
tfidf = TfidfVectorizer()
feature_union = FeatureUnion(
transformer_list=[
('x', Pipeline([
('selector', ItemSelector(key='x')),
('tfidf', tfidf),
('best', SelectKBest(k=1000))
]))
])
X_features = feature_union.fit(x_train, y_train).transform(x_train)
param_grid = dict(univ_select__k=[1,100,1000,10000], mnb__alpha=[0.01, 0.1, 1.0])
grid = GridSearchCV(MultinomialNB(), param_grid=param_grid)
grid.fit(X_features, y_train)
c = grid.best_estimator_
X_test = feature_union.transform(x_test)
pred = np.array(c.predict(X_test))
pred_proba = np.array([a[1] for a in c.predict_proba(X_test)])
precision, recall, fscore, support = precision_recall_fscore_support(actual, pred)
fpr, tpr, thresholds = roc_curve(actual, pred)
auc_score = auc(fpr, tpr)
now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
metadata = {
'pipeline': str(grid.best_estimator_),
'created_at': now,
'git_hash': 0,
'precision': [float(p) for p in precision],
'recall': [float(r) for r in recall],
'fscore': [float(f) for f in fscore],
'support': [int(s) for s in support],
'auc': auc_score
}
p = PackagedPipeline(pipeline=grid.best_estimator_, feature_union=feature_union, metadata=metadata,
x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)
p.save()
0
Example 27
def GetKFeatures(filename, method='RFE',kbest=30,alpha=0.01, reduceMatrix = True):
'''
Gets best features using chosen method
(K-best, RFE, RFECV,'L1' (RandomizedLogisticRegression),'Tree' (ExtraTreesClassifier), mrmr),
then prints top K features' names (from featNames).
If reduceMatrix = True, then also returns X reduced to the K best features.
Available methods' names are: 'RFE','RFECV','RandomizedLogisticRegression','K-best','ExtraTreesClassifier'..
Note, that effectiveyl, Any scikit learn method could be used, if correctly imported..
'''
#est = method()
'''
Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented).
Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
'''
features, labels, lb_encoder,featureNames = load_data(filename)
X, y = features, labels
# change the names as ints back to strings
class_names=lb_encoder.inverse_transform(y)
print("Data and labels imported. PreFilter Feature matrix shape:")
print(X.shape)
selectK = SelectKBest(k=kbest)
selectK.fit(X,y)
selectK_mask=selectK.get_support()
K_featnames = featureNames[selectK_mask]
print('X After K filter:',X.shape)
print("K_featnames: %s" %(K_featnames))
if reduceMatrix ==True :
Reduced_df = pd.read_csv(filename, index_col=0)
Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
Reduced_df.to_csv('REDUCED_Feat.csv')
print('Saved to REDUCED_Feat.csv')
return Reduced_df
0
Example 28
Project: AWS-Lambda-ML-Microservice-Skeleton
License: View license
Source File: test_bagging.py
Function: test_bagging_with_pipeline
License: View license
Source File: test_bagging.py
Function: test_bagging_with_pipeline
def test_bagging_with_pipeline():
estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
DecisionTreeClassifier()),
max_features=2)
estimator.fit(iris.data, iris.target)
0
Example 29
def test_feature_union():
# basic sanity check for feature union
iris = load_iris()
X = iris.data
X -= X.mean(axis=0)
y = iris.target
svd = TruncatedSVD(n_components=2, random_state=0)
select = SelectKBest(k=1)
fs = FeatureUnion([("svd", svd), ("select", select)])
fs.fit(X, y)
X_transformed = fs.transform(X)
assert_equal(X_transformed.shape, (X.shape[0], 3))
# check if it does the expected thing
assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X))
assert_array_equal(X_transformed[:, -1],
select.fit_transform(X, y).ravel())
# test if it also works for sparse input
# We use a different svd object to control the random_state stream
fs = FeatureUnion([("svd", svd), ("select", select)])
X_sp = sparse.csr_matrix(X)
X_sp_transformed = fs.fit_transform(X_sp, y)
assert_array_almost_equal(X_transformed, X_sp_transformed.toarray())
# test setting parameters
fs.set_params(select__k=2)
assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
# test it works with transformers missing fit_transform
fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)])
X_transformed = fs.fit_transform(X, y)
assert_equal(X_transformed.shape, (X.shape[0], 8))
0
Example 30
def test_feature_union_weights():
# test feature union with transformer weights
iris = load_iris()
X = iris.data
y = iris.target
pca = RandomizedPCA(n_components=2, random_state=0)
select = SelectKBest(k=1)
# test using fit followed by transform
fs = FeatureUnion([("pca", pca), ("select", select)],
transformer_weights={"pca": 10})
fs.fit(X, y)
X_transformed = fs.transform(X)
# test using fit_transform
fs = FeatureUnion([("pca", pca), ("select", select)],
transformer_weights={"pca": 10})
X_fit_transformed = fs.fit_transform(X, y)
# test it works with transformers missing fit_transform
fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)],
transformer_weights={"mock": 10})
X_fit_transformed_wo_method = fs.fit_transform(X, y)
# check against expected result
# We use a different pca object to control the random_state stream
assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
assert_array_equal(X_transformed[:, -1],
select.fit_transform(X, y).ravel())
assert_array_almost_equal(X_fit_transformed[:, :-1],
10 * pca.fit_transform(X))
assert_array_equal(X_fit_transformed[:, -1],
select.fit_transform(X, y).ravel())
assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
0
Example 31
def test_best_parameters():
pipeline = dl.Pipeline([("pca", PCA()),
("select_k", SelectKBest()),
("svm", LinearSVC())])
param_grid = {'select_k__k': [1, 2, 3, 4],
'svm__C': np.logspace(-3, 2, 3)}
grid = dl.GridSearchCV(pipeline, param_grid)
parameter_iterable = [{'select_k__k': k, 'svm__C': x}
for k in [1, 2, 3, 4]
for x in np.logspace(-3, 2, 3)]
cv = check_cv(grid.cv, X_train, y_train, classifier=is_classifier(pipeline))
scorer = check_scoring(pipeline, scoring=grid.scoring)
best = best_parameters(pipeline, cv, X_train, y_train,
parameter_iterable, scorer, grid.fit_params,
grid.iid)
pipeline = pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
assert (len(best.dask)
< len(cv) * len(parameter_iterable) * len(score.dask) / 2)
0
Example 32
def test_pipeline(self):
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
diabetes = datasets.load_diabetes()
models = ['OLS', 'GLS', 'WLS', 'GLSAR', 'QuantReg', 'GLM', 'RLM']
for model in models:
klass = getattr(sm, model)
selector = SelectKBest(f_regression, k=5)
estimator = Pipeline([('selector', selector),
('reg', base.StatsModelsRegressor(klass))])
estimator.fit(diabetes.data, diabetes.target)
result = estimator.predict(diabetes.data)
data = SelectKBest(f_regression, k=5).fit_transform(diabetes.data, diabetes.target)
expected = klass(diabetes.target, data).fit().predict(data)
self.assert_numpy_array_almost_equal(result, expected)