Here are the examples of the python api sklearn.feature_selection.SelectKBest.fit taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
6 Examples
3
Example 1
def test_feature_selection():
# make two feature dicts with two useful features and a bunch of useless
# ones, in terms of chi2
d1 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=1, useful2=20)
d2 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=20, useful2=1)
for indices in (True, False):
v = DictVectorizer().fit([d1, d2])
X = v.transform([d1, d2])
sel = SelectKBest(chi2, k=2).fit(X, [0, 1])
v.restrict(sel.get_support(indices=indices), indices=indices)
assert_equal(v.get_feature_names(), ["useful1", "useful2"])
3
Example 2
Project: scikit-learn Source File: test_feature_select.py
def test_tied_scores():
# Test for stable sorting in k-best with tied scores.
X_train = np.array([[0, 0, 0], [1, 1, 1]])
y_train = [0, 1]
for n_features in [1, 2, 3]:
sel = SelectKBest(chi2, k=n_features).fit(X_train, y_train)
X_test = sel.transform([[0, 1, 2]])
assert_array_equal(X_test[0], np.arange(3)[-n_features:])
3
Example 3
Project: scikit-learn Source File: test_feature_select.py
def test_invalid_k():
X = [[0, 1, 0], [0, -1, -1], [0, .5, .5]]
y = [1, 0, 1]
assert_raises(ValueError, SelectKBest(k=-1).fit, X, y)
assert_raises(ValueError, SelectKBest(k=4).fit, X, y)
assert_raises(ValueError,
GenericUnivariateSelect(mode='k_best', param=-1).fit, X, y)
assert_raises(ValueError,
GenericUnivariateSelect(mode='k_best', param=4).fit, X, y)
3
Example 4
Project: scikit-learn Source File: test_feature_select.py
def test_no_feature_selected():
rng = np.random.RandomState(0)
# Generate random uncorrelated data: a strict univariate test should
# rejects all the features
X = rng.rand(40, 10)
y = rng.randint(0, 4, size=40)
strict_selectors = [
SelectFwe(alpha=0.01).fit(X, y),
SelectFdr(alpha=0.01).fit(X, y),
SelectFpr(alpha=0.01).fit(X, y),
SelectPercentile(percentile=0).fit(X, y),
SelectKBest(k=0).fit(X, y),
]
for selector in strict_selectors:
assert_array_equal(selector.get_support(), np.zeros(10))
X_selected = assert_warns_message(
UserWarning, 'No features were selected', selector.transform, X)
assert_equal(X_selected.shape, (40, 0))
0
Example 5
Project: ProFET Source File: OutPutRes.py
def GetAllPerf (filePaths=None):
if filePaths is None:
filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv'))
#Sanity check:
# filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile']
# filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv']
print("FilePaths: \n",filePaths)
fileNames=fileNameFromPaths (filePaths)
print("FileNames:",fileNames)
resDict = pd.DataFrame(index=fileNames,
columns=['Accuracy','Accuracy_SD',
'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1',
'LargestClassPercent','Classes',
# 'TopRFE-Features','Best (f1) Model parameters',
'# Classes',
'Array-Acc-Scores' ,'Array-f1-Scores'
,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted'])
#redDict holds results for each file/class, for saving to output-file
i=-1
for filePath in filePaths:
i +=1
'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/'
filePath = os.path.normpath(filePath)
print(filePath)
fileName=str(fileNames[i]) #Str added now 14.1
print("fileName: %s" %(fileName))
"resDict['Name']= fileName"
# filePath = str(argv[1])
# X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels
X, y, lb_encoder,featureNames = load_data(filePath) # X, y = features, labels
print(X.shape,"= (samples, features)")
y_inv = Counter(lb_encoder.inverse_transform(y))
MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1)
print("Classes:", lb_encoder.classes_)
print("MajorityClassPercent:", MajorityPercent)
resDict.LargestClassPercent[fileName] = MajorityPercent
resDict.Classes[fileName] = str(lb_encoder.classes_)
resDict["# Classes"][fileName]=len(lb_encoder.classes_)
KFilt=None
KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself.
if KFilt is not None:
k = SelectKBest(k=KFilt).fit(X,y)
X=k.transform(X)
featureNames=featureNames[k.get_support()]
Fwe = SelectFwe(alpha=0.01).fit(X,y)
X=Fwe.transform(X)
featureNames=featureNames[Fwe.get_support()]
print("X reduced to K best features: ",X.shape)
FeatSelection_SVM=False #Feature Names need updating!!
FeatSelection_RandLogReg=False
if FeatSelection_RandLogReg == True:
LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5,
sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y)
X_L1 = LogRegFeats.transform(X)
featureNames=featureNames[LogRegFeats.get_support()]
print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape)
elif FeatSelection_SVM == True:
svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y)
X_L1 = svc_L1.transform(X, y)
featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
print ("L1 SVM Transformed X:",X_L1.shape)
# X=X_L1
'''
print("Performance as a function of percent of features used:")
PlotPerfPercentFeatures(X,y,est=LinearSVC())
'''
'EG - graph best features; feature selection using RF, ensemble classifiers..'
'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb'
RFE_FeatsToKeep = 16
FeatSelection_RFE=False
FeatSelection_RFECV=False
if (FeatSelection_RFE or FeatSelection_RFECV) == True:
'RFE + - best feats'
'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html '
svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False)
# svc = LogisticRegression(class_weight='auto')#,C=1)
if FeatSelection_RFECV==True:
rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision')
# ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3))
#,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..."
else:
rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03)
rfecv.fit(X, y)
if FeatSelection_RFECV==True:
print("RFE-CV selected %d features : " % (rfecv.n_features_))
print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) )
rfe_featnames = featureNames[rfecv.get_support()]
featureNames = featureNames[rfecv.get_support()]
print("RFE selected feature names:",rfe_featnames)
X_RFE = rfecv.fit_transform(X, y)
print("X_RFE",X_RFE.shape)
resDict['TopRFE-Features'][fileName]=str(rfe_featnames)
'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
GetRFEPerf=False
# print("lb_encoder.classes_",lb_encoder.classes_)
'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb '
'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/'
'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators'
"http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html"
print()
"Make custom F1 scorer. May not have fixed problem!"
from sklearn.metrics.score import make_scorer
f1_scorer = make_scorer(metrics.f1_score,
greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none
# print("Dummy classifiers output:")
dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0)
y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent)
dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred ))
dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted'))
dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred))
#Get from ALL classes f1..
dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean()
# print("Dummy, most frequent acc:",dummy_freq_acc)
# dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0)
# dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y)))
# 'print("Dummy, Stratified Random:",dummy_strat2)'
print()
resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc
## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean
resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean
resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted
# resDict.dummy_Stratfreq[fileName]=dummy_strat2
"We can get seperately the best model for Acc, and the best for f1!"
"WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower"
# bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')
"Temporary workaround until next SKlearn update of F1 metric:"
# bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer
bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer)
bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy')
print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1)
print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc)
#Temp
# bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1)
if GetRFEPerf==True:
bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1')
"Modified to get 2 estimators"
scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy
print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2))
scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1')
print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))
resDict['Accuracy'][fileName]=round(scores_acc.mean(),4)
resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4)
resDict['f1'][fileName]=round(scores_f1.mean(),4)
resDict['f1_SD'][fileName]=round(scores_f1.std(),4)
resDict['Array-f1-Scores'][fileName]=(scores_f1)
resDict['Array-Acc-Scores'][fileName]=(scores_acc)
resDict['bestML-f1'][fileName]=(str(bestEst_f1))
resDict['bestML-Acc'][fileName]=(str(bestEst_acc))
#ORIG
# Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15)
# resDict['Accuracy'][fileName]=round(Acc,4)
# resDict['Accuracy_SD'][fileName]=round(Acc_SD,4)
# resDict['f1 score'][fileName]=round(f1,4)
# resDict['f1_SD'][fileName]=round(f1_SD,4)
# resDict['Best (f1) Model parameters'][fileName]= bestEst
print()
# print(fileName," Done")
print("Saving results to file")
resDict.to_csv("OutputData.tsv", sep=',')
0
Example 6
Project: ProFET Source File: VisualizeBestFeatures.py
def main(args):
if args.train_dir is None:
# args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/'
#args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/'
# args.train_dir = r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3'
# args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big'
args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles'
print("Using default train_dir: %s" % args.train_dir)
pandas.set_option('display.max_columns', 10)
pandas.set_option('display.max_rows', 4)
# mpl.rc('title', labelsize=6)
mpl.rc('ytick', labelsize=7)
mpl.rc('xtick', labelsize=4)
os.chdir(args.train_dir)
dataName = 'Neuropeptides'
df = pandas.read_csv('trainingSetFeatures.csv')
feature_cols = [col for col in df.columns if col not in ['classname','Id','proteinname']]
feature_cols=numpy.array(feature_cols)
X = df[feature_cols].values
y = df.classname.values
le = LabelEncoder()
y = le.fit_transform(y)
"Initial feature selection trimming"
print(X.shape)
Fwe = SelectFwe(alpha=0.01).fit(X,y)
X=Fwe.transform(X)
print("F-test -> ",X.shape)
feature_cols=feature_cols[Fwe.get_support()]
'''
FeatSelection_SVM = True
if FeatSelection_SVM == True:
svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y)
X = svc_L1.transform(X, y)
print ("L1 SVM Transformed X:",X_L1.shape)
feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
'''
k = SelectKBest(k=255).fit(X,y)
X=k.transform(X)
feature_cols=feature_cols[k.get_support()]
param_dist = {"max_depth": [6,9, None],
"max_features": ['auto',0.4],
"min_samples_leaf": [1,2,3],
"bootstrap": [True, False],
'min_samples_split':[2,3],
"criterion": [ "gini"],
"n_estimators":[100],
"n_jobs":[-1]}
rf = RandomForestClassifierWithCoef(max_depth= 7, min_samples_split= 1, min_samples_leaf= 2, n_estimators= 50, n_jobs= 2, max_features= "auto")
"WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class."
scores = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2))
print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2))
"Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'."
scores_f1 = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1')
print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))
# rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04)
rfeSelect = RFECV(estimator=rf,step=20, cv=2,scoring='f1') #average_precision , recall
X_RFE = rfeSelect.fit_transform(X,y)
print(X_RFE.shape)
RFE_FeatureNames = feature_cols[rfeSelect.get_support()]
print(RFE_FeatureNames)
RFE_ScoreRatio = 100*(cross_validation.cross_val_score(rf,X_RFE,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1').mean())/scores_f1.mean()
print("Even with just",X_RFE.shape[1]," features, we have %f performance! (f1 score ratio)" %(RFE_ScoreRatio))
# PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
print("Alt plot:")
altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)