Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import pprint as pp
- import os
- import pprint as pp
- from sklearn.feature_selection import SelectKBest
- from sklearn.feature_selection import f_classif
- from scipy.stats import kstest, ks_2samp
- features_dic = {}
- results_dic = {}
- script_dir = os.path.dirname(__file__)
- rel_path = "dane_zawaly.xlsx"
- abs_file_path = os.path.join(script_dir, rel_path)
- dataExcel = pd.read_excel(abs_file_path, nrows=901)
- df = pd.DataFrame(dataExcel)
- feature_data = df.iloc[:, :-1]
- diagnose_classes = np.array(df['Klasa'])
- #for feature, values in feature_data.iteritems():
- #result = ks_2samp(values, diagnose_classes)
- #result = kstest(values, 'norm')
- #results_dic[feature] = result
- #ranking = sorted([(feature, result) for feature, result in results_dic.items()], key=lambda z: z[1][0], reverse=True)
- #pp.pprint(ranking)
- # Create an SelectKBest object to select features with two best ANOVA F-Values
- fvalue_selector = SelectKBest(f_classif, k=5)
- # Apply the SelectKBest object to the features and target
- feature_data_kbest = fvalue_selector.fit_transform(feature_data, diagnose_classes)
- print('Original number of features:', feature_data.shape[1])
- print('Reduced number of features:', feature_data_kbest.shape[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement