https://github.com/cuplv/Discriminer
Tip revision: 1d1fa58242c8ef28a7ae69157861161d391fad09 authored by Saeid Tizpaz Niari on 15 December 2018, 18:23:49 UTC
Update README.md
Update README.md
Tip revision: 1d1fa58
Classify.py
import numpy as np
from sklearn import preprocessing, cross_validation, neighbors, tree
import pandas as pd
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
from sklearn import decomposition
import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument("--filename", help="input_file", required=False)
argparser.add_argument("--kfolds", help="number of k", default = "20", required=False)
argparser.add_argument("--depth", help="depth of tree", default = "", required=False)
argparser.add_argument("--output", help="name of output", default = "tmp", required=False)
args = argparser.parse_args()
if(args.filename == None):
filename = raw_input("Enter the name of your input data set (.csv) without file type: ")
kfolds = raw_input("Please enter the number of random folds for cross-validation step (default is 20)? ")
if(kfolds == ""):
kfolds_numbers = 20
else:
try:
kfolds_numbers = int(kfolds)
except ValueError:
print("K-fold should be an integer!")
max_depth_tree = raw_input("Please enter the maximum depth of tree (do not specify any number if default value of algorithm is the best)? ")
if(max_depth_tree == ""):
max_depth_tree_num = None
else:
try:
max_depth_tree_num = int(max_depth_tree)
except ValueError:
print("Max depth should be integer!")
df = pd.read_csv("Classification_input/" + filename + ".csv",index_col = 'id')
else:
filename = args.filename
kfolds = args.kfolds
if(kfolds == ""):
kfolds_numbers = 20
else:
try:
kfolds_numbers = int(kfolds)
except ValueError:
print("K-fold should be an integer!")
max_depth_tree = args.depth
if(max_depth_tree == ""):
max_depth_tree_num = None
else:
try:
max_depth_tree_num = int(max_depth_tree)
except ValueError:
print("Max depth should be integer!")
df = pd.read_csv(filename,index_col = 'id')
header = list(df.columns.values)
header.remove('label')
header.remove('weight')
X = np.array(df.drop(['label'],1))
y = np.array(df['label'])
X, y = shuffle(X, y, random_state=0)
accuracy_max = 0
for i in range(3):
kf = cross_validation.KFold(len(X), n_folds=kfolds_numbers, shuffle=True, random_state=None)
for train_index, test_index in kf:
# X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
train_weights = X_train[:,-1]
X_train = np.delete(X_train,-1,1)
test_weights = X_test[:,-1]
X_test = np.delete(X_test,-1,1)
clf_temp = DecisionTreeClassifier(criterion='gini',splitter='best',max_depth=max_depth_tree_num)
clf_temp.fit(X_train,y_train,train_weights)
accuracy = clf_temp.score(X_test,y_test,test_weights)
if(accuracy > accuracy_max):
accuracy_max = accuracy
clf = clf_temp
if(args.filename==None):
out = "Classification_results/" + filename +'_tree'+str(i)+'.dot'
else:
out = args.output +'_tree'+str(i)+'.dot'
tree.export_graphviz(clf,out_file=out,feature_names=header)
print_out ='accuracy ' + filename +'_tree'+str(i) + ': '
print_out = print_out + str(accuracy_max)
print(print_out)
accuracy_max = 0
if(args.filename==None):
print("\n The program generates three trees with highest accuracy. Please run: dot -Tpng Classification_results/" + filename +"_treen.dot" + " -o tree.png to see the final decision tree. Please note that treen is tree0, tree1, or tree2. \n")
else:
print("\n The program generates three trees with highest accuracy. Please run: dot -Tpng " + args.output +"_treen.dot" + " -o tree.png to see the final decision tree. Please note that treen is tree0, tree1, or tree2. \n")