In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from operator import itemgetter
import random
import time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt
from numpy import genfromtxt
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc,recall_score,precision_score
import datetime as dt
from subprocess import check_output
from sklearn import tree
from sklearn import svm
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_friedman1
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.naive_bayes import GaussianNB
import math
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
In [2]:
#download files
#bucket = 'XXX'
#prefix = 'XXX/XXX'
#import boto3
#import re
#from sagemaker import get_execution_role
#role = get_execution_role()
#s3 = boto3.client('s3')
#s3.download_file(bucket, 'features.csv', 'features.csv')

data=pd.read_csv('features.csv')
data.drop(data[data.Pid > 57516454].index, inplace=True) # remove last 15 days
data.drop([ 'Pid','Row'], axis=1,inplace=True)
In [3]:
#models
features = list(data.columns.values)
features.remove('Resolved')
y_train = data['Resolved']

clf1 = GaussianNB()
clf2 =linear_model.Ridge(alpha=.3)
clf3 =linear_model.Lasso(alpha=0.6)
#clf4 =linear_model.LassoLars(alpha=0.1)
clf4 =linear_model.BayesianRidge()
#clf6 =SGDClassifier(loss="log", max_iter=5,n_jobs=-1)
#clf = NearestCentroid()
#clf7 = MLPClassifier(solver='lbfgs', alpha=1e-4,hidden_layer_sizes=(3, 2), random_state=0)
#dtrain = xgb.DMatrix(train[features], y_train, missing=-1)
#clf = svm.SVC()
#clf.score(data[features], y_test)
#clf = tree.DecisionTreeRegressor()
#clf8 = SGDClassifier(loss="hinge", penalty="l2", max_iter=1000,n_jobs=-1)
#clf = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5)
#clf = RandomForestClassifier(n_estimators=70,max_depth=10)
clf5 = DecisionTreeClassifier(max_depth=10, min_samples_split=100000, random_state=0)
#clf6=KNeighborsClassifier(n_neighbors=3)
#clf7 = QuadraticDiscriminantAnalysis()
#kernel = 1.0 * RBF(1.0)
#clf8 = GaussianProcessClassifier(kernel=kernel,random_state=0)
#clf9 = svm.SVC()
#clf10 = RandomForestClassifier(n_estimators=30,max_depth=10)
#clf10 = DecisionTreeClassifier(max_depth=10, min_samples_split=10000, random_state=0)
#clf11 = DecisionTreeClassifier(max_depth=10, min_samples_split=1000, random_state=0)
#clf12 = DecisionTreeClassifier(max_depth=7, min_samples_split=100000, random_state=0)
#clf = AdaBoostClassifier(n_estimators=100)
#clf = GradientBoostingClassifier(n_estimators=80,max_depth=7,min_samples_split=10000, random_state=0)
#clf = GradientBoostingRegressor(n_estimators=100, max_depth=7,min_samples_split=100, random_state=0, loss='ls')
#clf = GradientBoostingClassifier(n_estimators=100, max_depth=200,min_samples_split=100, random_state=0)
#clf1 = LogisticRegression(random_state=0)
#clf2 = RandomForestClassifier(n_estimators=70,max_depth=10)
#clf3 = GaussianNB()
#clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='hard')
#clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft', weights=[1, 2.5, 1])
#eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft')
#clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
#clf = clf.fit(data[features], y_train)
#clf=AdaBoostClassifier(n_estimators=100)
#clf0=GradientBoostingClassifier(n_estimators=100,max_depth=5,min_samples_split=100, random_state=0)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf1, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC GaussianNB: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf2, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC Ridge: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf3, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC Lasso: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf4, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC BayesianRidge: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf5, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC DecisionTreeClassifier: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)
AUC GaussianNB: 0.61+0.00
[0.60994334 0.61078714 0.61076934 0.61004258 0.61124516 0.60958641
 0.61012749 0.61031882 0.61028267 0.61044549]
AUC Ridge: 0.68+0.00
[0.67542747 0.67605748 0.67589748 0.67473962 0.67630616 0.67547069
 0.67532492 0.67560379 0.67561293 0.67574176]
AUC Lasso: 0.62+0.00
[0.62132539 0.62181082 0.62199835 0.62110897 0.62234805 0.62089516
 0.62127361 0.62157686 0.62180032 0.62154172]
AUC BayesianRidge: 0.68+0.00
[0.67542749 0.67605765 0.67589755 0.67473951 0.67630621 0.67547064
 0.67532504 0.6756037  0.67561286 0.67574182]
AUC DecisionTreeClassifier: 0.68+0.00
[0.68045745 0.68099503 0.68048889 0.67949462 0.68099736 0.68015547
 0.68006187 0.680643   0.68021852 0.68064268]
In [4]: