In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from operator import itemgetter
import random
import time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt
from numpy import genfromtxt
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc,recall_score,precision_score
import datetime as dt
from subprocess import check_output
from sklearn import tree
from sklearn import svm
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_friedman1
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.naive_bayes import GaussianNB
import math
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
In [2]:
#download files
#bucket = 'XXX'
#prefix = 'XXX/XXX'
#import boto3
#import re
#from sagemaker import get_execution_role
#role = get_execution_role()
#s3 = boto3.client('s3')
#s3.download_file(bucket, 'features.csv', 'features.csv')

data=pd.read_csv('features.csv')
data.drop(data[data.Pid > 57516454].index, inplace=True) # remove last 15 days
data.drop([ 'Pid','Row'], axis=1,inplace=True)
In [3]:
#w new features
data.drop([ 'WordCountBody', 'WordCountTitle', 'HasWhQuestion', 'CodeSnippets', 'isCapital', 
           'isBodyCapital', 'Links', 'BodyAVGSentence', 'BodyAVGWord',  'TitleAVGWord', 'TagCount',
           'SOExperience',  'AskingWeekDay', 'AskingHour', 'OwnerQuestions',
           'OwnerAnswers',  'OwnerAcceptedAnswers'], axis=1,inplace=True)
In [4]:
#models
features = list(data.columns.values)
features.remove('Resolved')
y_train = data['Resolved']

clf1 = GaussianNB()
clf2 =linear_model.Ridge(alpha=.3)
clf3 =linear_model.Lasso(alpha=0.6)
#clf4 =linear_model.LassoLars(alpha=0.1)
clf4 =linear_model.BayesianRidge()
#clf6 =SGDClassifier(loss="log", max_iter=5,n_jobs=-1)
#clf = NearestCentroid()
#clf7 = MLPClassifier(solver='lbfgs', alpha=1e-4,hidden_layer_sizes=(3, 2), random_state=0)
#dtrain = xgb.DMatrix(train[features], y_train, missing=-1)
#clf = svm.SVC()
#clf.score(data[features], y_test)
#clf = tree.DecisionTreeRegressor()
#clf8 = SGDClassifier(loss="hinge", penalty="l2", max_iter=1000,n_jobs=-1)
#clf = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5)
#clf = RandomForestClassifier(n_estimators=70,max_depth=10)
clf5 = DecisionTreeClassifier(max_depth=10, min_samples_split=100000, random_state=0)
#clf6=KNeighborsClassifier(n_neighbors=3)
#clf7 = QuadraticDiscriminantAnalysis()
#kernel = 1.0 * RBF(1.0)
#clf8 = GaussianProcessClassifier(kernel=kernel,random_state=0)
#clf9 = svm.SVC()
#clf10 = RandomForestClassifier(n_estimators=30,max_depth=10)
#clf10 = DecisionTreeClassifier(max_depth=10, min_samples_split=10000, random_state=0)
#clf11 = DecisionTreeClassifier(max_depth=10, min_samples_split=1000, random_state=0)
#clf12 = DecisionTreeClassifier(max_depth=7, min_samples_split=100000, random_state=0)
#clf = AdaBoostClassifier(n_estimators=100)
#clf = GradientBoostingClassifier(n_estimators=80,max_depth=7,min_samples_split=10000, random_state=0)
#clf = GradientBoostingRegressor(n_estimators=100, max_depth=7,min_samples_split=100, random_state=0, loss='ls')
#clf = GradientBoostingClassifier(n_estimators=100, max_depth=200,min_samples_split=100, random_state=0)
#clf1 = LogisticRegression(random_state=0)
#clf2 = RandomForestClassifier(n_estimators=70,max_depth=10)
#clf3 = GaussianNB()
#clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='hard')
#clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft', weights=[1, 2.5, 1])
#eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft')
#clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
#clf = clf.fit(data[features], y_train)
#clf=AdaBoostClassifier(n_estimators=100)
#clf0=GradientBoostingClassifier(n_estimators=100,max_depth=5,min_samples_split=100, random_state=0)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf1, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC GaussianNB: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf2, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC Ridge: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf3, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC Lasso: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf4, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC BayesianRidge: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf5, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC DecisionTreeClassifier: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)
AUC GaussianNB: 0.61+0.00
[0.61127381 0.61207757 0.61227318 0.61150377 0.61259991 0.61097553
 0.61150396 0.61165697 0.61154221 0.61180107]
AUC Ridge: 0.67+0.00
[0.66926208 0.67000393 0.66968944 0.6685762  0.67022925 0.66924955
 0.66907674 0.66938508 0.6694791  0.66965616]
AUC Lasso: 0.62+0.00
[0.61507316 0.61582768 0.61569704 0.61508921 0.61620193 0.61468971
 0.61506399 0.61540022 0.61550661 0.61542737]
AUC BayesianRidge: 0.67+0.00
[0.6692621  0.67000391 0.66968943 0.66857617 0.67022923 0.66924951
 0.66907675 0.66938507 0.66947912 0.66965618]
AUC DecisionTreeClassifier: 0.68+0.00
[0.6776676  0.67824791 0.67799348 0.67721683 0.67839266 0.67758505
 0.67754236 0.67800815 0.67775865 0.67784561]
In [ ]: