In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from operator import itemgetter
import random
import time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt
from numpy import genfromtxt
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc,recall_score,precision_score
import datetime as dt
from subprocess import check_output
from sklearn import tree
from sklearn import svm
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_friedman1
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.naive_bayes import GaussianNB
import math
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
In [2]:
#download files
#bucket = 'XXX'
#prefix = 'XXX/XXX'
#import boto3
#import re
#from sagemaker import get_execution_role
#role = get_execution_role()
#s3 = boto3.client('s3')
#s3.download_file(bucket, 'features.csv', 'features.csv')

data=pd.read_csv('features.csv')
data.drop(data[data.Pid > 57516454].index, inplace=True) # remove last 15 days
data.drop([ 'Pid','Row'], axis=1,inplace=True)
In [3]:
#wo new features
data.drop([ 'CodeSnippetsLen','Paragraphs','isQuestion','HasError','HasQuote','HasList','LOC',
          'min_quality', 'max_expert_ratio', 'max_quality', 'max_problem_rate', 'avg_quality',
          'QuestionAge','OwnerQuestionsScore','OwnerAnswersScore',
          'HadYearling', 'HadNiceAnswer', 'HadNiceQuestion','HadStudent', 
           'HadAnalytical', 'HadEnthusiast', 'HadCommentator', 'HadAutobiographer', 'HadCurious',
           'HadPopularQuestion', 'HadFamousQuestion', 'HadNotableQuestion', 'HadGoodAnswer', 'HadGreatAnswer',
           'HadNecromancer', 'HadScholar', 'HadCustodian', 'HadEditor', 'HadCritics', 'HadSupporter', 
           'HadTumbleweed', 'HadInformed','HadTeacher','BodySentenceCount'], axis=1,inplace=True)
In [4]:
#models
features = list(data.columns.values)
features.remove('Resolved')
y_train = data['Resolved']

clf1 = GaussianNB()
clf2 =linear_model.Ridge(alpha=.3)
clf3 =linear_model.Lasso(alpha=0.6)
#clf4 =linear_model.LassoLars(alpha=0.1)
clf4 =linear_model.BayesianRidge()
#clf6 =SGDClassifier(loss="log", max_iter=5,n_jobs=-1)
#clf = NearestCentroid()
#clf7 = MLPClassifier(solver='lbfgs', alpha=1e-4,hidden_layer_sizes=(3, 2), random_state=0)
#dtrain = xgb.DMatrix(train[features], y_train, missing=-1)
#clf = svm.SVC()
#clf.score(data[features], y_test)
#clf = tree.DecisionTreeRegressor()
#clf8 = SGDClassifier(loss="hinge", penalty="l2", max_iter=1000,n_jobs=-1)
#clf = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5)
#clf = RandomForestClassifier(n_estimators=70,max_depth=10)
clf5 = DecisionTreeClassifier(max_depth=10, min_samples_split=100000, random_state=0)
#clf6=KNeighborsClassifier(n_neighbors=3)
#clf7 = QuadraticDiscriminantAnalysis()
#kernel = 1.0 * RBF(1.0)
#clf8 = GaussianProcessClassifier(kernel=kernel,random_state=0)
#clf9 = svm.SVC()
#clf10 = RandomForestClassifier(n_estimators=30,max_depth=10)
#clf10 = DecisionTreeClassifier(max_depth=10, min_samples_split=10000, random_state=0)
#clf11 = DecisionTreeClassifier(max_depth=10, min_samples_split=1000, random_state=0)
#clf12 = DecisionTreeClassifier(max_depth=7, min_samples_split=100000, random_state=0)
#clf = AdaBoostClassifier(n_estimators=100)
#clf = GradientBoostingClassifier(n_estimators=80,max_depth=7,min_samples_split=10000, random_state=0)
#clf = GradientBoostingRegressor(n_estimators=100, max_depth=7,min_samples_split=100, random_state=0, loss='ls')
#clf = GradientBoostingClassifier(n_estimators=100, max_depth=200,min_samples_split=100, random_state=0)
#clf1 = LogisticRegression(random_state=0)
#clf2 = RandomForestClassifier(n_estimators=70,max_depth=10)
#clf3 = GaussianNB()
#clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='hard')
#clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft', weights=[1, 2.5, 1])
#eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],voting='soft')
#clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
#clf = clf.fit(data[features], y_train)
#clf=AdaBoostClassifier(n_estimators=100)
#clf0=GradientBoostingClassifier(n_estimators=100,max_depth=5,min_samples_split=100, random_state=0)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf1, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC GaussianNB: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf2, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC Ridge: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf3, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC Lasso: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf4, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC BayesianRidge: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
scores = cross_val_score(clf5, data[features], y_train, cv=cv, scoring='roc_auc',n_jobs=-1)
print("AUC DecisionTreeClassifier: %0.2f+%0.2f" % (scores.mean(), scores.std() * 2))
print(scores)
AUC GaussianNB: 0.55+0.00
[0.55277102 0.55266268 0.55333704 0.55303093 0.55318329 0.55312786
 0.55282189 0.55348395 0.5531132  0.55382223]
AUC Ridge: 0.56+0.00
[0.56293949 0.56281695 0.5638043  0.56328891 0.56329171 0.56321492
 0.56324228 0.56406467 0.56305277 0.56328155]
AUC Lasso: 0.51+0.00
[0.51130848 0.51209834 0.51074356 0.51136845 0.51101295 0.5103669
 0.51077746 0.51042552 0.5104577  0.51178685]
AUC BayesianRidge: 0.56+0.00
[0.56293775 0.56282062 0.56380331 0.56329197 0.56329634 0.56320993
 0.56324338 0.56406721 0.56305439 0.5632838 ]
AUC DecisionTreeClassifier: 0.64+0.00
[0.6437019  0.64387454 0.64349041 0.64275567 0.64372305 0.64349979
 0.64296187 0.64376884 0.64312921 0.64339253]
In [ ]: