In [1]:
#install XGBoost:
#!conda install -y -c conda-forge xgboost
In [2]:
import datetime as dt
import numpy as np
import pandas as pd 
import xgboost as xgb
import random
import time
from numpy import genfromtxt
In [3]:
#download files
#bucket = 'XXX'
#prefix = 'XXX/XXX'
#import boto3
#import re
#from sagemaker import get_execution_role
#role = get_execution_role()
#s3 = boto3.client('s3')
#s3.download_file(bucket, 'features.csv', 'features.csv')

data=pd.read_csv('features.csv')
In [4]:
data.drop(data[data.Pid > 57516454].index, inplace=True) # remove last 15 days
data.drop([ 'Pid','Row'], axis=1,inplace=True)
In [5]:
#w new features
data.drop([ 'WordCountBody', 'WordCountTitle', 'HasWhQuestion', 'CodeSnippets', 'isCapital', 
           'isBodyCapital', 'Links', 'BodyAVGSentence', 'BodyAVGWord',  'TitleAVGWord', 'TagCount',
           'SOExperience',  'AskingWeekDay', 'AskingHour', 'OwnerQuestions',
           'OwnerAnswers',  'OwnerAcceptedAnswers'], axis=1,inplace=True)
In [6]:
features = list(data.columns.values)
features.remove('Resolved')
params = {
    "objective": "binary:logistic",
    "booster" : "gbtree",
    "eval_metric": "auc",
    "eta": .56,
    "tree_method": 'exact',
    "max_depth": 20,
    "subsample": 1,
    "colsample_bytree": 0.5,
    "silent": 1,
    "min_child_weight": 1,
    "n_jobs": -1,
    "gamma": 15,
    "num_parallel_tree": 8,
}
dtrain = xgb.DMatrix(data[features], data['Resolved'], missing=-1)
gbm = xgb.cv(params, dtrain, 35, nfold=10,  verbose_eval=1)
print(gbm)
[0]	train-auc:0.662685+0.000401641	test-auc:0.661907+0.00305681
[1]	train-auc:0.677973+0.00670439	test-auc:0.676975+0.0079246
[2]	train-auc:0.685254+0.00474899	test-auc:0.684042+0.00672126
[3]	train-auc:0.68956+0.00314734	test-auc:0.688081+0.00477924
[4]	train-auc:0.69314+0.0016293	test-auc:0.691414+0.00368946
[5]	train-auc:0.695315+0.00117596	test-auc:0.693456+0.00330223
[6]	train-auc:0.696675+0.00114213	test-auc:0.694666+0.00342186
[7]	train-auc:0.698043+0.000752688	test-auc:0.695879+0.00295984
[8]	train-auc:0.699155+0.000648822	test-auc:0.696825+0.00265414
[9]	train-auc:0.699989+0.000558154	test-auc:0.697586+0.00263659
[10]	train-auc:0.700691+0.000536032	test-auc:0.698215+0.00261467
[11]	train-auc:0.701423+0.000441579	test-auc:0.698836+0.00259644
[12]	train-auc:0.701975+0.000391942	test-auc:0.699226+0.00258716
[13]	train-auc:0.702518+0.000451379	test-auc:0.699672+0.00250232
[14]	train-auc:0.703067+0.000390394	test-auc:0.700111+0.0025803
[15]	train-auc:0.703472+0.000410142	test-auc:0.70041+0.00263863
[16]	train-auc:0.703822+0.000385917	test-auc:0.700723+0.00260144
[17]	train-auc:0.704133+0.000388442	test-auc:0.701007+0.00253001
[18]	train-auc:0.704334+0.000391589	test-auc:0.701146+0.00251072
[19]	train-auc:0.704654+0.000304292	test-auc:0.701418+0.00251942
[20]	train-auc:0.704848+0.000307176	test-auc:0.701607+0.00246845
[21]	train-auc:0.705033+0.000301679	test-auc:0.701733+0.00247376
[22]	train-auc:0.705164+0.000277508	test-auc:0.701817+0.00246814
[23]	train-auc:0.705357+0.000318212	test-auc:0.702014+0.00242917
[24]	train-auc:0.705485+0.000340432	test-auc:0.702114+0.00242802
[25]	train-auc:0.705559+0.000335568	test-auc:0.702159+0.00244245
[26]	train-auc:0.705647+0.000316686	test-auc:0.702234+0.00243973
[27]	train-auc:0.705778+0.000263337	test-auc:0.702327+0.00246393
[28]	train-auc:0.705843+0.000292908	test-auc:0.702355+0.00246958
[29]	train-auc:0.705871+0.000279555	test-auc:0.702375+0.00248088
[30]	train-auc:0.705925+0.000285021	test-auc:0.702414+0.00247065
[31]	train-auc:0.705965+0.000317821	test-auc:0.702441+0.00245234
[32]	train-auc:0.706062+0.000344069	test-auc:0.702479+0.00246428
[33]	train-auc:0.706138+0.000397608	test-auc:0.702531+0.00239998
[34]	train-auc:0.706243+0.000447135	test-auc:0.702622+0.0023271
    train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0         0.662685       0.000402       0.661907      0.003057
1         0.677973       0.006704       0.676975      0.007925
2         0.685254       0.004749       0.684042      0.006721
3         0.689560       0.003147       0.688081      0.004779
4         0.693140       0.001629       0.691414      0.003689
5         0.695315       0.001176       0.693456      0.003302
6         0.696675       0.001142       0.694666      0.003422
7         0.698043       0.000753       0.695879      0.002960
8         0.699155       0.000649       0.696825      0.002654
9         0.699989       0.000558       0.697586      0.002637
10        0.700691       0.000536       0.698215      0.002615
11        0.701423       0.000442       0.698836      0.002596
12        0.701975       0.000392       0.699226      0.002587
13        0.702518       0.000451       0.699672      0.002502
14        0.703067       0.000390       0.700111      0.002580
15        0.703472       0.000410       0.700410      0.002639
16        0.703822       0.000386       0.700723      0.002601
17        0.704133       0.000388       0.701007      0.002530
18        0.704334       0.000392       0.701146      0.002511
19        0.704654       0.000304       0.701418      0.002519
20        0.704848       0.000307       0.701607      0.002468
21        0.705033       0.000302       0.701733      0.002474
22        0.705164       0.000278       0.701817      0.002468
23        0.705357       0.000318       0.702014      0.002429
24        0.705485       0.000340       0.702114      0.002428
25        0.705559       0.000336       0.702159      0.002442
26        0.705647       0.000317       0.702234      0.002440
27        0.705778       0.000263       0.702327      0.002464
28        0.705843       0.000293       0.702355      0.002470
29        0.705871       0.000280       0.702375      0.002481
30        0.705925       0.000285       0.702414      0.002471
31        0.705965       0.000318       0.702441      0.002452
32        0.706062       0.000344       0.702479      0.002464
33        0.706138       0.000398       0.702531      0.002400
34        0.706243       0.000447       0.702622      0.002327
In [ ]: