#install XGBoost:
#!conda install -y -c conda-forge xgboost
import datetime as dt
import numpy as np
import pandas as pd
import xgboost as xgb
import random
import time
from numpy import genfromtxt
#download files
#bucket = 'XXX'
#prefix = 'XXX/XXX'
#import boto3
#import re
#from sagemaker import get_execution_role
#role = get_execution_role()
#s3 = boto3.client('s3')
#s3.download_file(bucket, 'features.csv', 'features.csv')
data=pd.read_csv('features.csv')
data.drop(data[data.Pid > 57516454].index, inplace=True) # remove last 15 days
data.drop([ 'Pid','Row'], axis=1,inplace=True)
#wo new features
data.drop([ 'CodeSnippetsLen','Paragraphs','isQuestion','HasError','HasQuote','HasList','LOC',
'min_quality', 'max_expert_ratio', 'max_quality', 'max_problem_rate', 'avg_quality',
'QuestionAge','OwnerQuestionsScore','OwnerAnswersScore',
'HadYearling', 'HadNiceAnswer', 'HadNiceQuestion','HadStudent',
'HadAnalytical', 'HadEnthusiast', 'HadCommentator', 'HadAutobiographer', 'HadCurious',
'HadPopularQuestion', 'HadFamousQuestion', 'HadNotableQuestion', 'HadGoodAnswer', 'HadGreatAnswer',
'HadNecromancer', 'HadScholar', 'HadCustodian', 'HadEditor', 'HadCritics', 'HadSupporter',
'HadTumbleweed', 'HadInformed','HadTeacher','BodySentenceCount'], axis=1,inplace=True)
features = list(data.columns.values)
features.remove('Resolved')
params = {
"objective": "binary:logistic",
"booster" : "gbtree",
"eval_metric": "auc",
"eta": .56,
"tree_method": 'exact',
"max_depth": 20,
"subsample": 1,
"colsample_bytree": 0.5,
"silent": 1,
"min_child_weight": 1,
"n_jobs": -1,
"gamma": 15,
"num_parallel_tree": 8,
}
dtrain = xgb.DMatrix(data[features], data['Resolved'], missing=-1)
gbm = xgb.cv(params, dtrain, 35, nfold=10, verbose_eval=1)
print(gbm)