In [1]:
!conda install -y -c conda-forge xgboost
Solving environment: done


==> WARNING: A newer version of conda exists. <==
  current version: 4.5.12
  latest version: 4.8.3

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/ec2-user/anaconda3/envs/tensorflow_p36

  added / updated specs: 
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    libcurl-7.69.1             |       hf7181ac_0         573 KB  conda-forge
    py-xgboost-1.0.2           |   py36h9f0ad1d_1         2.2 MB  conda-forge
    cryptography-2.9.2         |   py36h45558ae_0         613 KB  conda-forge
    xgboost-1.0.2              |   py36h831f99a_1          11 KB  conda-forge
    curl-7.69.1                |       h33f0ec9_0         137 KB  conda-forge
    pycurl-7.43.0.5            |   py36h16ce93b_0          69 KB  conda-forge
    python-3.6.7               |    h381d211_1004        34.5 MB  conda-forge
    pykerberos-1.2.1           |   py36h2afdebe_2          27 KB  conda-forge
    krb5-1.17.1                |       h2fd8d38_0         1.5 MB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    libxgboost-1.0.2           |       he1b5a44_1         2.8 MB  conda-forge
    ------------------------------------------------------------
                                           Total:        44.6 MB

The following NEW packages will be INSTALLED:

    _py-xgboost-mutex: 2.0-cpu_0               conda-forge
    libxgboost:        1.0.2-he1b5a44_1        conda-forge
    py-xgboost:        1.0.2-py36h9f0ad1d_1    conda-forge
    python_abi:        3.6-1_cp36m             conda-forge
    xgboost:           1.0.2-py36h831f99a_1    conda-forge

The following packages will be UPDATED:

    ca-certificates:   2020.1.1-0                          --> 2020.4.5.1-hecc5488_0     conda-forge
    certifi:           2019.11.28-py36_0                   --> 2020.4.5.1-py36h9f0ad1d_0 conda-forge
    cryptography:      2.3.1-py36hc365091_0                --> 2.9.2-py36h45558ae_0      conda-forge
    curl:              7.61.0-h84994c4_0                   --> 7.69.1-h33f0ec9_0         conda-forge
    krb5:              1.14.2-hcdc1b81_6                   --> 1.17.1-h2fd8d38_0         conda-forge
    libcurl:           7.61.0-h1ad7b7a_0                   --> 7.69.1-hf7181ac_0         conda-forge
    libssh2:           1.8.0-h9cfc8f7_4                    --> 1.8.2-h22169c7_2          conda-forge
    openssl:           1.0.2u-h7b6447c_0                   --> 1.1.1g-h516909a_0         conda-forge
    pycurl:            7.43.0.2-py36hb7f436b_0             --> 7.43.0.5-py36h16ce93b_0   conda-forge
    pykerberos:        1.2.1-py36h14c3975_0                --> 1.2.1-py36h2afdebe_2      conda-forge
    python:            3.6.6-h6e4f718_2                    --> 3.6.7-h381d211_1004       conda-forge
    qt:                5.9.6-h8703b6f_2                    --> 5.9.7-h5867ecd_1                     


Downloading and Extracting Packages
libcurl-7.69.1       | 573 KB    | ##################################### | 100% 
py-xgboost-1.0.2     | 2.2 MB    | ##################################### | 100% 
cryptography-2.9.2   | 613 KB    | ##################################### | 100% 
xgboost-1.0.2        | 11 KB     | ##################################### | 100% 
curl-7.69.1          | 137 KB    | ##################################### | 100% 
pycurl-7.43.0.5      | 69 KB     | ##################################### | 100% 
python-3.6.7         | 34.5 MB   | ##################################### | 100% 
pykerberos-1.2.1     | 27 KB     | ##################################### | 100% 
krb5-1.17.1          | 1.5 MB    | ##################################### | 100% 
openssl-1.1.1g       | 2.1 MB    | ##################################### | 100% 
_py-xgboost-mutex-2. | 8 KB      | ##################################### | 100% 
libxgboost-1.0.2     | 2.8 MB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
In [2]:
import datetime as dt
import numpy as np
import pandas as pd 
import xgboost as xgb
import random
import time
from numpy import genfromtxt
In [3]:
#download files
#bucket = 'XXX'
#prefix = 'XXX/XXX'
#import boto3
#import re
#from sagemaker import get_execution_role
#role = get_execution_role()
#s3 = boto3.client('s3')
#s3.download_file(bucket, 'features.csv', 'features.csv')

data=pd.read_csv('features.csv')
In [4]:
data.drop(data[data.Pid > 57516454].index, inplace=True) # remove last 15 days
data.drop([ 'Pid','Row'], axis=1,inplace=True)
In [5]:
features = list(data.columns.values)
features.remove('Resolved')
params = {
    "objective": "binary:logistic",
    "booster" : "gbtree",
    "eval_metric": "auc",
    "eta": .56,
    "tree_method": 'exact',
    "max_depth": 20,
    "subsample": 1,
    "colsample_bytree": 0.5,
    "silent": 1,
    "min_child_weight": 1,
    "n_jobs": -1,
    "gamma": 15,
    "num_parallel_tree": 8,
}
dtrain = xgb.DMatrix(data[features], data['Resolved'], missing=-1)
gbm = xgb.cv(params, dtrain, 35, nfold=10,  verbose_eval=1)
print(gbm)
[0]	train-auc:0.682154+0.000322335	test-auc:0.680988+0.00252942
[1]	train-auc:0.69168+0.00169488	test-auc:0.690114+0.00246721
[2]	train-auc:0.697172+0.00107793	test-auc:0.695334+0.00243357
[3]	train-auc:0.700677+0.00086672	test-auc:0.698696+0.00258216
[4]	train-auc:0.703303+0.000635645	test-auc:0.701142+0.00262974
[5]	train-auc:0.705391+0.000415623	test-auc:0.703063+0.0025151
[6]	train-auc:0.706851+0.000273995	test-auc:0.704358+0.002427
[7]	train-auc:0.708135+0.000322682	test-auc:0.705356+0.00220353
[8]	train-auc:0.709199+0.0003464	test-auc:0.706342+0.00232223
[9]	train-auc:0.710138+0.000275356	test-auc:0.707086+0.00235856
[10]	train-auc:0.71094+0.000301732	test-auc:0.707706+0.00224876
[11]	train-auc:0.71154+0.000324379	test-auc:0.70813+0.00227818
[12]	train-auc:0.712108+0.0004138	test-auc:0.708494+0.00222826
[13]	train-auc:0.71273+0.000433147	test-auc:0.708929+0.00224433
[14]	train-auc:0.713411+0.000441736	test-auc:0.709495+0.00232511
[15]	train-auc:0.713843+0.000411378	test-auc:0.709814+0.00235225
[16]	train-auc:0.714235+0.000356222	test-auc:0.710178+0.00227136
[17]	train-auc:0.714654+0.000344586	test-auc:0.710465+0.0023427
[18]	train-auc:0.715104+0.000337058	test-auc:0.710821+0.00226592
[19]	train-auc:0.715409+0.000314641	test-auc:0.711017+0.00227906
[20]	train-auc:0.715824+0.000388957	test-auc:0.711325+0.0022089
[21]	train-auc:0.71622+0.000450116	test-auc:0.711598+0.00204566
[22]	train-auc:0.716426+0.000471536	test-auc:0.711743+0.00203462
[23]	train-auc:0.71657+0.000515038	test-auc:0.711867+0.00201331
[24]	train-auc:0.71679+0.000538778	test-auc:0.711989+0.00196111
[25]	train-auc:0.716934+0.000529491	test-auc:0.712086+0.00199815
[26]	train-auc:0.717113+0.000498556	test-auc:0.712203+0.00197488
[27]	train-auc:0.717305+0.000503702	test-auc:0.712323+0.00196269
[28]	train-auc:0.717516+0.000597805	test-auc:0.71249+0.00193788
[29]	train-auc:0.717673+0.000560451	test-auc:0.712611+0.00195121
[30]	train-auc:0.717858+0.000493259	test-auc:0.712724+0.00192662
[31]	train-auc:0.718019+0.000513389	test-auc:0.712819+0.00192664
[32]	train-auc:0.718192+0.000426822	test-auc:0.712966+0.00200415
[33]	train-auc:0.718337+0.000400466	test-auc:0.713021+0.00198646
[34]	train-auc:0.718518+0.0004652	test-auc:0.713121+0.00199394
    train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0         0.682154       0.000322       0.680988      0.002529
1         0.691680       0.001695       0.690114      0.002467
2         0.697172       0.001078       0.695334      0.002434
3         0.700677       0.000867       0.698696      0.002582
4         0.703303       0.000636       0.701142      0.002630
5         0.705391       0.000416       0.703063      0.002515
6         0.706851       0.000274       0.704358      0.002427
7         0.708135       0.000323       0.705356      0.002204
8         0.709199       0.000346       0.706342      0.002322
9         0.710138       0.000275       0.707086      0.002359
10        0.710940       0.000302       0.707706      0.002249
11        0.711540       0.000324       0.708130      0.002278
12        0.712108       0.000414       0.708494      0.002228
13        0.712730       0.000433       0.708929      0.002244
14        0.713411       0.000442       0.709495      0.002325
15        0.713843       0.000411       0.709814      0.002352
16        0.714235       0.000356       0.710178      0.002271
17        0.714654       0.000345       0.710465      0.002343
18        0.715104       0.000337       0.710821      0.002266
19        0.715409       0.000315       0.711017      0.002279
20        0.715824       0.000389       0.711325      0.002209
21        0.716220       0.000450       0.711598      0.002046
22        0.716426       0.000472       0.711743      0.002035
23        0.716570       0.000515       0.711867      0.002013
24        0.716790       0.000539       0.711989      0.001961
25        0.716934       0.000529       0.712086      0.001998
26        0.717113       0.000499       0.712203      0.001975
27        0.717305       0.000504       0.712323      0.001963
28        0.717516       0.000598       0.712490      0.001938
29        0.717673       0.000560       0.712611      0.001951
30        0.717858       0.000493       0.712724      0.001927
31        0.718019       0.000513       0.712819      0.001927
32        0.718192       0.000427       0.712966      0.002004
33        0.718337       0.000400       0.713021      0.001986
34        0.718518       0.000465       0.713121      0.001994
In [ ]:
import pickle
#get imp for single training
def create_fp(features):
    outfile = open('xgb', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()
def get_imp(gbm, features):
    create_fp(features)
    imp = gbm.get_fscore(fmap='xgb')
    imp = sorted(imp.items(), key=itemgetter(1), reverse=True)
    return imp
#save model for further usage
pickle.dump(gbm, open("a.pickle.dat", "wb"))