Gradient Boosting Decision Tree


Overview

  • Task: Using dataset of titanic: machine learning from disaster (you can check https://www.kaggle.com/c/titanic/data for details) to train a decision tree model so that it predicts whether a passenger can survive given corresponding imformation in type of category, text, and number.
  • Input: train_file with labels: ../input/titanic/train.csv & test_file without labels: ../input/titanic/test.csv
  • Label:
Index Variable Definition Type
0 'Pclass' Passenger's class (1st, 2nd, or 3rd) cat
1 'Name' Passenger's name text
2 'Sex' Passenger's sex cat
3 'Age' Passenger's age num
4 'SibSp' Number of siblings/spouses aboard the Titanic cat
5 'Parch' Number of parents/children aboard the Titanic cat
6 'Ticket' Ticket number text
7 'Fare' Fare paid for ticket num
8 'Cabin' Cabin number set
9 'Embarked' Where the passenger got on the ship (C - Cherbourg, S - Southampton, Q = Queenstown) cat
In [1]:
import numpy as np
In [2]:
FEAT_TYPE = {0: 'cat', 1: 'text', 2: 'cat', 3: 'num', 4: 'cat', 5: 'cat', 6: 'text', 7: 'num', 8: 'set', 9: 'cat'}
In [3]:
# parse a string into fields, skip quotations
def parse_feat(line):
    quota = False
    j = 0
    feats = []
    for i in range(len(line)):
        if line[i] == '\"':
            quota = not quota
        if line[i] == ',' and not quota:
            feat = line[j:i]
            feats.append(feat)
            j = i+1
    return feats + [line[j:]]


# load a csv file, use parse_feat() to convert format
def load_file(file_name):
    data = []
    with open(file_name, 'r') as fin:
        print('field_names:', fin.readline().strip().split(','))
        for line in fin:
            line = line.strip()
            data.append(parse_feat(line))
    return np.array(data)

train_data = load_file('../input/titanic/train.csv')
test_data = load_file('../input/titanic/test.csv')

train_id, train_label, train_feat = train_data[:, 0], train_data[:, 1], train_data[:, 2:]
test_id, test_feat = test_data[:, 0], test_data[:, 1:]

train_feat[:, [1, 6]] = None
test_feat[:, [1, 6]] = None

print('train_feat:\n', train_feat[0])
print('test_feat:\n', test_feat[0])
field_names: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
field_names: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
train_feat:
 ['3' 'None' 'male' '22' '1' '0' 'None' '7.25' '' 'S']
test_feat:
 ['3' 'None' 'male' '34.5' '0' '0' 'None' '7.8292' '' 'Q']
In [4]:
def get_feat_name(field, feat_val):
    assert field != 'text'
    if FEAT_TYPE[field] == 'cat':
        return str(field) + ':' + feat_val
    elif FEAT_TYPE[field] == 'num':
        return str(field) + ':'
    elif FEAT_TYPE[field] == 'set':
        return [str(field) + ':' + fv for  fv in feat_val.split()]

def build_feat_map(data):
    feat_map = {}
    for i in range(len(FEAT_TYPE)):
        if FEAT_TYPE[i] == 'num':
            fn = get_feat_name(i, None)
            if fn not in feat_map:
                feat_map[fn] = len(feat_map)
            continue
        elif FEAT_TYPE[i] == 'text':
            continue
            
        feat = data[:, i]
        for f in feat:
            if FEAT_TYPE[i] == 'cat':
                fn = get_feat_name(i, f)
                if fn not in feat_map:
                    feat_map[fn] = len(feat_map)
            elif FEAT_TYPE[i] == 'set':
                for fn in get_feat_name(i, f):
                    if fn not in feat_map:
                        feat_map[fn] = len(feat_map)
                    
    return feat_map
    
feat_map = build_feat_map(np.vstack([train_feat, test_feat]))
print(feat_map)
print(np.array(feat_map.keys()))
print(np.array(feat_map.values()))
{'0:3': 0, '0:1': 1, '0:2': 2, '2:male': 3, '2:female': 4, '3:': 5, '4:1': 6, '4:0': 7, '4:3': 8, '4:4': 9, '4:2': 10, '4:5': 11, '4:8': 12, '5:0': 13, '5:1': 14, '5:2': 15, '5:5': 16, '5:3': 17, '5:4': 18, '5:6': 19, '5:9': 20, '7:': 21, '8:C85': 22, '8:C123': 23, '8:E46': 24, '8:G6': 25, '8:C103': 26, '8:D56': 27, '8:A6': 28, '8:C23': 29, '8:C25': 30, '8:C27': 31, '8:B78': 32, '8:D33': 33, '8:B30': 34, '8:C52': 35, '8:B28': 36, '8:C83': 37, '8:F33': 38, '8:F': 39, '8:G73': 40, '8:E31': 41, '8:A5': 42, '8:D10': 43, '8:D12': 44, '8:D26': 45, '8:C110': 46, '8:B58': 47, '8:B60': 48, '8:E101': 49, '8:E69': 50, '8:D47': 51, '8:B86': 52, '8:F2': 53, '8:C2': 54, '8:E33': 55, '8:B19': 56, '8:A7': 57, '8:C49': 58, '8:F4': 59, '8:A32': 60, '8:B4': 61, '8:B80': 62, '8:A31': 63, '8:D36': 64, '8:D15': 65, '8:C93': 66, '8:C78': 67, '8:D35': 68, '8:C87': 69, '8:B77': 70, '8:E67': 71, '8:B94': 72, '8:C125': 73, '8:C99': 74, '8:C118': 75, '8:D7': 76, '8:A19': 77, '8:B49': 78, '8:D': 79, '8:C22': 80, '8:C26': 81, '8:C106': 82, '8:C65': 83, '8:E36': 84, '8:C54': 85, '8:B57': 86, '8:B59': 87, '8:B63': 88, '8:B66': 89, '8:C7': 90, '8:E34': 91, '8:C32': 92, '8:B18': 93, '8:C124': 94, '8:C91': 95, '8:E40': 96, '8:T': 97, '8:C128': 98, '8:D37': 99, '8:B35': 100, '8:E50': 101, '8:C82': 102, '8:B96': 103, '8:B98': 104, '8:E10': 105, '8:E44': 106, '8:A34': 107, '8:C104': 108, '8:C111': 109, '8:C92': 110, '8:E38': 111, '8:D21': 112, '8:E12': 113, '8:E63': 114, '8:A14': 115, '8:B37': 116, '8:C30': 117, '8:D20': 118, '8:B79': 119, '8:E25': 120, '8:D46': 121, '8:B73': 122, '8:C95': 123, '8:B38': 124, '8:B39': 125, '8:B22': 126, '8:C86': 127, '8:C70': 128, '8:A16': 129, '8:C101': 130, '8:C68': 131, '8:A10': 132, '8:E68': 133, '8:B41': 134, '8:A20': 135, '8:D19': 136, '8:D50': 137, '8:D9': 138, '8:A23': 139, '8:B50': 140, '8:A26': 141, '8:D48': 142, '8:E58': 143, '8:C126': 144, '8:B71': 145, '8:B51': 146, '8:B53': 147, '8:B55': 148, '8:D49': 149, '8:B5': 150, '8:B20': 151, '8:G63': 152, '8:C62': 153, '8:C64': 154, '8:E24': 155, '8:C90': 156, '8:C45': 157, '8:E8': 158, '8:B101': 159, '8:D45': 160, '8:C46': 161, '8:D30': 162, '8:E121': 163, '8:D11': 164, '8:E77': 165, '8:F38': 166, '8:B3': 167, '8:D6': 168, '8:B82': 169, '8:B84': 170, '8:D17': 171, '8:A36': 172, '8:B102': 173, '8:B69': 174, '8:E49': 175, '8:C47': 176, '8:D28': 177, '8:E17': 178, '8:A24': 179, '8:C50': 180, '8:B42': 181, '8:C148': 182, '8:B45': 183, '8:B36': 184, '8:A21': 185, '8:D34': 186, '8:A9': 187, '8:C31': 188, '8:B61': 189, '8:C53': 190, '8:D43': 191, '8:C130': 192, '8:C132': 193, '8:C55': 194, '8:C57': 195, '8:C116': 196, '8:A29': 197, '8:C6': 198, '8:C28': 199, '8:C51': 200, '8:C97': 201, '8:D22': 202, '8:B10': 203, '8:E45': 204, '8:E52': 205, '8:A11': 206, '8:B11': 207, '8:C80': 208, '8:C89': 209, '8:B26': 210, '8:E57': 211, '8:A18': 212, '8:E60': 213, '8:E39': 214, '8:E41': 215, '8:B52': 216, '8:B54': 217, '8:B56': 218, '8:C39': 219, '8:B24': 220, '8:D40': 221, '8:D38': 222, '8:C105': 223, '9:S': 224, '9:C': 225, '9:Q': 226, '9:': 227}
dict_keys(['0:3', '0:1', '0:2', '2:male', '2:female', '3:', '4:1', '4:0', '4:3', '4:4', '4:2', '4:5', '4:8', '5:0', '5:1', '5:2', '5:5', '5:3', '5:4', '5:6', '5:9', '7:', '8:C85', '8:C123', '8:E46', '8:G6', '8:C103', '8:D56', '8:A6', '8:C23', '8:C25', '8:C27', '8:B78', '8:D33', '8:B30', '8:C52', '8:B28', '8:C83', '8:F33', '8:F', '8:G73', '8:E31', '8:A5', '8:D10', '8:D12', '8:D26', '8:C110', '8:B58', '8:B60', '8:E101', '8:E69', '8:D47', '8:B86', '8:F2', '8:C2', '8:E33', '8:B19', '8:A7', '8:C49', '8:F4', '8:A32', '8:B4', '8:B80', '8:A31', '8:D36', '8:D15', '8:C93', '8:C78', '8:D35', '8:C87', '8:B77', '8:E67', '8:B94', '8:C125', '8:C99', '8:C118', '8:D7', '8:A19', '8:B49', '8:D', '8:C22', '8:C26', '8:C106', '8:C65', '8:E36', '8:C54', '8:B57', '8:B59', '8:B63', '8:B66', '8:C7', '8:E34', '8:C32', '8:B18', '8:C124', '8:C91', '8:E40', '8:T', '8:C128', '8:D37', '8:B35', '8:E50', '8:C82', '8:B96', '8:B98', '8:E10', '8:E44', '8:A34', '8:C104', '8:C111', '8:C92', '8:E38', '8:D21', '8:E12', '8:E63', '8:A14', '8:B37', '8:C30', '8:D20', '8:B79', '8:E25', '8:D46', '8:B73', '8:C95', '8:B38', '8:B39', '8:B22', '8:C86', '8:C70', '8:A16', '8:C101', '8:C68', '8:A10', '8:E68', '8:B41', '8:A20', '8:D19', '8:D50', '8:D9', '8:A23', '8:B50', '8:A26', '8:D48', '8:E58', '8:C126', '8:B71', '8:B51', '8:B53', '8:B55', '8:D49', '8:B5', '8:B20', '8:G63', '8:C62', '8:C64', '8:E24', '8:C90', '8:C45', '8:E8', '8:B101', '8:D45', '8:C46', '8:D30', '8:E121', '8:D11', '8:E77', '8:F38', '8:B3', '8:D6', '8:B82', '8:B84', '8:D17', '8:A36', '8:B102', '8:B69', '8:E49', '8:C47', '8:D28', '8:E17', '8:A24', '8:C50', '8:B42', '8:C148', '8:B45', '8:B36', '8:A21', '8:D34', '8:A9', '8:C31', '8:B61', '8:C53', '8:D43', '8:C130', '8:C132', '8:C55', '8:C57', '8:C116', '8:A29', '8:C6', '8:C28', '8:C51', '8:C97', '8:D22', '8:B10', '8:E45', '8:E52', '8:A11', '8:B11', '8:C80', '8:C89', '8:B26', '8:E57', '8:A18', '8:E60', '8:E39', '8:E41', '8:B52', '8:B54', '8:B56', '8:C39', '8:B24', '8:D40', '8:D38', '8:C105', '9:S', '9:C', '9:Q', '9:'])
dict_values([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227])
In [5]:
def to_float(x):
    if len(x):
        return float(x)
    return -1

def get_feat_id_val(field, feat_val):
    assert field != 'text'
    feat_name = get_feat_name(field, feat_val)
    if FEAT_TYPE[field] == 'cat':
        return feat_map[feat_name], 1
    elif FEAT_TYPE[field] == 'num':
        return feat_map[feat_name], to_float(feat_val)
    elif FEAT_TYPE[field] == 'set':
        return [feat_map[fn] for fn in feat_name], [1] * len(feat_name)

def to_libsvm(data):
    libsvm_data = []
    for d in data:
        libsvm_data.append([])
        for i in range(len(FEAT_TYPE)):
            if FEAT_TYPE[i] == 'cat' or FEAT_TYPE[i] == 'num':
                fv = get_feat_id_val(i, d[i])
                libsvm_data[-1].append(fv)
            elif FEAT_TYPE[i] == 'set':
                fvs = get_feat_id_val(i, d[i])
                for fv in zip(*fvs):
                    libsvm_data[-1].append(fv)
    return libsvm_data

train_data = to_libsvm(train_feat)
test_data = to_libsvm(test_feat)
In [6]:
np.random.seed(123)
rnd = np.random.random(len(train_data))
train_ind = np.where(rnd < 0.8)[0]
valid_ind = np.where(rnd >= 0.8)[0]

MAX_FEAT = ' %d:0\n' % len(feat_map)

flag = True
with open('train.svm', 'w') as fout:
    for i in train_ind:
        line = train_label[i]
        for fv in train_data[i]:
            line += ' {}:{}'.format(*fv)
        if flag:
            line += MAX_FEAT
            flag = False
        else:
            line += '\n'
        fout.write(line)
        
flag = True
with open('valid.svm', 'w') as fout:
    for i in valid_ind:
        line = train_label[i]
        for fv in train_data[i]:
            line += ' {}:{}'.format(*fv)
        if flag:
            line += MAX_FEAT
            flag = False
        else:
            line += '\n'
        fout.write(line)

flag = True
with open('test.svm', 'w') as fout:
    for i in range(len(test_data)):
        line = '0'
        for fv in test_data[i]:
            line += ' {}:{}'.format(*fv)
        if flag:
            line += MAX_FEAT
            flag = False
        else:
            line += '\n'
        fout.write(line)
In [7]:
import xgboost as xgb
# read in data
dtrain = xgb.DMatrix('train.svm')
dtest = xgb.DMatrix('valid.svm')
[08:55:31] 707x229 matrix with 5138 entries loaded from train.svm
[08:55:31] 184x229 matrix with 1339 entries loaded from valid.svm
In [8]:
# TODO tune params for gbtree
param = {
    # learner params
    'booster': 'gbtree', # gbtree or gblinear
    'nthread': 1, 
    'silent': 1, 
    # tree params
    'eta': 1, 
    'gamma': 0,
    'max_depth': 4, 
    'subsample': 1,
    'lambda': 1,
    'alpha': 0,
    # learning params
    'objective': 'binary:logistic', 
    'eval_metric': 'error', }
evallist = [(dtrain, 'train'), (dtest, 'eval')]
num_round = 100
bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=5)

# bst.save_model('xgb.model')
bst.dump_model('xgb.dump.raw.txt')
# bst = xgb.Booster({'nthread': 4})  # init model
# bst.load_model('xgb.model')  # load data
[0]	train-error:0.171146	eval-error:0.206522
Multiple eval metrics have been passed: 'eval-error' will be used for early stopping.

Will train until eval-error hasn't improved in 5 rounds.
[1]	train-error:0.164074	eval-error:0.217391
[2]	train-error:0.155587	eval-error:0.228261
[3]	train-error:0.134371	eval-error:0.217391
[4]	train-error:0.123055	eval-error:0.179348
[5]	train-error:0.130127	eval-error:0.179348
[6]	train-error:0.127298	eval-error:0.179348
[7]	train-error:0.125884	eval-error:0.179348
[8]	train-error:0.113154	eval-error:0.184783
[9]	train-error:0.09901	eval-error:0.173913
[10]	train-error:0.097595	eval-error:0.190217
[11]	train-error:0.091938	eval-error:0.195652
[12]	train-error:0.084866	eval-error:0.157609
[13]	train-error:0.076379	eval-error:0.13587
[14]	train-error:0.076379	eval-error:0.13587
[15]	train-error:0.066478	eval-error:0.157609
[16]	train-error:0.069307	eval-error:0.168478
[17]	train-error:0.063649	eval-error:0.163043
[18]	train-error:0.057992	eval-error:0.168478
Stopping. Best iteration:
[13]	train-error:0.076379	eval-error:0.13587

In [9]:
# TODO tune params for gblinear
In [10]:
import lightgbm as lgb

dtrain = lgb.Dataset('train.svm')
dtest = lgb.Dataset('valid.svm')
In [11]:
# TODO tune params for gbdt
param = {
    'objective':'binary',
    'boosting': 'gbdt',
    'num_threads': 1,
    'learning_rate': 1,
    'num_leaves': 31, 
    'max_depth': 9,
    'metric': 'binary_error',
    'lambda_l1': 0,
    'lambda_l2': 0,
    }

num_round = 100
bst = lgb.train(param, dtrain, num_round, valid_sets=[dtest], early_stopping_rounds=5)

bst.save_model('model.txt')
# json_model = bst.dump_model()
# bst = lgb.Booster(model_file='model.txt') 
[1]	valid_0's binary_error: 0.211957
Training until validation scores don't improve for 5 rounds
[2]	valid_0's binary_error: 0.211957
[3]	valid_0's binary_error: 0.184783
[4]	valid_0's binary_error: 0.146739
[5]	valid_0's binary_error: 0.163043
[6]	valid_0's binary_error: 0.141304
[7]	valid_0's binary_error: 0.13587
[8]	valid_0's binary_error: 0.152174
[9]	valid_0's binary_error: 0.163043
[10]	valid_0's binary_error: 0.168478
[11]	valid_0's binary_error: 0.157609
[12]	valid_0's binary_error: 0.152174
Early stopping, best iteration is:
[7]	valid_0's binary_error: 0.13587
Out[11]:
<lightgbm.basic.Booster at 0x7fc172570320>