Data preprocessing
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import pandas as pd import numpy as np import xgboost as xgb np.random.seed(1234) trn = pd.read_csv("data/train_ver2.csv") tst = pd.read_csv("data/test_ver2.csv") '''data preprocessing''' #save variables about products seperately prods = trn.columns[24:].tolist() #replace NAN to 0 trn[prods] = trn[prods].fillna(0.0).astype(np.int8) #remove cumstomer data who doesn't have any product among 24 no_prod = trn[prods].sum(axis=1) == 0 trn = trn[~no_prod] #add trn and tst, set 0 to product var which is not in tst for col in trn.columns[24:]: tst[col] = 0 df = pd.concat([trn, tst], axis=0) #for train features = list() #categorical var -> label encoding with factorize() categorical_cols = ['ind_empleado', 'pais_residencia', 'sexo', 'tiprel_1mes', 'indresi', 'indext','conyuemp', 'canal_entrada','indfall','tipodom','nomprov', 'segmento'] for col in categorical_cols: df[col],_ = df[col].factorize(na_sentinel=-99) features += categorical_cols #replace NaN to int -99 df['age'].replace(' NA', -99, inplace=True) df['age'] = df['age'].astype(np.int8) ''' same process with 'antiguedad' 'renta' 'indrel_1mes' ''' #add numerical vars to features for training features += ['age','antiguedad','renta','ind_nuevo','indrel','indrel_1mes','ind_actividad_cliente'] |