Contents of table:
Kaggle M5 Competition Part 2 -Modeling
5. Feature Engineering
1 2 3 4 5 6 7 d_id = dict(zip(df.id.cat.codes, df.id)) d_item_id = dict(zip(df.item_id.cat.codes, df.item_id)) d_dept_id = dict(zip(df.dept_id.cat.codes, df.dept_id)) d_cat_id = dict(zip(df.cat_id.cat.codes, df.cat_id)) d_store_id = dict(zip(df.store_id.cat.codes, df.store_id)) d_state_id = dict(zip(df.state_id.cat.codes, df.state_id))
1 2 3 4 5 6 7 8 9 10 11 12 13 gc.collect() df.d = df['d' ].apply(lambda x: x.split('_' )[1 ]).astype(np.int16) cols = df.dtypes.index.tolist() types = df.dtypes.values.tolist() for i,type in enumerate(types): if type.name == 'category' : df[cols[i]] = df[cols[i]].cat.codes df.drop('date' ,axis=1 ,inplace=True )
1 2 3 4 lags = [1 ,2 ,3 ,6 ,12 ,24 ,36 ] for lag in lags: df['sold_lag_' +str(lag)] = df.groupby(['id' , 'item_id' , 'dept_id' , 'cat_id' , 'store_id' , 'state_id' ],as_index=False )['sold' ].shift(lag).astype(np.float16)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 %time df['iteam_sold_avg' ] = df.groupby('item_id' )['sold' ].transform('mean' ).astype(np.float16) df['state_sold_avg' ] = df.groupby('state_id' )['sold' ].transform('mean' ).astype(np.float16) df['store_sold_avg' ] = df.groupby('store_id' )['sold' ].transform('mean' ).astype(np.float16) df['cat_sold_avg' ] = df.groupby('cat_id' )['sold' ].transform('mean' ).astype(np.float16) df['dept_sold_avg' ] = df.groupby('dept_id' )['sold' ].transform('mean' ).astype(np.float16) df['cat_dept_sold_avg' ] = df.groupby(['cat_id' ,'dept_id' ])['sold' ].transform('mean' ).astype(np.float16) df['store_item_sold_avg' ] = df.groupby(['store_id' ,'item_id' ])['sold' ].transform('mean' ).astype(np.float16) df['cat_item_sold_avg' ] = df.groupby(['cat_id' ,'item_id' ])['sold' ].transform('mean' ).astype(np.float16) df['dept_item_sold_avg' ] = df.groupby(['dept_id' ,'item_id' ])['sold' ].transform('mean' ).astype(np.float16) df['state_store_sold_avg' ] = df.groupby(['state_id' ,'store_id' ])['sold' ].transform('mean' ).astype(np.float16) df['state_store_cat_sold_avg' ] = df.groupby(['state_id' ,'store_id' ,'cat_id' ])['sold' ].transform('mean' ).astype(np.float16) df['store_cat_dept_sold_avg' ] = df.groupby(['store_id' ,'cat_id' ,'dept_id' ])['sold' ].transform('mean' ).astype(np.float16)
Wall time: 1 ms
1 df['rolling_sold_mean' ] = df.groupby(['id' , 'item_id' , 'dept_id' , 'cat_id' , 'store_id' , 'state_id' ])['sold' ].transform(lambda x: x.rolling(window=7 ).mean()).astype(np.float16)
1 df['expanding_sold_mean' ] = df.groupby(['id' , 'item_id' , 'dept_id' , 'cat_id' , 'store_id' , 'state_id' ])['sold' ].transform(lambda x: x.expanding(2 ).mean()).astype(np.float16)
1 2 3 4 5 df['daily_avg_sold' ] = df.groupby(['id' , 'item_id' , 'dept_id' , 'cat_id' , 'store_id' , 'state_id' ,'d' ])['sold' ].transform('mean' ).astype(np.float16) df['avg_sold' ] = df.groupby(['id' , 'item_id' , 'dept_id' , 'cat_id' , 'store_id' , 'state_id' ])['sold' ].transform('mean' ).astype(np.float16) df['selling_trend' ] = (df['daily_avg_sold' ] - df['avg_sold' ]).astype(np.float16) df.drop(['daily_avg_sold' ,'avg_sold' ],axis=1 ,inplace=True )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 58967660 entries, 1067150 to 60034809
Data columns (total 43 columns):
id int16
item_id int16
dept_id int8
cat_id int8
store_id int8
state_id int8
d int16
sold int16
wm_yr_wk int16
weekday int8
wday int8
month int8
year int16
event_name_1 int8
event_type_1 int8
event_name_2 int8
event_type_2 int8
snap_CA int8
snap_TX int8
snap_WI int8
sell_price float16
sold_lag_1 float16
sold_lag_2 float16
sold_lag_3 float16
sold_lag_6 float16
sold_lag_12 float16
sold_lag_24 float16
sold_lag_36 float16
iteam_sold_avg float16
state_sold_avg float16
store_sold_avg float16
cat_sold_avg float16
dept_sold_avg float16
cat_dept_sold_avg float16
store_item_sold_avg float16
cat_item_sold_avg float16
dept_item_sold_avg float16
state_store_sold_avg float16
state_store_cat_sold_avg float16
store_cat_dept_sold_avg float16
rolling_sold_mean float16
expanding_sold_mean float16
selling_trend float16
dtypes: float16(23), int16(6), int8(14)
memory usage: 4.4 GB
1 2 3 df.to_pickle('data.pkl' ) del dfgc.collect()
6. Modeling and Prediction
1 2 3 4 5 6 %time data = pd.read_pickle('data.pkl' ) valid = data[(data['d' ]>=1914 ) & (data['d' ]<1942 )][['id' ,'d' ,'sold' ]] test = data[data['d' ]>=1942 ][['id' ,'d' ,'sold' ]] eval_preds = test['sold' ] valid_preds = valid['sold' ]
Wall time: 0 ns
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 stores = sales.store_id.cat.codes.unique().tolist() for store in stores: df = data[data['store_id' ]==store] X_train, y_train = df[df['d' ]<1914 ].drop('sold' ,axis=1 ), df[df['d' ]<1914 ]['sold' ] X_valid, y_valid = df[(df['d' ]>=1914 ) & (df['d' ]<1942 )].drop('sold' ,axis=1 ), df[(df['d' ]>=1914 ) & (df['d' ]<1942 )]['sold' ] X_test = df[df['d' ]>=1942 ].drop('sold' ,axis=1 ) model = LGBMRegressor( n_estimators=1000 , learning_rate=0.3 , subsample=0.8 , colsample_bytree=0.8 , max_depth=8 , num_leaves=50 , min_child_weight=300 ) print('*****Prediction for Store: {}*****' .format(d_store_id[store])) model.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_valid,y_valid)], eval_metric='rmse' , verbose=20 , early_stopping_rounds=20 ) valid_preds[X_valid.index] = model.predict(X_valid) eval_preds[X_test.index] = model.predict(X_test) filename = 'model' +str(d_store_id[store])+'.pkl' joblib.dump(model, filename) del model, X_train, y_train, X_valid, y_valid gc.collect()
*****Prediction for Store: CA_1*****
Training until validation scores don't improve for 20 rounds
[20] training's rmse: 0.843923 training's l2: 0.712206 valid_1's rmse: 0.556612 valid_1's l2: 0.309817
[40] training's rmse: 0.805702 training's l2: 0.649156 valid_1's rmse: 0.536648 valid_1's l2: 0.287992
[60] training's rmse: 0.782521 training's l2: 0.612339 valid_1's rmse: 0.529075 valid_1's l2: 0.27992
[80] training's rmse: 0.765509 training's l2: 0.586004 valid_1's rmse: 0.519001 valid_1's l2: 0.269362
[100] training's rmse: 0.746824 training's l2: 0.557746 valid_1's rmse: 0.516391 valid_1's l2: 0.26666
[120] training's rmse: 0.736669 training's l2: 0.542682 valid_1's rmse: 0.512239 valid_1's l2: 0.262389
[140] training's rmse: 0.725183 training's l2: 0.52589 valid_1's rmse: 0.507517 valid_1's l2: 0.257574
[160] training's rmse: 0.71879 training's l2: 0.516659 valid_1's rmse: 0.503054 valid_1's l2: 0.253063
[180] training's rmse: 0.713246 training's l2: 0.508719 valid_1's rmse: 0.501668 valid_1's l2: 0.25167
Early stopping, best iteration is:
[177] training's rmse: 0.713815 training's l2: 0.509531 valid_1's rmse: 0.501194 valid_1's l2: 0.251195
*****Prediction for Store: CA_2*****
Training until validation scores don't improve for 20 rounds
[20] training's rmse: 0.509193 training's l2: 0.259277 valid_1's rmse: 0.488679 valid_1's l2: 0.238808
[40] training's rmse: 0.476985 training's l2: 0.227515 valid_1's rmse: 0.481392 valid_1's l2: 0.231738
[60] training's rmse: 0.459124 training's l2: 0.210795 valid_1's rmse: 0.469844 valid_1's l2: 0.220753
[80] training's rmse: 0.446454 training's l2: 0.199321 valid_1's rmse: 0.466131 valid_1's l2: 0.217278
[100] training's rmse: 0.44062 training's l2: 0.194146 valid_1's rmse: 0.465138 valid_1's l2: 0.216353
[120] training's rmse: 0.435579 training's l2: 0.189729 valid_1's rmse: 0.462275 valid_1's l2: 0.213698
[140] training's rmse: 0.433312 training's l2: 0.187759 valid_1's rmse: 0.46174 valid_1's l2: 0.213204
[160] training's rmse: 0.430487 training's l2: 0.185319 valid_1's rmse: 0.461825 valid_1's l2: 0.213283
Early stopping, best iteration is:
[149] training's rmse: 0.431706 training's l2: 0.18637 valid_1's rmse: 0.461223 valid_1's l2: 0.212727
*****Prediction for Store: CA_3*****
Training until validation scores don't improve for 20 rounds
[20] training's rmse: 1.31768 training's l2: 1.73629 valid_1's rmse: 0.620532 valid_1's l2: 0.38506
[40] training's rmse: 1.25016 training's l2: 1.56289 valid_1's rmse: 0.599518 valid_1's l2: 0.359422
[60] training's rmse: 1.21357 training's l2: 1.47275 valid_1's rmse: 0.583401 valid_1's l2: 0.340357
[80] training's rmse: 1.18962 training's l2: 1.41519 valid_1's rmse: 0.580415 valid_1's l2: 0.336882
[100] training's rmse: 1.16704 training's l2: 1.36198 valid_1's rmse: 0.573824 valid_1's l2: 0.329274
Early stopping, best iteration is:
[83] training's rmse: 1.18341 training's l2: 1.40046 valid_1's rmse: 0.571149 valid_1's l2: 0.326211
*****Prediction for Store: CA_4*****
Training until validation scores don't improve for 20 rounds
[20] training's rmse: 0.379545 training's l2: 0.144055 valid_1's rmse: 0.306421 valid_1's l2: 0.0938936
[40] training's rmse: 0.362723 training's l2: 0.131568 valid_1's rmse: 0.296737 valid_1's l2: 0.0880528
[60] training's rmse: 0.352526 training's l2: 0.124275 valid_1's rmse: 0.286469 valid_1's l2: 0.0820644
[80] training's rmse: 0.347152 training's l2: 0.120515 valid_1's rmse: 0.283419 valid_1's l2: 0.0803261
[100] training's rmse: 0.342128 training's l2: 0.117052 valid_1's rmse: 0.279012 valid_1's l2: 0.0778477
[120] training's rmse: 0.339248 training's l2: 0.115089 valid_1's rmse: 0.27756 valid_1's l2: 0.0770398
[140] training's rmse: 0.336076 training's l2: 0.112947 valid_1's rmse: 0.27745 valid_1's l2: 0.0769786
Early stopping, best iteration is:
[129] training's rmse: 0.337326 training's l2: 0.113789 valid_1's rmse: 0.276789 valid_1's l2: 0.0766123
*****Prediction for Store: TX_1*****
Training until validation scores don't improve for 20 rounds
[20] training's rmse: 0.779231 training's l2: 0.607202 valid_1's rmse: 0.495078 valid_1's l2: 0.245102
[40] training's rmse: 0.734945 training's l2: 0.540143 valid_1's rmse: 0.477927 valid_1's l2: 0.228414
[60] training's rmse: 0.715 training's l2: 0.511225 valid_1's rmse: 0.474993 valid_1's l2: 0.225618
[80] training's rmse: 0.700945 training's l2: 0.491324 valid_1's rmse: 0.471686 valid_1's l2: 0.222487
[100] training's rmse: 0.688138 training's l2: 0.473534 valid_1's rmse: 0.469721 valid_1's l2: 0.220638
[120] training's rmse: 0.671506 training's l2: 0.45092 valid_1's rmse: 0.468799 valid_1's l2: 0.219772
Early stopping, best iteration is:
[111] training's rmse: 0.678168 training's l2: 0.459912 valid_1's rmse: 0.466017 valid_1's l2: 0.217172
*****Prediction for Store: TX_2*****
Training until validation scores don't improve for 20 rounds
[20] training's rmse: 0.949797 training's l2: 0.902115 valid_1's rmse: 0.519843 valid_1's l2: 0.270237
[40] training's rmse: 0.901254 training's l2: 0.812259 valid_1's rmse: 0.50753 valid_1's l2: 0.257587
[60] training's rmse: 0.860935 training's l2: 0.741208 valid_1's rmse: 0.496691 valid_1's l2: 0.246702
[80] training's rmse: 0.837279 training's l2: 0.701036 valid_1's rmse: 0.500869 valid_1's l2: 0.25087
Early stopping, best iteration is:
[60] training's rmse: 0.860935 training's l2: 0.741208 valid_1's rmse: 0.496691 valid_1's l2: 0.246702
*****Prediction for Store: TX_3*****
Training until validation scores don't improve for 20 rounds
[20] training's rmse: 0.741642 training's l2: 0.550033 valid_1's rmse: 0.569192 valid_1's l2: 0.323979
[40] training's rmse: 0.71047 training's l2: 0.504767 valid_1's rmse: 0.557032 valid_1's l2: 0.310284
[60] training's rmse: 0.68682 training's l2: 0.471721 valid_1's rmse: 0.546532 valid_1's l2: 0.298697
[80] training's rmse: 0.672727 training's l2: 0.452562 valid_1's rmse: 0.541006 valid_1's l2: 0.292688
[100] training's rmse: 0.66163 training's l2: 0.437754 valid_1's rmse: 0.539347 valid_1's l2: 0.290895
[120] training's rmse: 0.650395 training's l2: 0.423014 valid_1's rmse: 0.534985 valid_1's l2: 0.286208
[140] training's rmse: 0.645165 training's l2: 0.416238 valid_1's rmse: 0.532259 valid_1's l2: 0.2833
Early stopping, best iteration is:
[132] training's rmse: 0.646645 training's l2: 0.418149 valid_1's rmse: 0.531403 valid_1's l2: 0.28239
*****Prediction for Store: WI_1*****
Training until validation scores don't improve for 20 rounds
[20] training's rmse: 0.40387 training's l2: 0.163111 valid_1's rmse: 0.351971 valid_1's l2: 0.123884
[40] training's rmse: 0.379547 training's l2: 0.144056 valid_1's rmse: 0.339714 valid_1's l2: 0.115405
[60] training's rmse: 0.370228 training's l2: 0.137069 valid_1's rmse: 0.338534 valid_1's l2: 0.114605
[80] training's rmse: 0.362681 training's l2: 0.131537 valid_1's rmse: 0.335793 valid_1's l2: 0.112757
Early stopping, best iteration is:
[75] training's rmse: 0.363574 training's l2: 0.132186 valid_1's rmse: 0.335287 valid_1's l2: 0.112418
*****Prediction for Store: WI_2*****
Training until validation scores don't improve for 20 rounds
[20] training's rmse: 0.798844 training's l2: 0.638151 valid_1's rmse: 0.99757 valid_1's l2: 0.995147
[40] training's rmse: 0.75986 training's l2: 0.577388 valid_1's rmse: 0.979328 valid_1's l2: 0.959084
[60] training's rmse: 0.729671 training's l2: 0.53242 valid_1's rmse: 0.968394 valid_1's l2: 0.937787
Early stopping, best iteration is:
[57] training's rmse: 0.732588 training's l2: 0.536685 valid_1's rmse: 0.967836 valid_1's l2: 0.936707
*****Prediction for Store: WI_3*****
Training until validation scores don't improve for 20 rounds
[20] training's rmse: 0.803068 training's l2: 0.644919 valid_1's rmse: 0.580289 valid_1's l2: 0.336735
[40] training's rmse: 0.762335 training's l2: 0.581154 valid_1's rmse: 0.573159 valid_1's l2: 0.328512
[60] training's rmse: 0.739142 training's l2: 0.546331 valid_1's rmse: 0.566164 valid_1's l2: 0.320541
Early stopping, best iteration is:
[51] training's rmse: 0.748455 training's l2: 0.560184 valid_1's rmse: 0.563976 valid_1's l2: 0.318069
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 actual = False if actual == False : validation = sales[['id' ]+['d_' + str(i) for i in range(1914 ,1942 )]] validation['id' ]=pd.read_csv('C:\\Eric\\Projects\\Kaggle_M5\Dataset\\sales_train_validation.csv' ).id validation.columns=['id' ] + ['F' + str(i + 1 ) for i in range(28 )] else : valid['sold' ] = valid_preds validation = valid[['id' ,'d' ,'sold' ]] validation = pd.pivot(validation, index='id' , columns='d' , values='sold' ).reset_index() validation.columns=['id' ] + ['F' + str(i + 1 ) for i in range(28 )] validation.id = validation.id.map(d_id).str.replace('evaluation' ,'validation' ) test['sold' ] = eval_preds evaluation = test[['id' ,'d' ,'sold' ]] evaluation = pd.pivot(evaluation, index='id' , columns='d' , values='sold' ).reset_index() evaluation.columns=['id' ] + ['F' + str(i + 1 ) for i in range(28 )] evaluation.id = evaluation.id.map(d_id) submit = pd.concat([validation,evaluation]).reset_index(drop=True ) submit.to_csv('M5_submission.csv' ,index=False )