Kaggle M5 Competition Part 2 -Modeling

Contents of table:



Kaggle M5 Competition Part 2 -Modeling


5. Feature Engineering

1
#5.1 Label Encoding 
1
2
3
4
5
6
7
#id, department, category, store, state 를 코드값으로 저장 
d_id = dict(zip(df.id.cat.codes, df.id))
d_item_id = dict(zip(df.item_id.cat.codes, df.item_id))
d_dept_id = dict(zip(df.dept_id.cat.codes, df.dept_id))
d_cat_id = dict(zip(df.cat_id.cat.codes, df.cat_id))
d_store_id = dict(zip(df.store_id.cat.codes, df.store_id))
d_state_id = dict(zip(df.state_id.cat.codes, df.state_id))
1
2
3
4
5
6
7
8
9
10
11
12
13
#1
gc.collect()

#2
df.d = df['d'].apply(lambda x: x.split('_')[1]).astype(np.int16)
cols = df.dtypes.index.tolist()
types = df.dtypes.values.tolist()
for i,type in enumerate(types):
if type.name == 'category':
df[cols[i]] = df[cols[i]].cat.codes

#3
df.drop('date',axis=1,inplace=True)
1
import time
1
#5.2 introduce lags
1
2
3
4
#lag col들을 추가
lags = [1,2,3,6,12,24,36]
for lag in lags:
df['sold_lag_'+str(lag)] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],as_index=False)['sold'].shift(lag).astype(np.float16)
1
#5.3 Mean Encoding 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
%time
#판매량 평균을 wrt item, state, store, category, department 별로 col 생성
df['iteam_sold_avg'] = df.groupby('item_id')['sold'].transform('mean').astype(np.float16)
df['state_sold_avg'] = df.groupby('state_id')['sold'].transform('mean').astype(np.float16)
df['store_sold_avg'] = df.groupby('store_id')['sold'].transform('mean').astype(np.float16)
df['cat_sold_avg'] = df.groupby('cat_id')['sold'].transform('mean').astype(np.float16)
df['dept_sold_avg'] = df.groupby('dept_id')['sold'].transform('mean').astype(np.float16)
df['cat_dept_sold_avg'] = df.groupby(['cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
df['store_item_sold_avg'] = df.groupby(['store_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['cat_item_sold_avg'] = df.groupby(['cat_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['dept_item_sold_avg'] = df.groupby(['dept_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_sold_avg'] = df.groupby(['state_id','store_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_cat_sold_avg'] = df.groupby(['state_id','store_id','cat_id'])['sold'].transform('mean').astype(np.float16)
df['store_cat_dept_sold_avg'] = df.groupby(['store_id','cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
Wall time: 1 ms
1
#5.4 Rolling Window Statistics 
1
df['rolling_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.rolling(window=7).mean()).astype(np.float16)
1
#5.5 Expanding Window Statistics  
1
df['expanding_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.expanding(2).mean()).astype(np.float16)
1
#5.6 Trends
1
2
3
4
5
#Selling Trend는 간단하게, 평균보다 큰지 작은지 만을 비교. 
df['daily_avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id','d'])['sold'].transform('mean').astype(np.float16)
df['avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform('mean').astype(np.float16)
df['selling_trend'] = (df['daily_avg_sold'] - df['avg_sold']).astype(np.float16)
df.drop(['daily_avg_sold','avg_sold'],axis=1,inplace=True)
1
#5.7 Save the data 
1
2
#lag 추가로 인해서, d 35까지 빈 row 들이 많이 발생했으므로 해당기간을 제외. 
df = df[df['d']>=36]
1
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 58967660 entries, 1067150 to 60034809
Data columns (total 43 columns):
id                          int16
item_id                     int16
dept_id                     int8
cat_id                      int8
store_id                    int8
state_id                    int8
d                           int16
sold                        int16
wm_yr_wk                    int16
weekday                     int8
wday                        int8
month                       int8
year                        int16
event_name_1                int8
event_type_1                int8
event_name_2                int8
event_type_2                int8
snap_CA                     int8
snap_TX                     int8
snap_WI                     int8
sell_price                  float16
sold_lag_1                  float16
sold_lag_2                  float16
sold_lag_3                  float16
sold_lag_6                  float16
sold_lag_12                 float16
sold_lag_24                 float16
sold_lag_36                 float16
iteam_sold_avg              float16
state_sold_avg              float16
store_sold_avg              float16
cat_sold_avg                float16
dept_sold_avg               float16
cat_dept_sold_avg           float16
store_item_sold_avg         float16
cat_item_sold_avg           float16
dept_item_sold_avg          float16
state_store_sold_avg        float16
state_store_cat_sold_avg    float16
store_cat_dept_sold_avg     float16
rolling_sold_mean           float16
expanding_sold_mean         float16
selling_trend               float16
dtypes: float16(23), int16(6), int8(14)
memory usage: 4.4 GB
1
2
3
df.to_pickle('data.pkl')
del df
gc.collect()

6. Modeling and Prediction

1
import time
1
2
3
4
5
6
%time 
data = pd.read_pickle('data.pkl') # FE후에 pickle 형태로 저장시켰던 데이터를 로드.
valid = data[(data['d']>=1914) & (data['d']<1942)][['id','d','sold']] # 1914 ~ 1942 validation period
test = data[data['d']>=1942][['id','d','sold']] # d >= 1942 test and eval period
eval_preds = test['sold'] # eval = test
valid_preds = valid['sold'] # val = val
Wall time: 0 ns
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#Get the store ids
stores = sales.store_id.cat.codes.unique().tolist()
for store in stores: #store 별로 나눠서 prediction 진행
df = data[data['store_id']==store]

#Split the data
X_train, y_train = df[df['d']<1914].drop('sold',axis=1), df[df['d']<1914]['sold']
X_valid, y_valid = df[(df['d']>=1914) & (df['d']<1942)].drop('sold',axis=1), df[(df['d']>=1914) & (df['d']<1942)]['sold']
X_test = df[df['d']>=1942].drop('sold',axis=1)

#Train and validate
model = LGBMRegressor(
n_estimators=1000,
learning_rate=0.3,
subsample=0.8,
colsample_bytree=0.8,
max_depth=8,
num_leaves=50,
min_child_weight=300
)
print('*****Prediction for Store: {}*****'.format(d_store_id[store]))
model.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_valid,y_valid)],
eval_metric='rmse', verbose=20, early_stopping_rounds=20)

valid_preds[X_valid.index] = model.predict(X_valid)
eval_preds[X_test.index] = model.predict(X_test)
filename = 'model'+str(d_store_id[store])+'.pkl'
# save model
joblib.dump(model, filename)
del model, X_train, y_train, X_valid, y_valid
gc.collect()
*****Prediction for Store: CA_1*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.843923    training's l2: 0.712206    valid_1's rmse: 0.556612    valid_1's l2: 0.309817
[40]    training's rmse: 0.805702    training's l2: 0.649156    valid_1's rmse: 0.536648    valid_1's l2: 0.287992
[60]    training's rmse: 0.782521    training's l2: 0.612339    valid_1's rmse: 0.529075    valid_1's l2: 0.27992
[80]    training's rmse: 0.765509    training's l2: 0.586004    valid_1's rmse: 0.519001    valid_1's l2: 0.269362
[100]    training's rmse: 0.746824    training's l2: 0.557746    valid_1's rmse: 0.516391    valid_1's l2: 0.26666
[120]    training's rmse: 0.736669    training's l2: 0.542682    valid_1's rmse: 0.512239    valid_1's l2: 0.262389
[140]    training's rmse: 0.725183    training's l2: 0.52589    valid_1's rmse: 0.507517    valid_1's l2: 0.257574
[160]    training's rmse: 0.71879    training's l2: 0.516659    valid_1's rmse: 0.503054    valid_1's l2: 0.253063
[180]    training's rmse: 0.713246    training's l2: 0.508719    valid_1's rmse: 0.501668    valid_1's l2: 0.25167
Early stopping, best iteration is:
[177]    training's rmse: 0.713815    training's l2: 0.509531    valid_1's rmse: 0.501194    valid_1's l2: 0.251195
*****Prediction for Store: CA_2*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.509193    training's l2: 0.259277    valid_1's rmse: 0.488679    valid_1's l2: 0.238808
[40]    training's rmse: 0.476985    training's l2: 0.227515    valid_1's rmse: 0.481392    valid_1's l2: 0.231738
[60]    training's rmse: 0.459124    training's l2: 0.210795    valid_1's rmse: 0.469844    valid_1's l2: 0.220753
[80]    training's rmse: 0.446454    training's l2: 0.199321    valid_1's rmse: 0.466131    valid_1's l2: 0.217278
[100]    training's rmse: 0.44062    training's l2: 0.194146    valid_1's rmse: 0.465138    valid_1's l2: 0.216353
[120]    training's rmse: 0.435579    training's l2: 0.189729    valid_1's rmse: 0.462275    valid_1's l2: 0.213698
[140]    training's rmse: 0.433312    training's l2: 0.187759    valid_1's rmse: 0.46174    valid_1's l2: 0.213204
[160]    training's rmse: 0.430487    training's l2: 0.185319    valid_1's rmse: 0.461825    valid_1's l2: 0.213283
Early stopping, best iteration is:
[149]    training's rmse: 0.431706    training's l2: 0.18637    valid_1's rmse: 0.461223    valid_1's l2: 0.212727
*****Prediction for Store: CA_3*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 1.31768    training's l2: 1.73629    valid_1's rmse: 0.620532    valid_1's l2: 0.38506
[40]    training's rmse: 1.25016    training's l2: 1.56289    valid_1's rmse: 0.599518    valid_1's l2: 0.359422
[60]    training's rmse: 1.21357    training's l2: 1.47275    valid_1's rmse: 0.583401    valid_1's l2: 0.340357
[80]    training's rmse: 1.18962    training's l2: 1.41519    valid_1's rmse: 0.580415    valid_1's l2: 0.336882
[100]    training's rmse: 1.16704    training's l2: 1.36198    valid_1's rmse: 0.573824    valid_1's l2: 0.329274
Early stopping, best iteration is:
[83]    training's rmse: 1.18341    training's l2: 1.40046    valid_1's rmse: 0.571149    valid_1's l2: 0.326211
*****Prediction for Store: CA_4*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.379545    training's l2: 0.144055    valid_1's rmse: 0.306421    valid_1's l2: 0.0938936
[40]    training's rmse: 0.362723    training's l2: 0.131568    valid_1's rmse: 0.296737    valid_1's l2: 0.0880528
[60]    training's rmse: 0.352526    training's l2: 0.124275    valid_1's rmse: 0.286469    valid_1's l2: 0.0820644
[80]    training's rmse: 0.347152    training's l2: 0.120515    valid_1's rmse: 0.283419    valid_1's l2: 0.0803261
[100]    training's rmse: 0.342128    training's l2: 0.117052    valid_1's rmse: 0.279012    valid_1's l2: 0.0778477
[120]    training's rmse: 0.339248    training's l2: 0.115089    valid_1's rmse: 0.27756    valid_1's l2: 0.0770398
[140]    training's rmse: 0.336076    training's l2: 0.112947    valid_1's rmse: 0.27745    valid_1's l2: 0.0769786
Early stopping, best iteration is:
[129]    training's rmse: 0.337326    training's l2: 0.113789    valid_1's rmse: 0.276789    valid_1's l2: 0.0766123
*****Prediction for Store: TX_1*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.779231    training's l2: 0.607202    valid_1's rmse: 0.495078    valid_1's l2: 0.245102
[40]    training's rmse: 0.734945    training's l2: 0.540143    valid_1's rmse: 0.477927    valid_1's l2: 0.228414
[60]    training's rmse: 0.715    training's l2: 0.511225    valid_1's rmse: 0.474993    valid_1's l2: 0.225618
[80]    training's rmse: 0.700945    training's l2: 0.491324    valid_1's rmse: 0.471686    valid_1's l2: 0.222487
[100]    training's rmse: 0.688138    training's l2: 0.473534    valid_1's rmse: 0.469721    valid_1's l2: 0.220638
[120]    training's rmse: 0.671506    training's l2: 0.45092    valid_1's rmse: 0.468799    valid_1's l2: 0.219772
Early stopping, best iteration is:
[111]    training's rmse: 0.678168    training's l2: 0.459912    valid_1's rmse: 0.466017    valid_1's l2: 0.217172
*****Prediction for Store: TX_2*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.949797    training's l2: 0.902115    valid_1's rmse: 0.519843    valid_1's l2: 0.270237
[40]    training's rmse: 0.901254    training's l2: 0.812259    valid_1's rmse: 0.50753    valid_1's l2: 0.257587
[60]    training's rmse: 0.860935    training's l2: 0.741208    valid_1's rmse: 0.496691    valid_1's l2: 0.246702
[80]    training's rmse: 0.837279    training's l2: 0.701036    valid_1's rmse: 0.500869    valid_1's l2: 0.25087
Early stopping, best iteration is:
[60]    training's rmse: 0.860935    training's l2: 0.741208    valid_1's rmse: 0.496691    valid_1's l2: 0.246702
*****Prediction for Store: TX_3*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.741642    training's l2: 0.550033    valid_1's rmse: 0.569192    valid_1's l2: 0.323979
[40]    training's rmse: 0.71047    training's l2: 0.504767    valid_1's rmse: 0.557032    valid_1's l2: 0.310284
[60]    training's rmse: 0.68682    training's l2: 0.471721    valid_1's rmse: 0.546532    valid_1's l2: 0.298697
[80]    training's rmse: 0.672727    training's l2: 0.452562    valid_1's rmse: 0.541006    valid_1's l2: 0.292688
[100]    training's rmse: 0.66163    training's l2: 0.437754    valid_1's rmse: 0.539347    valid_1's l2: 0.290895
[120]    training's rmse: 0.650395    training's l2: 0.423014    valid_1's rmse: 0.534985    valid_1's l2: 0.286208
[140]    training's rmse: 0.645165    training's l2: 0.416238    valid_1's rmse: 0.532259    valid_1's l2: 0.2833
Early stopping, best iteration is:
[132]    training's rmse: 0.646645    training's l2: 0.418149    valid_1's rmse: 0.531403    valid_1's l2: 0.28239
*****Prediction for Store: WI_1*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.40387    training's l2: 0.163111    valid_1's rmse: 0.351971    valid_1's l2: 0.123884
[40]    training's rmse: 0.379547    training's l2: 0.144056    valid_1's rmse: 0.339714    valid_1's l2: 0.115405
[60]    training's rmse: 0.370228    training's l2: 0.137069    valid_1's rmse: 0.338534    valid_1's l2: 0.114605
[80]    training's rmse: 0.362681    training's l2: 0.131537    valid_1's rmse: 0.335793    valid_1's l2: 0.112757
Early stopping, best iteration is:
[75]    training's rmse: 0.363574    training's l2: 0.132186    valid_1's rmse: 0.335287    valid_1's l2: 0.112418
*****Prediction for Store: WI_2*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.798844    training's l2: 0.638151    valid_1's rmse: 0.99757    valid_1's l2: 0.995147
[40]    training's rmse: 0.75986    training's l2: 0.577388    valid_1's rmse: 0.979328    valid_1's l2: 0.959084
[60]    training's rmse: 0.729671    training's l2: 0.53242    valid_1's rmse: 0.968394    valid_1's l2: 0.937787
Early stopping, best iteration is:
[57]    training's rmse: 0.732588    training's l2: 0.536685    valid_1's rmse: 0.967836    valid_1's l2: 0.936707
*****Prediction for Store: WI_3*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.803068    training's l2: 0.644919    valid_1's rmse: 0.580289    valid_1's l2: 0.336735
[40]    training's rmse: 0.762335    training's l2: 0.581154    valid_1's rmse: 0.573159    valid_1's l2: 0.328512
[60]    training's rmse: 0.739142    training's l2: 0.546331    valid_1's rmse: 0.566164    valid_1's l2: 0.320541
Early stopping, best iteration is:
[51]    training's rmse: 0.748455    training's l2: 0.560184    valid_1's rmse: 0.563976    valid_1's l2: 0.318069
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

actual = False
if actual == False:
#대회 종료 1달 전에, validation data 를 추가로 제공하기 때문에, 그 전에 training data 로만 생성한 valid 를 쓸지, 아니면 추가 제공 valid 를 쓸지 결정
validation = sales[['id']+['d_' + str(i) for i in range(1914,1942)]]
validation['id']=pd.read_csv('C:\\Eric\\Projects\\Kaggle_M5\Dataset\\sales_train_validation.csv').id
validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
else:
valid['sold'] = valid_preds
validation = valid[['id','d','sold']]
validation = pd.pivot(validation, index='id', columns='d', values='sold').reset_index()
validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
validation.id = validation.id.map(d_id).str.replace('evaluation','validation')

#predictio data 생성
test['sold'] = eval_preds
evaluation = test[['id','d','sold']]
evaluation = pd.pivot(evaluation, index='id', columns='d', values='sold').reset_index()
evaluation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
evaluation.id = evaluation.id.map(d_id)

#Submission 파일 생성
submit = pd.concat([validation,evaluation]).reset_index(drop=True)
submit.to_csv('M5_submission.csv',index=False)
1

Author

Eric Park

Posted on

2020-09-21

Updated on

2020-09-22

Licensed under