Kaggle M5 Competition Part 1 -EDA

Contents of table:



Kaggle M5 Competition Part 1 -EDA


미국 Wal-Mart 에서 주최한 매출예측 대회이다.

#ref:
https://mofc.unic.ac.cy/m5-competition/
https://www.kaggle.com/c/m5-forecasting-accuracy

#아래 코드는 Kaggle Grandmaster Rob Mulla 의 모델링을 기반으로 재구성하였습니다.

대회설명:
M5는 월마트에서 제공하는 계층적 판매 데이터를 사용하여, 향후 28 일 동안의 일일 판매를 예측하고 분포를 추정하는 것이 목표이다.
데이터에는 가격, 프로모션, 요일 및 특별 이벤트와 같은 설명 변수가 포함된다.

데이터셋:
calendar.csv - 제품 판매 날짜에 대한 정보를 포함.
sales_train_validation.csv - 제품 및 매장 별 일일 판매량 기록 데이터 포함 [d_1-d_1913]
sample_submission.csv - 제출 양식.
sell_prices.csv - 상점 및 날짜별로 판매 된 제품의 가격에 대한 정보를 포함.
sales_train_evaluation.cs - 제품 판매 포함 [d_1-d_1941]

1
2
3
4
5
6
7
8
9
10
11
12
13
import os
import pandas as pd
import numpy as np
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import warnings
warnings.filterwarnings('ignore')
from lightgbm import LGBMRegressor
import joblib

1. Fetch the data

1
2
3
4
5
6
sales = pd.read_csv('C:\\Eric\\Projects\\Kaggle_M5\Dataset\\sales_train_evaluation.csv')
sales.name = 'sales'
calendar = pd.read_csv('C:\\Eric\\Projects\\Kaggle_M5\Dataset\\calendar.csv')
calendar.name = 'calendar'
prices = pd.read_csv('C:\\Eric\\Projects\\Kaggle_M5\Dataset\\sell_prices.csv')
prices.name = 'prices'
1
sales.columns
Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',
       'd_2', 'd_3', 'd_4',
       ...
       'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938',
       'd_1939', 'd_1940', 'd_1941'],
      dtype='object', length=1947)
1
2
3
4
5
#빈 칸 처리되어있는 d 1942 ~ 1969 col들에 0 입력
for d in range(1942,1970):
col = 'd_' + str(d)
sales[col] = 0
sales[col] = sales[col].astype(np.int16)

2. Downcasting

1
2
3
4
#기본 데이터셋의 용량이 큰 만큼, 메모리 다운이 필요. 
sales_bd = np.round(sales.memory_usage().sum()/(1024*1024),1)
calendar_bd = np.round(calendar.memory_usage().sum()/(1024*1024),1)
prices_bd = np.round(prices.memory_usage().sum()/(1024*1024),1)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#캐글의 memory downcasting 코드를 참고하여 아래와 같이 메모리 다운. 
def downcast(df):
cols = df.dtypes.index.tolist()
types = df.dtypes.values.tolist()
for i,t in enumerate(types):
if 'int' in str(t):
if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
df[cols[i]] = df[cols[i]].astype(np.int8)
elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
df[cols[i]] = df[cols[i]].astype(np.int16)
elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
df[cols[i]] = df[cols[i]].astype(np.int32)
else:
df[cols[i]] = df[cols[i]].astype(np.int64)
elif 'float' in str(t):
if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
df[cols[i]] = df[cols[i]].astype(np.float16)
elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
df[cols[i]] = df[cols[i]].astype(np.float32)
else:
df[cols[i]] = df[cols[i]].astype(np.float64)
elif t == np.object:
if cols[i] == 'date':
df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
else:
df[cols[i]] = df[cols[i]].astype('category')
return df
1
2
3
sales = downcast(sales)
prices = downcast(prices)
calendar = downcast(calendar)
1
2
3
4
#메모리 다운 후의 메모리 사용량 체크. 
sales_ad = np.round(sales.memory_usage().sum()/(1024*1024),1)
calendar_ad = np.round(calendar.memory_usage().sum()/(1024*1024),1)
prices_ad = np.round(prices.memory_usage().sum()/(1024*1024),1)
1
2
3
4
5
6
7
8
9
10
11
12
#다운 캐스팅이 DataFrame의 메모리 사용량에 얼마나 많은 영향을 미쳤는지 시각화.1/4 미만으로 줄일 수 있음. 
dic = {'DataFrame':['sales','calendar','prices'],
'Before downcasting':[sales_bd,calendar_bd,prices_bd],
'After downcasting':[sales_ad,calendar_ad,prices_ad]}

memory = pd.DataFrame(dic)
memory = pd.melt(memory, id_vars='DataFrame', var_name='Status', value_name='Memory (MB)')
memory.sort_values('Memory (MB)',inplace=True)
fig = px.bar(memory, x='DataFrame', y='Memory (MB)', color='Status', barmode='group', text='Memory (MB)')
fig.update_traces(texttemplate='%{text} MB', textposition='outside')
fig.update_layout(template='seaborn', title='Effect of Downcasting')
fig.show()

3. Exploratory Data Analysis

1
2
3
4
5
6
walmart 에서 제공하는 세일즈 데이터는, wrt, 즉 with respect to [ cols ]
State: CA, WI, TX (3)
Store: CA_1, CA_2, TX_1, WI_1, ... (10)
Category: FOOD, HOBBIES, HOUSEHOLD (3)
Department:FOOD_1,2,3 , HOBBIES_1,2, ... (7)
item_id:: each unique id # (3,049)
1
2
3
4
5
6
7
8
9
10
11
#plotly_express 에서 제공하는 treemap 을 활용해서, 각 제품 id 를 count var로 잡고, data col 들의 관계를 directory 형태로 시각화.

group = sales.groupby(['state_id','store_id','cat_id','dept_id'],as_index=False)['item_id'].count().dropna()
group['USA'] = 'United States of America'
group.rename(columns={'state_id':'State','store_id':'Store','cat_id':'Category','dept_id':'Department','item_id':'Count'},inplace=True)
fig = px.treemap(group, path=['USA', 'State', 'Store', 'Category', 'Department'], values='Count',
color='Count',
color_continuous_scale= px.colors.sequential.Sunset,
title='Walmart: Distribution of items')
fig.update_layout(template='seaborn')
fig.show()

4. Melting the data

#4.1 Convert from wide to long format

1
머신러닝 포맷에 적합시키기 위해서는 와이드 형식의 판매 데이터 프레임을 긴 형식으로 변환이 필요하다. sales 데이터셋의 row 는 30490(== # of items), 데이터셋을 melt하게되면은 sales, calendar 30490 x 1969 = 60034810 개의 row 를 가지게 된다. 
1
df = pd.melt(sales, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='sold').dropna()
1
2
df = pd.merge(df, calendar, on='d', how='left')
df = pd.merge(df, prices, on=['store_id','item_id','wm_yr_wk'], how='left')
1
2
3
4
5
6
7
#Store 별로 매출액합계를 violin plot 을 활용해서 시각화. 
group = df.groupby(['year','date','state_id','store_id'], as_index=False)['sold'].sum().dropna()
fig = px.violin(group, x='store_id', color='state_id', y='sold',box=True)
fig.update_xaxes(title_text='Store')
fig.update_yaxes(title_text='Total items sold')
fig.update_layout(template='seaborn',title='Distribution of Items sold wrt Stores',legend_title_text='State')
fig.show()

Kaggle M5 Competition Part 2 -Modeling

Contents of table:



Kaggle M5 Competition Part 2 -Modeling


5. Feature Engineering

1
#5.1 Label Encoding 
1
2
3
4
5
6
7
#id, department, category, store, state 를 코드값으로 저장 
d_id = dict(zip(df.id.cat.codes, df.id))
d_item_id = dict(zip(df.item_id.cat.codes, df.item_id))
d_dept_id = dict(zip(df.dept_id.cat.codes, df.dept_id))
d_cat_id = dict(zip(df.cat_id.cat.codes, df.cat_id))
d_store_id = dict(zip(df.store_id.cat.codes, df.store_id))
d_state_id = dict(zip(df.state_id.cat.codes, df.state_id))
1
2
3
4
5
6
7
8
9
10
11
12
13
#1
gc.collect()

#2
df.d = df['d'].apply(lambda x: x.split('_')[1]).astype(np.int16)
cols = df.dtypes.index.tolist()
types = df.dtypes.values.tolist()
for i,type in enumerate(types):
if type.name == 'category':
df[cols[i]] = df[cols[i]].cat.codes

#3
df.drop('date',axis=1,inplace=True)
1
import time
1
#5.2 introduce lags
1
2
3
4
#lag col들을 추가
lags = [1,2,3,6,12,24,36]
for lag in lags:
df['sold_lag_'+str(lag)] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],as_index=False)['sold'].shift(lag).astype(np.float16)
1
#5.3 Mean Encoding 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
%time
#판매량 평균을 wrt item, state, store, category, department 별로 col 생성
df['iteam_sold_avg'] = df.groupby('item_id')['sold'].transform('mean').astype(np.float16)
df['state_sold_avg'] = df.groupby('state_id')['sold'].transform('mean').astype(np.float16)
df['store_sold_avg'] = df.groupby('store_id')['sold'].transform('mean').astype(np.float16)
df['cat_sold_avg'] = df.groupby('cat_id')['sold'].transform('mean').astype(np.float16)
df['dept_sold_avg'] = df.groupby('dept_id')['sold'].transform('mean').astype(np.float16)
df['cat_dept_sold_avg'] = df.groupby(['cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
df['store_item_sold_avg'] = df.groupby(['store_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['cat_item_sold_avg'] = df.groupby(['cat_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['dept_item_sold_avg'] = df.groupby(['dept_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_sold_avg'] = df.groupby(['state_id','store_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_cat_sold_avg'] = df.groupby(['state_id','store_id','cat_id'])['sold'].transform('mean').astype(np.float16)
df['store_cat_dept_sold_avg'] = df.groupby(['store_id','cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
Wall time: 1 ms
1
#5.4 Rolling Window Statistics 
1
df['rolling_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.rolling(window=7).mean()).astype(np.float16)
1
#5.5 Expanding Window Statistics  
1
df['expanding_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.expanding(2).mean()).astype(np.float16)
1
#5.6 Trends
1
2
3
4
5
#Selling Trend는 간단하게, 평균보다 큰지 작은지 만을 비교. 
df['daily_avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id','d'])['sold'].transform('mean').astype(np.float16)
df['avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform('mean').astype(np.float16)
df['selling_trend'] = (df['daily_avg_sold'] - df['avg_sold']).astype(np.float16)
df.drop(['daily_avg_sold','avg_sold'],axis=1,inplace=True)
1
#5.7 Save the data 
1
2
#lag 추가로 인해서, d 35까지 빈 row 들이 많이 발생했으므로 해당기간을 제외. 
df = df[df['d']>=36]
1
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 58967660 entries, 1067150 to 60034809
Data columns (total 43 columns):
id                          int16
item_id                     int16
dept_id                     int8
cat_id                      int8
store_id                    int8
state_id                    int8
d                           int16
sold                        int16
wm_yr_wk                    int16
weekday                     int8
wday                        int8
month                       int8
year                        int16
event_name_1                int8
event_type_1                int8
event_name_2                int8
event_type_2                int8
snap_CA                     int8
snap_TX                     int8
snap_WI                     int8
sell_price                  float16
sold_lag_1                  float16
sold_lag_2                  float16
sold_lag_3                  float16
sold_lag_6                  float16
sold_lag_12                 float16
sold_lag_24                 float16
sold_lag_36                 float16
iteam_sold_avg              float16
state_sold_avg              float16
store_sold_avg              float16
cat_sold_avg                float16
dept_sold_avg               float16
cat_dept_sold_avg           float16
store_item_sold_avg         float16
cat_item_sold_avg           float16
dept_item_sold_avg          float16
state_store_sold_avg        float16
state_store_cat_sold_avg    float16
store_cat_dept_sold_avg     float16
rolling_sold_mean           float16
expanding_sold_mean         float16
selling_trend               float16
dtypes: float16(23), int16(6), int8(14)
memory usage: 4.4 GB
1
2
3
df.to_pickle('data.pkl')
del df
gc.collect()

6. Modeling and Prediction

1
import time
1
2
3
4
5
6
%time 
data = pd.read_pickle('data.pkl') # FE후에 pickle 형태로 저장시켰던 데이터를 로드.
valid = data[(data['d']>=1914) & (data['d']<1942)][['id','d','sold']] # 1914 ~ 1942 validation period
test = data[data['d']>=1942][['id','d','sold']] # d >= 1942 test and eval period
eval_preds = test['sold'] # eval = test
valid_preds = valid['sold'] # val = val
Wall time: 0 ns
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#Get the store ids
stores = sales.store_id.cat.codes.unique().tolist()
for store in stores: #store 별로 나눠서 prediction 진행
df = data[data['store_id']==store]

#Split the data
X_train, y_train = df[df['d']<1914].drop('sold',axis=1), df[df['d']<1914]['sold']
X_valid, y_valid = df[(df['d']>=1914) & (df['d']<1942)].drop('sold',axis=1), df[(df['d']>=1914) & (df['d']<1942)]['sold']
X_test = df[df['d']>=1942].drop('sold',axis=1)

#Train and validate
model = LGBMRegressor(
n_estimators=1000,
learning_rate=0.3,
subsample=0.8,
colsample_bytree=0.8,
max_depth=8,
num_leaves=50,
min_child_weight=300
)
print('*****Prediction for Store: {}*****'.format(d_store_id[store]))
model.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_valid,y_valid)],
eval_metric='rmse', verbose=20, early_stopping_rounds=20)

valid_preds[X_valid.index] = model.predict(X_valid)
eval_preds[X_test.index] = model.predict(X_test)
filename = 'model'+str(d_store_id[store])+'.pkl'
# save model
joblib.dump(model, filename)
del model, X_train, y_train, X_valid, y_valid
gc.collect()
*****Prediction for Store: CA_1*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.843923    training's l2: 0.712206    valid_1's rmse: 0.556612    valid_1's l2: 0.309817
[40]    training's rmse: 0.805702    training's l2: 0.649156    valid_1's rmse: 0.536648    valid_1's l2: 0.287992
[60]    training's rmse: 0.782521    training's l2: 0.612339    valid_1's rmse: 0.529075    valid_1's l2: 0.27992
[80]    training's rmse: 0.765509    training's l2: 0.586004    valid_1's rmse: 0.519001    valid_1's l2: 0.269362
[100]    training's rmse: 0.746824    training's l2: 0.557746    valid_1's rmse: 0.516391    valid_1's l2: 0.26666
[120]    training's rmse: 0.736669    training's l2: 0.542682    valid_1's rmse: 0.512239    valid_1's l2: 0.262389
[140]    training's rmse: 0.725183    training's l2: 0.52589    valid_1's rmse: 0.507517    valid_1's l2: 0.257574
[160]    training's rmse: 0.71879    training's l2: 0.516659    valid_1's rmse: 0.503054    valid_1's l2: 0.253063
[180]    training's rmse: 0.713246    training's l2: 0.508719    valid_1's rmse: 0.501668    valid_1's l2: 0.25167
Early stopping, best iteration is:
[177]    training's rmse: 0.713815    training's l2: 0.509531    valid_1's rmse: 0.501194    valid_1's l2: 0.251195
*****Prediction for Store: CA_2*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.509193    training's l2: 0.259277    valid_1's rmse: 0.488679    valid_1's l2: 0.238808
[40]    training's rmse: 0.476985    training's l2: 0.227515    valid_1's rmse: 0.481392    valid_1's l2: 0.231738
[60]    training's rmse: 0.459124    training's l2: 0.210795    valid_1's rmse: 0.469844    valid_1's l2: 0.220753
[80]    training's rmse: 0.446454    training's l2: 0.199321    valid_1's rmse: 0.466131    valid_1's l2: 0.217278
[100]    training's rmse: 0.44062    training's l2: 0.194146    valid_1's rmse: 0.465138    valid_1's l2: 0.216353
[120]    training's rmse: 0.435579    training's l2: 0.189729    valid_1's rmse: 0.462275    valid_1's l2: 0.213698
[140]    training's rmse: 0.433312    training's l2: 0.187759    valid_1's rmse: 0.46174    valid_1's l2: 0.213204
[160]    training's rmse: 0.430487    training's l2: 0.185319    valid_1's rmse: 0.461825    valid_1's l2: 0.213283
Early stopping, best iteration is:
[149]    training's rmse: 0.431706    training's l2: 0.18637    valid_1's rmse: 0.461223    valid_1's l2: 0.212727
*****Prediction for Store: CA_3*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 1.31768    training's l2: 1.73629    valid_1's rmse: 0.620532    valid_1's l2: 0.38506
[40]    training's rmse: 1.25016    training's l2: 1.56289    valid_1's rmse: 0.599518    valid_1's l2: 0.359422
[60]    training's rmse: 1.21357    training's l2: 1.47275    valid_1's rmse: 0.583401    valid_1's l2: 0.340357
[80]    training's rmse: 1.18962    training's l2: 1.41519    valid_1's rmse: 0.580415    valid_1's l2: 0.336882
[100]    training's rmse: 1.16704    training's l2: 1.36198    valid_1's rmse: 0.573824    valid_1's l2: 0.329274
Early stopping, best iteration is:
[83]    training's rmse: 1.18341    training's l2: 1.40046    valid_1's rmse: 0.571149    valid_1's l2: 0.326211
*****Prediction for Store: CA_4*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.379545    training's l2: 0.144055    valid_1's rmse: 0.306421    valid_1's l2: 0.0938936
[40]    training's rmse: 0.362723    training's l2: 0.131568    valid_1's rmse: 0.296737    valid_1's l2: 0.0880528
[60]    training's rmse: 0.352526    training's l2: 0.124275    valid_1's rmse: 0.286469    valid_1's l2: 0.0820644
[80]    training's rmse: 0.347152    training's l2: 0.120515    valid_1's rmse: 0.283419    valid_1's l2: 0.0803261
[100]    training's rmse: 0.342128    training's l2: 0.117052    valid_1's rmse: 0.279012    valid_1's l2: 0.0778477
[120]    training's rmse: 0.339248    training's l2: 0.115089    valid_1's rmse: 0.27756    valid_1's l2: 0.0770398
[140]    training's rmse: 0.336076    training's l2: 0.112947    valid_1's rmse: 0.27745    valid_1's l2: 0.0769786
Early stopping, best iteration is:
[129]    training's rmse: 0.337326    training's l2: 0.113789    valid_1's rmse: 0.276789    valid_1's l2: 0.0766123
*****Prediction for Store: TX_1*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.779231    training's l2: 0.607202    valid_1's rmse: 0.495078    valid_1's l2: 0.245102
[40]    training's rmse: 0.734945    training's l2: 0.540143    valid_1's rmse: 0.477927    valid_1's l2: 0.228414
[60]    training's rmse: 0.715    training's l2: 0.511225    valid_1's rmse: 0.474993    valid_1's l2: 0.225618
[80]    training's rmse: 0.700945    training's l2: 0.491324    valid_1's rmse: 0.471686    valid_1's l2: 0.222487
[100]    training's rmse: 0.688138    training's l2: 0.473534    valid_1's rmse: 0.469721    valid_1's l2: 0.220638
[120]    training's rmse: 0.671506    training's l2: 0.45092    valid_1's rmse: 0.468799    valid_1's l2: 0.219772
Early stopping, best iteration is:
[111]    training's rmse: 0.678168    training's l2: 0.459912    valid_1's rmse: 0.466017    valid_1's l2: 0.217172
*****Prediction for Store: TX_2*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.949797    training's l2: 0.902115    valid_1's rmse: 0.519843    valid_1's l2: 0.270237
[40]    training's rmse: 0.901254    training's l2: 0.812259    valid_1's rmse: 0.50753    valid_1's l2: 0.257587
[60]    training's rmse: 0.860935    training's l2: 0.741208    valid_1's rmse: 0.496691    valid_1's l2: 0.246702
[80]    training's rmse: 0.837279    training's l2: 0.701036    valid_1's rmse: 0.500869    valid_1's l2: 0.25087
Early stopping, best iteration is:
[60]    training's rmse: 0.860935    training's l2: 0.741208    valid_1's rmse: 0.496691    valid_1's l2: 0.246702
*****Prediction for Store: TX_3*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.741642    training's l2: 0.550033    valid_1's rmse: 0.569192    valid_1's l2: 0.323979
[40]    training's rmse: 0.71047    training's l2: 0.504767    valid_1's rmse: 0.557032    valid_1's l2: 0.310284
[60]    training's rmse: 0.68682    training's l2: 0.471721    valid_1's rmse: 0.546532    valid_1's l2: 0.298697
[80]    training's rmse: 0.672727    training's l2: 0.452562    valid_1's rmse: 0.541006    valid_1's l2: 0.292688
[100]    training's rmse: 0.66163    training's l2: 0.437754    valid_1's rmse: 0.539347    valid_1's l2: 0.290895
[120]    training's rmse: 0.650395    training's l2: 0.423014    valid_1's rmse: 0.534985    valid_1's l2: 0.286208
[140]    training's rmse: 0.645165    training's l2: 0.416238    valid_1's rmse: 0.532259    valid_1's l2: 0.2833
Early stopping, best iteration is:
[132]    training's rmse: 0.646645    training's l2: 0.418149    valid_1's rmse: 0.531403    valid_1's l2: 0.28239
*****Prediction for Store: WI_1*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.40387    training's l2: 0.163111    valid_1's rmse: 0.351971    valid_1's l2: 0.123884
[40]    training's rmse: 0.379547    training's l2: 0.144056    valid_1's rmse: 0.339714    valid_1's l2: 0.115405
[60]    training's rmse: 0.370228    training's l2: 0.137069    valid_1's rmse: 0.338534    valid_1's l2: 0.114605
[80]    training's rmse: 0.362681    training's l2: 0.131537    valid_1's rmse: 0.335793    valid_1's l2: 0.112757
Early stopping, best iteration is:
[75]    training's rmse: 0.363574    training's l2: 0.132186    valid_1's rmse: 0.335287    valid_1's l2: 0.112418
*****Prediction for Store: WI_2*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.798844    training's l2: 0.638151    valid_1's rmse: 0.99757    valid_1's l2: 0.995147
[40]    training's rmse: 0.75986    training's l2: 0.577388    valid_1's rmse: 0.979328    valid_1's l2: 0.959084
[60]    training's rmse: 0.729671    training's l2: 0.53242    valid_1's rmse: 0.968394    valid_1's l2: 0.937787
Early stopping, best iteration is:
[57]    training's rmse: 0.732588    training's l2: 0.536685    valid_1's rmse: 0.967836    valid_1's l2: 0.936707
*****Prediction for Store: WI_3*****
Training until validation scores don't improve for 20 rounds
[20]    training's rmse: 0.803068    training's l2: 0.644919    valid_1's rmse: 0.580289    valid_1's l2: 0.336735
[40]    training's rmse: 0.762335    training's l2: 0.581154    valid_1's rmse: 0.573159    valid_1's l2: 0.328512
[60]    training's rmse: 0.739142    training's l2: 0.546331    valid_1's rmse: 0.566164    valid_1's l2: 0.320541
Early stopping, best iteration is:
[51]    training's rmse: 0.748455    training's l2: 0.560184    valid_1's rmse: 0.563976    valid_1's l2: 0.318069
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

actual = False
if actual == False:
#대회 종료 1달 전에, validation data 를 추가로 제공하기 때문에, 그 전에 training data 로만 생성한 valid 를 쓸지, 아니면 추가 제공 valid 를 쓸지 결정
validation = sales[['id']+['d_' + str(i) for i in range(1914,1942)]]
validation['id']=pd.read_csv('C:\\Eric\\Projects\\Kaggle_M5\Dataset\\sales_train_validation.csv').id
validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
else:
valid['sold'] = valid_preds
validation = valid[['id','d','sold']]
validation = pd.pivot(validation, index='id', columns='d', values='sold').reset_index()
validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
validation.id = validation.id.map(d_id).str.replace('evaluation','validation')

#predictio data 생성
test['sold'] = eval_preds
evaluation = test[['id','d','sold']]
evaluation = pd.pivot(evaluation, index='id', columns='d', values='sold').reset_index()
evaluation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
evaluation.id = evaluation.id.map(d_id)

#Submission 파일 생성
submit = pd.concat([validation,evaluation]).reset_index(drop=True)
submit.to_csv('M5_submission.csv',index=False)
1