나는야 데이터사이언티스트/PYTHON

[Python] 시계열 데이터 분석 - 기초버전

우주먼지의하루 2020. 3. 27. 03:43
728x90
Time series
In [68]:
#tistory 관련 코드(필요없음)
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))

import pandas as pd
pd.set_option('display.max_columns',500) #생략없이 출력 가능

funiture 판매 예측

In [1]:
import warnings
import itertools # 반복 가능한 데이터 스트림을 처리하는 데 유용한 많은 함수와 제네레이터가 포함
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
import pandas as pd
import statsmodels.api as sm #통계분석 기능을 제공하는 파이썬 패키지
import matplotlib

#차트 기본 크기 설정
matplotlib.rcParams['axes.labelsize'] = 14
matplotlib.rcParams['xtick.labelsize'] = 12
matplotlib.rcParams['ytick.labelsize'] = 12
matplotlib.rcParams['text.color'] = 'k'
In [2]:
pd.set_option('display.max_columns',500) #생략없이 출력
In [3]:
df = pd.read_excel("C://Users//82106//Desktop//Sample - Superstore.xls")
In [4]:
df.head()
Out[4]:
Row ID Order ID Order Date Ship Date Ship Mode Customer ID Customer Name Segment Country City State Postal Code Region Product ID Category Sub-Category Product Name Sales Quantity Discount Profit
0 1 CA-2016-152156 2016-11-08 2016-11-11 Second Class CG-12520 Claire Gute Consumer United States Henderson Kentucky 42420 South FUR-BO-10001798 Furniture Bookcases Bush Somerset Collection Bookcase 261.9600 2 0.00 41.9136
1 2 CA-2016-152156 2016-11-08 2016-11-11 Second Class CG-12520 Claire Gute Consumer United States Henderson Kentucky 42420 South FUR-CH-10000454 Furniture Chairs Hon Deluxe Fabric Upholstered Stacking Chairs,... 731.9400 3 0.00 219.5820
2 3 CA-2016-138688 2016-06-12 2016-06-16 Second Class DV-13045 Darrin Van Huff Corporate United States Los Angeles California 90036 West OFF-LA-10000240 Office Supplies Labels Self-Adhesive Address Labels for Typewriters b... 14.6200 2 0.00 6.8714
3 4 US-2015-108966 2015-10-11 2015-10-18 Standard Class SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale Florida 33311 South FUR-TA-10000577 Furniture Tables Bretford CR4500 Series Slim Rectangular Table 957.5775 5 0.45 -383.0310
4 5 US-2015-108966 2015-10-11 2015-10-18 Standard Class SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale Florida 33311 South OFF-ST-10000760 Office Supplies Storage Eldon Fold 'N Roll Cart System 22.3680 2 0.20 2.5164
In [5]:
#Category가 Furniture인것만 사용
furniture = df.loc[df['Category'] == 'Furniture']
In [6]:
furniture.head()
Out[6]:
Row ID Order ID Order Date Ship Date Ship Mode Customer ID Customer Name Segment Country City State Postal Code Region Product ID Category Sub-Category Product Name Sales Quantity Discount Profit
0 1 CA-2016-152156 2016-11-08 2016-11-11 Second Class CG-12520 Claire Gute Consumer United States Henderson Kentucky 42420 South FUR-BO-10001798 Furniture Bookcases Bush Somerset Collection Bookcase 261.9600 2 0.00 41.9136
1 2 CA-2016-152156 2016-11-08 2016-11-11 Second Class CG-12520 Claire Gute Consumer United States Henderson Kentucky 42420 South FUR-CH-10000454 Furniture Chairs Hon Deluxe Fabric Upholstered Stacking Chairs,... 731.9400 3 0.00 219.5820
3 4 US-2015-108966 2015-10-11 2015-10-18 Standard Class SO-20335 Sean O'Donnell Consumer United States Fort Lauderdale Florida 33311 South FUR-TA-10000577 Furniture Tables Bretford CR4500 Series Slim Rectangular Table 957.5775 5 0.45 -383.0310
5 6 CA-2014-115812 2014-06-09 2014-06-14 Standard Class BH-11710 Brosina Hoffman Consumer United States Los Angeles California 90032 West FUR-FU-10001487 Furniture Furnishings Eldon Expressions Wood and Plastic Desk Access... 48.8600 7 0.00 14.1694
10 11 CA-2014-115812 2014-06-09 2014-06-14 Standard Class BH-11710 Brosina Hoffman Consumer United States Los Angeles California 90032 West FUR-TA-10001539 Furniture Tables Chromcraft Rectangular Conference Tables 1706.1840 9 0.20 85.3092
In [7]:
furniture['Order Date'].min(), furniture['Order Date'].max()
Out[7]:
(Timestamp('2014-01-06 00:00:00'), Timestamp('2017-12-30 00:00:00'))

데이터 전처리

In [8]:
#필요없는 열 제거
cols = ['Row ID', 'Order ID', 'Ship Date', 'Ship Mode', 'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State', 'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category', 'Product Name', 'Quantity', 'Discount', 'Profit']
furniture.drop(cols, axis=1, inplace=True)
In [9]:
furniture = furniture.sort_values('Order Date')
In [10]:
furniture.head()
Out[10]:
Order Date Sales
7474 2014-01-06 2573.820
7660 2014-01-07 76.728
866 2014-01-10 51.940
716 2014-01-11 9.940
2978 2014-01-13 545.940
In [11]:
#누락 확인
furniture.isnull().sum()
Out[11]:
Order Date    0
Sales         0
dtype: int64
In [12]:
furniture = furniture.groupby('Order Date')['Sales'].sum().reset_index()
In [13]:
furniture.head()
Out[13]:
Order Date Sales
0 2014-01-06 2573.820
1 2014-01-07 76.728
2 2014-01-10 51.940
3 2014-01-11 9.940
4 2014-01-13 879.939
In [14]:
#index를 Order Date로 한다.
furniture = furniture.set_index('Order Date')
In [15]:
furniture.head()
Out[15]:
Sales
Order Date
2014-01-06 2573.820
2014-01-07 76.728
2014-01-10 51.940
2014-01-11 9.940
2014-01-13 879.939
In [16]:
furniture.index
Out[16]:
DatetimeIndex(['2014-01-06', '2014-01-07', '2014-01-10', '2014-01-11',
               '2014-01-13', '2014-01-14', '2014-01-16', '2014-01-19',
               '2014-01-20', '2014-01-21',
               ...
               '2017-12-18', '2017-12-19', '2017-12-21', '2017-12-22',
               '2017-12-23', '2017-12-24', '2017-12-25', '2017-12-28',
               '2017-12-29', '2017-12-30'],
              dtype='datetime64[ns]', name='Order Date', length=889, freq=None)

resampling

resample 연산을 쓰면 시간 간격을 재조정하는 리샘플링(resampling)이 가능하다. 이 때 시간 구간이 작아지면 데이터 양이 증가한다고 해서 업-샘플링(up-sampling)이라 하고 시간 구간이 커지면 데이터 양이 감소한다고 해서 다운-샘플링(down-sampling)이라 부른다. https://rfriend.tistory.com/494

In [17]:
#MS는인덱스는 해당 월의 마지막 일자로 표시를 시작일로 변경
y = furniture['Sales'].resample('MS').mean()
In [18]:
#2017년 매출
y['2017':]
Out[18]:
Order Date
2017-01-01     397.602133
2017-02-01     528.179800
2017-03-01     544.672240
2017-04-01     453.297905
2017-05-01     678.302328
2017-06-01     826.460291
2017-07-01     562.524857
2017-08-01     857.881889
2017-09-01    1209.508583
2017-10-01     875.362728
2017-11-01    1277.817759
2017-12-01    1256.298672
Freq: MS, Name: Sales, dtype: float64

Visualizing Furniture Sales Time Series Data

In [19]:
y.plot(figsize = (15,6))
plt.show()

연초에는 항상 매출이 낮고 연말에는 높은 계절 패턴이 있음을 확인.

In [20]:
#차트 기본 크기 설정
from pylab import rcParams
rcParams['figure.figsize'] = 18, 8
In [21]:
decomposition = sm.tsa.seasonal_decompose(y, model='additive')
fig = decomposition.plot()
plt.show()

일반 ARIMA모형과 계절 ARIMA모형이 서로 곱해져 있으므로 Multiplicative모형입니다. 합계되어 있으면 Additive모형인데 Multiplicative모형은 Additive모형보다 일반적으로 더(more) general합니다.

Time series forecasting with ARIMA

ARIMA라고 알려진 시계열 예측에 가장 많이 사용되는 방법 중 하나를 적용할 것이다. ARIMA는 자기진행적 통합 이동 평균을 의미한다. ARIMA 모델은 표기법 ARIMA(p, d, q)로 표시된다. 세 가지 파라미터: 계절성, 추세, 노이즈

In [22]:
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

print('Examples of parameter combinations for Seasonal ARIMA...')
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[2]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[3]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[4]))
Examples of parameter combinations for Seasonal ARIMA...
SARIMAX: (0, 0, 1) x (0, 0, 1, 12)
SARIMAX: (0, 0, 1) x (0, 1, 0, 12)
SARIMAX: (0, 1, 0) x (0, 1, 1, 12)
SARIMAX: (0, 1, 0) x (1, 0, 0, 12)

다음은 ARIMA 모델의 파라미터 선택이다. 여기서 목표는 "grid search"를 사용하여 모델에 가장 적합한 성능을 산출하는 최적의 매개변수 집합을 찾는 것이다.

In [23]:
for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            mod = sm.tsa.statespace.SARIMAX(y,
                                            order=param,
                                            seasonal_order=param_seasonal,
                                            enforce_stationarity=False,
                                            enforce_invertibility=False)
            results = mod.fit()
            print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))
        except:
            continue
ARIMA(0, 0, 0)x(0, 0, 0, 12)12 - AIC:769.0817523205916
ARIMA(0, 0, 0)x(0, 0, 1, 12)12 - AIC:1446.5593245884702
ARIMA(0, 0, 0)x(0, 1, 0, 12)12 - AIC:477.7170130920218
ARIMA(0, 0, 0)x(1, 0, 0, 12)12 - AIC:497.23144334183365
C:\Users\82106\Anaconda3\lib\site-packages\statsmodels\base\model.py:512: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
C:\Users\82106\Anaconda3\lib\site-packages\statsmodels\base\model.py:512: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
C:\Users\82106\Anaconda3\lib\site-packages\statsmodels\base\model.py:512: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
ARIMA(0, 0, 0)x(1, 0, 1, 12)12 - AIC:1172.2086741447833
ARIMA(0, 0, 0)x(1, 1, 0, 12)12 - AIC:318.0047199116341
ARIMA(0, 0, 1)x(0, 0, 0, 12)12 - AIC:720.92522707581
ARIMA(0, 0, 1)x(0, 0, 1, 12)12 - AIC:2900.357535652858
ARIMA(0, 0, 1)x(0, 1, 0, 12)12 - AIC:466.56074298091255
ARIMA(0, 0, 1)x(1, 0, 0, 12)12 - AIC:499.5929815918467
C:\Users\82106\Anaconda3\lib\site-packages\statsmodels\base\model.py:512: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
ARIMA(0, 0, 1)x(1, 0, 1, 12)12 - AIC:2513.139467221268
ARIMA(0, 0, 1)x(1, 1, 0, 12)12 - AIC:319.9884876946867
ARIMA(0, 1, 0)x(0, 0, 0, 12)12 - AIC:677.894766843944
C:\Users\82106\Anaconda3\lib\site-packages\statsmodels\base\model.py:512: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
C:\Users\82106\Anaconda3\lib\site-packages\statsmodels\base\model.py:512: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
ARIMA(0, 1, 0)x(0, 0, 1, 12)12 - AIC:1250.256448915547
ARIMA(0, 1, 0)x(0, 1, 0, 12)12 - AIC:486.6378567198382
ARIMA(0, 1, 0)x(1, 0, 0, 12)12 - AIC:497.78896630044073
ARIMA(0, 1, 0)x(1, 0, 1, 12)12 - AIC:1550.2028470024434
ARIMA(0, 1, 0)x(1, 1, 0, 12)12 - AIC:319.7714068109211
ARIMA(0, 1, 1)x(0, 0, 0, 12)12 - AIC:649.9056176817193
ARIMA(0, 1, 1)x(0, 0, 1, 12)12 - AIC:2626.6969858049224
ARIMA(0, 1, 1)x(0, 1, 0, 12)12 - AIC:458.8705548482836
ARIMA(0, 1, 1)x(1, 0, 0, 12)12 - AIC:486.1832977442613
ARIMA(0, 1, 1)x(1, 0, 1, 12)12 - AIC:2500.937327525841
ARIMA(0, 1, 1)x(1, 1, 0, 12)12 - AIC:310.75743684172716
ARIMA(1, 0, 0)x(0, 0, 0, 12)12 - AIC:692.1645522067712
C:\Users\82106\Anaconda3\lib\site-packages\statsmodels\base\model.py:512: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
ARIMA(1, 0, 0)x(0, 0, 1, 12)12 - AIC:1442.9450066127697
ARIMA(1, 0, 0)x(0, 1, 0, 12)12 - AIC:479.46321478521355
ARIMA(1, 0, 0)x(1, 0, 0, 12)12 - AIC:480.9259367935204
C:\Users\82106\Anaconda3\lib\site-packages\statsmodels\base\model.py:512: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
ARIMA(1, 0, 0)x(1, 0, 1, 12)12 - AIC:1265.3041082790207
ARIMA(1, 0, 0)x(1, 1, 0, 12)12 - AIC:304.46646750845906
ARIMA(1, 0, 1)x(0, 0, 0, 12)12 - AIC:665.7794442186656
ARIMA(1, 0, 1)x(0, 0, 1, 12)12 - AIC:82073.66352065685
ARIMA(1, 0, 1)x(0, 1, 0, 12)12 - AIC:468.3685195815077
ARIMA(1, 0, 1)x(1, 0, 0, 12)12 - AIC:482.57633238767863
ARIMA(1, 0, 1)x(1, 0, 1, 12)12 - AIC:nan
C:\Users\82106\Anaconda3\lib\site-packages\statsmodels\base\model.py:512: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
ARIMA(1, 0, 1)x(1, 1, 0, 12)12 - AIC:306.0156002130368
ARIMA(1, 1, 0)x(0, 0, 0, 12)12 - AIC:671.2513547541902
ARIMA(1, 1, 0)x(0, 0, 1, 12)12 - AIC:1388.9536686999006
ARIMA(1, 1, 0)x(0, 1, 0, 12)12 - AIC:479.2003422281134
ARIMA(1, 1, 0)x(1, 0, 0, 12)12 - AIC:475.3403658784957
C:\Users\82106\Anaconda3\lib\site-packages\statsmodels\base\model.py:512: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
ARIMA(1, 1, 0)x(1, 0, 1, 12)12 - AIC:1322.3326479713248
ARIMA(1, 1, 0)x(1, 1, 0, 12)12 - AIC:300.627090134543
ARIMA(1, 1, 1)x(0, 0, 0, 12)12 - AIC:649.0318019835554
ARIMA(1, 1, 1)x(0, 0, 1, 12)12 - AIC:101786.44160210453
ARIMA(1, 1, 1)x(0, 1, 0, 12)12 - AIC:460.47626876096086
ARIMA(1, 1, 1)x(1, 0, 0, 12)12 - AIC:469.5250354660887
ARIMA(1, 1, 1)x(1, 0, 1, 12)12 - AIC:2563.267567779531
ARIMA(1, 1, 1)x(1, 1, 0, 12)12 - AIC:297.78754395330014

위의 출력 중에서 SARIMAX(1, 1, 1)x(1, 1, 0, 12)가 297.78의 최저 AIC 값을 산출했다.

Fitting the ARIMA model

In [24]:
mod = sm.tsa.statespace.SARIMAX(y,
                                order=(1, 1, 1),
                                seasonal_order=(1, 1, 0, 12),
                                enforce_stationarity=False,
                                enforce_invertibility=False)
results = mod.fit()
print(results.summary().tables[1])
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.0146      0.342      0.043      0.966      -0.655       0.684
ma.L1         -1.0000      0.360     -2.781      0.005      -1.705      -0.295
ar.S.L12      -0.0253      0.042     -0.609      0.543      -0.107       0.056
sigma2      2.958e+04   1.22e-05   2.43e+09      0.000    2.96e+04    2.96e+04
==============================================================================
In [25]:
results.plot_diagnostics(figsize=(16, 8))
plt.show()

모델 잔차가 정규 분포에 가깝다

Validating forecasts

예측의 정확성을 파악하기 위해 예측 판매량을 시계열의 실제 판매량과 비교하고, 2017–01–01에서 종료 시점까지 예측을 설정한다.

In [26]:
pred = results.get_prediction(start=pd.to_datetime('2017-01-01'), dynamic=False)
pred_ci = pred.conf_int() #추정된 계수의 신뢰구간 계산
ax = y['2014':].plot(label='observed')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7))
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('Furniture Sales')
plt.legend()
plt.show()
In [27]:
y_forecasted = pred.predicted_mean
y_truth = y['2017-01-01':]
mse = ((y_forecasted - y_truth) ** 2).mean()
print('The Mean Squared Error of our forecasts is {}'.format(round(mse, 2)))
The Mean Squared Error of our forecasts is 22993.57

평균 제곱 오차(MSE)는 추정 값과 추정 값 사이의 평균 제곱 차이를 측정한다. MSE는 추정자의 품질에 대한 척도로서 항상 음성이 아니며, MSE가 작을수록 가장 적합한 선을 찾는 데 더 가까워진다.

In [28]:
print('The Root Mean Squared Error of our forecasts is {}'.format(round(np.sqrt(mse), 2)))
The Root Mean Squared Error of our forecasts is 151.64

Producing and visualizing forecasts

In [29]:
pred_uc = results.get_forecast(steps=100)
pred_ci = pred_uc.conf_int() #추정된 계수의 신뢰구간 계산
ax = y.plot(label='observed', figsize=(14, 7))
pred_uc.predicted_mean.plot(ax=ax, label='Forecast')
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.25)
ax.set_xlabel('Date')
ax.set_ylabel('Furniture Sales')
plt.legend()
plt.show()

Time Series of Furniture vs. Office Supplies

데이터에 따르면 Time Series of Furniture보다 Office Supplies의 매출이 훨씬 더 많았다.

In [30]:
furniture = df.loc[df['Category'] == 'Furniture']
office = df.loc[df['Category'] == 'Office Supplies']
furniture.shape, office.shape
Out[30]:
((2121, 21), (6026, 21))
In [31]:
cols = ['Row ID', 'Order ID', 'Ship Date', 'Ship Mode', 'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State', 'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category', 'Product Name', 'Quantity', 'Discount', 'Profit']
furniture.drop(cols, axis=1, inplace=True)
office.drop(cols, axis=1, inplace=True)
In [32]:
furniture = furniture.sort_values('Order Date')
office = office.sort_values('Order Date')
In [33]:
furniture = furniture.groupby('Order Date')['Sales'].sum().reset_index()
office = office.groupby('Order Date')['Sales'].sum().reset_index()
In [34]:
furniture = furniture.set_index('Order Date')
office = office.set_index('Order Date')
In [35]:
y_furniture = furniture['Sales'].resample('MS').mean()
y_office = office['Sales'].resample('MS').mean()
In [36]:
furniture = pd.DataFrame({'Order Date':y_furniture.index, 'Sales':y_furniture.values})
office = pd.DataFrame({'Order Date': y_office.index, 'Sales': y_office.values})
In [37]:
furniture.head()
Out[37]:
Order Date Sales
0 2014-01-01 480.194231
1 2014-02-01 367.931600
2 2014-03-01 857.291529
3 2014-04-01 567.488357
4 2014-05-01 432.049188
In [38]:
office.head()
Out[38]:
Order Date Sales
0 2014-01-01 285.357647
1 2014-02-01 63.042588
2 2014-03-01 391.176318
3 2014-04-01 464.794750
4 2014-05-01 324.346545
In [39]:
store = furniture.merge(office, how='inner', on='Order Date')
In [40]:
store.head()
Out[40]:
Order Date Sales_x Sales_y
0 2014-01-01 480.194231 285.357647
1 2014-02-01 367.931600 63.042588
2 2014-03-01 857.291529 391.176318
3 2014-04-01 567.488357 464.794750
4 2014-05-01 432.049188 324.346545
In [41]:
store.rename(columns={'Sales_x': 'furniture_sales', 'Sales_y': 'office_sales'}, inplace=True)
store.head()
Out[41]:
Order Date furniture_sales office_sales
0 2014-01-01 480.194231 285.357647
1 2014-02-01 367.931600 63.042588
2 2014-03-01 857.291529 391.176318
3 2014-04-01 567.488357 464.794750
4 2014-05-01 432.049188 324.346545
In [42]:
plt.figure(figsize=(20, 8))
plt.plot(store['Order Date'], store['furniture_sales'], 'b-', label = 'furniture')
plt.plot(store['Order Date'], store['office_sales'], 'r-', label = 'office supplies')
plt.xlabel('Date'); plt.ylabel('Sales'); plt.title('Sales of Furniture and Office Supplies')
plt.legend()
Out[42]:
<matplotlib.legend.Legend at 0x2079ee6ed48>

funiture와 office supllies 판매가 비슷한 계절 패턴을 공유한다는 것을 관찰할 수 있다. 두 종목 모두 연초다. 사무용품도 여름철이 한산한 것 같다. 게다가, 가구들의 하루 평균 판매량은 사무실 용품의 매월의 매상액보다 높다. 사무용품보다 가구 가격이 훨씬 높기 때문에 이해할 만하다. 때때로 사무용품들의 하루 평균 판매량은 가구보다 높았다. 사무실 용품 판매가 가구 판매량을 넘어선 것은 언제가 처음이었는지 알아보자.

In [43]:
first_date = store.ix[np.min(list(np.where(store['office_sales'] > store['furniture_sales'])[0])), 'Order Date']
print("Office supplies first time produced higher sales than furniture is {}.".format(first_date.date()))
Office supplies first time produced higher sales than furniture is 2014-07-01.

Time Series Modeling with Prophet

페이스북이 만든 시계열 예측 라이브러리

  1. 통계적 지식이 없어도 직관적 파라미터를 통해 모형을 조정할 수 있음
  2. 일반적인 경우 기본값만 사용해도 높은 성능을 보여줌
  3. 내부가 어떻게 동작하는지 고민할 필요가 없음
In [44]:
! pip install Prophet
Requirement already satisfied: Prophet in c:\users\82106\anaconda3\lib\site-packages (0.1.1)
Requirement already satisfied: pandas>=0.15.1 in c:\users\82106\anaconda3\lib\site-packages (from Prophet) (0.25.1)
Requirement already satisfied: pytz>=2014.9 in c:\users\82106\anaconda3\lib\site-packages (from Prophet) (2019.3)
Requirement already satisfied: six>=1.8.0 in c:\users\82106\anaconda3\lib\site-packages (from Prophet) (1.12.0)
Requirement already satisfied: numpy>=1.13.3 in c:\users\82106\anaconda3\lib\site-packages (from pandas>=0.15.1->Prophet) (1.16.5)
Requirement already satisfied: python-dateutil>=2.6.1 in c:\users\82106\anaconda3\lib\site-packages (from pandas>=0.15.1->Prophet) (2.8.0)
In [45]:
! pip install pystan
Requirement already satisfied: pystan in c:\users\82106\anaconda3\lib\site-packages (2.19.1.1)
Requirement already satisfied: Cython!=0.25.1,>=0.22 in c:\users\82106\anaconda3\lib\site-packages (from pystan) (0.29.13)
Requirement already satisfied: numpy>=1.7 in c:\users\82106\anaconda3\lib\site-packages (from pystan) (1.16.5)
In [46]:
from fbprophet import Prophet
Importing plotly failed. Interactive plots will not work.
In [47]:
furniture.head()
Out[47]:
Order Date Sales
0 2014-01-01 480.194231
1 2014-02-01 367.931600
2 2014-03-01 857.291529
3 2014-04-01 567.488357
4 2014-05-01 432.049188
In [48]:
furniture = furniture.rename(columns={'Order Date': 'ds', 'Sales': 'y'})
furniture_model = Prophet(interval_width=0.95)
furniture_model.fit(furniture)
office = office.rename(columns={'Order Date': 'ds', 'Sales': 'y'})
office_model = Prophet(interval_width=0.95)
office_model.fit(office)
furniture_forecast = furniture_model.make_future_dataframe(periods=36, freq='MS')
furniture_forecast = furniture_model.predict(furniture_forecast)
office_forecast = office_model.make_future_dataframe(periods=36, freq='MS')
office_forecast = office_model.predict(office_forecast)
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
In [49]:
plt.figure(figsize=(18, 6))
furniture_model.plot(furniture_forecast, xlabel = 'Date', ylabel = 'Sales')
plt.title('Furniture Sales');
<Figure size 1296x432 with 0 Axes>
In [50]:
plt.figure(figsize=(18, 6))
office_model.plot(office_forecast, xlabel = 'Date', ylabel = 'Sales')
plt.title('Office Supplies Sales');
<Figure size 1296x432 with 0 Axes>

예측 비교

In [51]:
furniture_names = ['furniture_%s' % column for column in furniture_forecast.columns]
office_names = ['office_%s' % column for column in office_forecast.columns]
In [52]:
furniture_names
Out[52]:
['furniture_ds',
 'furniture_trend',
 'furniture_yhat_lower',
 'furniture_yhat_upper',
 'furniture_trend_lower',
 'furniture_trend_upper',
 'furniture_additive_terms',
 'furniture_additive_terms_lower',
 'furniture_additive_terms_upper',
 'furniture_yearly',
 'furniture_yearly_lower',
 'furniture_yearly_upper',
 'furniture_multiplicative_terms',
 'furniture_multiplicative_terms_lower',
 'furniture_multiplicative_terms_upper',
 'furniture_yhat']
In [53]:
merge_furniture_forecast = furniture_forecast.copy()
merge_office_forecast = office_forecast.copy()
In [54]:
merge_furniture_forecast.columns = furniture_names
merge_office_forecast.columns = office_names
In [55]:
forecast = pd.merge(merge_furniture_forecast, merge_office_forecast, how = 'inner', left_on = 'furniture_ds', right_on = 'office_ds')
forecast = forecast.rename(columns={'furniture_ds': 'Date'}).drop('office_ds', axis=1)
forecast.head()
Out[55]:
Date furniture_trend furniture_yhat_lower furniture_yhat_upper furniture_trend_lower furniture_trend_upper furniture_additive_terms furniture_additive_terms_lower furniture_additive_terms_upper furniture_yearly furniture_yearly_lower furniture_yearly_upper furniture_multiplicative_terms furniture_multiplicative_terms_lower furniture_multiplicative_terms_upper furniture_yhat office_trend office_yhat_lower office_yhat_upper office_trend_lower office_trend_upper office_additive_terms office_additive_terms_lower office_additive_terms_upper office_yearly office_yearly_lower office_yearly_upper office_multiplicative_terms office_multiplicative_terms_lower office_multiplicative_terms_upper office_yhat
0 2014-01-01 726.057713 286.506053 788.293852 726.057713 726.057713 -190.685662 -190.685662 -190.685662 -190.685662 -190.685662 -190.685662 0.0 0.0 0.0 535.372051 487.530759 46.051635 644.705253 487.530759 487.530759 -140.040481 -140.040481 -140.040481 -140.040481 -140.040481 -140.040481 0.0 0.0 0.0 347.490278
1 2014-02-01 727.494023 219.574087 688.314723 727.494023 727.494023 -276.377703 -276.377703 -276.377703 -276.377703 -276.377703 -276.377703 0.0 0.0 0.0 451.116320 494.918445 -199.376479 428.747939 494.918445 494.918445 -385.678283 -385.678283 -385.678283 -385.678283 -385.678283 -385.678283 0.0 0.0 0.0 109.240162
2 2014-03-01 728.791335 470.289440 934.523571 728.791335 728.791335 -22.389755 -22.389755 -22.389755 -22.389755 -22.389755 -22.389755 0.0 0.0 0.0 706.401580 501.591193 172.381810 770.923112 501.591193 501.591193 -31.379844 -31.379844 -31.379844 -31.379844 -31.379844 -31.379844 0.0 0.0 0.0 470.211349
3 2014-04-01 730.227645 385.387745 875.825580 730.227645 730.227645 -100.141158 -100.141158 -100.141158 -100.141158 -100.141158 -100.141158 0.0 0.0 0.0 630.086487 508.978878 93.464909 686.504737 508.978878 508.978878 -134.291690 -134.291690 -134.291690 -134.291690 -134.291690 -134.291690 0.0 0.0 0.0 374.687188
4 2014-05-01 731.617622 334.075179 812.655696 731.617622 731.617622 -160.815662 -160.815662 -160.815662 -160.815662 -160.815662 -160.815662 0.0 0.0 0.0 570.801960 516.128251 -61.843508 555.318794 516.128251 516.128251 -263.821569 -263.821569 -263.821569 -263.821569 -263.821569 -263.821569 0.0 0.0 0.0 252.306682
In [56]:
plt.figure(figsize=(10, 7))
plt.plot(forecast['Date'], forecast['furniture_trend'], 'b-', label = "funiture")
plt.plot(forecast['Date'], forecast['office_trend'], 'r-', label = "office")
plt.legend(loc = 'upper right'); plt.xlabel('Date'); plt.ylabel('Sales')
plt.title('Furniture vs. Office Supplies Sales Trend');
In [57]:
plt.figure(figsize=(10, 7))
plt.plot(forecast['Date'], forecast['furniture_yhat'], 'b-',label = "funiture")
plt.plot(forecast['Date'], forecast['office_yhat'], 'r-',label = "office")
plt.legend(loc = "upper right"); plt.xlabel('Date'); plt.ylabel('Sales')
plt.title('Furniture vs. Office Supplies Estimate');
In [58]:
#furniture
furniture_model.plot_components(furniture_forecast);
In [59]:
#office
office_model.plot_components(office_forecast);

비록 사무용품들의 성장이 약간 더 강해진 것처럼 보이지만, 가구와 사무용품들의 판매는 시간이 지나면서 선형적으로 증가해 왔고 계속해서 증가할 것이라는 것을 보는 것은 좋은 일이다. funiture 최악의 달은 4월, office supply 최악의 달은 2월이다. funiture에 가장 좋은 달은 12월이고, office supply에 가장 좋은 달은 10월이다.

반응형