Autoregressive Moving Average (ARMA): Sunspots data

Autoregressive Moving Average (ARMA): Sunspots data

Link to Notebook GitHub

In [1]:
from __future__ import print_function
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
In [2]:
from statsmodels.graphics.api import qqplot

Sunpots Data

In [3]:
print(sm.datasets.sunspots.NOTE)
::

    Number of Observations - 309 (Annual 1700 - 2008)
    Number of Variables - 1
    Variable name definitions::

        SUNACTIVITY - Number of sunspots for each year

    The data file contains a 'YEAR' variable that is not returned by load.


In [4]:
dta = sm.datasets.sunspots.load_pandas().data
In [5]:
dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008'))
del dta["YEAR"]
In [6]:
dta.plot(figsize=(12,8));
In [7]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(dta.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(dta, lags=40, ax=ax2)
In [8]:
arma_mod20 = sm.tsa.ARMA(dta, (2,0)).fit()
print(arma_mod20.params)
const                49.659321
ar.L1.SUNACTIVITY     1.390656
ar.L2.SUNACTIVITY    -0.688571
dtype: float64

In [9]:
arma_mod30 = sm.tsa.ARMA(dta, (3,0)).fit()
In [10]:
print(arma_mod20.aic, arma_mod20.bic, arma_mod20.hqic)
2622.63633806 2637.56970317 2628.60672591

In [11]:
print(arma_mod30.params)
const                49.749928
ar.L1.SUNACTIVITY     1.300810
ar.L2.SUNACTIVITY    -0.508093
ar.L3.SUNACTIVITY    -0.129649
dtype: float64

In [12]:
print(arma_mod30.aic, arma_mod30.bic, arma_mod30.hqic)
2619.4036287 2638.07033508 2626.8666135

  • Does our model obey the theory?
In [13]:
sm.stats.durbin_watson(arma_mod30.resid.values)
Out[13]:
1.95648094813114
In [14]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax = arma_mod30.resid.plot(ax=ax);
In [15]:
resid = arma_mod30.resid
In [16]:
stats.normaltest(resid)
Out[16]:
(49.845018628344008, 1.5006925608124672e-11)
In [17]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
fig = qqplot(resid, line='q', ax=ax, fit=True)
In [18]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(resid.values.squeeze(), lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax2)
In [19]:
r,q,p = sm.tsa.acf(resid.values.squeeze(), qstat=True)
data = np.c_[range(1,41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))
           AC          Q      Prob(>Q)
lag
1    0.009179   0.026286  8.712035e-01
2    0.041793   0.573042  7.508713e-01
3   -0.001335   0.573602  9.024482e-01
4    0.136089   6.408921  1.706203e-01
5    0.092468   9.111828  1.046860e-01
6    0.091948  11.793242  6.674350e-02
7    0.068748  13.297199  6.518987e-02
8   -0.015020  13.369227  9.976139e-02
9    0.187592  24.641907  3.393913e-03
10   0.213718  39.321991  2.229477e-05
11   0.201082  52.361134  2.344953e-07
12   0.117182  56.804186  8.574269e-08
13  -0.014055  56.868322  1.893905e-07
14   0.015398  56.945561  3.997663e-07
15  -0.024967  57.149316  7.741477e-07
16   0.080916  59.296767  6.872171e-07
17   0.041138  59.853735  1.110945e-06
18  -0.052021  60.747425  1.548433e-06
19   0.062496  62.041689  1.831645e-06
20  -0.010301  62.076976  3.381245e-06
21   0.074453  63.926651  3.193589e-06
22   0.124955  69.154768  8.978363e-07
23   0.093162  72.071032  5.799788e-07
24  -0.082152  74.346684  4.713020e-07
25   0.015695  74.430040  8.289048e-07
26  -0.025037  74.642899  1.367285e-06
27  -0.125861  80.041145  3.722569e-07
28   0.053225  81.009979  4.716282e-07
29  -0.038693  81.523805  6.916637e-07
30  -0.016904  81.622223  1.151662e-06
31  -0.019296  81.750936  1.868767e-06
32   0.104990  85.575062  8.927963e-07
33   0.040086  86.134564  1.247509e-06
34   0.008829  86.161807  2.047826e-06
35   0.014588  86.236444  3.263809e-06
36  -0.119329  91.248895  1.084455e-06
37  -0.036665  91.723863  1.521924e-06
38  -0.046193  92.480512  1.938735e-06
39  -0.017768  92.592881  2.990680e-06
40  -0.006220  92.606703  4.696986e-06

  • This indicates a lack of fit.
  • In-sample dynamic prediction. How good does our model do?
In [20]:
predict_sunspots = arma_mod30.predict('1990', '2012', dynamic=True)
print(predict_sunspots)
1990-12-31    167.047411
1991-12-31    140.992990
1992-12-31     94.859100
1993-12-31     46.860889
1994-12-31     11.242579
1995-12-31     -4.721294
1996-12-31     -1.166910
1997-12-31     16.185692
1998-12-31     39.021879
1999-12-31     59.449862
2000-12-31     72.170127
2001-12-31     75.376765
2002-12-31     70.436439
2003-12-31     60.731569
2004-12-31     50.201783
2005-12-31     42.076018
2006-12-31     38.114281
2007-12-31     38.454638
2008-12-31     41.963810
2009-12-31     46.869278
2010-12-31     51.423249
2011-12-31     54.399704
2012-12-31     55.321675
Freq: A-DEC, dtype: float64

In [21]:
fig, ax = plt.subplots(figsize=(12, 8))
ax = dta.ix['1950':].plot(ax=ax)
fig = arma_mod30.plot_predict('1990', '2012', dynamic=True, ax=ax, plot_insample=False)
In [22]:
def mean_forecast_err(y, yhat):
    return y.sub(yhat).mean()
In [23]:
mean_forecast_err(dta.SUNACTIVITY, predict_sunspots)
Out[23]:
5.6369668786254596

Exercise: Can you obtain a better fit for the Sunspots model? (Hint: sm.tsa.AR has a method select_order)

Simulated ARMA(4,1): Model Identification is Difficult

In [24]:
from statsmodels.tsa.arima_process import arma_generate_sample, ArmaProcess
In [25]:
np.random.seed(1234)
# include zero-th lag
arparams = np.array([1, .75, -.65, -.55, .9])
maparams = np.array([1, .65])

Let's make sure this model is estimable.

In [26]:
arma_t = ArmaProcess(arparams, maparams)
In [27]:
arma_t.isinvertible()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-70-d3a1a0e5898b> in <module>()
----> 1arma_t.isinvertible()

TypeError: 'bool' object is not callable
In [28]:
arma_t.isstationary()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-71-55a9b2cc43b1> in <module>()
----> 1arma_t.isstationary()

TypeError: 'bool' object is not callable
  • What does this mean?
In [29]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax.plot(arma_t.generate_sample(size=50));
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-72-d059f8971c1a> in <module>()
      1 fig = plt.figure(figsize=(12,8))
      2 ax = fig.add_subplot(111)
----> 3ax.plot(arma_t.generate_sample(size=50));

TypeError: generate_sample() got an unexpected keyword argument 'size'
In [30]:
arparams = np.array([1, .35, -.15, .55, .1])
maparams = np.array([1, .65])
arma_t = ArmaProcess(arparams, maparams)
arma_t.isstationary()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-73-317f1b2ac56b> in <module>()
      2 maparams = np.array([1, .65])
      3 arma_t = ArmaProcess(arparams, maparams)
----> 4arma_t.isstationary()

TypeError: 'bool' object is not callable
In [31]:
arma_rvs = arma_t.generate_sample(size=500, burnin=250, scale=2.5)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-74-e0a3cc13cb6e> in <module>()
----> 1arma_rvs = arma_t.generate_sample(size=500, burnin=250, scale=2.5)

TypeError: generate_sample() got an unexpected keyword argument 'size'
In [32]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(arma_rvs, lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(arma_rvs, lags=40, ax=ax2)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-75-8e761b44cfae> in <module>()
      1 fig = plt.figure(figsize=(12,8))
      2 ax1 = fig.add_subplot(211)
----> 3fig = sm.graphics.tsa.plot_acf(arma_rvs, lags=40, ax=ax1)
      4 ax2 = fig.add_subplot(212)
      5 fig = sm.graphics.tsa.plot_pacf(arma_rvs, lags=40, ax=ax2)

NameError: name 'arma_rvs' is not defined
  • For mixed ARMA processes the Autocorrelation function is a mixture of exponentials and damped sine waves after (q-p) lags.
  • The partial autocorrelation function is a mixture of exponentials and dampened sine waves after (p-q) lags.
In [33]:
arma11 = sm.tsa.ARMA(arma_rvs, (1,1)).fit()
resid = arma11.resid
r,q,p = sm.tsa.acf(resid, qstat=True)
data = np.c_[range(1,41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-76-03653831c71c> in <module>()
----> 1arma11 = sm.tsa.ARMA(arma_rvs, (1,1)).fit()
      2 resid = arma11.resid
      3 r,q,p = sm.tsa.acf(resid, qstat=True)
      4 data = np.c_[range(1,41), r[1:], q, p]
      5 table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])

NameError: name 'arma_rvs' is not defined
In [34]:
arma41 = sm.tsa.ARMA(arma_rvs, (4,1)).fit()
resid = arma41.resid
r,q,p = sm.tsa.acf(resid, qstat=True)
data = np.c_[range(1,41), r[1:], q, p]
table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])
print(table.set_index('lag'))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-77-30d9c2f35894> in <module>()
----> 1arma41 = sm.tsa.ARMA(arma_rvs, (4,1)).fit()
      2 resid = arma41.resid
      3 r,q,p = sm.tsa.acf(resid, qstat=True)
      4 data = np.c_[range(1,41), r[1:], q, p]
      5 table = pd.DataFrame(data, columns=['lag', "AC", "Q", "Prob(>Q)"])

NameError: name 'arma_rvs' is not defined

Exercise: How good of in-sample prediction can you do for another series, say, CPI

In [35]:
macrodta = sm.datasets.macrodata.load_pandas().data
macrodta.index = pd.Index(sm.tsa.datetools.dates_from_range('1959Q1', '2009Q3'))
cpi = macrodta["cpi"]

Hint:

In [36]:
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax = cpi.plot(ax=ax);
ax.legend();

P-value of the unit-root test, resoundly rejects the null of no unit-root.

In [37]:
print(sm.tsa.adfuller(cpi)[1])
0.990432818834

doc_statsmodels
2017-01-18 16:07:15
Comments
Leave a Comment

Please login to continue.