import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.stats.stattools import durbin_watson
from sklearn.linear_model import LinearRegression, LassoLars, lars_path, lasso_path
from sklearn.preprocessing import normalize
from statsmodels.tsa.arima_model import ARIMA
%matplotlib inline
def custom_read_csv(filename):
df = pd.read_csv(filename)
df['date'] = pd.to_datetime(df['date'])
return df.set_index('date')
Read retail gas prices from csv.
# note that all prices are released on Monday, dayofweek = 0
national_price = custom_read_csv('national.csv').dropna()
state_price = custom_read_csv('state.csv').dropna()
state_price = state_price.drop('Unnamed: 10', axis=1).dropna()
city_price = custom_read_csv('city.csv').dropna()
price = pd.concat([national_price, state_price, city_price], axis=1)
The training period is 1992 - 2010. Save 2011 - 2016 for cross-validation.
trainingdates = np.intersect1d(price.index, pd.date_range('1992-01-01','2010-12-31'))
price.national.loc[trainingdates].plot(figsize=(15,6), title='National Retail Gas Price Trend')
plt.show()
National retail gas price is not a stationary time series, especially obvious after 1999. Augmented Dicken Fuller test confirms the intuition - it fails to reject the non-stationary null hypothesis.
pvalue = adfuller(price.national[trainingdates])[1]
pvalue
Differentiate the series. (Take approximate log returns.)
logreturn = (price.national - price.national.shift(1)) / price.national.shift(1)
logreturn = logreturn.rename('logreturn')
logreturn[trainingdates].plot(figsize=(15,6), title='Log Return of National Retail Gas')
plt.show()
ADF test says the differentiated series is stationary, so we will build the regression model on log return.
pvalue = adfuller(logreturn[trainingdates].dropna())[1]
pvalue
Read data for predictors and take log returns. The predictors are:
All predictors are weekly time series.
spot = custom_read_csv('spot.csv').drop('Unnamed: 3', axis=1).dropna()
gasspot = custom_read_csv('gasspot.csv').drop('Unnamed: 3', axis=1).dropna()
supply = custom_read_csv('productsupplied.csv').dropna()
stocks = custom_read_csv('stocks.csv').dropna()
predictors_raw = pd.concat([spot, gasspot, supply, stocks], axis=1)
# take log return for all predictors
predictors = (predictors_raw - predictors_raw.shift(1)) / predictors_raw.shift(1)
# the predictors are relaesed on Friday, add 3 days to align
predictors.index = predictors.index + pd.DateOffset(days=3)
correls = pd.concat([logreturn, predictors], axis=1).rolling(window=52*2).corr()
correls = correls.loc[(slice(None), 'logreturn'),:].dropna().drop('logreturn', axis=1)
correls.index = correls.index.droplevel(level=1)
correls.loc[trainingdates].plot(
figsize=(15,6), title='2-year Rolling Correlation with National Retail Gas Log Return')
plt.show()
There are a lot of predictors being considered, and they are highly correlated to each other.
y = logreturn.loc[trainingdates].values
X = predictors.loc[trainingdates].values
alphas, _, coefs = lars_path(normalize(X), y, method='lasso', verbose=False)
xx = np.sum(np.abs(coefs.T), axis=1)
xx /= xx[-1]
plt.figure(figsize=(15,6))
plt.plot(xx, coefs.T)
ymin, ymax = plt.ylim()
plt.vlines(xx, ymin, ymax, linestyle='dashed')
plt.xlabel('|coef| / max|coef|')
plt.ylabel('Coefficients')
plt.title('LASSO Path')
plt.axis('tight')
plt.legend(predictors.columns)
plt.show()
According to LARS, select the following predictors:
X = predictors[['gas_spot_gulf', 'gas_spot_ny', 'stocks_gasoline']].loc[trainingdates].values
y = logreturn.loc[trainingdates].values
np.corrcoef(X, rowvar=False)
As expected, Gulf Coast spot price and NY spot price are highly correlated. So we only include one of them in the regression. Among the two, we select Gulf Caost gasoline spot price based on the LARS path.
selected_predictors = predictors[['gas_spot_gulf', 'stocks_gasoline']]
Check if the two chosen predictors are stationary. ADF says they both are stationary.
for x in selected_predictors.columns:
print 'ADF test p-value on %s:' % x, adfuller(selected_predictors[x].dropna())[1]
Instead of doing one regression on the whole training set, we choose to do rolling regression (exponentially weighted with halflife = 5 years).
The training period is very long (18 years), so likely the correlation changes over time. Rolling regression better captures the changing relations.
def regress(predictor, response, start, end, halflife, returnr2=False, outlier_cap=2):
'''
helper function to run regression
predictor: dataframe of predictors
response: series of response
start: start date
end: end date
halflife: halflife to weight regression exponentially
'''
y = response[start:end].values
X = predictor[start:end].values
# cap both y and X at 2 standard deviations to avoid heavy influence of outliers
y = np.clip(y, -outlier_cap * np.std(y), outlier_cap * np.std(y))
X = np.clip(X, -outlier_cap * np.std(X, axis=0), outlier_cap * np.std(X, axis=0))
assert X.shape[0] == y.shape[0], "predictor and response are not the same length"
w = np.ones(y.shape)
decay = 2.0 ** (-1.0/halflife)
for i in range(len(w)-2, -1, -1):
w[i] = w[i+1] * decay
reg = LinearRegression(normalize=True)
reg.fit(X, y, sample_weight=w)
if returnr2:
return (reg, reg.score(X,y))
return reg
Perform rolling regression on the training period.
Use 5 years as halflife.
(According to backtesting in the training period, 5 years as halflife works better than shorter halflife.)
# Rolling regression
regression_window = 52 * 5
regression_halflife = 52 * 5
reg = np.empty(len(trainingdates), dtype=object)
for t in range(regression_window, len(trainingdates)):
reg[t] = regress(selected_predictors, logreturn,
trainingdates[t-regression_window], trainingdates[t],
regression_halflife)
regobj = pd.Series(reg[regression_window:], index=trainingdates[regression_window:], name='regress_object')
As expected, the regressed coefficient does change over time. The coefficient to spot price is more stable than to stocks.
plt.figure(figsize=(15,6))
regobj.map(lambda x: x.coef_[0]).plot(label='Coefficient to gas_spot_gulf')
regobj.map(lambda x: x.coef_[1]).plot(label='Coefficient to gas_stocks')
plt.legend()
plt.show()
Since this is a predictive model, we test it by making predictions using only historical data.
Every week, use training data from 5 years ago up to the past week to get regression coefficients. Then, predict according to the most recent predictor data.
df = pd.concat([pd.Series(trainingdates[regression_window:], index=regobj.index, name='date'),
regobj], axis=1).shift(1).dropna()
prediction = df.apply(lambda x: x.regress_object.predict(selected_predictors.loc[x.date].values.reshape(1,-1)),
axis=1)['date']
prediction = prediction.rename('predicted').rename_axis('date')
cvdates = np.intersect1d(price.index, pd.date_range('2011-01-01','2016-12-31'))
# Rolling regression
regression_window = 52 * 5
regression_halflife = 52 * 5
dates = np.union1d(trainingdates, cvdates)
reg = np.empty(len(dates), dtype=object)
for t in range(regression_window, len(dates)):
reg[t] = regress(selected_predictors, logreturn,
dates[t-regression_window], dates[t],
regression_halflife)
regobj = pd.Series(reg[regression_window:], index=dates[regression_window:], name='regress_object')
df = pd.concat([pd.Series(dates[regression_window:], index=regobj.index, name='date'),
regobj], axis=1).shift(1).dropna()
prediction = df.apply(lambda x: x.regress_object.predict(selected_predictors.loc[x.date].values.reshape(1,-1)),
axis=1)['date']
prediction = prediction.rename('predicted').rename_axis('date')
corr = pd.concat([logreturn, prediction], axis=1).ewm(span=52 * 2).corr().dropna()
corr = corr.unstack()['logreturn']['predicted']
corr[cvdates].plot(
figsize=(15,6), title='2-year Exponentially Weighted Correlation between Predicted and Actual Values of Log Returns')
plt.show()
Consumers care more about price trend (direction of price move) rather than the exact price, since they are unlikely to trade ratail gas. Therefore, let's try predicting the price trend, and evaluate the task as a classification problem.
1.0 * np.logical_and(logreturn[cvdates] > 0, prediction[cvdates] > 0).sum() / \
(prediction[cvdates] > 0).sum()
1.0 * np.logical_and(logreturn[cvdates] > 0, prediction[cvdates] > 0).sum() / \
(logreturn[cvdates] > 0).sum()
1.0 * (logreturn[cvdates] * prediction[cvdates] > 0).sum() / len(cvdates)
Since this is a classification problem, logistic regression might be better suited. Notebook 3 tests logistic regression.