Cross-Asset Financial Analytics — Discovering Statistical Inefficiencies via Deep Learning
Dr. Yves J. Hilpisch | The Python Quants GmbH
http://tpq.io | @dyjh | training@tpq.io
This tutorial shows
pandas
, Plotly
and Cufflinks
andimport time
import eikon as ek # the Eikon package
import numpy as np # NumPy
import pandas as pd # pandas
import cufflinks as cf # Cufflinks
import configparser as cp
The following Python and package versions are used.
import sys
print(sys.version)
ek.__version__
np.__version__
pd.__version__
cf.__version__
This code sets the app_id
to connect to the Eikon Data API Proxy which needs to be running locally.
cfg = cp.ConfigParser()
cfg.read('eikon.cfg')
# ek.set_app_key(cfg['eikon']['app_id'])
We first define a small universe of RICS
for which to retrieve data.
rics = [
'GE', # General Electric stock
'AAPL.O', # Apple stock
'.SPX', # S&P 500 stock index
'.VIX', # VIX volatility index
'EUR=', # EUR/USD exchange rate
'XAU=', # Gold price
'GLD', # Gold ETF
'BTC=', # Bitcoin in USD
]
Second, end-of-day (EOD) data is retrieved.
fn = 'eikon_eod_data.csv'
try:
data = pd.read_csv(fn, index_col=0, parse_dates=True)
except:
first = True
for ric in rics:
print(ric)
d = ek.get_timeseries(ric, # the RIC
fields='CLOSE', # the required fields
start_date='2015-01-01', # start date
end_date='2019-09-30') # end date
if first:
data = d
data.columns = [ric]
first = False
else:
data[ric] = d
time.sleep(2)
data.to_csv(fn)
data.head() # first five rows
data.tail() # final five rows
data.info() # DataFrame meta information
To discover statistical inefficiencies we work with different features extracted from the time series data.
def add_lags(data, ric, lags):
cols = []
df = pd.DataFrame(data[ric])
df['r'] = np.log(df / df.shift())
df['sma'] = df[ric].rolling(20).mean()
df['min'] = df[ric].rolling(20).min()
df['max'] = df[ric].rolling(20).max()
df['mom'] = df['r'].rolling(20).mean()
df['vol'] = df['r'].rolling(20).std()
df.dropna(inplace=True)
df['d'] = np.where(df['r'] > 0, 1, 0)
features = [ric, 'r', 'd', 'sma', 'min', 'max', 'mom', 'vol']
for f in features:
for lag in range(1, lags + 1):
col = f'{f}_lag_{lag}' # defines the column name
df[col] = df[f].shift(lag) # creates the lagged data column
cols.append(col) # stores the column name
df.dropna(inplace=True) # gets rid of incomplete data rows
return df, cols
Second, the iterations over all RICs
, using the add_lags
function and storing the resulting DataFrame
objects in a dictionary.
lags = 7 # historical lags
dfs = {}
for ric in rics:
print(ric)
df, cols = add_lags(data, ric, lags)
dfs[ric] = df.dropna(), cols
dfs.keys() # the keys of the dictonary
# dfs['AAPL.O'].head(7)
The matrix consisting of the lagged data columns is used to "predict" the next day's value of the RIC
via Machine Learning.
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
np.set_printoptions(precision=4, suppress=True)
def create_model(algo=3, n_estimators=75):
base_estimators = [
GaussianNB(),
LogisticRegression(C=1),
DecisionTreeClassifier(random_state=100, max_depth=3,
min_samples_leaf=12),
MLPClassifier(hidden_layer_sizes=2 * [128,],
early_stopping=True, shuffle=False,
random_state=100,
activation='relu',
max_iter=250),
RandomForestClassifier(n_estimators=5, max_depth=3,
min_samples_leaf=8, random_state=100)
]
model = BaggingClassifier(base_estimator=base_estimators[4],
n_estimators=n_estimators,
bootstrap=True,
max_features=0.75,
max_samples=0.75,
n_jobs=4,
random_state=100
)
return model
First, training and prediction in-sample only.
%%time
for ric in rics:
model = create_model(n_estimators=10)
df, cols = dfs[ric] # getting data for the RIC
model.fit(df[cols], df['d']) # the fitting step
pred = model.predict(df[cols]) # the prediction step
acc = accuracy_score(df['d'], pred) # prediction accuracy
print(f'IN-SAMPLE | {ric:7s} | acc={acc:.4f}')
Second, training in-sample and prediction out-of-sample.
split = int(len(dfs[ric][0]) * 0.7)
%%time
for ric in rics:
model = create_model()
df, cols = dfs[ric] # getting data for the RIC
train = df.iloc[:split] # training data set
mu, std = train[cols].mean(), train[cols].std()
train[cols] = (train[cols] - mu) / std # normalization
model.fit(train[cols], train['d']) # the fitting step
test = df.iloc[split:] # test data set
test[cols] = (test[cols] - mu) / std # normalization
pred = model.predict(test[cols]) # the prediction step
acc = accuracy_score(test['d'], pred) # prediction accuracy
print(f'OUT-OF-SAMPLE | {ric:7s} | acc={acc:.4f}')
Let us quickly check, whether the results are similar on an intraday basis.
fn = 'eikon_id_data.csv'
try:
data = pd.read_csv(fn, index_col=0, parse_dates=True)
except:
first = True
for ric in rics:
print(ric)
d = ek.get_timeseries(ric, # the RIC
fields='CLOSE', # the required fields
start_date='2019-10-07', # start date
end_date='2019-10-12', # end date
interval='minute')
if first:
data = d
data.columns = [ric]
first = False
else:
data[ric] = d
time.sleep(2)
data.to_csv(fn)
data.dropna(inplace=True)
data.info()
data.tail()
dfs = {}
for ric in rics:
df, cols = add_lags(data, ric, lags)
dfs[ric] = df.dropna(), cols
split = int(len(dfs[ric][0]) * 0.8)
%%time
for ric in rics:
model = create_model(n_estimators=75)
df, cols = dfs[ric] # getting data for the RIC
train = df.iloc[:split] # training data set
mu, std = train[cols].mean(), train[cols].std()
train[cols] = (train[cols] - mu) / std # normalization
model.fit(train[cols], train['d']) # the fitting step
test = df.iloc[split:] # test data set
test[cols] = (test[cols] - mu) / std # normalization
pred = model.predict(test[cols]) # the prediction step
acc = accuracy_score(test['d'], pred) # prediction accuracy
print(f'OUT-OF-SAMPLE | {ric:7s} | acc={acc:.4f}')
Based on this tutorial, we can conclude that
Plotly
and Cufflinks
make financial data visualization convenient andData Item Browser Application: Type DIB
into Eikon Search Bar.