AI in Finance

Are Markets Predictable?

Dr Yves J Hilpisch | The AI Machine

http://aimachine.io | http://twitter.com/dyjh

Imports

For the tpqoa package see http://github.com/yhilpisch/tpqoa.

In [1]:
import math
import tpqoa
import numpy as np
import pandas as pd
from pylab import plt
import cufflinks as cf
plt.style.use('seaborn')
%matplotlib inline
cf.set_config_file(offline=True)
np.random.seed(1)

Data

In [2]:
dates = ['2019-03-01', '2019-06-01', '2019-09-01']
In [3]:
symbol = 'EUR_USD'
start =  dates[1]
end = dates[2]
granularity = 'M10'
price = 'A'
fn = f'data/oanda_{symbol}_{start}_{end}_{granularity}_{price}.csv'
fn
Out[3]:
'data/oanda_EUR_USD_2019-06-01_2019-09-01_M10_A.csv'
In [4]:
%%time
try:
    raw = pd.read_csv(fn, index_col=0, parse_dates=True)
except:
    api = tpqoa.tpqoa('dyjh.cfg')
    raw = api.get_history(symbol, start, end, granularity, price)
    raw.to_csv(fn)
CPU times: user 75.9 ms, sys: 6.35 ms, total: 82.3 ms
Wall time: 116 ms
In [5]:
raw.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9358 entries, 2019-06-02 21:00:00 to 2019-08-30 20:50:00
Data columns (total 6 columns):
c           9358 non-null float64
complete    9358 non-null bool
h           9358 non-null float64
l           9358 non-null float64
o           9358 non-null float64
volume      9358 non-null int64
dtypes: bool(1), float64(4), int64(1)
memory usage: 447.8 KB
In [6]:
data = pd.DataFrame(raw['c'])
data.columns = [symbol]
data['r'] = np.log(data[symbol] / data[symbol].shift(1))
data['d'] = np.where(data['r'] > 0, 1, 0)
data.dropna(inplace=True)
In [7]:
data['long'] = data['r']
data['short'] = -data['r']
data['random'] = np.random.choice([-1, 1], len(data)) * data['r']
In [8]:
ld = len(data)
ld
Out[8]:
9357
In [9]:
split = int(ld * 0.6)
val_size = int(ld * 0.1)
In [10]:
train = data.iloc[:split]
val = train[-val_size:]
train = train[:-val_size]
test = data.iloc[split:].copy()
In [11]:
lags = 5
In [12]:
def create_lags(df, mu, std):
    global cols
    cols = []
    df['rs'] = df['r'] - mu
    df['rs'] /= std
    for lag in range(1, lags + 1):
        col = 'lag_{}'.format(lag)
        df[col] = df['rs'].shift(lag)
        cols.append(col)
    df.dropna(inplace=True)
In [13]:
mu = train['r'].mean()
std = train['r'].std()
create_lags(train, mu, std)
In [14]:
train.head(5)
Out[14]:
EUR_USD r d long short random rs lag_1 lag_2 lag_3 lag_4 lag_5
time
2019-06-02 22:00:00 1.11728 0.000295 1 0.000295 -0.000295 0.000295 1.047357 -0.575584 0.188176 1.015723 -0.893941 -0.161873
2019-06-02 22:10:00 1.11727 -0.000009 0 -0.000009 0.000009 -0.000009 -0.034573 1.047357 -0.575584 0.188176 1.015723 -0.893941
2019-06-02 22:20:00 1.11712 -0.000134 0 -0.000134 0.000134 -0.000134 -0.480046 -0.034573 1.047357 -0.575584 0.188176 1.015723
2019-06-02 22:30:00 1.11714 0.000018 1 0.000018 -0.000018 0.000018 0.060886 -0.480046 -0.034573 1.047357 -0.575584 0.188176
2019-06-02 22:40:00 1.11737 0.000206 1 0.000206 -0.000206 -0.000206 0.729049 0.060886 -0.480046 -0.034573 1.047357 -0.575584
In [15]:
create_lags(val, mu, std)
In [16]:
val.head()
Out[16]:
EUR_USD r d long short random rs lag_1 lag_2 lag_3 lag_4 lag_5
time
2019-07-17 09:50:00 1.12176 -0.000250 0 -0.000250 0.000250 -0.000250 -0.889962 1.866970 -0.066153 0.409342 0.155756 0.472823
2019-07-17 10:00:00 1.12170 -0.000053 0 -0.000053 0.000053 0.000053 -0.192901 -0.889962 1.866970 -0.066153 0.409342 0.155756
2019-07-17 10:10:00 1.12152 -0.000160 0 -0.000160 0.000160 0.000160 -0.573250 -0.192901 -0.889962 1.866970 -0.066153 0.409342
2019-07-17 10:20:00 1.12174 0.000196 1 0.000196 -0.000196 0.000196 0.694501 -0.573250 -0.192901 -0.889962 1.866970 -0.066153
2019-07-17 10:30:00 1.12156 -0.000160 0 -0.000160 0.000160 -0.000160 -0.573230 0.694501 -0.573250 -0.192901 -0.889962 1.866970
In [17]:
create_lags(test, mu, std)
In [18]:
test.head()
Out[18]:
EUR_USD r d long short random rs lag_1 lag_2 lag_3 lag_4 lag_5
time
2019-07-25 21:40:00 1.11484 0.000027 1 0.000027 -0.000027 -0.000027 0.092905 -0.098417 0.252345 -0.098422 -0.321622 0.507442
2019-07-25 21:50:00 1.11489 0.000045 1 0.000045 -0.000045 -0.000045 0.156673 0.092905 -0.098417 0.252345 -0.098422 -0.321622
2019-07-25 22:00:00 1.11480 -0.000081 0 -0.000081 0.000081 0.000081 -0.289734 0.156673 0.092905 -0.098417 0.252345 -0.098422
2019-07-25 22:10:00 1.11472 -0.000072 0 -0.000072 0.000072 0.000072 -0.257867 -0.289734 0.156673 0.092905 -0.098417 0.252345
2019-07-25 22:20:00 1.11475 0.000027 1 0.000027 -0.000027 -0.000027 0.092912 -0.257867 -0.289734 0.156673 0.092905 -0.098417

Bagging

In [19]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
In [20]:
base_estimator = DecisionTreeClassifier(random_state=1, max_depth=3,
                                        min_samples_leaf=10)
In [21]:
model = BaggingClassifier(base_estimator=base_estimator,
                          n_estimators=100,
                          bootstrap=True,
                          oob_score=True,
                          n_jobs=4,
                          random_state=100)
In [22]:
%time model.fit(train[cols], train['d'])
CPU times: user 139 ms, sys: 65.8 ms, total: 205 ms
Wall time: 2.35 s
Out[22]:
BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=100, n_jobs=4, oob_score=True,
         random_state=100, verbose=0, warm_start=False)
In [23]:
model.score(test[cols], test['d'])  # prediction accuracy
Out[23]:
0.5393258426966292
In [24]:
test['p'] = model.predict(test[cols])
test['p'] = np.where(test['p'] > 0, 1, -1)
In [25]:
test['p'].value_counts()  # positions taken
Out[25]:
-1    2687
 1    1051
Name: p, dtype: int64
In [26]:
sum(test['p'].diff() != 0)  # trades necessary
Out[26]:
1261
In [27]:
test['strategy_bag'] = test['p'] * test['r']
cf.colors.scales()
In [28]:
test[['strategy_bag', 'random', 'short', 'long']].cumsum(
    ).apply(np.exp).iplot(colorscale='rdbu')

MLP Regressor

In [29]:
from sklearn.neural_network import MLPClassifier
In [30]:
model = MLPClassifier(hidden_layer_sizes=(192, 192),
                      activation='relu',
                      learning_rate_init=0.0005,
                      random_state=100,
                      max_iter=500,
                      validation_fraction=0.1,
                      shuffle=False,
                      early_stopping=True,
                      verbose=False)
In [31]:
%time model.fit(train[cols], train['d'])
CPU times: user 2.1 s, sys: 45.1 ms, total: 2.14 s
Wall time: 1.46 s
Out[31]:
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(192, 192), learning_rate='constant',
       learning_rate_init=0.0005, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=100, shuffle=False, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
In [32]:
test['p'] = model.predict(test[cols])
test['p'] = np.where(test['p'] > 0, 1, -1)
In [33]:
test['p'].value_counts()
Out[33]:
-1    1951
 1    1787
Name: p, dtype: int64
In [34]:
sum(test['p'].diff() != 0)
Out[34]:
1263
In [35]:
test['strategy_mlp'] = test['p'] * test['r']
In [36]:
test[['strategy_mlp', 'strategy_bag', 'random', 'short', 'long']].cumsum(
    ).apply(np.exp).iplot(colorscale='rdbu')

Keras DNN

In [37]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import RMSprop
Using TensorFlow backend.
In [38]:
np.random.seed(100)
tf.random.set_random_seed(100)
In [39]:
opt = RMSprop(lr=0.005, rho=0.9, epsilon=None, decay=0.0)
In [40]:
model = Sequential()

model.add(Dense(128, activation='relu', input_shape=(lags,)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['acc'])
In [41]:
%%time
model.fit(train[cols], train['d'],
          epochs=50, batch_size=32, verbose=False,
          validation_data=(val[cols], val['d']));
CPU times: user 15.8 s, sys: 2.63 s, total: 18.5 s
Wall time: 10.5 s
Out[41]:
<keras.callbacks.History at 0x1a2916ba20>
In [42]:
res = pd.DataFrame(model.history.history)
In [43]:
res.tail(3)
Out[43]:
val_loss val_acc loss acc
47 0.828585 0.522581 0.630126 0.642704
48 0.836504 0.508602 0.626266 0.636072
49 0.858591 0.509677 0.629465 0.641635
In [44]:
res.iplot()
In [45]:
model.evaluate(test[cols], test['d'])
3738/3738 [==============================] - 0s 31us/step
Out[45]:
[0.8469630597810046, 0.5171214553396482]
In [46]:
test['p'] = model.predict_classes(test[cols])
test['p'] = np.where(test['p'] > 0, 1, -1)
In [47]:
test['p'].value_counts()
Out[47]:
-1    1966
 1    1772
Name: p, dtype: int64
In [48]:
sum(test['p'].diff() != 0)
Out[48]:
1879
In [49]:
test['strategy_dnn'] = test['p'] * test['r']
In [50]:
res_cols = ['strategy_dnn', 'strategy_mlp', 'strategy_bag',
            'random', 'short', 'long']
In [51]:
r = test[res_cols].sum().apply(np.exp).sort_values(ascending=False)
r
Out[51]:
strategy_dnn    1.037899
strategy_bag    1.037430
strategy_mlp    1.026202
short           1.013878
random          1.009287
long            0.986312
dtype: float64
In [52]:
r - r['long']
Out[52]:
strategy_dnn    0.051587
strategy_bag    0.051118
strategy_mlp    0.039891
short           0.027567
random          0.022975
long            0.000000
dtype: float64
In [53]:
test[res_cols].cumsum().apply(np.exp).iplot(colorscale='rdbu')