# Reinforcement Learning: From Playing Games to Trading Stocks¶

Dr Yves J Hilpisch | The AI Machine

## General Imports¶

In [1]:
import os
import math
import time
import random
import numpy as np
import pandas as pd
from pylab import plt
from IPython import display
plt.style.use('seaborn')
np.set_printoptions(precision=4, suppress=True)
os.environ['PYTHONHASHSEED'] = '0'

In [2]:
import warnings; warnings.simplefilter('ignore')


## OpenAI Environment¶

In [3]:
import gym

In [4]:
env = gym.make('CartPole-v0')

In [5]:
env.observation_space

Out[5]:
Box(4,)
In [6]:
env.observation_space.low.astype(np.float16)

Out[6]:
array([-4.8  ,   -inf, -0.419,   -inf], dtype=float16)
In [7]:
env.observation_space.high.astype(np.float16)

Out[7]:
array([4.8  ,   inf, 0.419,   inf], dtype=float16)
In [8]:
state = env.reset()

In [9]:
state # [cart position, cart velocity, pole angle, pole angular velocity]

Out[9]:
array([ 0.0096, -0.0041,  0.0018, -0.0279])
In [10]:
env.action_space

Out[10]:
Discrete(2)
In [11]:
env.action_space.n

Out[11]:
2
In [12]:
env.action_space.sample()

Out[12]:
0
In [13]:
env.action_space.sample()

Out[13]:
0
In [14]:
a = env.action_space.sample()
a

Out[14]:
1
In [15]:
state, reward, done, info = env.step(a)
state, reward, done, info

Out[15]:
(array([ 0.0095,  0.191 ,  0.0012, -0.32  ]), 1.0, False, {})
In [16]:
env.reset()
for e in range(1, 200):
a = env.action_space.sample()
state, reward, done, info = env.step(a) # <2>
print(f'step={e:2d} | state={state} | action={a} | reward={reward}')
if done and (e + 1) < 200:
print('*** FAILED ***')
break

step= 1 | state=[-0.0021 -0.1613 -0.0191  0.2619] | action=0 | reward=1.0
step= 2 | state=[-0.0053  0.0341 -0.0138 -0.0367] | action=1 | reward=1.0
step= 3 | state=[-0.0046 -0.1608 -0.0146  0.2516] | action=0 | reward=1.0
step= 4 | state=[-0.0078  0.0345 -0.0095 -0.0457] | action=1 | reward=1.0
step= 5 | state=[-0.0071  0.2298 -0.0105 -0.3414] | action=1 | reward=1.0
step= 6 | state=[-0.0025  0.425  -0.0173 -0.6373] | action=1 | reward=1.0
step= 7 | state=[ 0.006   0.2302 -0.03   -0.3501] | action=0 | reward=1.0
step= 8 | state=[ 0.0106  0.4257 -0.037  -0.6521] | action=1 | reward=1.0
step= 9 | state=[ 0.0191  0.2311 -0.0501 -0.3713] | action=0 | reward=1.0
step=10 | state=[ 0.0237  0.0367 -0.0575 -0.0949] | action=0 | reward=1.0
step=11 | state=[ 0.0244  0.2326 -0.0594 -0.4051] | action=1 | reward=1.0
step=12 | state=[ 0.0291  0.4285 -0.0675 -0.7159] | action=1 | reward=1.0
step=13 | state=[ 0.0376  0.2344 -0.0818 -0.4452] | action=0 | reward=1.0
step=14 | state=[ 0.0423  0.0405 -0.0907 -0.1794] | action=0 | reward=1.0
step=15 | state=[ 0.0431 -0.1532 -0.0943  0.0833] | action=0 | reward=1.0
step=16 | state=[ 0.0401  0.0432 -0.0926 -0.2376] | action=1 | reward=1.0
step=17 | state=[ 0.0409 -0.1505 -0.0974  0.0245] | action=0 | reward=1.0
step=18 | state=[ 0.0379 -0.3441 -0.0969  0.285 ] | action=0 | reward=1.0
step=19 | state=[ 0.0311 -0.5377 -0.0912  0.5456] | action=0 | reward=1.0
step=20 | state=[ 0.0203 -0.3415 -0.0803  0.2256] | action=1 | reward=1.0
step=21 | state=[ 0.0135 -0.5354 -0.0758  0.4919] | action=0 | reward=1.0
step=22 | state=[ 0.0028 -0.7293 -0.0659  0.7598] | action=0 | reward=1.0
step=23 | state=[-0.0118 -0.9235 -0.0508  1.031 ] | action=0 | reward=1.0
step=24 | state=[-0.0303 -0.7277 -0.0301  0.7228] | action=1 | reward=1.0
step=25 | state=[-0.0448 -0.9224 -0.0157  1.0059] | action=0 | reward=1.0
step=26 | state=[-0.0633 -1.1173  0.0044  1.2936] | action=0 | reward=1.0
step=27 | state=[-0.0856 -1.3125  0.0303  1.5877] | action=0 | reward=1.0
step=28 | state=[-0.1119 -1.508   0.0621  1.8897] | action=0 | reward=1.0
step=29 | state=[-0.1421 -1.7037  0.0999  2.2009] | action=0 | reward=1.0
step=30 | state=[-0.1761 -1.5097  0.1439  1.9407] | action=1 | reward=1.0
step=31 | state=[-0.2063 -1.706   0.1827  2.2743] | action=0 | reward=1.0
step=32 | state=[-0.2404 -1.9023  0.2282  2.6172] | action=0 | reward=1.0
*** FAILED ***

In [17]:
done

Out[17]:
True
In [18]:
env.reset()
img = plt.imshow(env.render(mode='rgb_array')) # initialize bitmap embedding
for e in range(100):
img.set_data(env.render(mode='rgb_array')) # updating the data
display.display(plt.gcf())
display.clear_output(wait=True)
a = env.action_space.sample()  # random action choice
obs, rew, done, _ = env.step(a)  # taking action
if done and (e + 1) < 200:
print('*** FAILED ***')
break

*** FAILED ***


## Keras and Seeds¶

In [19]:
import tensorflow as tf
from keras.layers import Dense, Dropout
from keras.models import Sequential
from sklearn.metrics import accuracy_score

Using TensorFlow backend.

In [20]:
def set_seeds(seed=100):
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
env.seed(seed)


## DQL Agent¶

In [21]:
from collections import deque

In [22]:
class DQLAgent:
def __init__(self, gamma=0.95, lr=0.001, finish=1e10):
self.finish = finish
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
self.gamma = gamma
self.batch_size = 32
self.lr = lr
self.max_treward = 0
self.averages = list()
self.memory = deque(maxlen=2000)
self.osn = env.observation_space.shape[0]
self.model = self._build_model()

def _build_model(self):
model = Sequential()
activation='relu'))
return model

def act(self, state):
if random.random() <= self.epsilon:
return env.action_space.sample()
action = self.model.predict(state)[0]
return np.argmax(action)

def replay(self):
batch = random.sample(self.memory, self.batch_size)
for state, action, reward, next_state, done in batch:
if not done:
reward += self.gamma * np.amax(
self.model.predict(next_state)[0])
target = self.model.predict(state)
target[0, action] = reward
self.model.fit(state, target, epochs=1,
verbose=False)
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay

def learn(self, episodes, max_iter=200):
trewards = []
for e in range(1, episodes + 1):
state = env.reset()
state = np.reshape(state, [1, self.osn])
for _ in range(max_iter):
action = self.act(state)
next_state, reward, done, info = env.step(action)
next_state = np.reshape(next_state,
[1, self.osn])
self.memory.append([state, action, reward,
next_state, done])
state = next_state
if done:
treward = _ + 1
trewards.append(treward)
av = sum(trewards[-25:]) / 25
self.averages.append(av)
self.max_treward = max(self.max_treward, treward)
templ = 'episode: {:4d}/{} | treward: {:4d} | '
templ += 'av: {:6.1f} | max: {:4d}'
print(templ.format(e, episodes, treward, av,
self.max_treward), end='\r')
break
if av > self.finish:
break
if len(self.memory) > self.batch_size:
self.replay()
def test(self, episodes, max_iter=200):
trewards = []
for e in range(1, episodes + 1):
state = env.reset()
for _ in range(max_iter):
state = np.reshape(state, [1, self.osn])
action = np.argmax(self.model.predict(state)[0])
next_state, reward, done, info = env.step(action)
state = next_state
if done:
treward = _ + 1
trewards.append(treward)
print('episode: {:4d}/{} | treward: {:4d}'
.format(e, episodes, treward), end='\r')
time.sleep(0.05)
break
return trewards

In [23]:
set_seeds(100)
agent = DQLAgent(lr=0.001, finish=195)

In [24]:
episodes = 1000

In [25]:
agent.learn(episodes)

episode:  467/1000 | treward:  200 | av:  195.3 | max:  200

In [26]:
agent.epsilon

Out[26]:
0.09770335251664321
In [27]:
plt.figure(figsize=(10, 6))
x = range(len(agent.averages))
y = np.polyval(np.polyfit(x, agent.averages, deg=3), x)
plt.plot(agent.averages, label='moving average')
plt.plot(x, y, 'r--', label='regression')
plt.xlabel('episodes')
plt.ylabel('total reward')
plt.legend();

In [28]:
trewards = agent.test(100)

episode:  100/100 | treward:  200

In [29]:
sum(trewards) / len(trewards)

Out[29]:
200.0

## Finance Environment¶

In [30]:
class observation_space:
def __init__(self, n):
self.shape = (n,)

In [31]:
class action_space:
def __init__(self, n):
self.n = n
def sample(self):
return random.randint(0, self.n - 1)

In [32]:
class Finance:
url = 'http://hilpisch.com/aiif_eikon_eod_data.csv'
def __init__(self, symbol, features):
self.symbol = symbol
self.features = features
self.observation_space = observation_space(4)
self.osn = self.observation_space.shape[0]
self.action_space = action_space(2)
self.min_accuracy = 0.5
self._get_data()
self._prepare_data()
def _get_data(self):
parse_dates=True).dropna()
def _prepare_data(self):
self.data = pd.DataFrame(self.raw[self.symbol])
self.data['r'] = np.log(self.data / self.data.shift(1))
self.data.dropna(inplace=True)
self.data = (self.data - self.data.mean()) / self.data.std()
self.data['d'] = np.where(self.data['r'] > 0, 1, 0)
def _get_state(self):
return self.data[self.features].iloc[
self.bar - self.osn:self.bar].values
def seed(self, seed=None):
pass
def reset(self):
self.treward = 0
self.accuracy = 0
self.bar = self.osn
state = self.data[self.features].iloc[
self.bar - self.osn:self.bar]
return state.values
def step(self, action):
correct = action == self.data['d'].iloc[self.bar]
reward = 1 if correct else 0
self.treward += reward
self.bar += 1
self.accuracy = self.treward / (self.bar - self.osn)
if self.bar >= len(self.data):
done = True
elif reward == 1:
done = False
elif (self.accuracy < self.min_accuracy and
self.bar > self.osn + 10):
done = True
else:
done = False
state = self._get_state()
info = {}
return state, reward, done, info

In [33]:
env = Finance('EUR=', 'r')

In [34]:
env.reset()

Out[34]:
array([-0.5125,  0.5603, -1.1434,  1.1788])
In [35]:
a = env.action_space.sample()
a

Out[35]:
1
In [36]:
env.step(a)

Out[36]:
(array([ 0.5603, -1.1434,  1.1788,  1.2569]), 1, False, {})

In [37]:
set_seeds(100)
agent = DQLAgent(lr=0.001, gamma=0.5, finish=2400)

In [38]:
episodes = 1000

In [39]:
agent.learn(episodes, max_iter=2600)

episode:  830/1000 | treward: 2511 | av: 2427.2 | max: 2511

In [40]:
agent.epsilon

Out[40]:
0.01583754189442009
In [41]:
agent.test(3, max_iter=2600)

episode:    3/3 | treward: 2511

Out[41]:
[2511, 2511, 2511]
In [42]:
plt.figure(figsize=(10, 6))
x = range(len(agent.averages))
y = np.polyval(np.polyfit(x, agent.averages, deg=3), x)
plt.plot(agent.averages, label='moving average')
plt.plot(x, y, 'r--', label='regression')
plt.xlabel('episodes')
plt.ylabel('total reward')
plt.legend();