Reinforcement Learning: From Playing Games to Trading Stocks

Dr Yves J Hilpisch | The AI Machine

http://aimachine.io | http://twitter.com/dyjh

General Imports

In [1]:
import os
import math
import time
import random
import numpy as np
import pandas as pd
from pylab import plt
from IPython import display
plt.style.use('seaborn')
np.set_printoptions(precision=4, suppress=True)
os.environ['PYTHONHASHSEED'] = '0'
In [2]:
import warnings; warnings.simplefilter('ignore')

OpenAI Environment

In [3]:
import gym
In [4]:
env = gym.make('CartPole-v0')
In [5]:
env.observation_space
Out[5]:
Box(4,)
In [6]:
env.observation_space.low.astype(np.float16)
Out[6]:
array([-4.8  ,   -inf, -0.419,   -inf], dtype=float16)
In [7]:
env.observation_space.high.astype(np.float16)
Out[7]:
array([4.8  ,   inf, 0.419,   inf], dtype=float16)
In [8]:
state = env.reset()
In [9]:
state # [cart position, cart velocity, pole angle, pole angular velocity]
Out[9]:
array([ 0.0096, -0.0041,  0.0018, -0.0279])
In [10]:
env.action_space
Out[10]:
Discrete(2)
In [11]:
env.action_space.n
Out[11]:
2
In [12]:
env.action_space.sample()
Out[12]:
0
In [13]:
env.action_space.sample() 
Out[13]:
0
In [14]:
a = env.action_space.sample()
a
Out[14]:
1
In [15]:
state, reward, done, info = env.step(a)
state, reward, done, info
Out[15]:
(array([ 0.0095,  0.191 ,  0.0012, -0.32  ]), 1.0, False, {})
In [16]:
env.reset()
for e in range(1, 200):
    a = env.action_space.sample()
    state, reward, done, info = env.step(a) # <2>
    print(f'step={e:2d} | state={state} | action={a} | reward={reward}')
    if done and (e + 1) < 200:
        print('*** FAILED ***')
        break
step= 1 | state=[-0.0021 -0.1613 -0.0191  0.2619] | action=0 | reward=1.0
step= 2 | state=[-0.0053  0.0341 -0.0138 -0.0367] | action=1 | reward=1.0
step= 3 | state=[-0.0046 -0.1608 -0.0146  0.2516] | action=0 | reward=1.0
step= 4 | state=[-0.0078  0.0345 -0.0095 -0.0457] | action=1 | reward=1.0
step= 5 | state=[-0.0071  0.2298 -0.0105 -0.3414] | action=1 | reward=1.0
step= 6 | state=[-0.0025  0.425  -0.0173 -0.6373] | action=1 | reward=1.0
step= 7 | state=[ 0.006   0.2302 -0.03   -0.3501] | action=0 | reward=1.0
step= 8 | state=[ 0.0106  0.4257 -0.037  -0.6521] | action=1 | reward=1.0
step= 9 | state=[ 0.0191  0.2311 -0.0501 -0.3713] | action=0 | reward=1.0
step=10 | state=[ 0.0237  0.0367 -0.0575 -0.0949] | action=0 | reward=1.0
step=11 | state=[ 0.0244  0.2326 -0.0594 -0.4051] | action=1 | reward=1.0
step=12 | state=[ 0.0291  0.4285 -0.0675 -0.7159] | action=1 | reward=1.0
step=13 | state=[ 0.0376  0.2344 -0.0818 -0.4452] | action=0 | reward=1.0
step=14 | state=[ 0.0423  0.0405 -0.0907 -0.1794] | action=0 | reward=1.0
step=15 | state=[ 0.0431 -0.1532 -0.0943  0.0833] | action=0 | reward=1.0
step=16 | state=[ 0.0401  0.0432 -0.0926 -0.2376] | action=1 | reward=1.0
step=17 | state=[ 0.0409 -0.1505 -0.0974  0.0245] | action=0 | reward=1.0
step=18 | state=[ 0.0379 -0.3441 -0.0969  0.285 ] | action=0 | reward=1.0
step=19 | state=[ 0.0311 -0.5377 -0.0912  0.5456] | action=0 | reward=1.0
step=20 | state=[ 0.0203 -0.3415 -0.0803  0.2256] | action=1 | reward=1.0
step=21 | state=[ 0.0135 -0.5354 -0.0758  0.4919] | action=0 | reward=1.0
step=22 | state=[ 0.0028 -0.7293 -0.0659  0.7598] | action=0 | reward=1.0
step=23 | state=[-0.0118 -0.9235 -0.0508  1.031 ] | action=0 | reward=1.0
step=24 | state=[-0.0303 -0.7277 -0.0301  0.7228] | action=1 | reward=1.0
step=25 | state=[-0.0448 -0.9224 -0.0157  1.0059] | action=0 | reward=1.0
step=26 | state=[-0.0633 -1.1173  0.0044  1.2936] | action=0 | reward=1.0
step=27 | state=[-0.0856 -1.3125  0.0303  1.5877] | action=0 | reward=1.0
step=28 | state=[-0.1119 -1.508   0.0621  1.8897] | action=0 | reward=1.0
step=29 | state=[-0.1421 -1.7037  0.0999  2.2009] | action=0 | reward=1.0
step=30 | state=[-0.1761 -1.5097  0.1439  1.9407] | action=1 | reward=1.0
step=31 | state=[-0.2063 -1.706   0.1827  2.2743] | action=0 | reward=1.0
step=32 | state=[-0.2404 -1.9023  0.2282  2.6172] | action=0 | reward=1.0
*** FAILED ***
In [17]:
done
Out[17]:
True
In [18]:
env.reset()
img = plt.imshow(env.render(mode='rgb_array')) # initialize bitmap embedding
for e in range(100):
    img.set_data(env.render(mode='rgb_array')) # updating the data
    display.display(plt.gcf())
    display.clear_output(wait=True)
    a = env.action_space.sample()  # random action choice
    obs, rew, done, _ = env.step(a)  # taking action
    if done and (e + 1) < 200:
        print('*** FAILED ***')
        break
*** FAILED ***

Keras and Seeds

In [19]:
import tensorflow as tf
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam
from sklearn.metrics import accuracy_score
Using TensorFlow backend.
In [20]:
def set_seeds(seed=100):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    env.seed(seed)

DQL Agent

In [21]:
from collections import deque
In [22]:
class DQLAgent:
    def __init__(self, gamma=0.95, lr=0.001, finish=1e10):
        self.finish = finish
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.gamma = gamma
        self.batch_size = 32
        self.lr = lr
        self.max_treward = 0
        self.averages = list()
        self.memory = deque(maxlen=2000)
        self.osn = env.observation_space.shape[0]
        self.model = self._build_model()
        
    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.osn,
                        activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(env.action_space.n, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.lr))
        return model
        
    def act(self, state):
        if random.random() <= self.epsilon:
            return env.action_space.sample()
        action = self.model.predict(state)[0]
        return np.argmax(action)
    
    def replay(self):
        batch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in batch:
            if not done:
                reward += self.gamma * np.amax(
                    self.model.predict(next_state)[0])
            target = self.model.predict(state)
            target[0, action] = reward
            self.model.fit(state, target, epochs=1,
                           verbose=False)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def learn(self, episodes, max_iter=200):
        trewards = []
        for e in range(1, episodes + 1):
            state = env.reset()
            state = np.reshape(state, [1, self.osn])
            for _ in range(max_iter):
                action = self.act(state)
                next_state, reward, done, info = env.step(action)
                next_state = np.reshape(next_state,
                                        [1, self.osn])
                self.memory.append([state, action, reward,
                                     next_state, done])
                state = next_state
                if done:
                    treward = _ + 1
                    trewards.append(treward)
                    av = sum(trewards[-25:]) / 25
                    self.averages.append(av)
                    self.max_treward = max(self.max_treward, treward)
                    templ = 'episode: {:4d}/{} | treward: {:4d} | '
                    templ += 'av: {:6.1f} | max: {:4d}'
                    print(templ.format(e, episodes, treward, av,
                                       self.max_treward), end='\r')
                    break
            if av > self.finish:
                break
            if len(self.memory) > self.batch_size:
                self.replay()
    def test(self, episodes, max_iter=200):
        trewards = []
        for e in range(1, episodes + 1):
            state = env.reset()
            for _ in range(max_iter):
                state = np.reshape(state, [1, self.osn])
                action = np.argmax(self.model.predict(state)[0])
                next_state, reward, done, info = env.step(action)
                state = next_state
                if done:
                    treward = _ + 1
                    trewards.append(treward)
                    print('episode: {:4d}/{} | treward: {:4d}'
                          .format(e, episodes, treward), end='\r')
                    time.sleep(0.05)
                    break
        return trewards
In [23]:
set_seeds(100)
agent = DQLAgent(lr=0.001, finish=195)
In [24]:
episodes = 1000
In [25]:
agent.learn(episodes)
episode:  467/1000 | treward:  200 | av:  195.3 | max:  200
In [26]:
agent.epsilon
Out[26]:
0.09770335251664321
In [27]:
plt.figure(figsize=(10, 6))
x = range(len(agent.averages))
y = np.polyval(np.polyfit(x, agent.averages, deg=3), x)
plt.plot(agent.averages, label='moving average')
plt.plot(x, y, 'r--', label='regression')
plt.xlabel('episodes')
plt.ylabel('total reward')
plt.legend();
In [28]:
trewards = agent.test(100)
episode:  100/100 | treward:  200
In [29]:
sum(trewards) / len(trewards)
Out[29]:
200.0

Finance Environment

In [30]:
class observation_space:
    def __init__(self, n):
        self.shape = (n,)
In [31]:
class action_space:
    def __init__(self, n):
        self.n = n
    def sample(self):
        return random.randint(0, self.n - 1)
In [32]:
class Finance:
    url = 'http://hilpisch.com/aiif_eikon_eod_data.csv'
    def __init__(self, symbol, features):
        self.symbol = symbol
        self.features = features
        self.observation_space = observation_space(4)
        self.osn = self.observation_space.shape[0]
        self.action_space = action_space(2)
        self.min_accuracy = 0.5
        self._get_data()
        self._prepare_data()
    def _get_data(self):
        self.raw = pd.read_csv(self.url, index_col=0,
                               parse_dates=True).dropna()
    def _prepare_data(self):
        self.data = pd.DataFrame(self.raw[self.symbol])
        self.data['r'] = np.log(self.data / self.data.shift(1))
        self.data.dropna(inplace=True)
        self.data = (self.data - self.data.mean()) / self.data.std()
        self.data['d'] = np.where(self.data['r'] > 0, 1, 0)
    def _get_state(self):
        return self.data[self.features].iloc[
            self.bar - self.osn:self.bar].values
    def seed(self, seed=None):
        pass
    def reset(self):
        self.treward = 0
        self.accuracy = 0
        self.bar = self.osn
        state = self.data[self.features].iloc[
            self.bar - self.osn:self.bar]
        return state.values
    def step(self, action):
        correct = action == self.data['d'].iloc[self.bar]
        reward = 1 if correct else 0
        self.treward += reward
        self.bar += 1
        self.accuracy = self.treward / (self.bar - self.osn)
        if self.bar >= len(self.data):
            done = True
        elif reward == 1:
            done = False
        elif (self.accuracy < self.min_accuracy and
              self.bar > self.osn + 10):
            done = True
        else:
            done = False
        state = self._get_state()
        info = {}
        return state, reward, done, info
In [33]:
env = Finance('EUR=', 'r')
In [34]:
env.reset()
Out[34]:
array([-0.5125,  0.5603, -1.1434,  1.1788])
In [35]:
a = env.action_space.sample()
a
Out[35]:
1
In [36]:
env.step(a)
Out[36]:
(array([ 0.5603, -1.1434,  1.1788,  1.2569]), 1, False, {})

Trading Bot

In [37]:
set_seeds(100)
agent = DQLAgent(lr=0.001, gamma=0.5, finish=2400)
In [38]:
episodes = 1000
In [39]:
agent.learn(episodes, max_iter=2600)
episode:  830/1000 | treward: 2511 | av: 2427.2 | max: 2511
In [40]:
agent.epsilon
Out[40]:
0.01583754189442009
In [41]:
agent.test(3, max_iter=2600)
episode:    3/3 | treward: 2511
Out[41]:
[2511, 2511, 2511]
In [42]:
plt.figure(figsize=(10, 6))
x = range(len(agent.averages))
y = np.polyval(np.polyfit(x, agent.averages, deg=3), x)
plt.plot(agent.averages, label='moving average')
plt.plot(x, y, 'r--', label='regression')
plt.xlabel('episodes')
plt.ylabel('total reward')
plt.legend();