by Dr. Yves J. Hilpisch
The Python Quants GmbH
Dagstuhl Castle, 18. June 2013
Continuum is about Python, Data, Analytics, Visualization.
The team consists of 40+ people:
Continuum builds the Python-based data analytics infrastructure of choice for financial, corporate and governmental institutions.
One of my recent Python-based projects:
The following is an implementation of the Least-Squares Monte Carlo algorithm (LSM) of Longstaff-Schwartz (2001): "Valuing American Options by Simulation: A Simple Least-Squares Approach." Review of Financial Studies, Vol. 14, 113-147. This algorithm values American options by Monte Carlo simulation, using ordinary least-squares regression to estimate continuation values.
def optionValue(S0, vol, T, K=40, M=50, I=4096, r=0.06):
import numpy as np
np.random.seed(150000) # fix the seed for every valuation
dt = T / M # time interval
df = np.exp(-r * dt) # discount factor per time time interval
# Simulation of Index Levels
S = np.zeros((M + 1, I), 'd') # stock price matrix
S[0, :] = S0 # intial values for stock price
for t in range(1, M + 1):
ran = np.random.standard_normal(I / 2)
ran = np.concatenate((ran, -ran)) # antithetic variates
ran = ran - np.mean(ran) # correct first moment
ran = ran / np.std(ran) # correct second moment
S[t, :] = S[t - 1, :] * np.exp((r - vol ** 2 / 2) * dt
+ vol * ran * np.sqrt(dt))
h = np.maximum(K - S, 0) # inner values for put option
V = np.zeros_like(h) # value matrix
V[-1] = h[-1]
# Valuation by LSM
for t in range(M - 1, 0, -1):
rg = np.polyfit(S[t, :], V[t + 1, :] * df, 5) # regression
C = np.polyval(rg, S[t, :]) # evaluation of regression
V[t, :] = np.where(h[t, :] > C, h[t, :],
V[t + 1, :] * df) # exercise decision/optimization
V0 = np.sum(V[1, :] * df) / I # LSM estimator
print "S0 %4.1f|vol %4.2f|T %2.1f| Option Value %8.3f" % (S0, vol, T, V0)
return V0
We want to replicate the whole table 1 of the seminal paper.
def seqValue():
optionValues = []
for S0 in (36., 38., 40., 42., 44.): # initial stock price values
for vol in (0.2, 0.4): # volatility values
for T in (1.0, 2.0): # times-to-maturity
optionValues.append(optionValue(S0, vol, T))
return optionValues
Now, we measure the time for the 20 different American put options of that table 1 with sequential execution.
from time import time
t0 = time()
optionValues = seqValue() # calculate all values
t1 = time(); d1 = t1 - t0
print "Duration in Seconds %6.3f" % d1
S0 36.0|vol 0.20|T 1.0| Option Value 4.500 S0 36.0|vol 0.20|T 2.0| Option Value 4.854 S0 36.0|vol 0.40|T 1.0| Option Value 7.112 S0 36.0|vol 0.40|T 2.0| Option Value 8.472 S0 38.0|vol 0.20|T 1.0| Option Value 3.262 S0 38.0|vol 0.20|T 2.0| Option Value 3.723 S0 38.0|vol 0.40|T 1.0| Option Value 6.112 S0 38.0|vol 0.40|T 2.0| Option Value 7.682 S0 40.0|vol 0.20|T 1.0| Option Value 2.305 S0 40.0|vol 0.20|T 2.0| Option Value 2.861 S0 40.0|vol 0.40|T 1.0| Option Value 5.258 S0 40.0|vol 0.40|T 2.0| Option Value 6.899 S0 42.0|vol 0.20|T 1.0| Option Value 1.569 S0 42.0|vol 0.20|T 2.0| Option Value 2.164 S0 42.0|vol 0.40|T 1.0| Option Value 4.544 S0 42.0|vol 0.40|T 2.0| Option Value 6.171 S0 44.0|vol 0.20|T 1.0| Option Value 1.020 S0 44.0|vol 0.20|T 2.0| Option Value 1.602 S0 44.0|vol 0.40|T 1.0| Option Value 3.912 S0 44.0|vol 0.40|T 2.0| Option Value 5.591 Duration in Seconds 2.673
First, start a local cluster, if you have multiple cores in your machine.
# in the shell ...
####
# ipcluster start -n 8
####
# or maybe more cores, if available
Enable parallel computing capabilities.
from IPython.parallel import Client
cluster_profile = "default"
# the profile is set to the default one
# see IPython docs for further details
c = Client(profile=cluster_profile)
view = c.load_balanced_view()
Again, a loop for the 20 options of table 1. This time asynchronously distributed to the 4 cores.
def parValue():
optionValues = []
for S in (36., 38., 40., 42., 44.):
for vol in (0.2, 0.4):
for T in (1.0, 2.0):
value = view.apply_async(optionValue, S, vol, T)
# asynchronously calculate the option values
optionValues.append(value)
return optionValues
Now, we measure the time needed with parallel execution.
def execution():
optionValues = parValue() # calculate all values
print "Submitted tasks %d" % len(optionValues)
c.wait(optionValues)
# wait for all tasks to be finished
return optionValues
t0 = time()
optionValues = execution()
t1 = time(); d2 = t1 - t0
print "Duration in Seconds %6.3f" % d2
Submitted tasks 20 Duration in Seconds 0.654
d1 / d2 # speed-up of parallel execution
4.084153618252551
Single values for the American put options can be directly accessed.
optionValues[0].result
4.4995294086976036
You can also inspect the metadata and access single data fields in JSON-typical manner.
optionValues[0].metadata
{'after': [], 'completed': datetime.datetime(2013, 6, 18, 11, 24, 18, 801104), 'data': {}, 'engine_id': 0, 'engine_uuid': '7ab67f8d-1977-45a1-bc18-ecbb5c5ac028', 'follow': [], 'msg_id': '40682209-fa2a-412c-ae87-dd2bdb2c1c66', 'outputs': [], 'outputs_ready': True, 'pyerr': None, 'pyin': None, 'pyout': None, 'received': datetime.datetime(2013, 6, 18, 11, 24, 18, 805855), 'started': datetime.datetime(2013, 6, 18, 11, 24, 18, 398935), 'status': 'ok', 'stderr': '', 'stdout': 'S0 36.0|vol 0.20|T 1.0| Option Value 4.500\n', 'submitted': datetime.datetime(2013, 6, 18, 11, 24, 18, 394793)}
The whole output of the 20 valuations.
for result in optionValues:
print result.metadata['stdout'],
S0 36.0|vol 0.20|T 1.0| Option Value 4.500 S0 36.0|vol 0.20|T 2.0| Option Value 4.854 S0 36.0|vol 0.40|T 1.0| Option Value 7.112 S0 36.0|vol 0.40|T 2.0| Option Value 8.472 S0 38.0|vol 0.20|T 1.0| Option Value 3.262 S0 38.0|vol 0.20|T 2.0| Option Value 3.723 S0 38.0|vol 0.40|T 1.0| Option Value 6.112 S0 38.0|vol 0.40|T 2.0| Option Value 7.682 S0 40.0|vol 0.20|T 1.0| Option Value 2.305 S0 40.0|vol 0.20|T 2.0| Option Value 2.861 S0 40.0|vol 0.40|T 1.0| Option Value 5.258 S0 40.0|vol 0.40|T 2.0| Option Value 6.899 S0 42.0|vol 0.20|T 1.0| Option Value 1.569 S0 42.0|vol 0.20|T 2.0| Option Value 2.164 S0 42.0|vol 0.40|T 1.0| Option Value 4.544 S0 42.0|vol 0.40|T 2.0| Option Value 6.171 S0 44.0|vol 0.20|T 1.0| Option Value 1.020 S0 44.0|vol 0.20|T 2.0| Option Value 1.602 S0 44.0|vol 0.40|T 1.0| Option Value 3.912 S0 44.0|vol 0.40|T 2.0| Option Value 5.591
This example illustrates first the collection and the analysis of daily and then of high frequency stock price data – both "popular" topics in todays financial markets. It uses the powerful and convenient pandas library.
import pandas as pd
import pandas.io.data as pdd
AAPL = pdd.DataReader('AAPL', data_source='yahoo', start='1/1/1984') # retrieve stock prices since IPO
AAPL
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 7256 entries, 1984-09-07 00:00:00 to 2013-06-17 00:00:00 Data columns: Open 7256 non-null values High 7256 non-null values Low 7256 non-null values Close 7256 non-null values Volume 7256 non-null values Adj Close 7256 non-null values dtypes: float64(5), int64(1)
AAPL.head()
Open | High | Low | Close | Volume | Adj Close | |
---|---|---|---|---|---|---|
Date | ||||||
1984-09-07 | 26.50 | 26.87 | 26.25 | 26.50 | 2981600 | 2.96 |
1984-09-10 | 26.50 | 26.62 | 25.87 | 26.37 | 2346400 | 2.95 |
1984-09-11 | 26.62 | 27.37 | 26.62 | 26.87 | 5444000 | 3.00 |
1984-09-12 | 26.87 | 27.00 | 26.12 | 26.12 | 4773600 | 2.92 |
1984-09-13 | 27.50 | 27.62 | 27.50 | 27.50 | 7429600 | 3.07 |
AAPL['Close'].plot()
<matplotlib.axes.AxesSubplot at 0x3b6cf10>
AAPL['log_ret'] = log(AAPL['Close'] / AAPL['Close'].shift(1))
AAPL.head()
Open | High | Low | Close | Volume | Adj Close | log_ret | |
---|---|---|---|---|---|---|---|
Date | |||||||
1984-09-07 | 26.50 | 26.87 | 26.25 | 26.50 | 2981600 | 2.96 | NaN |
1984-09-10 | 26.50 | 26.62 | 25.87 | 26.37 | 2346400 | 2.95 | -0.004918 |
1984-09-11 | 26.62 | 27.37 | 26.62 | 26.87 | 5444000 | 3.00 | 0.018783 |
1984-09-12 | 26.87 | 27.00 | 26.12 | 26.12 | 4773600 | 2.92 | -0.028309 |
1984-09-13 | 27.50 | 27.62 | 27.50 | 27.50 | 7429600 | 3.07 | 0.051485 |
AAPL['log_ret'].describe()
count 7255.000000 mean 0.000385 std 0.033125 min -0.731247 25% -0.014855 50% 0.000000 75% 0.015923 max 0.286796
AAPL['log_ret'].hist(bins=40)
<matplotlib.axes.AxesSubplot at 0x3c0ddd0>
Using pandas, this following reads intraday, high-frequency data from a Web source, plots it and resamples it.
url = 'http://hopey.netfonds.no/posdump.php?date=20130614&paper=AAPL.O&csv_format=csv'
# you may have to adjust the date since only recent dates are available
AAPL = pd.read_csv(url, index_col=0, header=0, parse_dates=True)
AAPL
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 12533 entries, 2013-06-14 01:16:55 to 2013-06-14 23:19:32 Data columns: bid 12533 non-null values bid_depth 12533 non-null values bid_depth_total 12533 non-null values offer 12533 non-null values offer_depth 12533 non-null values offer_depth_total 12533 non-null values dtypes: float64(2), int64(4)
AAPL['bid'].plot()
<matplotlib.axes.AxesSubplot at 0x41fc2d0>
AAPL = AAPL[AAPL.index > datetime.datetime(2013, 6, 14, 12, 0, 0)] # taking a sub-set of the data
# this resamples the record frequency to 10 minutes, using mean as aggregation rule
AAPL_10min = AAPL.resample(rule='10min', how='mean')
AAPL_10min.head()
bid | bid_depth | bid_depth_total | offer | offer_depth | offer_depth_total | |
---|---|---|---|---|---|---|
time | ||||||
2013-06-14 12:10:00 | 435.300000 | 100 | 100 | 435.740000 | 200.000000 | 200.000000 |
2013-06-14 12:20:00 | 435.250000 | 100 | 100 | 435.860000 | 100.000000 | 100.000000 |
2013-06-14 12:30:00 | 435.200000 | 100 | 100 | 435.860000 | 100.000000 | 100.000000 |
2013-06-14 12:40:00 | 435.216667 | 100 | 100 | 435.860000 | 133.333333 | 133.333333 |
2013-06-14 12:50:00 | 434.955000 | 200 | 200 | 435.774167 | 191.666667 | 191.666667 |
AAPL_10min['bid'].plot()
<matplotlib.axes.AxesSubplot at 0x4523d10>
# this function can be applied by management to reverse fortune ...
def f(x):
return 432 - x + 432
AAPL_10min['bid'].apply(f).plot()
<matplotlib.axes.AxesSubplot at 0x45449d0>
This example illustrates typical data analytics tasks based on the PyTables library which rests on the HDF5 database standard.
import tables as tb
from tables import *
# h5.close()
h5 = open_file('/home/yhilpisch/Temp/example.h5', 'w')
row_des = {
'DATE': tb.StringCol(26, pos=1),
'NO1': tb.Float64Col(pos=2),
'NO2': tb.Float64Col(pos=3),
'NO3': tb.Float64Col(pos=4),
'NO4': tb.Float64Col(pos=5),
}
filters = tb.Filters(complevel=2)
rows = 10000000
tab = h5.create_table('/', 'dummy_data', title='table', description=row_des, expectedrows=rows, filters=filters)
# populating the table with dummy data
r = tab.row
def genTab():
for i in range(rows):
rand = standard_normal(4)
r['DATE'] = datetime.datetime.now()
r['NO1'], r['NO2'], r['NO3'], r['NO4'] = rand
r.append()
tab.flush()
%time genTab()
tab
CPU times: user 1min 40s, sys: 11.4 s, total: 1min 52s Wall time: 1min 54s
/dummy_data (Table(10000000,), shuffle, zlib(2)) 'table' description := { "DATE": StringCol(itemsize=26, shape=(), dflt='', pos=0), "NO1": Float64Col(shape=(), dflt=0.0, pos=1), "NO2": Float64Col(shape=(), dflt=0.0, pos=2), "NO3": Float64Col(shape=(), dflt=0.0, pos=3), "NO4": Float64Col(shape=(), dflt=0.0, pos=4)} byteorder := 'little' chunkshape := (4519,)
# looking into the table
tab[:3]
array([ ('2013-06-18 11:24:28.272186', 1.1203410743749882, -0.8352379153375216, -1.2919781660873217, -1.6348680542132725), ('2013-06-18 11:24:28.272277', -0.5462648679055429, 0.8278247279360921, -0.6459374792177821, 1.2654821119198476), ('2013-06-18 11:24:28.272299', -1.388790769970219, 0.4573883550857721, 0.578149246726677, 0.04427853797707416)], dtype=[('DATE', '|S26'), ('NO1', '<f8'), ('NO2', '<f8'), ('NO3', '<f8'), ('NO4', '<f8')])
# histogram of a column's values
hist(tab.cols.NO1[::100], bins=30)
grid(True)
# a SQL-like query for data from two columns
%time res = array([(row['NO1'], row['NO2']) for row in tab.where('((NO1 < -0.5) | (NO1 > 0.5)) & ((NO2 < -1) | (NO2 > 1))')])
CPU times: user 14.5 s, sys: 144 ms, total: 14.6 s Wall time: 14.2 s
res = res[::100] # only every x-th record
res[:5]
array([[-0.92639639, 1.76924943], [-0.86429104, 1.99071732], [ 0.79151535, 2.18724794], [ 0.64515076, 3.6376278 ], [-0.77100211, 2.84507245]])
plot(res.T[0], res.T[1], 'ro')
grid(True)
This example illustrates in-memory data analytics with pandas -- based on the previously generated HDF5 database.
import pandas as pd
%time data = pd.DataFrame(tab.read())
data.index = data['DATE']
del data['DATE']
h5.close()
CPU times: user 6.34 s, sys: 512 ms, total: 6.85 s Wall time: 6.86 s
data
<class 'pandas.core.frame.DataFrame'> Index: 10000000 entries, 2013-06-18 11:24:28.272186 to 2013-06-18 11:26:22.263551 Data columns: NO1 10000000 non-null values NO2 10000000 non-null values NO3 10000000 non-null values NO4 10000000 non-null values dtypes: float64(4)
%time data.hist(grid=True, bins=30)
CPU times: user 5.45 s, sys: 248 ms, total: 5.7 s Wall time: 5.7 s
array([[Axes(0.125,0.563043;0.336957x0.336957), Axes(0.563043,0.563043;0.336957x0.336957)], [Axes(0.125,0.125;0.336957x0.336957), Axes(0.563043,0.125;0.336957x0.336957)]], dtype=object)
%time res = data[['NO1', 'NO2']][(data.NO1 < -0.5) & (data.NO2 < -1)]
CPU times: user 212 ms, sys: 60 ms, total: 272 ms Wall time: 272 ms
plot(res.NO1, res.NO2, 'ro')
grid(True)
data['SIGN'] = array(data['NO1']).round(0) #generates new column
data.head()
NO1 | NO2 | NO3 | NO4 | SIGN | |
---|---|---|---|---|---|
DATE | |||||
2013-06-18 11:24:28.272186 | 1.120341 | -0.835238 | -1.291978 | -1.634868 | 1 |
2013-06-18 11:24:28.272277 | -0.546265 | 0.827825 | -0.645937 | 1.265482 | -1 |
2013-06-18 11:24:28.272299 | -1.388791 | 0.457388 | 0.578149 | 0.044279 | -1 |
2013-06-18 11:24:28.272310 | -0.926396 | 1.769249 | -1.341341 | 0.902475 | -1 |
2013-06-18 11:24:28.272320 | -0.446076 | -1.578122 | -0.815648 | 0.129584 | -0 |
%time data.groupby(['SIGN']).count()
CPU times: user 1.74 s, sys: 736 ms, total: 2.48 s Wall time: 2.48 s
NO1 | NO2 | NO3 | NO4 | SIGN | |
---|---|---|---|---|---|
SIGN | |||||
-6 | 1 | 1 | 1 | 1 | 1 |
-5 | 25 | 25 | 25 | 25 | 25 |
-4 | 2328 | 2328 | 2328 | 2328 | 2328 |
-3 | 59545 | 59545 | 59545 | 59545 | 59545 |
-2 | 604778 | 604778 | 604778 | 604778 | 604778 |
-1 | 2417467 | 2417467 | 2417467 | 2417467 | 2417467 |
-0 | 3832749 | 3832749 | 3832749 | 3832749 | 3832749 |
1 | 2417036 | 2417036 | 2417036 | 2417036 | 2417036 |
2 | 604321 | 604321 | 604321 | 604321 | 604321 |
3 | 59414 | 59414 | 59414 | 59414 | 59414 |
4 | 2304 | 2304 | 2304 | 2304 | 2304 |
5 | 32 | 32 | 32 | 32 | 32 |
data.groupby(['SIGN']).mean()
NO1 | NO2 | NO3 | NO4 | |
---|---|---|---|---|
SIGN | ||||
-6 | -5.586834 | 0.057630 | 0.671113 | -1.092833 |
-5 | -4.658663 | 0.078163 | -0.161723 | -0.246599 |
-4 | -3.737356 | 0.009310 | 0.002855 | -0.017930 |
-3 | -2.786637 | -0.003414 | -0.002366 | 0.008004 |
-2 | -1.847916 | 0.002868 | 0.001307 | -0.001836 |
-1 | -0.920871 | -0.001191 | 0.000869 | -0.001073 |
-0 | -0.000151 | 0.000185 | 0.000875 | 0.000917 |
1 | 0.920679 | -0.001195 | 0.000605 | 0.001957 |
2 | 1.848316 | -0.001816 | 0.000322 | 0.000366 |
3 | 2.786931 | 0.003385 | 0.001877 | -0.004025 |
4 | 3.735813 | -0.009022 | 0.011115 | 0.006007 |
5 | 4.720344 | 0.053851 | -0.102913 | 0.162162 |
data.boxplot(column=['NO1'], by=['SIGN'])
<matplotlib.axes.AxesSubplot at 0x522b150>
The Python Quants GmbH – the company Web site
Dr. Yves J. Hilpisch – my personal Web site
Derivatives Analytics with Python – my new book
Read an Excerpt and Order the Book
Contact Us
# Autosave Function for IPyton Notebook
def autosave(interval=3):
"""Autosave the notebook every interval (in minutes)"""
from IPython.core.display import Javascript
interval *= 60*1000 # JS wants intervals in miliseconds
tpl = 'setInterval ( "IPython.notebook.save_notebook()", %i );'
return Javascript(tpl % interval)
autosave()
<IPython.core.display.Javascript at 0x530be90>