Bradlee Speice, Sat 26 December 2015, Blog
import requests
import pandas as pd
import numpy as np
from dateutil import parser as dtparser
from dateutil.relativedelta import relativedelta
from datetime import datetime
from html.parser import HTMLParser
from copy import copy
import Quandl
Pursuant to attending a graduate school studying Financial Engineering, I've been a fan of the Mad Money TV show featuring the bombastic Jim Cramer. One of the things that he's said is that you shouldn't use the futures to predict where the stock market is going to go. But he says it often enough, I've begun to wonder - who is he trying to convince?
It makes sense that because futures on things like the S&P 500 are traded continuously, they would price in market information before the stock market opens. So is Cramer right to be convinced that strategies based on the futures are a poor idea? I wanted to test it out.
The first question is where to get the future's data. I've been part of Seeking Alpha for a bit, and they publish the Wall Street Breakfast newsletter which contains daily future's returns as of 6:20 AM EST. I'd be interested in using that data to see if we can actually make some money.
First though, let's get the data:
We're going to define two HTML parsing classes - one to get the article URL's from a page, and one to get the actual data from each article.
class ArticleListParser(HTMLParser):
"""Given a web page with articles on it, parse out the article links"""
articles = []
def handle_starttag(self, tag, attrs):
#if tag == 'div' and ("id", "author_articles_wrapper") in attrs:
# self.fetch_links = True
if tag == 'a' and ('class', 'dashboard_article_link') in attrs:
href = list(filter(lambda x: x[0] == 'href', attrs))[0][1]
self.articles.append(href)
base_url = "http://seekingalpha.com/author/wall-street-breakfast/articles"
article_page_urls = [base_url] + [base_url + '/{}'.format(i) for i in range(2, 20)]
global_articles = []
for page in article_page_urls:
# We need to switch the user agent, as SA blocks the standard requests agent
articles_html = requests.get(page,
headers={"User-Agent": "Wget/1.13.4"})
parser = ArticleListParser()
parser.feed(articles_html.text)
global_articles += (parser.articles)
class ArticleReturnParser(HTMLParser):
"Given an article, parse out the futures returns in it"
record_font_tags = False
in_font_tag = False
counter = 0
# data = {} # See __init__
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.data = {}
def handle_starttag(self, tag, attrs):
if tag == 'span' and ('itemprop', 'datePublished') in attrs:
date_string = list(filter(lambda x: x[0] == 'content', attrs))[0][1]
date = dtparser.parse(date_string)
self.data['date'] = date
self.in_font_tag = tag == 'font'
def safe_float(self, string):
try:
return float(string[:-1]) / 100
except ValueError:
return np.NaN
def handle_data(self, content):
if not self.record_font_tags and "Futures at 6" in content:
self.record_font_tags = True
if self.record_font_tags and self.in_font_tag:
if self.counter == 0:
self.data['DOW'] = self.safe_float(content)
elif self.counter == 1:
self.data['S&P'] = self.safe_float(content)
elif self.counter == 2:
self.data['NASDAQ'] = self.safe_float(content)
elif self.counter == 3:
self.data['Crude'] = self.safe_float(content)
elif self.counter == 4:
self.data['Gold'] = self.safe_float(content)
self.counter += 1
def handle_endtag(self, tag):
self.in_font_tag = False
def retrieve_data(url):
sa = "http://seekingalpha.com"
article_html = requests.get(sa + url,
headers={"User-Agent": "Wget/1.13.4"})
parser = ArticleReturnParser()
parser.feed(article_html.text)
parser.data.update({"url": url})
parser.data.update({"text": article_html.text})
return parser.data
# This copy **MUST** be in place. I'm not sure why,
# as you'd think that the data being returned would already
# represent a different memory location. Even so, it blows up
# if you don't do this.
article_list = list(set(global_articles))
article_data = [copy(retrieve_data(url)) for url in article_list]
# If there's an issue downloading the article, drop it.
article_df = pd.DataFrame.from_dict(article_data).dropna()
Now that we have the futures data, we're going to compare across 4 different indices - the S&P 500 index, Dow Jones Industrial, Russell 2000, and NASDAQ 100. Let's get the data off of Quandl to make things easier!
# article_df is sorted by date, so we get the first row.
start_date = article_df.sort_values(by='date').iloc[0]['date'] - relativedelta(days=1)
SPY = Quandl.get("GOOG/NYSE_SPY", trim_start=start_date)
DJIA = Quandl.get("GOOG/AMS_DIA", trim_start=start_date)
RUSS = Quandl.get("GOOG/AMEX_IWM", trim_start=start_date)
NASDAQ = Quandl.get("GOOG/EPA_QQQ", trim_start=start_date)
There are two types of tests I want to determine: How accurate each futures category is at predicting the index's opening change over the close before, and predicting the index's daily return.
Let's first calculate how good each future is at predicting the opening return over the previous day. I expect that the futures will be more than 50% accurate, since the information is recorded 3 hours before the markets open.
def calculate_opening_ret(frame):
# I'm not a huge fan of the appending for loop,
# but it's a bit verbose for a comprehension
data = {}
for i in range(1, len(frame)):
date = frame.iloc[i].name
prior_close = frame.iloc[i-1]['Close']
open_val = frame.iloc[i]['Open']
data[date] = (open_val - prior_close) / prior_close
return data
SPY_open_ret = calculate_opening_ret(SPY)
DJIA_open_ret = calculate_opening_ret(DJIA)
RUSS_open_ret = calculate_opening_ret(RUSS)
NASDAQ_open_ret = calculate_opening_ret(NASDAQ)
def signs_match(list_1, list_2):
# This is a surprisingly difficult task - we have to match
# up the dates in order to check if opening returns actually match
index_dict_dt = {key.to_datetime(): list_2[key] for key in list_2.keys()}
matches = []
for row in list_1.iterrows():
row_dt = row[1][1]
row_value = row[1][0]
index_dt = datetime(row_dt.year, row_dt.month, row_dt.day)
if index_dt in list_2:
index_value = list_2[index_dt]
if (row_value > 0 and index_value > 0) or \
(row_value < 0 and index_value < 0) or \
(row_value == 0 and index_value == 0):
matches += [1]
else:
matches += [0]
#print("{}".format(list_2[index_dt]))
return matches
prediction_dict = {}
matches_dict = {}
count_dict = {}
index_dict = {"SPY": SPY_open_ret, "DJIA": DJIA_open_ret, "RUSS": RUSS_open_ret, "NASDAQ": NASDAQ_open_ret}
indices = ["SPY", "DJIA", "RUSS", "NASDAQ"]
futures = ["Crude", "Gold", "DOW", "NASDAQ", "S&P"]
for index in indices:
matches_dict[index] = {future: signs_match(article_df[[future, 'date']],
index_dict[index]) for future in futures}
count_dict[index] = {future: len(matches_dict[index][future]) for future in futures}
prediction_dict[index] = {future: np.mean(matches_dict[index][future])
for future in futures}
print("Articles Checked: ")
print(pd.DataFrame.from_dict(count_dict))
print()
print("Prediction Accuracy:")
print(pd.DataFrame.from_dict(prediction_dict))
This data is very interesting. Some insights:
All said though it appears that futures data is important for determining market direction for both the S&P 500 and Russell 2000. Cramer is half-right: futures data isn't very helpful for the Dow and NASDAQ indices, but is great for the S&P and Russell indices.
Given the code we currently have, I'd like to predict the close of the market as well. We can re-use most of the code, so let's see what happens:
def calculate_closing_ret(frame):
# I'm not a huge fan of the appending for loop,
# but it's a bit verbose for a comprehension
data = {}
for i in range(0, len(frame)):
date = frame.iloc[i].name
open_val = frame.iloc[i]['Open']
close_val = frame.iloc[i]['Close']
data[date] = (close_val - open_val) / open_val
return data
SPY_close_ret = calculate_closing_ret(SPY)
DJIA_close_ret = calculate_closing_ret(DJIA)
RUSS_close_ret = calculate_closing_ret(RUSS)
NASDAQ_close_ret = calculate_closing_ret(NASDAQ)
def signs_match(list_1, list_2):
# This is a surprisingly difficult task - we have to match
# up the dates in order to check if opening returns actually match
index_dict_dt = {key.to_datetime(): list_2[key] for key in list_2.keys()}
matches = []
for row in list_1.iterrows():
row_dt = row[1][1]
row_value = row[1][0]
index_dt = datetime(row_dt.year, row_dt.month, row_dt.day)
if index_dt in list_2:
index_value = list_2[index_dt]
if (row_value > 0 and index_value > 0) or \
(row_value < 0 and index_value < 0) or \
(row_value == 0 and index_value == 0):
matches += [1]
else:
matches += [0]
#print("{}".format(list_2[index_dt]))
return matches
matches_dict = {}
count_dict = {}
prediction_dict = {}
index_dict = {"SPY": SPY_close_ret, "DJIA": DJIA_close_ret,
"RUSS": RUSS_close_ret, "NASDAQ": NASDAQ_close_ret}
indices = ["SPY", "DJIA", "RUSS", "NASDAQ"]
futures = ["Crude", "Gold", "DOW", "NASDAQ", "S&P"]
for index in indices:
matches_dict[index] = {future: signs_match(article_df[[future, 'date']],
index_dict[index]) for future in futures}
count_dict[index] = {future: len(matches_dict[index][future]) for future in futures}
prediction_dict[index] = {future: np.mean(matches_dict[index][future])
for future in futures}
print("Articles Checked:")
print(pd.DataFrame.from_dict(count_dict))
print()
print("Prediction Accuracy:")
print(pd.DataFrame.from_dict(prediction_dict))
Well, it appears that the futures data is terrible at predicting market close. NASDAQ predicting NASDAQ is the most interesting data point, but 63% accuracy isn't accurate enough to make money consistently.
The data bears out very close to what I expected would happen:
In summary:
I hope you've enjoyed this, I quite enjoyed taking a deep dive in the analytics this way. I'll be posting more soon!