Add NASDAQ Company and ETF lists

master
Bradlee Speice 2016-08-19 22:02:19 -04:00
parent c42f5a6980
commit 88c4401960
8 changed files with 106 additions and 4 deletions

6
metrik/targets/noop.py Normal file
View File

@ -0,0 +1,6 @@
from luigi.target import Target
class NoOpTarget(Target):
def exists(self):
return True

View File

@ -1,7 +1,12 @@
from __future__ import print_function
from luigi import Task
from metrik.targets.mongo_target import MongoTarget
import logging
from luigi import Task
from luigi.parameter import DateMinuteParameter
from metrik.targets.mongo import MongoTarget
from metrik.targets.noop import NoOpTarget
class MongoCreateTask(Task):
@ -27,3 +32,31 @@ class MongoCreateTask(Task):
def retrieve_data(self, *args, **kwargs):
raise NotImplementedError('Get me some data!')
# noinspection PyAbstractClass
class MongoNoBackCreateTask(MongoCreateTask):
# Have one parameter to make sure that the MongoTarget created by `super`
# doesn't blow up.
current_datetime = DateMinuteParameter()
def __init__(self, live=False, *args, **kwargs):
super(MongoNoBackCreateTask, self).__init__(*args, **kwargs)
self.live = live
child_name = type(self).__name__
if not live:
logging.warning('Trying to create {child_name} without running'
' live, errors potentially to ensue.'.format(child_name))
def output(self):
if self.live:
return super(MongoNoBackCreateTask, self).output()
else:
return NoOpTarget()
def run(self):
# It only makes sense to run these tasks live: they can only retrieve
# data in the moment, and can not go back to back-fill data. This is
# very unfortunate, but there is plenty of valuable to be had that we
# wish to persist for the future.
if self.live:
return super(MongoNoBackCreateTask, self).run()

37
metrik/tasks/nasdaq.py Normal file
View File

@ -0,0 +1,37 @@
import requests
import pandas as pd
from six import StringIO
from metrik.tasks.base import MongoNoBackCreateTask
class NasdaqCompanyList(MongoNoBackCreateTask):
def get_collection_name(self):
return 'nasdaq_company_list'
@staticmethod
def retrieve_data(*args, **kwargs):
# Explicitly use requests to make mocking easy
csv_bytes = requests.get('http://www.nasdaq.com/screening/'
'companies-by-region.aspx?&render=download') \
.content
csv_filelike = StringIO(csv_bytes)
company_csv = pd.read_csv(csv_filelike)[
['Symbol', 'Name', 'LastSale', 'MarketCap', 'Country', 'IPOyear',
'Sector', 'Industry']
]
return {'companies': company_csv.to_dict(orient='records')}
class NasdaqETFList(MongoNoBackCreateTask):
def get_collection_name(self):
return 'nasdaq_etf_list'
@staticmethod
def retrieve_data(*args, **kwargs):
csv_bytes = requests.get('http://www.nasdaq.com/investing/etfs/'
'etf-finder-results.aspx?download=Yes') \
.content
csv_filelike = StringIO(csv_bytes)
etf_csv = pd.read_csv(csv_filelike)[['Symbol', 'Name', 'LastSale']]
return {'etfs': etf_csv.to_dict(orient='records')}

View File

@ -2,7 +2,7 @@ from unittest import TestCase
from pymongo import MongoClient
from metrik.conf import MONGO_DATABASE, MONGO_PORT, MONGO_HOST
from metrik.targets.mongo_target import MongoTarget
from metrik.targets.mongo import MongoTarget
class MongoTest(TestCase):

View File

@ -1,7 +1,7 @@
from pymongo import MongoClient
from random import randint
from metrik.targets.mongo_target import MongoTarget
from metrik.targets.mongo import MongoTarget
from metrik.conf import MONGO_DATABASE, MONGO_HOST, MONGO_PORT
from test.mongo_test import MongoTest

View File

@ -0,0 +1,9 @@
from unittest import TestCase
from metrik.targets.noop import NoOpTarget
class NoOpTest(TestCase):
def test_sanity(self):
t = NoOpTarget()
assert t.exists()

17
test/tasks/test_nasdaq.py Normal file
View File

@ -0,0 +1,17 @@
from unittest import TestCase
from metrik.tasks.nasdaq import NasdaqCompanyList, NasdaqETFList
class NasdaqTest(TestCase):
def test_company_list(self):
companies = NasdaqCompanyList.retrieve_data()['companies']
assert len(companies) > 6000
# TODO: Get lists of companies from ETF holdings and verify that they
# can be found here as well - this should be a superset
def test_etf_list(self):
etfs = NasdaqETFList.retrieve_data()['etfs']
assert len(etfs) > 1500