From 88c440196050f690aec6e9e21305403bb1cb34b7 Mon Sep 17 00:00:00 2001 From: Bradlee Speice Date: Fri, 19 Aug 2016 22:02:19 -0400 Subject: [PATCH] Add NASDAQ Company and ETF lists --- metrik/targets/{mongo_target.py => mongo.py} | 0 metrik/targets/noop.py | 6 ++++ metrik/tasks/base.py | 37 ++++++++++++++++++-- metrik/tasks/nasdaq.py | 37 ++++++++++++++++++++ test/mongo_test.py | 2 +- test/targets/test_mongo_target.py | 2 +- test/targets/test_noop.py | 9 +++++ test/tasks/test_nasdaq.py | 17 +++++++++ 8 files changed, 106 insertions(+), 4 deletions(-) rename metrik/targets/{mongo_target.py => mongo.py} (100%) create mode 100644 metrik/targets/noop.py create mode 100644 metrik/tasks/nasdaq.py create mode 100644 test/targets/test_noop.py create mode 100644 test/tasks/test_nasdaq.py diff --git a/metrik/targets/mongo_target.py b/metrik/targets/mongo.py similarity index 100% rename from metrik/targets/mongo_target.py rename to metrik/targets/mongo.py diff --git a/metrik/targets/noop.py b/metrik/targets/noop.py new file mode 100644 index 0000000..3f11449 --- /dev/null +++ b/metrik/targets/noop.py @@ -0,0 +1,6 @@ +from luigi.target import Target + + +class NoOpTarget(Target): + def exists(self): + return True \ No newline at end of file diff --git a/metrik/tasks/base.py b/metrik/tasks/base.py index 7bed2cb..0c3d45f 100644 --- a/metrik/tasks/base.py +++ b/metrik/tasks/base.py @@ -1,7 +1,12 @@ from __future__ import print_function -from luigi import Task -from metrik.targets.mongo_target import MongoTarget +import logging + +from luigi import Task +from luigi.parameter import DateMinuteParameter + +from metrik.targets.mongo import MongoTarget +from metrik.targets.noop import NoOpTarget class MongoCreateTask(Task): @@ -27,3 +32,31 @@ class MongoCreateTask(Task): def retrieve_data(self, *args, **kwargs): raise NotImplementedError('Get me some data!') + +# noinspection PyAbstractClass +class MongoNoBackCreateTask(MongoCreateTask): + # Have one parameter to make sure that the MongoTarget created by `super` + # doesn't blow up. + current_datetime = DateMinuteParameter() + + def __init__(self, live=False, *args, **kwargs): + super(MongoNoBackCreateTask, self).__init__(*args, **kwargs) + self.live = live + child_name = type(self).__name__ + if not live: + logging.warning('Trying to create {child_name} without running' + ' live, errors potentially to ensue.'.format(child_name)) + + def output(self): + if self.live: + return super(MongoNoBackCreateTask, self).output() + else: + return NoOpTarget() + + def run(self): + # It only makes sense to run these tasks live: they can only retrieve + # data in the moment, and can not go back to back-fill data. This is + # very unfortunate, but there is plenty of valuable to be had that we + # wish to persist for the future. + if self.live: + return super(MongoNoBackCreateTask, self).run() diff --git a/metrik/tasks/nasdaq.py b/metrik/tasks/nasdaq.py new file mode 100644 index 0000000..828c544 --- /dev/null +++ b/metrik/tasks/nasdaq.py @@ -0,0 +1,37 @@ +import requests +import pandas as pd +from six import StringIO + +from metrik.tasks.base import MongoNoBackCreateTask + + +class NasdaqCompanyList(MongoNoBackCreateTask): + def get_collection_name(self): + return 'nasdaq_company_list' + + @staticmethod + def retrieve_data(*args, **kwargs): + # Explicitly use requests to make mocking easy + csv_bytes = requests.get('http://www.nasdaq.com/screening/' + 'companies-by-region.aspx?&render=download') \ + .content + csv_filelike = StringIO(csv_bytes) + company_csv = pd.read_csv(csv_filelike)[ + ['Symbol', 'Name', 'LastSale', 'MarketCap', 'Country', 'IPOyear', + 'Sector', 'Industry'] + ] + return {'companies': company_csv.to_dict(orient='records')} + + +class NasdaqETFList(MongoNoBackCreateTask): + def get_collection_name(self): + return 'nasdaq_etf_list' + + @staticmethod + def retrieve_data(*args, **kwargs): + csv_bytes = requests.get('http://www.nasdaq.com/investing/etfs/' + 'etf-finder-results.aspx?download=Yes') \ + .content + csv_filelike = StringIO(csv_bytes) + etf_csv = pd.read_csv(csv_filelike)[['Symbol', 'Name', 'LastSale']] + return {'etfs': etf_csv.to_dict(orient='records')} diff --git a/test/mongo_test.py b/test/mongo_test.py index c145a7a..7419463 100644 --- a/test/mongo_test.py +++ b/test/mongo_test.py @@ -2,7 +2,7 @@ from unittest import TestCase from pymongo import MongoClient from metrik.conf import MONGO_DATABASE, MONGO_PORT, MONGO_HOST -from metrik.targets.mongo_target import MongoTarget +from metrik.targets.mongo import MongoTarget class MongoTest(TestCase): diff --git a/test/targets/test_mongo_target.py b/test/targets/test_mongo_target.py index c44046c..faa50f5 100644 --- a/test/targets/test_mongo_target.py +++ b/test/targets/test_mongo_target.py @@ -1,7 +1,7 @@ from pymongo import MongoClient from random import randint -from metrik.targets.mongo_target import MongoTarget +from metrik.targets.mongo import MongoTarget from metrik.conf import MONGO_DATABASE, MONGO_HOST, MONGO_PORT from test.mongo_test import MongoTest diff --git a/test/targets/test_noop.py b/test/targets/test_noop.py new file mode 100644 index 0000000..7296ee7 --- /dev/null +++ b/test/targets/test_noop.py @@ -0,0 +1,9 @@ +from unittest import TestCase + +from metrik.targets.noop import NoOpTarget + + +class NoOpTest(TestCase): + def test_sanity(self): + t = NoOpTarget() + assert t.exists() diff --git a/test/tasks/test_nasdaq.py b/test/tasks/test_nasdaq.py new file mode 100644 index 0000000..daee5ce --- /dev/null +++ b/test/tasks/test_nasdaq.py @@ -0,0 +1,17 @@ +from unittest import TestCase + +from metrik.tasks.nasdaq import NasdaqCompanyList, NasdaqETFList + + +class NasdaqTest(TestCase): + + def test_company_list(self): + companies = NasdaqCompanyList.retrieve_data()['companies'] + assert len(companies) > 6000 + # TODO: Get lists of companies from ETF holdings and verify that they + # can be found here as well - this should be a superset + + + def test_etf_list(self): + etfs = NasdaqETFList.retrieve_data()['etfs'] + assert len(etfs) > 1500 \ No newline at end of file