elektricity/src/podcasters/bassdrive.py

133 lines
3.8 KiB
Python

"""
Podcast provider for the Bassdrive Archives
"""
from datetime import datetime
from html.parser import HTMLParser
from urllib.parse import unquote
import requests
from feedgen.feed import FeedGenerator
from pytz import UTC
from podcasters.base import BasePodcast
class BassdriveParser(HTMLParser):
def error(self, message):
return super().error(message)
record_link_text = False
link_url = ''
def __init__(self, *args, **kwargs):
# noinspection PyArgumentList
super().__init__(*args, **kwargs)
self.links = []
def handle_starttag(self, tag, attrs):
"""
If we find an 'a' tag, make sure that we record
the next link we come across
>>> b = BassdriveParser()
>>> b.handle_starttag('a', (('href', 'something.mp3'),))
>>> b.record_link_text
True
>>> b.link_url
'something.mp3'
"""
href = ''
for attr, val in attrs:
if attr == 'href':
href = val
if tag == 'a' and href.find('mp3') != -1:
self.record_link_text = True
self.link_url = href
def handle_data(self, data):
"""
If we receive a new link, record it if we're inside an `a` tag
>>> b = BassdriveParser()
>>> not b.get_links()
True
>>> b.handle_data("some_link")
>>> not b.get_links()
True
>>> b.handle_starttag('a', [['href', 'something.mp3']])
>>> b.handle_data("some text")
>>> len(b.get_links()) == 1
True
"""
if self.record_link_text:
self.links.append((data, self.link_url))
self.record_link_text = False
def get_links(self):
# Reverse to sort in descending date order
return self.links
def clear_links(self):
"""
For whatever reason, creating a new parser doesn't
clear out the old links.
>>> import requests
>>> b = BassdriveParser()
>>> b.feed(str(requests.get('http://archives.bassdrivearchive.com/' +\
'1%20-%20Monday/Subfactory%20Show%20-%20DJ%20Spim').content))
>>> len(b.get_links()) > 0
True
>>> b.clear_links()
>>> len(b.get_links()) == 0
True
"""
self.links = []
class BassdriveFeed(BasePodcast):
def __init__(self, *args, **kwargs):
self.url = kwargs['url']
self.logo = kwargs.get('logo', '')
# Get the title and DJ while handling trailing slash
url_pretty = unquote(self.url)
elems = filter(lambda x: x, url_pretty.split('/'))
self.title, self.dj = list(elems)[-1].split(' - ')
def build_feed(self):
"Build the feed given our existing URL"
# Get all the episodes
page_content = str(requests.get(self.url).content)
parser = BassdriveParser()
parser.feed(page_content)
links = parser.get_links()
# And turn them into something usable
fg = FeedGenerator()
fg.id(self.url)
fg.title(self.title)
fg.description(self.title)
fg.author({'name': self.dj})
fg.language('en')
fg.link({'href': self.url, 'rel': 'alternate'})
fg.logo(self.logo)
for link in links:
fe = fg.add_entry()
fe.author({'name': self.dj})
fe.title(link[0])
fe.description(link[0])
fe.enclosure(self.url + link[1], 0, 'audio/mpeg')
# Bassdrive always uses date strings of
# [yyyy.mm.dd] with 0 padding on days and months,
# so that makes our lives easy
date_start = link[0].find('[')
date_str = link[0][date_start:date_start+12]
published = datetime.strptime(date_str, '[%Y.%m.%d]')
fe.pubdate(UTC.localize(published))
fe.guid((link[0]))
return fg