elektricity/src/podcasters/bassdrive.py

95 lines
2.7 KiB
Python

"""
Podcast provider for the Bassdrive Archives
"""
from datetime import datetime
from html.parser import HTMLParser
from urllib.parse import unquote
import requests
from feedgen.feed import FeedGenerator
from pytz import UTC
from podcasters.base import BasePodcast
class BassdriveParser(HTMLParser):
def error(self, message):
return super().error(message)
record_link_text = False
link_url = ''
def __init__(self, *args, **kwargs):
# noinspection PyArgumentList
super().__init__(*args, **kwargs)
self.links = []
def handle_starttag(self, tag, attrs):
href = ''
for attr, val in attrs:
if attr == 'href':
href = val
if tag == 'a' and href.find('mp3') != -1:
self.record_link_text = True
self.link_url = href
def handle_data(self, data):
if self.record_link_text:
self.links.append((data, self.link_url))
self.record_link_text = False
def get_links(self):
# Reverse to sort in descending date order
return self.links
def clear_links(self):
self.links = []
class BassdriveFeed(BasePodcast):
def __init__(self, *args, **kwargs):
self.url = kwargs['url']
self.logo = kwargs['logo']
# Get the title and DJ while handling trailing slash
url_pretty = unquote(self.url)
elems = filter(lambda x: x, url_pretty.split('/'))
self.title, self.dj = list(elems)[-1].split(' - ')
def build_feed(self):
"Build the feed given our existing URL"
# Get all the episodes
page_content = str(requests.get(self.url).content)
parser = BassdriveParser()
parser.feed(page_content)
links = parser.get_links()
# And turn them into something usable
fg = FeedGenerator()
fg.id(self.url)
fg.title(self.title)
fg.description(self.title)
fg.author({'name': self.dj})
fg.language('en')
fg.link({'href': self.url, 'rel': 'alternate'})
fg.logo(self.logo)
for link in links:
fe = fg.add_entry()
fe.author({'name': self.dj})
fe.title(link[0])
fe.description(link[0])
fe.enclosure(self.url + link[1], 0, 'audio/mpeg')
# Bassdrive always uses date strings of
# [yyyy.mm.dd] with 0 padding on days and months,
# so that makes our lives easy
date_start = link[0].find('[')
date_str = link[0][date_start:date_start+12]
published = datetime.strptime(date_str, '[%Y.%m.%d]')
fe.pubdate(UTC.localize(published))
fe.guid((link[0]))
parser.clear_links()
return fg