From a00d73c8c8fe1c33053ccc102153c13ead1614ec Mon Sep 17 00:00:00 2001 From: Erik Johnson Date: Mon, 28 Jul 2014 13:40:58 -0500 Subject: [PATCH 1/4] Add Patreon extractor --- youtube_dl/extractor/patreon.py | 127 ++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 youtube_dl/extractor/patreon.py diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py new file mode 100644 index 000000000..da1ad065a --- /dev/null +++ b/youtube_dl/extractor/patreon.py @@ -0,0 +1,127 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + compat_html_parser, + #compat_urllib_request, + #compat_urllib_parse, +) + + +class PatreonHTMLParser(compat_html_parser.HTMLParser): + _PREFIX = 'http://www.patreon.com' + _ATTACH_TAGS = 5 * ['div'] + _ATTACH_CLASSES = [ + 'fancyboxhidden', 'box photo double', 'boxwrapper double', + 'hiddendisplay shareinfo', 'attach' + ] + _INFO_TAGS = 4 * ['div'] + _INFO_CLASSES = [ + 'fancyboxhidden', 'box photo double', 'boxwrapper double', + 'hiddendisplay shareinfo' + ] + + def get_creation_info(self, html_data): + self.tag_stack = [] + self.attrs_stack = [] + self.creation_info = {} + self.feed(html_data) + + def handle_starttag(self, tag, attrs): + self.tag_stack.append(tag.lower()) + self.attrs_stack.append(dict(attrs)) + + def handle_endtag(self, tag): + self.tag_stack.pop() + self.attrs_stack.pop() + + def handle_data(self, data): + # Check first if this is a creation attachment + if self.tag_stack[-6:-1] == self._ATTACH_TAGS: + attrs_classes = [ + x.get('class', '').lower() for x in self.attrs_stack[-6:-1] + ] + if attrs_classes == self._ATTACH_CLASSES: + if self.tag_stack[-1] == 'a': + url = self._PREFIX + self.attrs_stack[-1].get('href') + self.creation_info['url'] = url + if '.' in data: + self.creation_info['ext'] = data.rsplit('.')[-1] + # Next, check if this is within the div containing the creation info + if self.tag_stack[-5:-1] == self._INFO_TAGS: + attrs_classes = [ + x.get('class', '').lower() for x in self.attrs_stack[-5:-1] + ] + if attrs_classes == self._INFO_CLASSES: + if self.attrs_stack[-1].get('class') == 'utitle': + self.creation_info['title'] = data.strip() + + +class PatreonIE(InfoExtractor): + IE_NAME = 'patreon' + _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(.+)' + _TESTS = [ + { + 'url': 'http://www.patreon.com/creation?hid=743933', + 'md5': 'e25505eec1053a6e6813b8ed369875cc', + 'name': 'Patreon', + 'info_dict': { + 'id': '743933', + 'ext': 'mp3', + 'title': 'Episode 166: David Smalley of Dogma Debate', + 'uploader': 'Cognitive Dissonance Podcast', + }, + }, + ] + + # Currently Patreon exposes download URL via hidden CSS, so login is not + # needed. Keeping this commented for when this inevitably changes. + ''' + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_form = { + 'redirectUrl': 'http://www.patreon.com/', + 'email': username, + 'password': password, + } + + request = compat_urllib_request.Request( + 'https://www.patreon.com/processLogin', + compat_urllib_parse.urlencode(login_form).encode('utf-8') + ) + login_page = self._download_webpage(request, None, note='Logging in as %s' % username) + + if re.search(r'onLoginFailed', login_page): + raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) + + def _real_initialize(self): + self._login() + ''' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + + info_page = self._download_webpage(url, video_id) + + ret = {'id': video_id} + try: + ret['uploader'] = re.search( + r'(.+) is creating', info_page + ).group(1) + except AttributeError: + pass + + parser = PatreonHTMLParser() + parser.get_creation_info(info_page) + if not parser.creation_info.get('url'): + raise ExtractionError('Unable to retrieve creation URL') + ret.update(parser.creation_info) + return ret From 27ace98f5159efdf6942e69ec73c5840cce26fbf Mon Sep 17 00:00:00 2001 From: Erik Johnson Date: Mon, 28 Jul 2014 13:41:28 -0500 Subject: [PATCH 2/4] Add import for Patreon extractor --- youtube_dl/extractor/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b8b341afd..cf7a8d6e1 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -226,6 +226,7 @@ from .oe1 import OE1IE from .ooyala import OoyalaIE from .orf import ORFIE from .parliamentliveuk import ParliamentLiveUKIE +from .patreon import PatreonIE from .pbs import PBSIE from .photobucket import PhotobucketIE from .playvid import PlayvidIE From c3f0b12b0f1db168b6d30df68e33a9dba077728e Mon Sep 17 00:00:00 2001 From: Erik Johnson Date: Wed, 30 Jul 2014 15:30:07 -0500 Subject: [PATCH 3/4] fix exception --- youtube_dl/extractor/patreon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index da1ad065a..4efb18d6c 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -122,6 +122,6 @@ class PatreonIE(InfoExtractor): parser = PatreonHTMLParser() parser.get_creation_info(info_page) if not parser.creation_info.get('url'): - raise ExtractionError('Unable to retrieve creation URL') + raise ExtractorError('Unable to retrieve creation URL') ret.update(parser.creation_info) return ret From 6994e70651180cfe7c5d097fee517de78429c18d Mon Sep 17 00:00:00 2001 From: Erik Johnson Date: Tue, 5 Aug 2014 00:26:23 -0500 Subject: [PATCH 4/4] Fix CSS parsing for Patreon Some of the CSS classes end in " double", so this commit refines the HTML parsing to account for both kinds of classes, and also adds an additional test case. --- youtube_dl/extractor/patreon.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index 4efb18d6c..4ddff42e6 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -16,15 +16,25 @@ class PatreonHTMLParser(compat_html_parser.HTMLParser): _PREFIX = 'http://www.patreon.com' _ATTACH_TAGS = 5 * ['div'] _ATTACH_CLASSES = [ - 'fancyboxhidden', 'box photo double', 'boxwrapper double', + 'fancyboxhidden', 'box photo', 'boxwrapper', 'hiddendisplay shareinfo', 'attach' ] _INFO_TAGS = 4 * ['div'] _INFO_CLASSES = [ - 'fancyboxhidden', 'box photo double', 'boxwrapper double', + 'fancyboxhidden', 'box photo', 'boxwrapper', 'hiddendisplay shareinfo' ] + def _match(self, attrs_classes, desired): + if attrs_classes == desired: + return True + elif len(attrs_classes) == len(desired): + return all( + x.startswith(y) + for x, y in zip(attrs_classes, desired) + ) + return False + def get_creation_info(self, html_data): self.tag_stack = [] self.attrs_stack = [] @@ -45,7 +55,7 @@ class PatreonHTMLParser(compat_html_parser.HTMLParser): attrs_classes = [ x.get('class', '').lower() for x in self.attrs_stack[-6:-1] ] - if attrs_classes == self._ATTACH_CLASSES: + if self._match(attrs_classes, self._ATTACH_CLASSES): if self.tag_stack[-1] == 'a': url = self._PREFIX + self.attrs_stack[-1].get('href') self.creation_info['url'] = url @@ -56,7 +66,7 @@ class PatreonHTMLParser(compat_html_parser.HTMLParser): attrs_classes = [ x.get('class', '').lower() for x in self.attrs_stack[-5:-1] ] - if attrs_classes == self._INFO_CLASSES: + if self._match(attrs_classes, self._INFO_CLASSES): if self.attrs_stack[-1].get('class') == 'utitle': self.creation_info['title'] = data.strip() @@ -65,10 +75,10 @@ class PatreonIE(InfoExtractor): IE_NAME = 'patreon' _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(.+)' _TESTS = [ + # CSS names with "double" in the name, i.e. "boxwrapper double" { 'url': 'http://www.patreon.com/creation?hid=743933', 'md5': 'e25505eec1053a6e6813b8ed369875cc', - 'name': 'Patreon', 'info_dict': { 'id': '743933', 'ext': 'mp3', @@ -76,6 +86,16 @@ class PatreonIE(InfoExtractor): 'uploader': 'Cognitive Dissonance Podcast', }, }, + { + 'url': 'http://www.patreon.com/creation?hid=754133', + 'md5': '3eb09345bf44bf60451b8b0b81759d0a', + 'info_dict': { + 'id': '754133', + 'ext': 'mp3', + 'title': 'CD 167 Extra', + 'uploader': 'Cognitive Dissonance Podcast', + }, + }, ] # Currently Patreon exposes download URL via hidden CSS, so login is not