Added new IE for Grooveshark

This commit is contained in:
Keith Beckman 2014-05-20 02:47:34 -04:00
parent f50ee8d1c3
commit dd06c95e43
3 changed files with 202 additions and 0 deletions

View File

@ -56,6 +56,7 @@ __authors__ = (
'Nicolas Évrard', 'Nicolas Évrard',
'Jason Normore', 'Jason Normore',
'Hoje Lee', 'Hoje Lee',
'Keith Beckman'
) )
__license__ = 'Public Domain' __license__ = 'Public Domain'

View File

@ -109,6 +109,7 @@ from .gdcvault import GDCVaultIE
from .generic import GenericIE from .generic import GenericIE
from .googleplus import GooglePlusIE from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE from .googlesearch import GoogleSearchIE
from .grooveshark import GroovesharkIE
from .hark import HarkIE from .hark import HarkIE
from .helsinki import HelsinkiIE from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE from .hentaistigma import HentaiStigmaIE

View File

@ -0,0 +1,200 @@
# coding: utf-8
from __future__ import unicode_literals
import time
import math
import re
from urlparse import urlparse, urlunparse, urldefrag
from urllib import quote, urlencode
from os.path import basename
from .common import InfoExtractor
from ..utils import ExtractorError, compat_urllib_request, compat_html_parser
class GroovesharkHtmlParser(compat_html_parser.HTMLParser):
def __init__(self):
self._current_object = None
self.objects = []
compat_html_parser.HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
attrs = dict((k, v) for k, v in attrs)
if tag == 'object':
self._current_object = { 'attrs': attrs, 'params': [] }
elif tag == 'param':
self._current_object['params'].append(attrs)
def handle_endtag(self, tag):
if tag == 'object':
self.objects.append(self._current_object)
self._current_object = None
@classmethod
def extract_object_tags(cls, html):
p = cls()
p.feed(html)
p.close()
return p.objects
class GroovesharkIE(InfoExtractor):
_VALID_URL = r'https?://(www\.)?grooveshark\.com/#!/s/([^/]+)/([^/]+)'
_TEST = {
'url': 'http://grooveshark.com/#!/s/Jolene+Tenth+Key+Remix+Ft+Will+Sessions/6SS1DW?src=5',
'md5': 'bbccc50b19daca23b8f961152c1dc95b',
'info_dict': {
'id': '6SS1DW',
'title': 'Jolene (Tenth Key Remix ft. Will Sessions)',
'ext': 'mp3',
'duration': 227
}
}
do_playerpage_request = True
do_bootstrap_request = True
def _parse_target(self, target):
uri = urlparse(target)
hash = uri.fragment[1:].split('?')[0]
token = basename(hash.rstrip('/'))
return (uri, hash, token)
def _build_bootstrap_url(self, target):
(uri, hash, token) = self._parse_target(target)
query = 'getCommunicationToken=1&hash=%s&%d' % (quote(hash, safe=''), self.ts)
return (urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token)
def _build_meta_url(self, target):
(uri, hash, token) = self._parse_target(target)
query = 'hash=%s&%d' % (quote(hash, safe=''), self.ts)
return (urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token)
def _build_stream_url(self, meta):
return urlunparse(('http', meta['streamKey']['ip'], '/stream.php', None, None, None))
def _build_swf_referer(self, target, obj):
(uri, _, _) = self._parse_target(target)
return urlunparse((uri.scheme, uri.netloc, obj['attrs']['data'], None, None, None))
def _transform_bootstrap(self, js):
return re.split('^\s*try\s*{', js, flags=re.M)[0] \
.split(' = ', 1)[1].strip().rstrip(';')
def _transform_meta(self, js):
return js.split('\n')[0].split('=')[1].rstrip(';')
def _get_meta(self, target):
(meta_url, token) = self._build_meta_url(target)
self.to_screen('Metadata URL: %s' % meta_url)
headers = {'Referer': urldefrag(target)[0]}
req = compat_urllib_request.Request(meta_url, headers=headers)
res = self._download_json(req, token,
transform_source=self._transform_meta)
if 'getStreamKeyWithSong' not in res:
raise ExtractorError(
'Metadata not found. URL may be malformed, or Grooveshark API may have changed.')
if res['getStreamKeyWithSong'] is None:
raise ExtractorError(
'Metadata download failed, probably due to Grooveshark anti-abuse throttling. Wait at least an hour before retrying from this IP.',
expected=True)
return res['getStreamKeyWithSong']
def _get_bootstrap(self, target):
(bootstrap_url, token) = self._build_bootstrap_url(target)
headers = {'Referer': urldefrag(target)[0]}
req = compat_urllib_request.Request(bootstrap_url, headers=headers)
res = self._download_json(req, token, fatal=False,
note='Downloading player bootstrap data',
errnote='Unable to download player bootstrap data',
transform_source=self._transform_bootstrap)
return res
def _get_playerpage(self, target):
(_, _, token) = self._parse_target(target)
res = self._download_webpage(
target, token,
note='Downloading player page',
errnote='Unable to download player page',
fatal=False)
if res is not None:
o = GroovesharkHtmlParser.extract_object_tags(res)
return (res, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed'])
return (res, None)
def _real_extract(self, url):
(target_uri, _, token) = self._parse_target(url)
# 1. Fill cookiejar by making a request to the player page
if self.do_playerpage_request:
(_, player_objs) = self._get_playerpage(url)
if player_objs is not None:
swf_referer = self._build_swf_referer(url, player_objs[0])
self.to_screen('SWF Referer: %s' % swf_referer)
# 2. Ask preload.php for swf bootstrap data to better mimic webapp
if self.do_bootstrap_request:
bootstrap = self._get_bootstrap(url)
self.to_screen('CommunicationToken: %s' % bootstrap['getCommunicationToken'])
# 3. Ask preload.php for track metadata.
meta = self._get_meta(url)
# 4. Construct stream request for track.
stream_url = self._build_stream_url(meta)
duration = int(math.ceil(float(meta['streamKey']['uSecs']) / 1000000))
post_dict = {'streamKey': meta['streamKey']['streamKey']}
post_data = urlencode(post_dict).encode('utf-8')
headers = {
'Content-Length': len(post_data),
'Content-Type': 'application/x-www-form-urlencoded'
}
if 'swf_referer' in locals():
headers['Referer'] = swf_referer
req = compat_urllib_request.Request(streamurl, post_data, headers)
info_dict = {
'id': token,
'title': meta['song']['Name'],
'http_method': 'POST',
'url': stream_url,
'ext': 'mp3',
'format': 'mp3 audio',
'duration': duration,
'post_data': post_data,
'post_dict': post_dict,
'headers': headers,
'request': req
}
if 'swf_referer' in locals():
info_dict['http_referer'] = swf_referer
return info_dict
def _real_initialize(self):
self.ts = int(time.time() * 1000) # timestamp in millis
def _download_json(self, url_or_request, video_id,
note=u'Downloading JSON metadata',
errnote=u'Unable to download JSON metadata',
fatal=True,
transform_source=None):
try:
out = super(GroovesharkIE, self)._download_json(
url_or_request, video_id, note, errnote, transform_source)
return out
except ExtractorError as ee:
if fatal:
raise ee
return None