[extractor] Simplify search extractors

This commit is contained in:
pukkandan 2021-10-09 02:09:55 +05:30
parent a903d8285c
commit cc16383ff3
No known key found for this signature in database
GPG Key ID: 0F00D95A001F4698
6 changed files with 27 additions and 72 deletions

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import base64 import base64
import datetime import datetime
import hashlib import hashlib
import itertools
import json import json
import netrc import netrc
import os import os
@ -3617,7 +3618,14 @@ class SearchInfoExtractor(InfoExtractor):
return self._get_n_results(query, n) return self._get_n_results(query, n)
def _get_n_results(self, query, n): def _get_n_results(self, query, n):
"""Get a specified number of results for a query""" """Get a specified number of results for a query.
Either this function or _search_results must be overridden by subclasses """
return self.playlist_result(
itertools.islice(self._search_results(query), 0, None if n == float('inf') else n),
query, query)
def _search_results(self, query):
"""Returns an iterator of search results"""
raise NotImplementedError('This method must be implemented by subclasses') raise NotImplementedError('This method must be implemented by subclasses')
@property @property

View File

@ -11,6 +11,7 @@ class GoogleSearchIE(SearchInfoExtractor):
_MAX_RESULTS = 1000 _MAX_RESULTS = 1000
IE_NAME = 'video.google:search' IE_NAME = 'video.google:search'
_SEARCH_KEY = 'gvsearch' _SEARCH_KEY = 'gvsearch'
_WORKING = False
_TEST = { _TEST = {
'url': 'gvsearch15:python language', 'url': 'gvsearch15:python language',
'info_dict': { 'info_dict': {
@ -20,16 +21,7 @@ class GoogleSearchIE(SearchInfoExtractor):
'playlist_count': 15, 'playlist_count': 15,
} }
def _get_n_results(self, query, n): def _search_results(self, query):
"""Get a specified number of results for a query"""
entries = []
res = {
'_type': 'playlist',
'id': query,
'title': query,
}
for pagenum in itertools.count(): for pagenum in itertools.count():
webpage = self._download_webpage( webpage = self._download_webpage(
'http://www.google.com/search', 'http://www.google.com/search',
@ -44,16 +36,8 @@ class GoogleSearchIE(SearchInfoExtractor):
for hit_idx, mobj in enumerate(re.finditer( for hit_idx, mobj in enumerate(re.finditer(
r'<h3 class="r"><a href="([^"]+)"', webpage)): r'<h3 class="r"><a href="([^"]+)"', webpage)):
if re.search(f'id="vidthumb{hit_idx + 1}"', webpage):
yield self.url_result(mobj.group(1))
# Skip playlists if not re.search(r'id="pnnext"', webpage):
if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage): return
continue
entries.append({
'_type': 'url',
'url': mobj.group(1)
})
if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage):
res['entries'] = entries[:n]
return res

View File

@ -709,11 +709,9 @@ class NicovideoSearchIE(SearchInfoExtractor, NicovideoSearchURLIE):
_SEARCH_KEY = 'nicosearch' _SEARCH_KEY = 'nicosearch'
_TESTS = [] _TESTS = []
def _get_n_results(self, query, n): def _search_results(self, query):
entries = self._entries(self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query) return self._entries(
if n < float('inf'): self._proto_relative_url(f'//www.nicovideo.jp/search/{query}'), query)
entries = itertools.islice(entries, 0, n)
return self.playlist_result(entries, query, query)
class NicovideoSearchDateIE(NicovideoSearchIE): class NicovideoSearchDateIE(NicovideoSearchIE):

View File

@ -880,25 +880,14 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
}) })
next_url = update_url_query(self._API_V2_BASE + endpoint, query) next_url = update_url_query(self._API_V2_BASE + endpoint, query)
collected_results = 0
for i in itertools.count(1): for i in itertools.count(1):
response = self._download_json( response = self._download_json(
next_url, collection_id, 'Downloading page {0}'.format(i), next_url, collection_id, f'Downloading page {i}',
'Unable to download API page', headers=self._HEADERS) 'Unable to download API page', headers=self._HEADERS)
collection = response.get('collection', []) for item in response.get('collection') or []:
if not collection: if item:
break yield self.url_result(item['uri'], SoundcloudIE.ie_key())
collection = list(filter(bool, collection))
collected_results += len(collection)
for item in collection:
yield self.url_result(item['uri'], SoundcloudIE.ie_key())
if not collection or collected_results >= limit:
break
next_url = response.get('next_href') next_url = response.get('next_href')
if not next_url: if not next_url:
@ -906,4 +895,4 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
def _get_n_results(self, query, n): def _get_n_results(self, query, n):
tracks = self._get_collection('search/tracks', query, limit=n, q=query) tracks = self._get_collection('search/tracks', query, limit=n, q=query)
return self.playlist_result(tracks, playlist_title=query) return self.playlist_result(tracks, query, query)

View File

@ -334,31 +334,15 @@ class YahooSearchIE(SearchInfoExtractor):
IE_NAME = 'screen.yahoo:search' IE_NAME = 'screen.yahoo:search'
_SEARCH_KEY = 'yvsearch' _SEARCH_KEY = 'yvsearch'
def _get_n_results(self, query, n): def _search_results(self, query):
"""Get a specified number of results for a query"""
entries = []
for pagenum in itertools.count(0): for pagenum in itertools.count(0):
result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30) result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
info = self._download_json(result_url, query, info = self._download_json(result_url, query,
note='Downloading results page ' + str(pagenum + 1)) note='Downloading results page ' + str(pagenum + 1))
m = info['m'] yield from (self.url_result(result['rurl']) for result in info['results'])
results = info['results'] if info['m']['last'] >= info['m']['total'] - 1:
for (i, r) in enumerate(results):
if (pagenum * 30) + i >= n:
break
mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
entries.append(e)
if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
break break
return {
'_type': 'playlist',
'id': query,
'entries': entries,
}
class YahooGyaOPlayerIE(InfoExtractor): class YahooGyaOPlayerIE(InfoExtractor):
IE_NAME = 'yahoo:gyao:player' IE_NAME = 'yahoo:gyao:player'

View File

@ -4615,11 +4615,10 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
_SEARCH_PARAMS = None _SEARCH_PARAMS = None
_TESTS = [] _TESTS = []
def _entries(self, query, n): def _search_results(self, query):
data = {'query': query} data = {'query': query}
if self._SEARCH_PARAMS: if self._SEARCH_PARAMS:
data['params'] = self._SEARCH_PARAMS data['params'] = self._SEARCH_PARAMS
total = 0
continuation = {} continuation = {}
for page_num in itertools.count(1): for page_num in itertools.count(1):
data.update(continuation) data.update(continuation)
@ -4662,17 +4661,10 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
continue continue
yield self._extract_video(video) yield self._extract_video(video)
total += 1
if total == n:
return
if not continuation: if not continuation:
break break
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
return self.playlist_result(self._entries(query, n), query, query)
class YoutubeSearchDateIE(YoutubeSearchIE): class YoutubeSearchDateIE(YoutubeSearchIE):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date' IE_NAME = YoutubeSearchIE.IE_NAME + ':date'