[Motherless] Pull from yt-dlp, etc

* use username field
* loosen regexes
* warn on page count 0 in group
* avoid reloading group page 1
Closes #29626
This commit is contained in:
dirkf 2022-10-12 01:09:55 +01:00 committed by GitHub
parent c91cbf6072
commit 7bbd5b13d4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 26 additions and 7 deletions

View File

@ -126,9 +126,10 @@ class MotherlessIE(InfoExtractor):
kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
comment_count = webpage.count('class="media-comment-contents"') comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage))
uploader_id = self._html_search_regex( uploader_id = self._html_search_regex(
r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)''', (r'''<span\b[^>]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)</span>''',
r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)'''),
webpage, 'uploader_id') webpage, 'uploader_id')
categories = self._html_search_meta('keywords', webpage, default=None) categories = self._html_search_meta('keywords', webpage, default=None)
@ -171,6 +172,17 @@ class MotherlessGroupIE(InfoExtractor):
'any kind!' 'any kind!'
}, },
'playlist_mincount': 0, 'playlist_mincount': 0,
'expected_warnings': [
'This group has no videos.',
]
}, {
'url': 'https://motherless.com/g/beautiful_cock',
'info_dict': {
'id': 'beautiful_cock',
'title': 'Beautiful Cock',
'description': 'Group for lovely cocks yours, mine, a friends anything human',
},
'playlist_mincount': 2500,
}] }]
@classmethod @classmethod
@ -211,14 +223,21 @@ class MotherlessGroupIE(InfoExtractor):
'description', webpage, fatal=False) 'description', webpage, fatal=False)
page_count = str_to_int(self._search_regex( page_count = str_to_int(self._search_regex(
r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b', r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b',
webpage, 'page_count', default='1')) webpage, 'page_count', default=0))
if not page_count:
message = self._search_regex(
r'''class\s*=\s*['"]error-page\b[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*''',
webpage, 'error_msg', default=None) or 'This group has no videos.'
self.report_warning(message, group_id)
page_count = 1
PAGE_SIZE = 80 PAGE_SIZE = 80
def _get_page(idx): def _get_page(idx):
webpage = self._download_webpage( if idx > 0:
page_url, group_id, query={'page': idx + 1}, webpage = self._download_webpage(
note='Downloading page %d/%d' % (idx + 1, page_count) page_url, group_id, query={'page': idx + 1},
) note='Downloading page %d/%d' % (idx + 1, page_count)
)
for entry in self._extract_entries(webpage, url): for entry in self._extract_entries(webpage, url):
yield entry yield entry