mirror of https://github.com/yt-dlp/yt-dlp.git
[youtube:comments] Add more options for limiting number of comments extracted (#1626)
Extends `max_comments` extractor arg to support `max-parents,max-replies,max-replies-per-thread`. Authored-by: coletdjnz
This commit is contained in:
parent
0bb322b9c0
commit
46383212b3
|
@ -1565,8 +1565,10 @@ The following extractors use this feature:
|
||||||
* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details
|
* `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details
|
||||||
* `include_live_dash`: Include live dash formats (These formats don't download properly)
|
* `include_live_dash`: Include live dash formats (These formats don't download properly)
|
||||||
* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
|
* `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side)
|
||||||
* `max_comments`: Maximum amount of comments to download (default all)
|
* `max_comments`: Limit the amount of comments to gather. Comma-seperated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all`.
|
||||||
* `max_comment_depth`: Maximum depth for nested comments. YouTube supports depths 1 or 2 (default)
|
* E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total.
|
||||||
|
* `max_comment_depth` Maximum depth for nested comments. YouTube supports depths 1 or 2 (default)
|
||||||
|
* **Deprecated**: Set `max-replies` to `0` or `all` in `max_comments` instead (e.g. `max_comments=all,all,0` to get no replies)
|
||||||
|
|
||||||
#### youtubetab (YouTube playlists, channels, feeds, etc.)
|
#### youtubetab (YouTube playlists, channels, feeds, etc.)
|
||||||
* `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details)
|
* `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details)
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import base64
|
|
||||||
import calendar
|
import calendar
|
||||||
import copy
|
import copy
|
||||||
import datetime
|
import datetime
|
||||||
|
@ -13,6 +12,7 @@ import math
|
||||||
import os.path
|
import os.path
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
|
@ -30,7 +30,6 @@ from ..compat import (
|
||||||
from ..jsinterp import JSInterpreter
|
from ..jsinterp import JSInterpreter
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
bug_reports_message,
|
bug_reports_message,
|
||||||
bytes_to_intlist,
|
|
||||||
clean_html,
|
clean_html,
|
||||||
datetime_from_str,
|
datetime_from_str,
|
||||||
dict_get,
|
dict_get,
|
||||||
|
@ -39,7 +38,6 @@ from ..utils import (
|
||||||
float_or_none,
|
float_or_none,
|
||||||
format_field,
|
format_field,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
intlist_to_bytes,
|
|
||||||
is_html,
|
is_html,
|
||||||
join_nonempty,
|
join_nonempty,
|
||||||
mimetype2ext,
|
mimetype2ext,
|
||||||
|
@ -2117,20 +2115,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
'parent': parent or 'root'
|
'parent': parent or 'root'
|
||||||
}
|
}
|
||||||
|
|
||||||
def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, comment_counts=None):
|
def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, tracker=None):
|
||||||
|
|
||||||
|
get_single_config_arg = lambda c: self._configuration_arg(c, [''])[0]
|
||||||
|
|
||||||
def extract_header(contents):
|
def extract_header(contents):
|
||||||
_continuation = None
|
_continuation = None
|
||||||
for content in contents:
|
for content in contents:
|
||||||
comments_header_renderer = try_get(content, lambda x: x['commentsHeaderRenderer'])
|
comments_header_renderer = traverse_obj(content, 'commentsHeaderRenderer')
|
||||||
expected_comment_count = parse_count(self._get_text(
|
expected_comment_count = parse_count(self._get_text(
|
||||||
comments_header_renderer, 'countText', 'commentsCount', max_runs=1))
|
comments_header_renderer, 'countText', 'commentsCount', max_runs=1))
|
||||||
|
|
||||||
if expected_comment_count:
|
if expected_comment_count:
|
||||||
comment_counts[1] = expected_comment_count
|
tracker['est_total'] = expected_comment_count
|
||||||
self.to_screen('Downloading ~%d comments' % expected_comment_count)
|
self.to_screen(f'Downloading ~{expected_comment_count} comments')
|
||||||
sort_mode_str = self._configuration_arg('comment_sort', [''])[0]
|
comment_sort_index = int(get_single_config_arg('comment_sort') != 'top') # 1 = new, 0 = top
|
||||||
comment_sort_index = int(sort_mode_str != 'top') # 1 = new, 0 = top
|
|
||||||
|
|
||||||
sort_menu_item = try_get(
|
sort_menu_item = try_get(
|
||||||
comments_header_renderer,
|
comments_header_renderer,
|
||||||
|
@ -2141,76 +2140,84 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
if not _continuation:
|
if not _continuation:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
sort_text = sort_menu_item.get('title')
|
sort_text = str_or_none(sort_menu_item.get('title'))
|
||||||
if isinstance(sort_text, compat_str):
|
if not sort_text:
|
||||||
sort_text = sort_text.lower()
|
|
||||||
else:
|
|
||||||
sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
|
sort_text = 'top comments' if comment_sort_index == 0 else 'newest first'
|
||||||
self.to_screen('Sorting comments by %s' % sort_text)
|
self.to_screen('Sorting comments by %s' % sort_text.lower())
|
||||||
break
|
break
|
||||||
return _continuation
|
return _continuation
|
||||||
|
|
||||||
def extract_thread(contents):
|
def extract_thread(contents):
|
||||||
if not parent:
|
if not parent:
|
||||||
comment_counts[2] = 0
|
tracker['current_page_thread'] = 0
|
||||||
for content in contents:
|
for content in contents:
|
||||||
|
if not parent and tracker['total_parent_comments'] >= max_parents:
|
||||||
|
yield
|
||||||
comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
|
comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
|
||||||
comment_renderer = try_get(
|
comment_renderer = get_first(
|
||||||
comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
|
(comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]],
|
||||||
content, (lambda x: x['commentRenderer'], dict))
|
expected_type=dict, default={})
|
||||||
|
|
||||||
if not comment_renderer:
|
|
||||||
continue
|
|
||||||
comment = self._extract_comment(comment_renderer, parent)
|
comment = self._extract_comment(comment_renderer, parent)
|
||||||
if not comment:
|
if not comment:
|
||||||
continue
|
continue
|
||||||
comment_counts[0] += 1
|
|
||||||
|
tracker['running_total'] += 1
|
||||||
|
tracker['total_reply_comments' if parent else 'total_parent_comments'] += 1
|
||||||
yield comment
|
yield comment
|
||||||
|
|
||||||
# Attempt to get the replies
|
# Attempt to get the replies
|
||||||
comment_replies_renderer = try_get(
|
comment_replies_renderer = try_get(
|
||||||
comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
|
comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
|
||||||
|
|
||||||
if comment_replies_renderer:
|
if comment_replies_renderer:
|
||||||
comment_counts[2] += 1
|
tracker['current_page_thread'] += 1
|
||||||
comment_entries_iter = self._comment_entries(
|
comment_entries_iter = self._comment_entries(
|
||||||
comment_replies_renderer, ytcfg, video_id,
|
comment_replies_renderer, ytcfg, video_id,
|
||||||
parent=comment.get('id'), comment_counts=comment_counts)
|
parent=comment.get('id'), tracker=tracker)
|
||||||
|
for reply_comment in itertools.islice(comment_entries_iter, min(max_replies_per_thread, max(0, max_replies - tracker['total_reply_comments']))):
|
||||||
for reply_comment in comment_entries_iter:
|
|
||||||
yield reply_comment
|
yield reply_comment
|
||||||
|
|
||||||
|
# Keeps track of counts across recursive calls
|
||||||
|
if not tracker:
|
||||||
|
tracker = dict(
|
||||||
|
running_total=0,
|
||||||
|
est_total=0,
|
||||||
|
current_page_thread=0,
|
||||||
|
total_parent_comments=0,
|
||||||
|
total_reply_comments=0)
|
||||||
|
|
||||||
|
# TODO: Deprecated
|
||||||
# YouTube comments have a max depth of 2
|
# YouTube comments have a max depth of 2
|
||||||
max_depth = int_or_none(self._configuration_arg('max_comment_depth', [''])[0]) or float('inf')
|
max_depth = int_or_none(get_single_config_arg('max_comment_depth'))
|
||||||
|
if max_depth:
|
||||||
|
self._downloader.deprecation_warning(
|
||||||
|
'[youtube] max_comment_depth extractor argument is deprecated. Set max replies in the max-comments extractor argument instead.')
|
||||||
if max_depth == 1 and parent:
|
if max_depth == 1 and parent:
|
||||||
return
|
return
|
||||||
if not comment_counts:
|
|
||||||
# comment so far, est. total comments, current comment thread #
|
max_comments, max_parents, max_replies, max_replies_per_thread, *_ = map(
|
||||||
comment_counts = [0, 0, 0]
|
lambda p: int_or_none(p, default=sys.maxsize), self._configuration_arg('max_comments', ) + [''] * 4)
|
||||||
|
|
||||||
continuation = self._extract_continuation(root_continuation_data)
|
continuation = self._extract_continuation(root_continuation_data)
|
||||||
if continuation and len(continuation['continuation']) < 27:
|
|
||||||
self.write_debug('Detected old API continuation token. Generating new API compatible token.')
|
|
||||||
continuation_token = self._generate_comment_continuation(video_id)
|
|
||||||
continuation = self._build_api_continuation_query(continuation_token, None)
|
|
||||||
|
|
||||||
message = self._get_text(root_continuation_data, ('contents', ..., 'messageRenderer', 'text'), max_runs=1)
|
message = self._get_text(root_continuation_data, ('contents', ..., 'messageRenderer', 'text'), max_runs=1)
|
||||||
if message and not parent:
|
if message and not parent:
|
||||||
self.report_warning(message, video_id=video_id)
|
self.report_warning(message, video_id=video_id)
|
||||||
|
|
||||||
visitor_data = None
|
response = None
|
||||||
is_first_continuation = parent is None
|
is_first_continuation = parent is None
|
||||||
|
|
||||||
for page_num in itertools.count(0):
|
for page_num in itertools.count(0):
|
||||||
if not continuation:
|
if not continuation:
|
||||||
break
|
break
|
||||||
headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data)
|
headers = self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response))
|
||||||
comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
|
comment_prog_str = f"({tracker['running_total']}/{tracker['est_total']})"
|
||||||
if page_num == 0:
|
if page_num == 0:
|
||||||
if is_first_continuation:
|
if is_first_continuation:
|
||||||
note_prefix = 'Downloading comment section API JSON'
|
note_prefix = 'Downloading comment section API JSON'
|
||||||
else:
|
else:
|
||||||
note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
|
note_prefix = ' Downloading comment API JSON reply thread %d %s' % (
|
||||||
comment_counts[2], comment_prog_str)
|
tracker['current_page_thread'], comment_prog_str)
|
||||||
else:
|
else:
|
||||||
note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
|
note_prefix = '%sDownloading comment%s API JSON page %d %s' % (
|
||||||
' ' if parent else '', ' replies' if parent else '',
|
' ' if parent else '', ' replies' if parent else '',
|
||||||
|
@ -2219,83 +2226,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
response = self._extract_response(
|
response = self._extract_response(
|
||||||
item_id=None, query=continuation,
|
item_id=None, query=continuation,
|
||||||
ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
|
ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix,
|
||||||
check_get_keys=('onResponseReceivedEndpoints', 'continuationContents'))
|
check_get_keys='onResponseReceivedEndpoints')
|
||||||
if not response:
|
|
||||||
break
|
|
||||||
visitor_data = try_get(
|
|
||||||
response,
|
|
||||||
lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
|
|
||||||
compat_str) or visitor_data
|
|
||||||
|
|
||||||
continuation_contents = dict_get(response, ('onResponseReceivedEndpoints', 'continuationContents'))
|
continuation_contents = traverse_obj(
|
||||||
|
response, 'onResponseReceivedEndpoints', expected_type=list, default=[])
|
||||||
|
|
||||||
continuation = None
|
continuation = None
|
||||||
if isinstance(continuation_contents, list):
|
for continuation_section in continuation_contents:
|
||||||
for continuation_section in continuation_contents:
|
continuation_items = traverse_obj(
|
||||||
if not isinstance(continuation_section, dict):
|
continuation_section,
|
||||||
continue
|
(('reloadContinuationItemsCommand', 'appendContinuationItemsAction'), 'continuationItems'),
|
||||||
continuation_items = try_get(
|
get_all=False, expected_type=list) or []
|
||||||
continuation_section,
|
if is_first_continuation:
|
||||||
(lambda x: x['reloadContinuationItemsCommand']['continuationItems'],
|
continuation = extract_header(continuation_items)
|
||||||
lambda x: x['appendContinuationItemsAction']['continuationItems']),
|
is_first_continuation = False
|
||||||
list) or []
|
|
||||||
if is_first_continuation:
|
|
||||||
continuation = extract_header(continuation_items)
|
|
||||||
is_first_continuation = False
|
|
||||||
if continuation:
|
|
||||||
break
|
|
||||||
continue
|
|
||||||
count = 0
|
|
||||||
for count, entry in enumerate(extract_thread(continuation_items)):
|
|
||||||
yield entry
|
|
||||||
continuation = self._extract_continuation({'contents': continuation_items})
|
|
||||||
if continuation:
|
if continuation:
|
||||||
# Sometimes YouTube provides a continuation without any comments
|
|
||||||
# In most cases we end up just downloading these with very little comments to come.
|
|
||||||
if count == 0:
|
|
||||||
if not parent:
|
|
||||||
self.report_warning('No comments received - assuming end of comments')
|
|
||||||
continuation = None
|
|
||||||
break
|
break
|
||||||
|
continue
|
||||||
|
|
||||||
# Deprecated response structure
|
for entry in extract_thread(continuation_items):
|
||||||
elif isinstance(continuation_contents, dict):
|
if not entry:
|
||||||
known_continuation_renderers = ('itemSectionContinuation', 'commentRepliesContinuation')
|
return
|
||||||
for key, continuation_renderer in continuation_contents.items():
|
yield entry
|
||||||
if key not in known_continuation_renderers:
|
continuation = self._extract_continuation({'contents': continuation_items})
|
||||||
continue
|
if continuation:
|
||||||
if not isinstance(continuation_renderer, dict):
|
|
||||||
continue
|
|
||||||
if is_first_continuation:
|
|
||||||
header_continuation_items = [continuation_renderer.get('header') or {}]
|
|
||||||
continuation = extract_header(header_continuation_items)
|
|
||||||
is_first_continuation = False
|
|
||||||
if continuation:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Sometimes YouTube provides a continuation without any comments
|
|
||||||
# In most cases we end up just downloading these with very little comments to come.
|
|
||||||
count = 0
|
|
||||||
for count, entry in enumerate(extract_thread(continuation_renderer.get('contents') or {})):
|
|
||||||
yield entry
|
|
||||||
continuation = self._extract_continuation(continuation_renderer)
|
|
||||||
if count == 0:
|
|
||||||
if not parent:
|
|
||||||
self.report_warning('No comments received - assuming end of comments')
|
|
||||||
continuation = None
|
|
||||||
break
|
break
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _generate_comment_continuation(video_id):
|
|
||||||
"""
|
|
||||||
Generates initial comment section continuation token from given video id
|
|
||||||
"""
|
|
||||||
b64_vid_id = base64.b64encode(bytes(video_id.encode('utf-8')))
|
|
||||||
parts = ('Eg0SCw==', b64_vid_id, 'GAYyJyIRIgs=', b64_vid_id, 'MAB4AjAAQhBjb21tZW50cy1zZWN0aW9u')
|
|
||||||
new_continuation_intlist = list(itertools.chain.from_iterable(
|
|
||||||
[bytes_to_intlist(base64.b64decode(part)) for part in parts]))
|
|
||||||
return base64.b64encode(intlist_to_bytes(new_continuation_intlist)).decode('utf-8')
|
|
||||||
|
|
||||||
def _get_comments(self, ytcfg, video_id, contents, webpage):
|
def _get_comments(self, ytcfg, video_id, contents, webpage):
|
||||||
"""Entry for comment extraction"""
|
"""Entry for comment extraction"""
|
||||||
def _real_comment_extract(contents):
|
def _real_comment_extract(contents):
|
||||||
|
|
Loading…
Reference in New Issue