[outtmpl] Limit changes during sanitization

Closes #2761
2022-03-27 10:04:04 +05:30 · 2022-03-27 10:04:04 +05:30 · 5c3895fff1
parent fd2ad7cb24
commit 5c3895fff1
5 changed files with 30 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -144,6 +144,7 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu
 * Some private fields such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this
 * When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the separate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this
 * `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi`
 * youtube-dl tries to remove some superfluous punctuations from filenames. While this can sometimes be helpfull, it is often undesirable. So yt-dlp tries to keep the fields in the filenames as close to their original values as possible. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior
 For ease of use, a few more compat options are available:
 * `--compat-options all`: Use all compat options
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -160,10 +160,12 @@ class TestUtil(unittest.TestCase):
            sanitize_filename('New World record at 0:12:34'),
            'New World record at 0_12_34')
-        self.assertEqual(sanitize_filename('--gasdgf'), '_-gasdgf')
+        self.assertEqual(sanitize_filename('--gasdgf'), '--gasdgf')
        self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf')
-        self.assertEqual(sanitize_filename('.gasdgf'), 'gasdgf')
+        self.assertEqual(sanitize_filename('--gasdgf', is_id=False), '_-gasdgf')
        self.assertEqual(sanitize_filename('.gasdgf'), '.gasdgf')
        self.assertEqual(sanitize_filename('.gasdgf', is_id=True), '.gasdgf')
        self.assertEqual(sanitize_filename('.gasdgf', is_id=False), 'gasdgf')
        forbidden = '"\0\\/'
        for fc in forbidden:
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@ -87,6 +87,7 @@ from .utils import (
    MaxDownloadsReached,
    merge_headers,
    network_exceptions,
    NO_DEFAULT,
    number_of_digits,
    orderedSet,
    OUTTMPL_TYPES,
@ -1150,8 +1151,10 @@ class YoutubeDL(object):
        na = self.params.get('outtmpl_na_placeholder', 'NA')
        def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')):
-            return sanitize_filename(str(value), restricted=restricted,
+            return sanitize_filename(str(value), restricted=restricted, is_id=(
-                                     is_id=re.search(r'(^|[_.])id(\.|$)', key))
+                bool(re.search(r'(^|[_.])id(\.|$)', key))
                if 'filename-sanitization' in self.params.get('compat_opts', [])
                else NO_DEFAULT))
        sanitizer = sanitize if callable(sanitize) else filename_sanitizer
        sanitize = bool(sanitize)
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@ -338,7 +338,7 @@ def create_parser():
        action='callback', callback=_set_from_options_callback,
        callback_kwargs={
            'allowed_values': {
-                'filename', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles',
+                'filename', 'filename-sanitization', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles',
                'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge',
                'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json', 'embed-metadata',
                'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi',
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@ -705,36 +705,40 @@ def timeconvert(timestr):
    return timestamp
-def sanitize_filename(s, restricted=False, is_id=False):
+def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
    """Sanitizes a string so it could be used as part of a filename.
-    If restricted is set, use a stricter subset of allowed characters.
+    @param restricted   Use a stricter subset of allowed characters
-    Set is_id if this is not an arbitrary string, but an ID that should be kept
+    @param is_id        Whether this is an ID that should be kept unchanged if possible.
-    if possible.
+                        If unset, yt-dlp's new sanitization rules are in effect
    """
    if s == '':
        return ''
    def replace_insane(char):
        if restricted and char in ACCENT_CHARS:
            return ACCENT_CHARS[char]
        elif not restricted and char == '\n':
-            return ' '
+            return '\0 '
        elif char == '?' or ord(char) < 32 or ord(char) == 127:
            return ''
        elif char == '"':
            return '' if restricted else '\''
        elif char == ':':
-            return '_-' if restricted else ' -'
+            return '\0_\0-' if restricted else '\0 \0-'
        elif char in '\\/|*<>':
-            return '_'
+            return '\0_'
-        if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
+        if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
-            return '_'
+            return '\0_'
        if restricted and ord(char) > 127:
            return '_'
        return char
-    if s == '':
+    s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
        return ''
    # Handle timestamps
    s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
    result = ''.join(map(replace_insane, s))
    if is_id is NO_DEFAULT:
        result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result)  # Remove repeated substitute chars
        STRIP_RE = '(?:\0.|[ _-])*'
        result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result)  # Remove substitute chars from start/end
    result = result.replace('\0', '') or '_'
    if not is_id:
        while '__' in result:
            result = result.replace('__', '_')