yt-dlp/yt_dlp/webvtt.py

"""
A partial parser for WebVTT segments. Interprets enough of the WebVTT stream
to be able to assemble a single stand-alone subtitle file, suitably adjusting
timestamps on the way, while everything else is passed through unmodified.

Regular expressions based on the W3C WebVTT specification
<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described
in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.
"""

import io
import re

from .utils import int_or_none, timetuple_from_msec


class _MatchParser:
    """
    An object that maintains the current parsing position and allows
    conveniently advancing it as syntax elements are successfully parsed.
    """

    def __init__(self, string):
        self._data = string
        self._pos = 0

    def match(self, r):
        if isinstance(r, re.Pattern):
            return r.match(self._data, self._pos)
        if isinstance(r, str):
            if self._data.startswith(r, self._pos):
                return len(r)
            return None
        raise ValueError(r)

    def advance(self, by):
        if by is None:
            amt = 0
        elif isinstance(by, re.Match):
            amt = len(by.group(0))
        elif isinstance(by, str):
            amt = len(by)
        elif isinstance(by, int):
            amt = by
        else:
            raise ValueError(by)
        self._pos += amt
        return by

    def consume(self, r):
        return self.advance(self.match(r))

    def child(self):
        return _MatchChildParser(self)


class _MatchChildParser(_MatchParser):
    """
    A child parser state, which advances through the same data as
    its parent, but has an independent position. This is useful when
    advancing through syntax elements we might later want to backtrack
    from.
    """

    def __init__(self, parent):
        super().__init__(parent._data)
        self.__parent = parent
        self._pos = parent._pos

    def commit(self):
        """
        Advance the parent state to the current position of this child state.
        """
        self.__parent._pos = self._pos
        return self.__parent


class ParseError(Exception):
    def __init__(self, parser):
        data = parser._data[parser._pos:parser._pos + 100]
        super().__init__(f'Parse error at position {parser._pos} (near {data!r})')


# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>
# prescribes that hours must be *2 or more* digits, timestamps with a single
# digit for the hour part has been seen in the wild.
# See https://github.com/yt-dlp/yt-dlp/issues/921
_REGEX_TS = re.compile(r'''(?x)
    (?:([0-9]{1,}):)?
    ([0-9]{2}):
    ([0-9]{2})\.
    ([0-9]{3})?
''')
_REGEX_EOF = re.compile(r'\Z')
_REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)')
_REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+')
_REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*')


def _parse_ts(ts):
    """
    Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)
    into an MPEG PES timestamp: a tick counter at 90 kHz resolution.
    """
    return 90 * sum(
        int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))


def _format_ts(ts):
    """
    Convert an MPEG PES timestamp into a WebVTT timestamp.
    This will lose sub-millisecond precision.
    """
    return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))


class Block:
    """
    An abstract WebVTT block.
    """

    def __init__(self, **kwargs):
        for key, val in kwargs.items():
            setattr(self, key, val)

    @classmethod
    def parse(cls, parser):
        m = parser.match(cls._REGEX)
        if not m:
            return None
        parser.advance(m)
        return cls(raw=m.group(0))

    def write_into(self, stream):
        stream.write(self.raw)


class HeaderBlock(Block):
    """
    A WebVTT block that may only appear in the header part of the file,
    i.e. before any cue blocks.
    """
    pass


class Magic(HeaderBlock):
    _REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n|[\r\n])')

    # XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5
    # <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC
    # doesn't specify the exact grammar nor where in the WebVTT
    # syntax it should be placed; the below has been devised based
    # on usage in the wild
    #
    # And strictly speaking, the presence of this extension violates
    # the W3C WebVTT spec. Oh well.

    _REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')
    _REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')
    _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')
    _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*')

    # This was removed from the spec in the 2017 revision;
    # the last spec draft to describe this syntax element is
    # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.
    # Nevertheless, YouTube keeps serving those
    _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])')

    @classmethod
    def __parse_tsmap(cls, parser):
        parser = parser.child()

        while True:
            m = parser.consume(cls._REGEX_TSMAP_LOCAL)
            if m:
                m = parser.consume(_REGEX_TS)
                if m is None:
                    raise ParseError(parser)
                local = _parse_ts(m)
                if local is None:
                    raise ParseError(parser)
            else:
                m = parser.consume(cls._REGEX_TSMAP_MPEGTS)
                if m:
                    mpegts = int_or_none(m.group(1))
                    if mpegts is None:
                        raise ParseError(parser)
                else:
                    raise ParseError(parser)
            if parser.consume(cls._REGEX_TSMAP_SEP):
                continue
            if parser.consume(_REGEX_NL):
                break
            raise ParseError(parser)

        parser.commit()
        return local, mpegts

    @classmethod
    def parse(cls, parser):
        parser = parser.child()

        m = parser.consume(cls._REGEX)
        if not m:
            raise ParseError(parser)

        extra = m.group(1)
        local, mpegts, meta = None, None, ''
        while not parser.consume(_REGEX_NL):
            if parser.consume(cls._REGEX_TSMAP):
                local, mpegts = cls.__parse_tsmap(parser)
                continue
            m = parser.consume(cls._REGEX_META)
            if m:
                meta += m.group(0)
                continue
            raise ParseError(parser)
        parser.commit()
        return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)

    def write_into(self, stream):
        stream.write('WEBVTT')
        if self.extra is not None:
            stream.write(self.extra)
        stream.write('\n')
        if self.local or self.mpegts:
            stream.write('X-TIMESTAMP-MAP=LOCAL:')
            stream.write(_format_ts(self.local if self.local is not None else 0))
            stream.write(',MPEGTS:')
            stream.write(str(self.mpegts if self.mpegts is not None else 0))
            stream.write('\n')
        if self.meta:
            stream.write(self.meta)
        stream.write('\n')


class StyleBlock(HeaderBlock):
    _REGEX = re.compile(r'''(?x)
        STYLE[\ \t]*(?:\r\n|[\r\n])
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class RegionBlock(HeaderBlock):
    _REGEX = re.compile(r'''(?x)
        REGION[\ \t]*
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class CommentBlock(Block):
    _REGEX = re.compile(r'''(?x)
        NOTE(?:\r\n|[\ \t\r\n])
        ((?:(?!-->)[^\r\n])+(?:\r\n|[\r\n]))*
        (?:\r\n|[\r\n])
    ''')


class CueBlock(Block):
    """
    A cue block. The payload is not interpreted.
    """

    _REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n|[\r\n])')
    _REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')
    _REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')
    _REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n|[\r\n])?')

    @classmethod
    def parse(cls, parser):
        parser = parser.child()

        id_ = None
        m = parser.consume(cls._REGEX_ID)
        if m:
            id_ = m.group(1)

        m0 = parser.consume(_REGEX_TS)
        if not m0:
            return None
        if not parser.consume(cls._REGEX_ARROW):
            return None
        m1 = parser.consume(_REGEX_TS)
        if not m1:
            return None
        m2 = parser.consume(cls._REGEX_SETTINGS)
        parser.consume(_REGEX_OPTIONAL_WHITESPACE)
        if not parser.consume(_REGEX_NL):
            return None

        start = _parse_ts(m0)
        end = _parse_ts(m1)
        settings = m2.group(1) if m2 is not None else None

        text = io.StringIO()
        while True:
            m = parser.consume(cls._REGEX_PAYLOAD)
            if not m:
                break
            text.write(m.group(0))

        parser.commit()
        return cls(
            id=id_,
            start=start, end=end, settings=settings,
            text=text.getvalue(),
        )

    def write_into(self, stream):
        if self.id is not None:
            stream.write(self.id)
            stream.write('\n')
        stream.write(_format_ts(self.start))
        stream.write(' --> ')
        stream.write(_format_ts(self.end))
        if self.settings is not None:
            stream.write(' ')
            stream.write(self.settings)
        stream.write('\n')
        stream.write(self.text)
        stream.write('\n')

    @property
    def as_json(self):
        return {
            'id': self.id,
            'start': self.start,
            'end': self.end,
            'text': self.text,
            'settings': self.settings,
        }

    def __eq__(self, other):
        return self.as_json == other.as_json

    @classmethod
    def from_json(cls, json):
        return cls(
            id=json['id'],
            start=json['start'],
            end=json['end'],
            text=json['text'],
            settings=json['settings'],
        )

    def hinges(self, other):
        if self.text != other.text:
            return False
        if self.settings != other.settings:
            return False
        return self.start <= self.end == other.start <= other.end


def parse_fragment(frag_content):
    """
    A generator that yields (partially) parsed WebVTT blocks when given
    a bytes object containing the raw contents of a WebVTT file.
    """

    parser = _MatchParser(frag_content.decode())

    yield Magic.parse(parser)

    while not parser.match(_REGEX_EOF):
        if parser.consume(_REGEX_BLANK):
            continue

        block = RegionBlock.parse(parser)
        if block:
            yield block
            continue
        block = StyleBlock.parse(parser)
        if block:
            yield block
            continue
        block = CommentBlock.parse(parser)
        if block:
            yield block  # XXX: or skip
            continue

        break

    while not parser.match(_REGEX_EOF):
        if parser.consume(_REGEX_BLANK):
            continue

        block = CommentBlock.parse(parser)
        if block:
            yield block  # XXX: or skip
            continue
        block = CueBlock.parse(parser)
        if block:
            yield block
            continue

        raise ParseError(parser)
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`"""`
			`A partial parser for WebVTT segments. Interprets enough of the WebVTT stream`
			`to be able to assemble a single stand-alone subtitle file, suitably adjusting`
			`timestamps on the way, while everything else is passed through unmodified.`

			`Regular expressions based on the W3C WebVTT specification`
			`<https://www.w3.org/TR/webvtt1/>. The X-TIMESTAMP-MAP extension is described`
			`in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>.`
			`"""`

			`import io`
Remove Python 3.6 support Closes #3764 2022-07-17 18:20:54 -06:00			`import re`
[cleanup] Sort imports Using https://github.com/PyCQA/isort isort -m VERTICAL_HANGING_INDENT --py 36 -l 80 --rr -n --tc . 2022-04-11 16:32:57 -06:00
[utils] Standardize timestamp formatting code Closes #1285 2021-10-19 11:28:14 -06:00			`from .utils import int_or_none, timetuple_from_msec`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00

[cleanup] Upgrade syntax Using https://github.com/asottile/pyupgrade 1. `__future__` imports and `coding: utf-8` were removed 2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format` 3. f-strings were cherry-picked from `pyupgrade --py36-plus` Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts 2022-04-11 09:10:28 -06:00			`class _MatchParser:`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`"""`
			`An object that maintains the current parsing position and allows`
			`conveniently advancing it as syntax elements are successfully parsed.`
			`"""`

			`def __init__(self, string):`
			`self._data = string`
			`self._pos = 0`

			`def match(self, r):`
[compat] Split into sub-modules (#2173) Authored by: fstirlitz, pukkandan 2022-02-04 06:37:02 -07:00			`if isinstance(r, re.Pattern):`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`return r.match(self._data, self._pos)`
			`if isinstance(r, str):`
			`if self._data.startswith(r, self._pos):`
			`return len(r)`
			`return None`
			`raise ValueError(r)`

			`def advance(self, by):`
			`if by is None:`
			`amt = 0`
[compat] Split into sub-modules (#2173) Authored by: fstirlitz, pukkandan 2022-02-04 06:37:02 -07:00			`elif isinstance(by, re.Match):`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`amt = len(by.group(0))`
			`elif isinstance(by, str):`
			`amt = len(by)`
			`elif isinstance(by, int):`
			`amt = by`
			`else:`
			`raise ValueError(by)`
			`self._pos += amt`
			`return by`

			`def consume(self, r):`
			`return self.advance(self.match(r))`

			`def child(self):`
			`return _MatchChildParser(self)`


			`class _MatchChildParser(_MatchParser):`
			`"""`
			`A child parser state, which advances through the same data as`
			`its parent, but has an independent position. This is useful when`
			`advancing through syntax elements we might later want to backtrack`
			`from.`
			`"""`

			`def __init__(self, parent):`
[cleanup] Upgrade syntax Using https://github.com/asottile/pyupgrade 1. `__future__` imports and `coding: utf-8` were removed 2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format` 3. f-strings were cherry-picked from `pyupgrade --py36-plus` Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts 2022-04-11 09:10:28 -06:00			`super().__init__(parent._data)`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`self.__parent = parent`
			`self._pos = parent._pos`

			`def commit(self):`
			`"""`
			`Advance the parent state to the current position of this child state.`
			`"""`
			`self.__parent._pos = self._pos`
			`return self.__parent`


			`class ParseError(Exception):`
			`def __init__(self, parser):`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 17:09:58 -06:00			`data = parser._data[parser._pos:parser._pos + 100]`
			`super().__init__(f'Parse error at position {parser._pos} (near {data!r})')`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00

[WebVTT] Adjust parser to accommodate PBS subtitles (#922) Closes #921 2021-09-08 04:40:10 -06:00			`# While the specification <https://www.w3.org/TR/webvtt1/#webvtt-timestamp>`
			`# prescribes that hours must be 2 or more digits, timestamps with a single`
			`# digit for the hour part has been seen in the wild.`
			`# See https://github.com/yt-dlp/yt-dlp/issues/921`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`_REGEX_TS = re.compile(r'''(?x)`
[WebVTT] Adjust parser to accommodate PBS subtitles (#922) Closes #921 2021-09-08 04:40:10 -06:00			`(?:([0-9]{1,}):)?`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`([0-9]{2}):`
			`([0-9]{2})\.`
			`([0-9]{3})?`
			`''')`
			`_REGEX_EOF = re.compile(r'\Z')`
[webvtt] Handle premature EOF Closes #2867, closes #5600 Authored by: flashdagger 2022-11-20 01:42:23 -07:00			`_REGEX_NL = re.compile(r'(?:\r\n\|[\r\n]\|$)')`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`_REGEX_BLANK = re.compile(r'(?:\r\n\|[\r\n])+')`
[webvtt] Allow spaces before newlines for CueBlock (#7681) Closes #7453 Ref: https://www.w3.org/TR/webvtt1/#webvtt-cue-block 2023-11-28 16:20:06 -07:00			`_REGEX_OPTIONAL_WHITESPACE = re.compile(r'[ \t]*')`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00

			`def _parse_ts(ts):`
			`"""`
			`Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS)`
			`into an MPEG PES timestamp: a tick counter at 90 kHz resolution.`
			`"""`
[cleanup] Misc cleanup and refactor (#2173) 2022-04-17 14:58:28 -06:00			`return 90 * sum(`
			`int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1)))`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00

			`def _format_ts(ts):`
			`"""`
			`Convert an MPEG PES timestamp into a WebVTT timestamp.`
			`This will lose sub-millisecond precision.`
			`"""`
[utils] Standardize timestamp formatting code Closes #1285 2021-10-19 11:28:14 -06:00			`return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90))`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00

[cleanup] Upgrade syntax Using https://github.com/asottile/pyupgrade 1. `__future__` imports and `coding: utf-8` were removed 2. Files were rewritten with `pyupgrade --py36-plus --keep-percent-format` 3. f-strings were cherry-picked from `pyupgrade --py36-plus` Extractors are left untouched (except removing header) to avoid unnecessary merge conflicts 2022-04-11 09:10:28 -06:00			`class Block:`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`"""`
			`An abstract WebVTT block.`
			`"""`

			`def __init__(self, **kwargs):`
			`for key, val in kwargs.items():`
			`setattr(self, key, val)`

			`@classmethod`
			`def parse(cls, parser):`
			`m = parser.match(cls._REGEX)`
			`if not m:`
			`return None`
			`parser.advance(m)`
			`return cls(raw=m.group(0))`

			`def write_into(self, stream):`
			`stream.write(self.raw)`


			`class HeaderBlock(Block):`
			`"""`
			`A WebVTT block that may only appear in the header part of the file,`
			`i.e. before any cue blocks.`
			`"""`
			`pass`


			`class Magic(HeaderBlock):`
			`_REGEX = re.compile(r'\ufeff?WEBVTT([ \t][^\r\n]*)?(?:\r\n\|[\r\n])')`

			`# XXX: The X-TIMESTAMP-MAP extension is described in RFC 8216 §3.5`
			`# <https://tools.ietf.org/html/rfc8216#section-3.5>, but the RFC`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 17:09:58 -06:00			`# doesn't specify the exact grammar nor where in the WebVTT`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`# syntax it should be placed; the below has been devised based`
			`# on usage in the wild`
			`#`
			`# And strictly speaking, the presence of this extension violates`
			`# the W3C WebVTT spec. Oh well.`

			`_REGEX_TSMAP = re.compile(r'X-TIMESTAMP-MAP=')`
			`_REGEX_TSMAP_LOCAL = re.compile(r'LOCAL:')`
			`_REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)')`
[WebVTT] Adjust parser to accommodate PBS subtitles (#922) Closes #921 2021-09-08 04:40:10 -06:00			`_REGEX_TSMAP_SEP = re.compile(r'[ \t],[ \t]')`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00
[webvtt, extractor/youtube] Extract auto-subs from livestream VODs Closes #4130 Authored by: pukkandan, fstirlitz 2022-06-21 16:16:54 -06:00			`# This was removed from the spec in the 2017 revision;`
			`# the last spec draft to describe this syntax element is`
			`# <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>.`
			`# Nevertheless, YouTube keeps serving those`
			`_REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n])')`

[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`@classmethod`
			`def __parse_tsmap(cls, parser):`
			`parser = parser.child()`

			`while True:`
			`m = parser.consume(cls._REGEX_TSMAP_LOCAL)`
			`if m:`
			`m = parser.consume(_REGEX_TS)`
			`if m is None:`
			`raise ParseError(parser)`
			`local = _parse_ts(m)`
			`if local is None:`
			`raise ParseError(parser)`
			`else:`
			`m = parser.consume(cls._REGEX_TSMAP_MPEGTS)`
			`if m:`
			`mpegts = int_or_none(m.group(1))`
			`if mpegts is None:`
			`raise ParseError(parser)`
			`else:`
			`raise ParseError(parser)`
[WebVTT] Adjust parser to accommodate PBS subtitles (#922) Closes #921 2021-09-08 04:40:10 -06:00			`if parser.consume(cls._REGEX_TSMAP_SEP):`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`continue`
			`if parser.consume(_REGEX_NL):`
			`break`
			`raise ParseError(parser)`

			`parser.commit()`
			`return local, mpegts`

			`@classmethod`
			`def parse(cls, parser):`
			`parser = parser.child()`

			`m = parser.consume(cls._REGEX)`
			`if not m:`
			`raise ParseError(parser)`

			`extra = m.group(1)`
[webvtt, extractor/youtube] Extract auto-subs from livestream VODs Closes #4130 Authored by: pukkandan, fstirlitz 2022-06-21 16:16:54 -06:00			`local, mpegts, meta = None, None, ''`
			`while not parser.consume(_REGEX_NL):`
			`if parser.consume(cls._REGEX_TSMAP):`
			`local, mpegts = cls.__parse_tsmap(parser)`
			`continue`
			`m = parser.consume(cls._REGEX_META)`
			`if m:`
			`meta += m.group(0)`
			`continue`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`raise ParseError(parser)`
			`parser.commit()`
[webvtt, extractor/youtube] Extract auto-subs from livestream VODs Closes #4130 Authored by: pukkandan, fstirlitz 2022-06-21 16:16:54 -06:00			`return cls(extra=extra, mpegts=mpegts, local=local, meta=meta)`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00
			`def write_into(self, stream):`
			`stream.write('WEBVTT')`
			`if self.extra is not None:`
			`stream.write(self.extra)`
			`stream.write('\n')`
			`if self.local or self.mpegts:`
			`stream.write('X-TIMESTAMP-MAP=LOCAL:')`
			`stream.write(_format_ts(self.local if self.local is not None else 0))`
			`stream.write(',MPEGTS:')`
			`stream.write(str(self.mpegts if self.mpegts is not None else 0))`
			`stream.write('\n')`
[webvtt, extractor/youtube] Extract auto-subs from livestream VODs Closes #4130 Authored by: pukkandan, fstirlitz 2022-06-21 16:16:54 -06:00			`if self.meta:`
			`stream.write(self.meta)`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`stream.write('\n')`


			`class StyleBlock(HeaderBlock):`
			`_REGEX = re.compile(r'''(?x)`
			`STYLE[\ \t]*(?:\r\n\|[\r\n])`
			`((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*`
			`(?:\r\n\|[\r\n])`
			`''')`


			`class RegionBlock(HeaderBlock):`
			`_REGEX = re.compile(r'''(?x)`
			`REGION[\ \t]*`
			`((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*`
			`(?:\r\n\|[\r\n])`
			`''')`


			`class CommentBlock(Block):`
			`_REGEX = re.compile(r'''(?x)`
			`NOTE(?:\r\n\|[\ \t\r\n])`
			`((?:(?!-->)[^\r\n])+(?:\r\n\|[\r\n]))*`
			`(?:\r\n\|[\r\n])`
			`''')`


			`class CueBlock(Block):`
			`"""`
			`A cue block. The payload is not interpreted.`
			`"""`

			`_REGEX_ID = re.compile(r'((?:(?!-->)[^\r\n])+)(?:\r\n\|[\r\n])')`
			`_REGEX_ARROW = re.compile(r'[ \t]+-->[ \t]+')`
			`_REGEX_SETTINGS = re.compile(r'[ \t]+((?:(?!-->)[^\r\n])+)')`
			`_REGEX_PAYLOAD = re.compile(r'[^\r\n]+(?:\r\n\|[\r\n])?')`

			`@classmethod`
			`def parse(cls, parser):`
			`parser = parser.child()`

[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 17:09:58 -06:00			`id_ = None`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`m = parser.consume(cls._REGEX_ID)`
			`if m:`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 17:09:58 -06:00			`id_ = m.group(1)`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00
			`m0 = parser.consume(_REGEX_TS)`
			`if not m0:`
			`return None`
			`if not parser.consume(cls._REGEX_ARROW):`
			`return None`
			`m1 = parser.consume(_REGEX_TS)`
			`if not m1:`
			`return None`
			`m2 = parser.consume(cls._REGEX_SETTINGS)`
[webvtt] Fix 15f22b4880b6b3f71f350c64d70976ae65b9f1ca 2023-12-12 16:40:35 -07:00			`parser.consume(_REGEX_OPTIONAL_WHITESPACE)`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`if not parser.consume(_REGEX_NL):`
			`return None`

			`start = _parse_ts(m0)`
			`end = _parse_ts(m1)`
			`settings = m2.group(1) if m2 is not None else None`

			`text = io.StringIO()`
			`while True:`
			`m = parser.consume(cls._REGEX_PAYLOAD)`
			`if not m:`
			`break`
			`text.write(m.group(0))`

			`parser.commit()`
			`return cls(`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 17:09:58 -06:00			`id=id_,`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`start=start, end=end, settings=settings,`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 17:09:58 -06:00			`text=text.getvalue(),`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00			`)`

			`def write_into(self, stream):`
			`if self.id is not None:`
			`stream.write(self.id)`
			`stream.write('\n')`
			`stream.write(_format_ts(self.start))`
			`stream.write(' --> ')`
			`stream.write(_format_ts(self.end))`
			`if self.settings is not None:`
			`stream.write(' ')`
			`stream.write(self.settings)`
			`stream.write('\n')`
			`stream.write(self.text)`
			`stream.write('\n')`

[downloader/hls] Remove duplicate cues using a sliding window of candidates 2021-04-23 02:52:21 -06:00			`@property`
			`def as_json(self):`
			`return {`
			`'id': self.id,`
			`'start': self.start,`
			`'end': self.end,`
			`'text': self.text,`
			`'settings': self.settings,`
			`}`

[webvtt] Merge daisy-chained duplicate cues (#638) Fixes: https://github.com/yt-dlp/yt-dlp/issues/631#issuecomment-893338552 Previous deduplication algorithm only removed duplicate cues with identical text, styles and timestamps. This change also merges cues that come in ‘daisy chains’, where sequences of cues with identical text and styles appear in which the ending timestamp of one equals the starting timestamp of the next. This deduplication algorithm has the somewhat unfortunate side effect that NOTE blocks between cues, if found, will be emitted in a different order relative to their original cues. This may be unwanted if perfect fidelity is desired, but then so is daisy-chain deduplication itself. NOTE blocks ought to be ignored by WebVTT players in any case. Authored by: fstirlitz 2021-08-09 14:22:30 -06:00			`def __eq__(self, other):`
			`return self.as_json == other.as_json`

			`@classmethod`
			`def from_json(cls, json):`
			`return cls(`
			`id=json['id'],`
			`start=json['start'],`
			`end=json['end'],`
			`text=json['text'],`
[cleanup] Add more ruff rules (#10149) Authored by: seproDev Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com> Reviewed-by: Simon Sawicki <contact@grub4k.xyz> 2024-06-11 17:09:58 -06:00			`settings=json['settings'],`
[webvtt] Merge daisy-chained duplicate cues (#638) Fixes: https://github.com/yt-dlp/yt-dlp/issues/631#issuecomment-893338552 Previous deduplication algorithm only removed duplicate cues with identical text, styles and timestamps. This change also merges cues that come in ‘daisy chains’, where sequences of cues with identical text and styles appear in which the ending timestamp of one equals the starting timestamp of the next. This deduplication algorithm has the somewhat unfortunate side effect that NOTE blocks between cues, if found, will be emitted in a different order relative to their original cues. This may be unwanted if perfect fidelity is desired, but then so is daisy-chain deduplication itself. NOTE blocks ought to be ignored by WebVTT players in any case. Authored by: fstirlitz 2021-08-09 14:22:30 -06:00			`)`

			`def hinges(self, other):`
			`if self.text != other.text:`
			`return False`
			`if self.settings != other.settings:`
			`return False`
			`return self.start <= self.end == other.start <= other.end`

[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00
			`def parse_fragment(frag_content):`
			`"""`
			`A generator that yields (partially) parsed WebVTT blocks when given`
			`a bytes object containing the raw contents of a WebVTT file.`
			`"""`

[cleanup] Minor fixes (See desc) * [youtube] Fix `--youtube-skip-dash-manifest` * [build] Use `$()` in `Makefile`. Closes #3684 * Fix bug in 385ffb467b2285e85a2a5495b90314ba1f8e0700 * Fix bug in 43d7f5a5d0c77556156a3f8caa6976d3908a1e38 * [cleanup] Remove unnecessary `utf-8` from `str.encode`/`bytes.decode` * [utils] LazyList: Expose unnecessarily "protected" attributes and other minor cleanup 2022-05-09 05:54:28 -06:00			`parser = _MatchParser(frag_content.decode())`
[downloader/hls] Assemble single-file WebVTT subtitles from HLS segments 2021-04-28 04:47:30 -06:00
			`yield Magic.parse(parser)`

			`while not parser.match(_REGEX_EOF):`
			`if parser.consume(_REGEX_BLANK):`
			`continue`

			`block = RegionBlock.parse(parser)`
			`if block:`
			`yield block`
			`continue`
			`block = StyleBlock.parse(parser)`
			`if block:`
			`yield block`
			`continue`
			`block = CommentBlock.parse(parser)`
			`if block:`
			`yield block # XXX: or skip`
			`continue`

			`break`

			`while not parser.match(_REGEX_EOF):`
			`if parser.consume(_REGEX_BLANK):`
			`continue`

			`block = CommentBlock.parse(parser)`
			`if block:`
			`yield block # XXX: or skip`
			`continue`
			`block = CueBlock.parse(parser)`
			`if block:`
			`yield block`
			`continue`

			`raise ParseError(parser)`