yt-dlp/yt_dlp/extractor/sohu.py

import base64
import re
import urllib.parse

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    float_or_none,
    int_or_none,
    traverse_obj,
    try_get,
    unified_timestamp,
    url_or_none,
    urljoin,
)


class SohuIE(InfoExtractor):
    _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?'

    # Sohu videos give different MD5 sums on Travis CI and my machine
    _TESTS = [{
        'note': 'This video is available only in Mainland China',
        'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super',
        'info_dict': {
            'id': '382479172',
            'ext': 'mp4',
            'title': 'MV：Far East Movement《The Illest》',
        },
        'skip': 'On available in China',
    }, {
        'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
        'info_dict': {
            'id': '409385080',
            'ext': 'mp4',
            'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》',
        },
        'skip': 'no longer available',
    }, {
        'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml',
        'info_dict': {
            'id': '78693464',
            'ext': 'mp4',
            'title': '【爱范品】第31期：MWC见不到的奇葩手机',
            'uploader': '爱范儿视频',
            'duration': 213,
            'timestamp': 1425519600,
            'upload_date': '20150305',
            'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg',
            'tags': ['爱范儿', '爱范品', 'MWC', '手机'],
        },
    }, {
        'note': 'Multipart video',
        'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml',
        'info_dict': {
            'id': '78910339',
            'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
            'uploader': '小苍cany',
            'duration': 744.0,
            'timestamp': 1426269360,
            'upload_date': '20150313',
            'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg',
            'tags': ['小苍MM', '英雄联盟', '实战秘籍'],
        },
        'playlist': [{
            'info_dict': {
                'id': '78910339_part1',
                'ext': 'mp4',
                'duration': 294,
                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
            },
        }, {
            'info_dict': {
                'id': '78910339_part2',
                'ext': 'mp4',
                'duration': 300,
                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
            },
        }, {
            'info_dict': {
                'id': '78910339_part3',
                'ext': 'mp4',
                'duration': 150,
                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
            },
        }],
    }, {
        'note': 'Video with title containing dash',
        'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml',
        'info_dict': {
            'id': '78932792',
            'ext': 'mp4',
            'title': 'youtube-dl testing video',
            'duration': 360,
            'timestamp': 1426348620,
            'upload_date': '20150314',
            'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M02/8A/00/MTAuMTAuODguNzk=/6_14cee1be192g102SysCutcloud_78932792_7_7b.jpg',
            'tags': [],
        },
        'params': {
            'skip_download': True,
        },
    }]

    def _real_extract(self, url):

        def _fetch_data(vid_id, mytv=False):
            if mytv:
                base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid='
            else:
                base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='

            return self._download_json(
                base_data_url + vid_id, video_id,
                f'Downloading JSON data for {vid_id}',
                headers=self.geo_verification_headers())

        mobj = self._match_valid_url(url)
        video_id = mobj.group('id')
        mytv = mobj.group('mytv') is not None

        webpage = self._download_webpage(url, video_id)

        title = re.sub(r'( - 高清正版在线观看)? - 搜狐视频$', '', self._og_search_title(webpage))

        vid = self._html_search_regex(
            r'var vid ?= ?["\'](\d+)["\']',
            webpage, 'video path')
        vid_data = _fetch_data(vid, mytv)
        if vid_data['play'] != 1:
            if vid_data.get('status') == 12:
                raise ExtractorError(
                    f'{self.IE_NAME} said: There\'s something wrong in the video.',
                    expected=True)
            else:
                self.raise_geo_restricted(
                    f'{self.IE_NAME} said: The video is only licensed to users in Mainland China.')

        formats_json = {}
        for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'):
            vid_id = vid_data['data'].get(f'{format_id}Vid')
            if not vid_id:
                continue
            vid_id = str(vid_id)
            formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv)

        part_count = vid_data['data']['totalBlocks']

        playlist = []
        for i in range(part_count):
            formats = []
            for format_id, format_data in formats_json.items():
                allot = format_data['allot']

                data = format_data['data']
                clip_url = traverse_obj(data, (('clipsURL', 'mp4PlayUrl'), i, {url_or_none}), get_all=False)
                if not clip_url:
                    raise ExtractorError(f'Unable to extract url for clip {i}')
                su = data['su']

                video_url = 'newflv.sohu.ccgslb.net'
                cdn_id = None
                retries = 0

                while 'newflv.sohu.ccgslb.net' in video_url:
                    params = {
                        'prot': 9,
                        'file': clip_url,
                        'new': su[i],
                        'prod': 'h5n',
                        'rb': 1,
                    }

                    if cdn_id is not None:
                        params['idc'] = cdn_id

                    download_note = f'Downloading {format_id} video URL part {i + 1} of {part_count}'

                    if retries > 0:
                        download_note += f' (retry #{retries})'
                    part_info = self._parse_json(self._download_webpage(
                        f'http://{allot}/?{urllib.parse.urlencode(params)}',
                        video_id, download_note), video_id)

                    video_url = part_info['url']
                    cdn_id = part_info.get('nid')

                    retries += 1
                    if retries > 5:
                        raise ExtractorError('Failed to get video URL')

                formats.append({
                    'url': video_url,
                    'format_id': format_id,
                    'filesize': int_or_none(
                        try_get(data, lambda x: x['clipsBytes'][i])),
                    'width': int_or_none(data.get('width')),
                    'height': int_or_none(data.get('height')),
                    'fps': int_or_none(data.get('fps')),
                })

            playlist.append({
                'id': f'{video_id}_part{i + 1}',
                'title': title,
                'duration': vid_data['data']['clipsDuration'][i],
                'formats': formats,
            })

        if len(playlist) == 1:
            info = playlist[0]
            info['id'] = video_id
        else:
            info = {
                '_type': 'multi_video',
                'entries': playlist,
                'id': video_id,
                'title': title,
                'duration': traverse_obj(vid_data, ('data', 'totalDuration', {float_or_none})),
            }

        if mytv:
            publish_time = unified_timestamp(self._search_regex(
                r'publishTime:\s*["\'](\d+-\d+-\d+ \d+:\d+)["\']', webpage, 'publish time', fatal=False))
        else:
            publish_time = traverse_obj(vid_data, ('tv_application_time', {unified_timestamp}))

        return {
            'timestamp': publish_time - 8 * 3600 if publish_time else None,
            **traverse_obj(vid_data, {
                'alt_title': ('data', 'subName', {str}),
                'uploader': ('wm_data', 'wm_username', {str}),
                'thumbnail': ('data', 'coverImg', {url_or_none}),
                'tags': ('data', 'tag', {str.split}),
            }),
            **info,
        }


class SohuVIE(InfoExtractor):
    _VALID_URL = r'https?://tv\.sohu\.com/v/(?P<id>[\w=-]+)\.html(?:$|[#?])'

    _TESTS = [{
        'note': 'Multipart video',
        'url': 'https://tv.sohu.com/v/MjAyMzA2MTQvbjYwMTMxNTE5Mi5zaHRtbA==.html',
        'info_dict': {
            'id': '601315192',
            'title': '《淬火丹心》第1集',
            'alt_title': '“点天灯”发生事故',
            'duration': 2701.692,
            'timestamp': 1686758040,
            'upload_date': '20230614',
            'thumbnail': 'http://photocdn.tv.sohu.com/img/20230614/vrsa_hor_1686738763256_454010551.jpg',
        },
        'playlist_mincount': 9,
        'skip': 'Only available in China',
    }, {
        'url': 'https://tv.sohu.com/v/dXMvMjMyNzk5ODg5Lzc4NjkzNDY0LnNodG1s.html',
        'info_dict': {
            'id': '78693464',
            'ext': 'mp4',
            'title': '【爱范品】第31期：MWC见不到的奇葩手机',
            'uploader': '爱范儿视频',
            'duration': 213,
            'timestamp': 1425519600,
            'upload_date': '20150305',
            'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg',
            'tags': ['爱范儿', '爱范品', 'MWC', '手机'],
        },
    }, {
        'note': 'Multipart video',
        'url': 'https://tv.sohu.com/v/dXMvMjQyNTYyMTYzLzc4OTEwMzM5LnNodG1s.html?src=pl',
        'info_dict': {
            'id': '78910339',
            'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
            'uploader': '小苍cany',
            'duration': 744.0,
            'timestamp': 1426269360,
            'upload_date': '20150313',
            'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg',
            'tags': ['小苍MM', '英雄联盟', '实战秘籍'],
        },
        'playlist_mincount': 3,
    }]

    def _real_extract(self, url):
        encoded_id = self._match_id(url)
        path = base64.urlsafe_b64decode(encoded_id).decode()
        subdomain = 'tv' if re.match(r'\d+/n\d+\.shtml', path) else 'my.tv'
        return self.url_result(urljoin(f'http://{subdomain}.sohu.com/', path), SohuIE)
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								import base64
-												[sohu] Handle encoding, and fix tests

											
										
										
											2013-08-28 05:59:08 -06:00
+								import re
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								import urllib.parse
-												add an extractor for tv.sohu.com

											
										
										
											2013-08-02 03:58:46 -06:00
 								from .common import InfoExtractor
-												[sohu] Fix numeric fields

											
										
										
											2017-06-08 11:16:42 -06:00
+								from ..utils import (
 								    ExtractorError,
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								    float_or_none,
-												[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)

Authored by: bashonly, seproDev, Grub4K

Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com>
											
										
										
											2024-05-26 13:27:21 -06:00
+								    int_or_none,
 								    traverse_obj,
-												[sohu] Fix numeric fields

											
										
										
											2017-06-08 11:16:42 -06:00
+								    try_get,
-												[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)

Authored by: bashonly, seproDev, Grub4K

Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com>
											
										
										
											2024-05-26 13:27:21 -06:00
+								    unified_timestamp,
 								    url_or_none,
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								    urljoin,
-												[sohu] Fix numeric fields

											
										
										
											2017-06-08 11:16:42 -06:00
+								)
-												add an extractor for tv.sohu.com

											
										
										
											2013-08-02 03:58:46 -06:00
 								class SohuIE(InfoExtractor):
-												[sohu] add support for my.tv.sohu.com urls (fixes #1398)

											
										
										
											2013-09-09 11:56:16 -06:00
+								    _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?'
-												add an extractor for tv.sohu.com

											
										
										
											2013-08-02 03:58:46 -06:00
-												[sohu] Update _TESTS (closes #10260)

											
										
										
											2016-08-08 04:48:21 -06:00
+								    # Sohu videos give different MD5 sums on Travis CI and my machine
-												[sohu] Fix info extractor and add tests

											
										
										
											2015-03-05 11:43:05 -07:00
+								    _TESTS = [{
 								        'note': 'This video is available only in Mainland China',
-												[sohu] Modernize

											
										
										
											2014-11-26 04:53:55 -07:00
+								        'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super',
 								        'info_dict': {
 								            'id': '382479172',
 								            'ext': 'mp4',
 								            'title': 'MV：Far East Movement《The Illest》',
-												add an extractor for tv.sohu.com

											
										
										
											2013-08-02 03:58:46 -06:00
+								        },
-												[letv/sohu] Skip tests relying on external proxies

The proxy is currently broken. See #5655 and zhuzhuor/Unblock-Youku#427

											
										
										
											2015-05-20 00:08:23 -06:00
+								        'skip': 'On available in China',
-												[sohu] Fix info extractor and add tests

											
										
										
											2015-03-05 11:43:05 -07:00
+								    }, {
 								        'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
 								        'info_dict': {
 								            'id': '409385080',
 								            'ext': 'mp4',
 								            'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》',
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								        },
 								        'skip': 'no longer available',
-												[sohu] Fix info extractor and add tests

											
										
										
											2015-03-05 11:43:05 -07:00
+								    }, {
 								        'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml',
 								        'info_dict': {
 								            'id': '78693464',
 								            'ext': 'mp4',
 								            'title': '【爱范品】第31期：MWC见不到的奇葩手机',
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								            'uploader': '爱范儿视频',
 								            'duration': 213,
 								            'timestamp': 1425519600,
 								            'upload_date': '20150305',
 								            'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg',
 								            'tags': ['爱范儿', '爱范品', 'MWC', '手机'],
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								        },
-												[Sohu] Add a multiplart video test case

											
										
										
											2015-03-14 10:59:49 -06:00
+								    }, {
 								        'note': 'Multipart video',
 								        'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml',
 								        'info_dict': {
 								            'id': '78910339',
-												[Sohu] Fix title extraction

											
										
										
											2015-04-19 05:19:44 -06:00
+								            'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								            'uploader': '小苍cany',
 								            'duration': 744.0,
 								            'timestamp': 1426269360,
 								            'upload_date': '20150313',
 								            'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg',
 								            'tags': ['小苍MM', '英雄联盟', '实战秘籍'],
-												[Sohu] Add a multiplart video test case

											
										
										
											2015-03-14 10:59:49 -06:00
+								        },
 								        'playlist': [{
 								            'info_dict': {
 								                'id': '78910339_part1',
 								                'ext': 'mp4',
 								                'duration': 294,
 								                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								            },
-												[Sohu] Add a multiplart video test case

											
										
										
											2015-03-14 10:59:49 -06:00
+								        }, {
 								            'info_dict': {
 								                'id': '78910339_part2',
 								                'ext': 'mp4',
 								                'duration': 300,
 								                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								            },
-												[Sohu] Add a multiplart video test case

											
										
										
											2015-03-14 10:59:49 -06:00
+								        }, {
 								            'info_dict': {
 								                'id': '78910339_part3',
 								                'ext': 'mp4',
 								                'duration': 150,
 								                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								            },
 								        }],
-												[Sohu] Fix title extraction

											
										
										
											2015-03-14 11:05:01 -06:00
+								    }, {
-												[sohu] Fix test's note info

											
										
										
											2015-03-17 09:39:31 -06:00
+								        'note': 'Video with title containing dash',
-												[Sohu] Fix title extraction

											
										
										
											2015-03-14 11:05:01 -06:00
+								        'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml',
 								        'info_dict': {
 								            'id': '78932792',
 								            'ext': 'mp4',
-												Completely change project name to yt-dlp (#85)

* All modules and binary names are changed
* All documentation references changed
* yt-dlp no longer loads youtube-dlc config files
* All URLs changed to point to organization account

Co-authored-by: Pccode66
Co-authored-by: pukkandan
											
										
										
											2021-02-24 11:45:56 -07:00
+								            'title': 'youtube-dl testing video',
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								            'duration': 360,
 								            'timestamp': 1426348620,
 								            'upload_date': '20150314',
 								            'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M02/8A/00/MTAuMTAuODguNzk=/6_14cee1be192g102SysCutcloud_78932792_7_7b.jpg',
 								            'tags': [],
-												[Sohu] Fix title extraction

											
										
										
											2015-03-14 11:05:01 -06:00
+								        },
 								        'params': {
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								            'skip_download': True,
 								        },
-												[sohu] Fix info extractor and add tests

											
										
										
											2015-03-05 11:43:05 -07:00
+								    }]
-												add an extractor for tv.sohu.com

											
										
										
											2013-08-02 03:58:46 -06:00
 								    def _real_extract(self, url):
-												[sohu] Handle encoding, and fix tests

											
										
										
											2013-08-28 05:59:08 -06:00
-												[sohu] add support for my.tv.sohu.com urls (fixes #1398)

											
										
										
											2013-09-09 11:56:16 -06:00
+								        def _fetch_data(vid_id, mytv=False):
 								            if mytv:
 								                base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid='
 								            else:
-												[sohu] Modernize

											
										
										
											2014-11-26 04:53:55 -07:00
+								                base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
-												[sohu] Modernize and extract all formats and more metadata (Closes #4409, closes #2056, closes #3009)

											
										
										
											2014-12-25 09:25:05 -07:00
-												[sohu] Fix test's note info

											
										
										
											2015-03-17 09:39:31 -06:00
+								            return self._download_json(
-												Rename --cn-verfication-proxy to --geo-verification-proxy

And deprecate the former one

Since commit f1388739002a7fd1e8e9c41b642734786fc6c391, this option is
not limited to China websites, so rename it.

											
										
										
											2016-07-03 09:23:48 -06:00
+								                base_data_url + vid_id, video_id,
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								                f'Downloading JSON data for {vid_id}',
-												Rename --cn-verfication-proxy to --geo-verification-proxy

And deprecate the former one

Since commit f1388739002a7fd1e8e9c41b642734786fc6c391, this option is
not limited to China websites, so rename it.

											
										
										
											2016-07-03 09:23:48 -06:00
+								                headers=self.geo_verification_headers())
-												[sohu] Handle encoding, and fix tests

											
										
										
											2013-08-28 05:59:08 -06:00
-												[extractor] Common function `_match_valid_url`

											
										
										
											2021-08-18 19:41:24 -06:00
+								        mobj = self._match_valid_url(url)
-												add an extractor for tv.sohu.com

											
										
										
											2013-08-02 03:58:46 -06:00
+								        video_id = mobj.group('id')
-												[sohu] add support for my.tv.sohu.com urls (fixes #1398)

											
										
										
											2013-09-09 11:56:16 -06:00
+								        mytv = mobj.group('mytv') is not None
-												[sohu] Handle encoding, and fix tests

											
										
										
											2013-08-28 05:59:08 -06:00
-												add an extractor for tv.sohu.com

											
										
										
											2013-08-02 03:58:46 -06:00
+								        webpage = self._download_webpage(url, video_id)
-												[Sohu] Fix title extraction

											
										
										
											2015-03-14 11:05:01 -06:00
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								        title = re.sub(r'( - 高清正版在线观看)? - 搜狐视频$', '', self._og_search_title(webpage))
-												add an extractor for tv.sohu.com

											
										
										
											2013-08-02 03:58:46 -06:00
-												[sohu] Modernize and extract all formats and more metadata (Closes #4409, closes #2056, closes #3009)

											
										
										
											2014-12-25 09:25:05 -07:00
+								        vid = self._html_search_regex(
 								            r'var vid ?= ?["\'](\d+)["\']',
 								            webpage, 'video path')
 								        vid_data = _fetch_data(vid, mytv)
-												[sohu] Enhance error handling

											
										
										
											2015-05-08 23:19:54 -06:00
+								        if vid_data['play'] != 1:
 								            if vid_data.get('status') == 12:
 								                raise ExtractorError(
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								                    f'{self.IE_NAME} said: There\'s something wrong in the video.',
-												[sohu] Enhance error handling

											
										
										
											2015-05-08 23:19:54 -06:00
+								                    expected=True)
 								            else:
-												[sohu] raise GeoRestrictedError

											
										
										
											2017-02-23 03:49:35 -07:00
+								                self.raise_geo_restricted(
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								                    f'{self.IE_NAME} said: The video is only licensed to users in Mainland China.')
-												[sohu] Handle encoding, and fix tests

											
										
										
											2013-08-28 05:59:08 -06:00
-												[sohu] Modernize and extract all formats and more metadata (Closes #4409, closes #2056, closes #3009)

											
										
										
											2014-12-25 09:25:05 -07:00
+								        formats_json = {}
 								        for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'):
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								            vid_id = vid_data['data'].get(f'{format_id}Vid')
-												[sohu] Modernize and extract all formats and more metadata (Closes #4409, closes #2056, closes #3009)

											
										
										
											2014-12-25 09:25:05 -07:00
+								            if not vid_id:
 								                continue
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								            vid_id = str(vid_id)
-												[sohu] Modernize and extract all formats and more metadata (Closes #4409, closes #2056, closes #3009)

											
										
										
											2014-12-25 09:25:05 -07:00
+								            formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv)
-												[sohu] Handle encoding, and fix tests

											
										
										
											2013-08-28 05:59:08 -06:00
-												[sohu] Modernize and extract all formats and more metadata (Closes #4409, closes #2056, closes #3009)

											
										
										
											2014-12-25 09:25:05 -07:00
+								        part_count = vid_data['data']['totalBlocks']
-												[sohu] Handle encoding, and fix tests

											
										
										
											2013-08-28 05:59:08 -06:00
 								        playlist = []
 								        for i in range(part_count):
-												[sohu] Modernize and extract all formats and more metadata (Closes #4409, closes #2056, closes #3009)

											
										
										
											2014-12-25 09:25:05 -07:00
+								            formats = []
 								            for format_id, format_data in formats_json.items():
-												Revert "[sohu] Update extractor"

This reverts commit 32060c6d6b618fa858b2ce43db34d02fd43bc542.

											
										
										
											2015-06-21 09:29:40 -06:00
+								                allot = format_data['allot']
-												[sohu] Modernize and extract all formats and more metadata (Closes #4409, closes #2056, closes #3009)

											
										
										
											2014-12-25 09:25:05 -07:00
+								                data = format_data['data']
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								                clip_url = traverse_obj(data, (('clipsURL', 'mp4PlayUrl'), i, {url_or_none}), get_all=False)
 								                if not clip_url:
 								                    raise ExtractorError(f'Unable to extract url for clip {i}')
-												Revert "[sohu] Update extractor"

This reverts commit 32060c6d6b618fa858b2ce43db34d02fd43bc542.

											
										
										
											2015-06-21 09:29:40 -06:00
+								                su = data['su']
-												[sohu] Fix extraction again

											
										
										
											2015-06-21 10:59:55 -06:00
+								                video_url = 'newflv.sohu.ccgslb.net'
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								                cdn_id = None
-												[sohu] Fix extraction again

											
										
										
											2015-06-21 10:59:55 -06:00
+								                retries = 0
-												Revert "[sohu] Update extractor"

This reverts commit 32060c6d6b618fa858b2ce43db34d02fd43bc542.

											
										
										
											2015-06-21 09:29:40 -06:00
-												[sohu] Fix extraction again

											
										
										
											2015-06-21 10:59:55 -06:00
+								                while 'newflv.sohu.ccgslb.net' in video_url:
 								                    params = {
 								                        'prot': 9,
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								                        'file': clip_url,
-												[sohu] Fix extraction again

											
										
										
											2015-06-21 10:59:55 -06:00
+								                        'new': su[i],
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								                        'prod': 'h5n',
-												[sohu]fix 403 forbidden

											
										
										
											2015-12-07 23:12:32 -07:00
+								                        'rb': 1,
-												[sohu] Fix extraction again

											
										
										
											2015-06-21 10:59:55 -06:00
+								                    }
-												[sohu] Fix info extractor and add tests

											
										
										
											2015-03-05 11:43:05 -07:00
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								                    if cdn_id is not None:
 								                        params['idc'] = cdn_id
-												[sohu] Fix extraction again

											
										
										
											2015-06-21 10:59:55 -06:00
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								                    download_note = f'Downloading {format_id} video URL part {i + 1} of {part_count}'
-												[sohu] Fix extraction again

											
										
										
											2015-06-21 10:59:55 -06:00
 								                    if retries > 0:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								                        download_note += f' (retry #{retries})'
-												[sohu] Fix extraction again

											
										
										
											2015-06-21 10:59:55 -06:00
+								                    part_info = self._parse_json(self._download_webpage(
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								                        f'http://{allot}/?{urllib.parse.urlencode(params)}',
-												[sohu] Fix extraction again

											
										
										
											2015-06-21 10:59:55 -06:00
+								                        video_id, download_note), video_id)
 								                    video_url = part_info['url']
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								                    cdn_id = part_info.get('nid')
-												[sohu] Fix extraction again

											
										
										
											2015-06-21 10:59:55 -06:00
 								                    retries += 1
 								                    if retries > 5:
 								                        raise ExtractorError('Failed to get video URL')
-												[sohu] Modernize and extract all formats and more metadata (Closes #4409, closes #2056, closes #3009)

											
										
										
											2014-12-25 09:25:05 -07:00
 								                formats.append({
 								                    'url': video_url,
 								                    'format_id': format_id,
-												[sohu] Fix numeric fields

											
										
										
											2017-06-08 11:16:42 -06:00
+								                    'filesize': int_or_none(
 								                        try_get(data, lambda x: x['clipsBytes'][i])),
 								                    'width': int_or_none(data.get('width')),
 								                    'height': int_or_none(data.get('height')),
 								                    'fps': int_or_none(data.get('fps')),
-												[sohu] Modernize and extract all formats and more metadata (Closes #4409, closes #2056, closes #3009)

											
										
										
											2014-12-25 09:25:05 -07:00
+								                })
 								            playlist.append({
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								                'id': f'{video_id}_part{i + 1}',
-												add an extractor for tv.sohu.com

											
										
										
											2013-08-02 03:58:46 -06:00
+								                'title': title,
-												[sohu] Modernize and extract all formats and more metadata (Closes #4409, closes #2056, closes #3009)

											
										
										
											2014-12-25 09:25:05 -07:00
+								                'duration': vid_data['data']['clipsDuration'][i],
 								                'formats': formats,
 								            })
-												[sohu] Handle encoding, and fix tests

											
										
										
											2013-08-28 05:59:08 -06:00
 								        if len(playlist) == 1:
 								            info = playlist[0]
-												use ..utils/clean_html()

											
										
										
											2013-08-02 20:29:58 -06:00
+								            info['id'] = video_id
-												[sohu] Handle encoding, and fix tests

											
										
										
											2013-08-28 05:59:08 -06:00
+								        else:
 								            info = {
-												[Sohu] Fix title extraction

											
										
										
											2015-04-19 05:19:44 -06:00
+								                '_type': 'multi_video',
-												[sohu] Handle encoding, and fix tests

											
										
										
											2013-08-28 05:59:08 -06:00
+								                'entries': playlist,
 								                'id': video_id,
-												[Sohu] Fix title extraction

											
										
										
											2015-04-19 05:19:44 -06:00
+								                'title': title,
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								                'duration': traverse_obj(vid_data, ('data', 'totalDuration', {float_or_none})),
-												[sohu] Handle encoding, and fix tests

											
										
										
											2013-08-28 05:59:08 -06:00
+								            }
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								        if mytv:
 								            publish_time = unified_timestamp(self._search_regex(
 								                r'publishTime:\s*["\'](\d+-\d+-\d+ \d+:\d+)["\']', webpage, 'publish time', fatal=False))
 								        else:
 								            publish_time = traverse_obj(vid_data, ('tv_application_time', {unified_timestamp}))
 								        return {
 								            'timestamp': publish_time - 8 * 3600 if publish_time else None,
 								            **traverse_obj(vid_data, {
 								                'alt_title': ('data', 'subName', {str}),
 								                'uploader': ('wm_data', 'wm_username', {str}),
 								                'thumbnail': ('data', 'coverImg', {url_or_none}),
 								                'tags': ('data', 'tag', {str.split}),
 								            }),
 								            **info,
 								        }
 								class SohuVIE(InfoExtractor):
 								    _VALID_URL = r'https?://tv\.sohu\.com/v/(?P<id>[\w=-]+)\.html(?:$|[#?])'
 								    _TESTS = [{
 								        'note': 'Multipart video',
 								        'url': 'https://tv.sohu.com/v/MjAyMzA2MTQvbjYwMTMxNTE5Mi5zaHRtbA==.html',
 								        'info_dict': {
 								            'id': '601315192',
 								            'title': '《淬火丹心》第1集',
 								            'alt_title': '“点天灯”发生事故',
 								            'duration': 2701.692,
 								            'timestamp': 1686758040,
 								            'upload_date': '20230614',
 								            'thumbnail': 'http://photocdn.tv.sohu.com/img/20230614/vrsa_hor_1686738763256_454010551.jpg',
 								        },
 								        'playlist_mincount': 9,
 								        'skip': 'Only available in China',
 								    }, {
 								        'url': 'https://tv.sohu.com/v/dXMvMjMyNzk5ODg5Lzc4NjkzNDY0LnNodG1s.html',
 								        'info_dict': {
 								            'id': '78693464',
 								            'ext': 'mp4',
 								            'title': '【爱范品】第31期：MWC见不到的奇葩手机',
 								            'uploader': '爱范儿视频',
 								            'duration': 213,
 								            'timestamp': 1425519600,
 								            'upload_date': '20150305',
 								            'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg',
 								            'tags': ['爱范儿', '爱范品', 'MWC', '手机'],
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-11 17:09:58 -06:00
+								        },
-												[ie/sohu] Fix extractor (#7628)

Closes #1667, Closes #7463
Authored by: c-basalt, bashonly
											
										
										
											2023-09-16 15:13:04 -06:00
+								    }, {
 								        'note': 'Multipart video',
 								        'url': 'https://tv.sohu.com/v/dXMvMjQyNTYyMTYzLzc4OTEwMzM5LnNodG1s.html?src=pl',
 								        'info_dict': {
 								            'id': '78910339',
 								            'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
 								            'uploader': '小苍cany',
 								            'duration': 744.0,
 								            'timestamp': 1426269360,
 								            'upload_date': '20150313',
 								            'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg',
 								            'tags': ['小苍MM', '英雄联盟', '实战秘籍'],
 								        },
 								        'playlist_mincount': 3,
 								    }]
 								    def _real_extract(self, url):
 								        encoded_id = self._match_id(url)
 								        path = base64.urlsafe_b64decode(encoded_id).decode()
 								        subdomain = 'tv' if re.match(r'\d+/n\d+\.shtml', path) else 'my.tv'
 								        return self.url_result(urljoin(f'http://{subdomain}.sohu.com/', path), SohuIE)