yt-dlp/yt_dlp/extractor/sina.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    HEADRequest,
    ExtractorError,
    int_or_none,
    update_url_query,
    qualities,
    get_element_by_attribute,
    clean_html,
)


class SinaIE(InfoExtractor):
    _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/
                        (?:
                            (?:view/|.*\#)(?P<id>\d+)|
                            .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)|
                            # This is used by external sites like Weibo
                            api/sinawebApi/outplay.php/(?P<token>.+?)\.swf
                        )
                  '''

    _TESTS = [
        {
            'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622',
            'md5': 'd38433e2fc886007729735650ae4b3e9',
            'info_dict': {
                'id': '250576622',
                'ext': 'mp4',
                'title': '现场:克鲁兹宣布退选 特朗普将稳获提名',
            }
        },
        {
            'url': 'http://video.sina.com.cn/v/b/101314253-1290078633.html',
            'info_dict': {
                'id': '101314253',
                'ext': 'flv',
                'title': '军方提高对朝情报监视级别',
            },
            'skip': 'the page does not exist or has been deleted',
        },
        {
            'url': 'http://video.sina.com.cn/view/250587748.html',
            'md5': '3d1807a25c775092aab3bc157fff49b4',
            'info_dict': {
                'id': '250587748',
                'ext': 'mp4',
                'title': '瞬间泪目：8年前汶川地震珍贵视频首曝光',
            },
        },
    ]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)

        video_id = mobj.group('id')
        if not video_id:
            if mobj.group('token') is not None:
                # The video id is in the redirected url
                self.to_screen('Getting video id')
                request = HEADRequest(url)
                _, urlh = self._download_webpage_handle(request, 'NA', False)
                return self._real_extract(urlh.geturl())
            else:
                pseudo_id = mobj.group('pseudo_id')
                webpage = self._download_webpage(url, pseudo_id)
                error = get_element_by_attribute('class', 'errtitle', webpage)
                if error:
                    raise ExtractorError('%s said: %s' % (
                        self.IE_NAME, clean_html(error)), expected=True)
                video_id = self._search_regex(
                    r"video_id\s*:\s*'(\d+)'", webpage, 'video id')

        video_data = self._download_json(
            'http://s.video.sina.com.cn/video/h5play',
            video_id, query={'video_id': video_id})
        if video_data['code'] != 1:
            raise ExtractorError('%s said: %s' % (
                self.IE_NAME, video_data['message']), expected=True)
        else:
            video_data = video_data['data']
            title = video_data['title']
            description = video_data.get('description')
            if description:
                description = description.strip()

            preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd'])
            formats = []
            for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items():
                file_api = quality.get('file_api')
                file_id = quality.get('file_id')
                if not file_api or not file_id:
                    continue
                formats.append({
                    'format_id': quality_id,
                    'url': update_url_query(file_api, {'vid': file_id}),
                    'quality': preference(quality_id),
                    'ext': 'mp4',
                })
            self._sort_formats(formats)

            return {
                'id': video_id,
                'title': title,
                'description': description,
                'thumbnail': video_data.get('image'),
                'duration': int_or_none(video_data.get('length')),
                'timestamp': int_or_none(video_data.get('create_time')),
                'formats': formats,
            }
-												Add SinaIE (related #1039): extractor for video.sina.com.cn

											
										
										
											2013-07-18 07:31:50 -06:00
+								# coding: utf-8
-												[sina] use unicode_literals

											
										
										
											2014-01-23 06:00:29 -07:00
+								from __future__ import unicode_literals
-												Add SinaIE (related #1039): extractor for video.sina.com.cn

											
										
										
											2013-07-18 07:31:50 -06:00
 								import re
 								from .common import InfoExtractor
-												[sina] fix extraction(fixes #1146)

											
										
										
											2016-05-13 13:24:36 -06:00
+								from ..utils import (
 								    HEADRequest,
 								    ExtractorError,
 								    int_or_none,
 								    update_url_query,
 								    qualities,
 								    get_element_by_attribute,
 								    clean_html,
 								)
-												Add SinaIE (related #1039): extractor for video.sina.com.cn

											
										
										
											2013-07-18 07:31:50 -06:00
 								class SinaIE(InfoExtractor):
-												[sina] fix extraction(fixes #1146)

											
										
										
											2016-05-13 13:24:36 -06:00
+								    _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/
 								                        (?:
-												[extractor] Fix pre-checking archive for some extractors
The `id` regex group must be present for `_match_id` and pre-checking archive to work correctly

											
										
										
											2021-06-06 03:35:07 -06:00
+								                            (?:view/|.*\#)(?P<id>\d+)|
-												[sina] fix extraction(fixes #1146)

											
										
										
											2016-05-13 13:24:36 -06:00
+								                            .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)|
-												Add SinaIE (related #1039): extractor for video.sina.com.cn

											
										
										
											2013-07-18 07:31:50 -06:00
+								                            # This is used by external sites like Weibo
-												[sina] fix extraction(fixes #1146)

											
										
										
											2016-05-13 13:24:36 -06:00
+								                            api/sinawebApi/outplay.php/(?P<token>.+?)\.swf
-												Add SinaIE (related #1039): extractor for video.sina.com.cn

											
										
										
											2013-07-18 07:31:50 -06:00
+								                        )
 								                  '''
-												[sina] Recognize http://video.sina.com.cn/v/b/{id}-*.html urls (fixes #2212)

											
										
										
											2014-01-23 06:03:14 -07:00
+								    _TESTS = [
 								        {
-												[sina] fix extraction(fixes #1146)

											
										
										
											2016-05-13 13:24:36 -06:00
+								            'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622',
 								            'md5': 'd38433e2fc886007729735650ae4b3e9',
-												[sina] Recognize http://video.sina.com.cn/v/b/{id}-*.html urls (fixes #2212)

											
										
										
											2014-01-23 06:03:14 -07:00
+								            'info_dict': {
-												[sina] fix extraction(fixes #1146)

											
										
										
											2016-05-13 13:24:36 -06:00
+								                'id': '250576622',
 								                'ext': 'mp4',
 								                'title': '现场:克鲁兹宣布退选 特朗普将稳获提名',
-												[sina] Recognize http://video.sina.com.cn/v/b/{id}-*.html urls (fixes #2212)

											
										
										
											2014-01-23 06:03:14 -07:00
+								            }
 								        },
 								        {
 								            'url': 'http://video.sina.com.cn/v/b/101314253-1290078633.html',
 								            'info_dict': {
 								                'id': '101314253',
 								                'ext': 'flv',
 								                'title': '军方提高对朝情报监视级别',
 								            },
-												[sina] fix extraction(fixes #1146)

											
										
										
											2016-05-13 13:24:36 -06:00
+								            'skip': 'the page does not exist or has been deleted',
 								        },
 								        {
 								            'url': 'http://video.sina.com.cn/view/250587748.html',
 								            'md5': '3d1807a25c775092aab3bc157fff49b4',
 								            'info_dict': {
 								                'id': '250587748',
 								                'ext': 'mp4',
 								                'title': '瞬间泪目：8年前汶川地震珍贵视频首曝光',
 								            },
-												[sina] Recognize http://video.sina.com.cn/v/b/{id}-*.html urls (fixes #2212)

											
										
										
											2014-01-23 06:03:14 -07:00
+								        },
 								    ]
-												Add SinaIE (related #1039): extractor for video.sina.com.cn

											
										
										
											2013-07-18 07:31:50 -06:00
 								    def _real_extract(self, url):
-												[sina] Modernize and simplify

											
										
										
											2015-02-01 07:16:35 -07:00
+								        mobj = re.match(self._VALID_URL, url)
-												Add SinaIE (related #1039): extractor for video.sina.com.cn

											
										
										
											2013-07-18 07:31:50 -06:00
-												[extractor] Fix pre-checking archive for some extractors
The `id` regex group must be present for `_match_id` and pre-checking archive to work correctly

											
										
										
											2021-06-06 03:35:07 -06:00
+								        video_id = mobj.group('id')
-												[sina] fix extraction(fixes #1146)

											
										
										
											2016-05-13 13:24:36 -06:00
+								        if not video_id:
 								            if mobj.group('token') is not None:
 								                # The video id is in the redirected url
 								                self.to_screen('Getting video id')
 								                request = HEADRequest(url)
-												remove unnecessary assignment parenthesis

											
										
										
											2018-05-26 09:12:44 -06:00
+								                _, urlh = self._download_webpage_handle(request, 'NA', False)
-												[sina] fix extraction(fixes #1146)

											
										
										
											2016-05-13 13:24:36 -06:00
+								                return self._real_extract(urlh.geturl())
 								            else:
 								                pseudo_id = mobj.group('pseudo_id')
 								                webpage = self._download_webpage(url, pseudo_id)
 								                error = get_element_by_attribute('class', 'errtitle', webpage)
 								                if error:
 								                    raise ExtractorError('%s said: %s' % (
 								                        self.IE_NAME, clean_html(error)), expected=True)
 								                video_id = self._search_regex(
 								                    r"video_id\s*:\s*'(\d+)'", webpage, 'video id')
 								        video_data = self._download_json(
 								            'http://s.video.sina.com.cn/video/h5play',
 								            video_id, query={'video_id': video_id})
 								        if video_data['code'] != 1:
 								            raise ExtractorError('%s said: %s' % (
 								                self.IE_NAME, video_data['message']), expected=True)
 								        else:
 								            video_data = video_data['data']
 								            title = video_data['title']
 								            description = video_data.get('description')
 								            if description:
 								                description = description.strip()
 								            preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd'])
 								            formats = []
 								            for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items():
 								                file_api = quality.get('file_api')
 								                file_id = quality.get('file_id')
 								                if not file_api or not file_id:
 								                    continue
 								                formats.append({
 								                    'format_id': quality_id,
 								                    'url': update_url_query(file_api, {'vid': file_id}),
-												[formatsort] Remove misuse of 'preference'

'preference' is to be used only when the format is better that ALL qualities of a lower preference irrespective of ANY sorting order the user requests. See deezer.py for correct use of this

In the older sorting method, `preference`, `quality` and `language_preference` were functionally almost equivalent. So these disparities doesn't really matter there

Also, despite what the documentation says, the default for `preference` was actually 0 and not -1. I have tried to correct this and also account for it when converting `preference` to `quality`

											
										
										
											2021-02-18 15:03:16 -07:00
+								                    'quality': preference(quality_id),
-												[sina] fix extraction(fixes #1146)

											
										
										
											2016-05-13 13:24:36 -06:00
+								                    'ext': 'mp4',
 								                })
 								            self._sort_formats(formats)
 								            return {
 								                'id': video_id,
 								                'title': title,
 								                'description': description,
 								                'thumbnail': video_data.get('image'),
 								                'duration': int_or_none(video_data.get('length')),
 								                'timestamp': int_or_none(video_data.get('create_time')),
 								                'formats': formats,
 								            }