use ..utils/clean_html()
This commit is contained in:
parent
6624a2b07d
commit
4ec929dc9b
|
@ -7,7 +7,7 @@ import logging
|
||||||
import urllib2
|
import urllib2
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..utils import compat_urllib_request
|
from ..utils import compat_urllib_request, clean_html
|
||||||
|
|
||||||
|
|
||||||
class SohuIE(InfoExtractor):
|
class SohuIE(InfoExtractor):
|
||||||
|
@ -22,16 +22,6 @@ class SohuIE(InfoExtractor):
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
def _clearn_html(self, string):
|
|
||||||
tags = re.findall(r'<.+?>', string)
|
|
||||||
for t in tags:
|
|
||||||
string = string.replace(t, ' ')
|
|
||||||
for i in range(2):
|
|
||||||
spaces = re.findall(r'\s+', string)
|
|
||||||
for s in spaces:
|
|
||||||
string = string.replace(s, ' ')
|
|
||||||
string = string.strip()
|
|
||||||
return string
|
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
mobj = re.match(self._VALID_URL, url)
|
mobj = re.match(self._VALID_URL, url)
|
||||||
|
@ -40,7 +30,7 @@ class SohuIE(InfoExtractor):
|
||||||
pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>'
|
pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>'
|
||||||
compiled = re.compile(pattern, re.DOTALL)
|
compiled = re.compile(pattern, re.DOTALL)
|
||||||
title = self._search_regex(compiled, webpage, u'video title').strip('\t\n')
|
title = self._search_regex(compiled, webpage, u'video title').strip('\t\n')
|
||||||
title = self._clearn_html(title)
|
title = clean_html(title)
|
||||||
pattern = re.compile(r'var vid="(\d+)"')
|
pattern = re.compile(r'var vid="(\d+)"')
|
||||||
result = re.search(pattern, webpage)
|
result = re.search(pattern, webpage)
|
||||||
if not result:
|
if not result:
|
||||||
|
@ -93,5 +83,8 @@ class SohuIE(InfoExtractor):
|
||||||
}
|
}
|
||||||
files_info.append(info)
|
files_info.append(info)
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
if num_of_parts == 1:
|
||||||
|
info = files_info[0]
|
||||||
|
info['id'] = video_id
|
||||||
|
return info
|
||||||
return files_info
|
return files_info
|
||||||
|
|
Loading…
Reference in New Issue