From decf86044d17a8ec04e43a4805a0092622d976ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 16 Jul 2017 03:06:04 +0700 Subject: [PATCH] [pearvideo] Improve (closes #13031) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/pear.py | 34 ---------------- youtube_dl/extractor/pearvideo.py | 63 ++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 35 deletions(-) delete mode 100644 youtube_dl/extractor/pear.py create mode 100644 youtube_dl/extractor/pearvideo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 75c1a3d0e..28f0d3f0d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -762,7 +762,7 @@ from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE -from .pear import PearIE +from .pearvideo import PearVideoIE from .people import PeopleIE from .periscope import ( PeriscopeIE, diff --git a/youtube_dl/extractor/pear.py b/youtube_dl/extractor/pear.py deleted file mode 100644 index 77fd46852..000000000 --- a/youtube_dl/extractor/pear.py +++ /dev/null @@ -1,34 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class PearIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pearvideo\.com/video_(?P[0-9]+)' - _TEST = { - 'url': 'http://www.pearvideo.com/video_1076290', - 'info_dict': { - 'id': '1076290', - 'ext': 'mp4', - 'title': '小浣熊在主人家玻璃上滚石头:没砸', - 'description': '小浣熊找到一个小石头,仿佛发现了一个宝贝。它不停地用石头按在玻璃上,滚来滚去,吸引主人注意。', - 'url': 'http://video.pearvideo.com/mp4/short/20170508/cont-1076290-10438018-hd.mp4' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex(r']+class="video-tt">(.+)', webpage, 'title', fatal=False) - description = self._html_search_regex(r']+class="summary"[^>]*>([^<]+)<', webpage, 'description', fatal=False) - url = self._html_search_regex(r'hdUrl="(.*?)"', webpage, 'url', fatal=False) - - return { - 'id': video_id, - 'ext': 'mp4', - 'title': title, - 'description': description, - 'url': url - } diff --git a/youtube_dl/extractor/pearvideo.py b/youtube_dl/extractor/pearvideo.py new file mode 100644 index 000000000..1d777221c --- /dev/null +++ b/youtube_dl/extractor/pearvideo.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + qualities, + unified_timestamp, +) + + +class PearVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pearvideo\.com/video_(?P\d+)' + _TEST = { + 'url': 'http://www.pearvideo.com/video_1076290', + 'info_dict': { + 'id': '1076290', + 'ext': 'mp4', + 'title': '小浣熊在主人家玻璃上滚石头:没砸', + 'description': 'md5:01d576b747de71be0ee85eb7cac25f9d', + 'timestamp': 1494275280, + 'upload_date': '20170508', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + quality = qualities( + ('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src')) + + formats = [{ + 'url': mobj.group('url'), + 'format_id': mobj.group('id'), + 'quality': quality(mobj.group('id')), + } for mobj in re.finditer( + r'(?P[a-zA-Z]+)Url\s*=\s*(["\'])(?P(?:https?:)?//.+?)\2', + webpage)] + self._sort_formats(formats) + + title = self._search_regex( + (r']+\bclass=(["\'])video-tt\1[^>]*>(?P[^<]+)', + r'<[^>]+\bdata-title=(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'title', group='value') + description = self._search_regex( + (r']+\bclass=(["\'])summary\1[^>]*>(?P[^<]+)', + r'<[^>]+\bdata-summary=(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'description', default=None, + group='value') or self._html_search_meta('Description', webpage) + timestamp = unified_timestamp(self._search_regex( + r']+\bclass=["\']date["\'][^>]*>([^<]+)', + webpage, 'timestamp', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + }