From 7bc8780c576505fd87a5c85ff1f50ef2e8841d88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 7 Oct 2014 22:23:05 +0700 Subject: [PATCH] [walla] Fix extractor and add subtitle tests --- test/test_subtitles.py | 28 ++++++++++ youtube_dl/extractor/walla.py | 101 ++++++++++++++++++++-------------- 2 files changed, 88 insertions(+), 41 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 48c302198..eb5f2f8dd 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -15,6 +15,7 @@ from youtube_dl.extractor import ( DailymotionIE, TEDIE, VimeoIE, + WallaIE, ) @@ -279,5 +280,32 @@ class TestVimeoSubtitles(BaseTestSubtitles): self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) +class TestWallsSubtitles(BaseTestSubtitles): + url = 'http://vod.walla.co.il/movie/2705958/the-yes-men' + IE = WallaIE + + def test_list_subtitles(self): + self.DL.expect_warning(u'Automatic Captions not supported by this server') + self.DL.params['listsubtitles'] = True + info_dict = self.getInfoDict() + self.assertEqual(info_dict, None) + + def test_allsubtitles(self): + self.DL.expect_warning(u'Automatic Captions not supported by this server') + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['heb'])) + self.assertEqual(md5(subtitles['heb']), 'e758c5d7cb982f6bef14f377ec7a3920') + + def test_nosubtitles(self): + self.DL.expect_warning(u'video doesn\'t have subtitles') + self.url = 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one' + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(len(subtitles), 0) + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py index e687c3af0..672bda7a7 100644 --- a/youtube_dl/extractor/walla.py +++ b/youtube_dl/extractor/walla.py @@ -1,70 +1,89 @@ # coding: utf-8 from __future__ import unicode_literals - import re -from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor +from ..utils import ( + xpath_text, + int_or_none, +) -class WallaIE(InfoExtractor): - _VALID_URL = r'http://vod\.walla\.co\.il/\w+/(?P\d+)' +class WallaIE(SubtitlesInfoExtractor): + _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P\d+)/(?P.+)' _TEST = { 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', 'info_dict': { 'id': '2642630', + 'display_id': 'one-direction-all-for-one', 'ext': 'flv', 'title': 'וואן דיירקשן: ההיסטריה', + 'description': 'md5:de9e2512a92442574cdb0913c49bc4d8', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 3600, + }, + 'params': { + # rtmp download + 'skip_download': True, } } + _SUBTITLE_LANGS = { + 'עברית': 'heb', + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + display_id = mobj.group('display_id') - config_url = 'http://video2.walla.co.il/?w=null/null/%s/@@/video/flv_pl' % video_id - - webpage = self._download_webpage(config_url, video_id, '') - - media_id = self._html_search_regex(r'(\d+)', webpage, video_id, 'extract media id') - - prefix = '0' if len(media_id) == 7 else '' - - series = '%s%s' % (prefix, media_id[0:2]) - session = media_id[2:5] - episode = media_id[5:7] - - title = self._html_search_regex(r'(.*)', webpage, video_id, 'title') - - default_quality = self._html_search_regex(r'', webpage, video_id, 0) + video = self._download_xml( + 'http://video2.walla.co.il/?w=null/null/%s/@@/video/flv_pl' % video_id, + display_id) - quality = default_quality if default_quality else '40' + item = video.find('./items/item') - media_path = '/%s/%s/%s' % (series, session, media_id) #self._html_search_regex(r'.*(.*)' % default_quality ,webpage, '', flags=re.DOTALL) - - playpath = 'mp4:media/%s/%s/%s-%s' % (series, session, media_id, quality) #self._html_search_regex(r'.*(.*)' % default_quality ,webpage, '', flags=re.DOTALL) + title = xpath_text(item, './title', 'title') + description = xpath_text(item, './synopsis', 'description') + thumbnail = xpath_text(item, './preview_pic', 'thumbnail') + duration = int_or_none(xpath_text(item, './duration', 'duration')) subtitles = {} - - subtitle_url = self._html_search_regex(r'(.*).*', webpage, video_id, 0) - - print subtitle_url - - if subtitle_url: - subtitles_page = self._download_webpage(subtitle_url, video_id, '') - subtitles['heb'] = subtitles_page + for subtitle in item.findall('./subtitles/subtitle'): + lang = xpath_text(subtitle, './title') + subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = xpath_text(subtitle, './src') + + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, subtitles) + return + + subtitles = self.extract_subtitles(video_id, subtitles) + + formats = [] + for quality in item.findall('./qualities/quality'): + format_id = xpath_text(quality, './title') + fmt = { + 'url': 'rtmp://wafla.walla.co.il/vod', + 'play_path': xpath_text(quality, './src'), + 'player_url': 'http://isc.walla.co.il/w9/swf/video_swf/vod/WallaMediaPlayerAvod.swf', + 'page_url': url, + 'ext': 'flv', + 'format_id': xpath_text(quality, './title'), + } + m = re.search(r'^(?P\d+)[Pp]', format_id) + if m: + fmt['height'] = int(m.group('height')) + formats.append(fmt) + self._sort_formats(formats) return { 'id': video_id, + 'display_id': display_id, 'title': title, - 'url': 'rtmp://wafla.walla.co.il:1935/vod', - 'player_url': 'http://isc.walla.co.il/w9/swf/video_swf/vod/WallaMediaPlayerAvod.swf', - 'page_url': url, - 'app': "vod", - 'play_path': playpath, - 'tc_url': 'rtmp://wafla.walla.co.il:1935/vod', - 'rtmp_protocol': 'rtmp', - 'ext': 'flv', + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, 'subtitles': subtitles, - } \ No newline at end of file + }