From f11554092b419baa919875432fe6ebc1f22f5307 Mon Sep 17 00:00:00 2001 From: Tjark Saul Date: Fri, 17 Apr 2015 09:21:54 +0200 Subject: [PATCH 1/7] [Lecture2Go] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/lecture2go.py | 33 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 youtube_dl/extractor/lecture2go.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bbf3be41d..3d6e981b2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -249,6 +249,7 @@ from .krasview import KrasViewIE from .ku6 import Ku6IE from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .lecture2go import Lecture2GoIE from .letv import ( LetvIE, LetvTvIE, diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py new file mode 100644 index 000000000..9cf28e31c --- /dev/null +++ b/youtube_dl/extractor/lecture2go.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class Lecture2GoIE(InfoExtractor): + _VALID_URL = r'https?://lecture2go.uni-hamburg.de/veranstaltungen/-/v/(?P[0-9]+)' + _TEST = { + 'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473', + 'md5': 'a9e76f83b3ef58019c4b7dbc35f406c1', + 'info_dict': { + 'id': '17473', + 'ext': 'mp4', + 'url': 'https://fms1.rrz.uni-hamburg.de/abo/64.050_FrankHeitmann_2015-04-13_14-35.mp4', + 'title': '2 - Endliche Automaten und reguläre Sprachen' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'(.*?)', webpage, 'title') + video_url = self._search_regex(r'b.isFirefox..a.useHTML5\).b.setOption.a,"src","(.*.mp4)"\).else', webpage, 'video_url') + creator = self._html_search_regex(r'
(.*)
', webpage, 'creator') + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'creator': creator + } From 981b9cdc8c12d817eaf3ec6b030538c252efe48e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 22:09:01 +0800 Subject: [PATCH 2/7] [lecture2go] Improve some regular expressions --- youtube_dl/extractor/lecture2go.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py index 9cf28e31c..fd115ff54 100644 --- a/youtube_dl/extractor/lecture2go.py +++ b/youtube_dl/extractor/lecture2go.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class Lecture2GoIE(InfoExtractor): - _VALID_URL = r'https?://lecture2go.uni-hamburg.de/veranstaltungen/-/v/(?P[0-9]+)' + _VALID_URL = r'https?://lecture2go\.uni-hamburg\.de/veranstaltungen/-/v/(?P\d+)' _TEST = { 'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473', 'md5': 'a9e76f83b3ef58019c4b7dbc35f406c1', @@ -21,9 +21,9 @@ class Lecture2GoIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.*?)', webpage, 'title') + title = self._html_search_regex(r']+class="title">(.+)', webpage, 'title') video_url = self._search_regex(r'b.isFirefox..a.useHTML5\).b.setOption.a,"src","(.*.mp4)"\).else', webpage, 'video_url') - creator = self._html_search_regex(r'
(.*)
', webpage, 'creator') + creator = self._html_search_regex(r']+id="description">([^<]+)', webpage, 'creator') return { 'id': video_id, From 795704f0f1f963d3f61a7e20074ce41eeb3cdf95 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 22:39:46 +0800 Subject: [PATCH 3/7] [lecture2go] Support more formats --- youtube_dl/extractor/lecture2go.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py index fd115ff54..0075b8a2e 100644 --- a/youtube_dl/extractor/lecture2go.py +++ b/youtube_dl/extractor/lecture2go.py @@ -1,7 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import determine_ext class Lecture2GoIE(InfoExtractor): @@ -22,12 +25,26 @@ class Lecture2GoIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r']+class="title">(.+)', webpage, 'title') - video_url = self._search_regex(r'b.isFirefox..a.useHTML5\).b.setOption.a,"src","(.*.mp4)"\).else', webpage, 'video_url') + + formats = [] + for url in set(re.findall(r'"src","([^"]+)"', webpage)): + ext = determine_ext(url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats(url, video_id)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats(url, video_id)) + else: + formats.append({ + 'url': url, + }) + + self._sort_formats(formats) + creator = self._html_search_regex(r']+id="description">([^<]+)', webpage, 'creator') return { 'id': video_id, 'title': title, - 'url': video_url, + 'formats': formats, 'creator': creator } From 1e124295644e3760cd457bb1a6ae717e1cb2c0fc Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 23:05:14 +0800 Subject: [PATCH 4/7] [lecture2go] Update _TEST --- youtube_dl/extractor/lecture2go.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py index 0075b8a2e..d0e9416f5 100644 --- a/youtube_dl/extractor/lecture2go.py +++ b/youtube_dl/extractor/lecture2go.py @@ -11,12 +11,12 @@ class Lecture2GoIE(InfoExtractor): _VALID_URL = r'https?://lecture2go\.uni-hamburg\.de/veranstaltungen/-/v/(?P\d+)' _TEST = { 'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473', - 'md5': 'a9e76f83b3ef58019c4b7dbc35f406c1', + 'md5': 'ac02b570883020d208d405d5a3fd2f7f', 'info_dict': { 'id': '17473', - 'ext': 'mp4', - 'url': 'https://fms1.rrz.uni-hamburg.de/abo/64.050_FrankHeitmann_2015-04-13_14-35.mp4', - 'title': '2 - Endliche Automaten und reguläre Sprachen' + 'ext': 'flv', + 'title': '2 - Endliche Automaten und reguläre Sprachen', + 'creator': 'Frank Heitmann', } } From 9c29bc69f7d6365835f495dff10f3c5f49671a55 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 23:15:22 +0800 Subject: [PATCH 5/7] [utils] Improve parse_duration Now dots are parsed. For example '87 Min.' --- test/test_utils.py | 1 + youtube_dl/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index e13e11b59..65692a9fb 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -324,6 +324,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('02:03:04'), 7384) self.assertEqual(parse_duration('01:02:03:04'), 93784) self.assertEqual(parse_duration('1 hour 3 minutes'), 3780) + self.assertEqual(parse_duration('87 Min.'), 5220) def test_fix_xml_ampersands(self): self.assertEqual( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 942f76d24..ae813099d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1309,10 +1309,10 @@ def parse_duration(s): m = re.match( r'''(?ix)(?:P?T)? (?: - (?P[0-9.]+)\s*(?:mins?|minutes?)\s*| + (?P[0-9.]+)\s*(?:mins?\.?|minutes?)\s*| (?P[0-9.]+)\s*(?:hours?)| - \s*(?P[0-9]+)\s*(?:[:h]|hours?)\s*(?P[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*| + \s*(?P[0-9]+)\s*(?:[:h]|hours?)\s*(?P[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*| (?: (?: (?:(?P[0-9]+)\s*(?:[:d]|days?)\s*)? From e9c6deffee26db40992293b3055df31804ca7e12 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 23:22:19 +0800 Subject: [PATCH 6/7] [lecture2go] Add more metadata fields --- youtube_dl/extractor/lecture2go.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py index d0e9416f5..a2f9d5c54 100644 --- a/youtube_dl/extractor/lecture2go.py +++ b/youtube_dl/extractor/lecture2go.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import determine_ext +from ..utils import ( + determine_ext, + parse_duration, + int_or_none, +) class Lecture2GoIE(InfoExtractor): @@ -17,6 +21,7 @@ class Lecture2GoIE(InfoExtractor): 'ext': 'flv', 'title': '2 - Endliche Automaten und reguläre Sprachen', 'creator': 'Frank Heitmann', + 'duration': 5220, } } @@ -41,10 +46,16 @@ class Lecture2GoIE(InfoExtractor): self._sort_formats(formats) creator = self._html_search_regex(r']+id="description">([^<]+)', webpage, 'creator') + duration = parse_duration(self._html_search_regex( + r'Duration:\s*\s*]*>([^<]+)', webpage, 'duration', fatal=False)) + view_count = int_or_none(self._html_search_regex( + r'Views:\s*\s*]+>(\d+)', webpage, 'view count', fatal=False)) return { 'id': video_id, 'title': title, 'formats': formats, - 'creator': creator + 'creator': creator, + 'duration': duration, + 'view_count': view_count, } From 40101dc311909523852a88ba69df76be9b6bc920 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 23:25:32 +0800 Subject: [PATCH 7/7] [lecture2go] Make optional fields non-fatal --- youtube_dl/extractor/lecture2go.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py index a2f9d5c54..40a3d2346 100644 --- a/youtube_dl/extractor/lecture2go.py +++ b/youtube_dl/extractor/lecture2go.py @@ -45,7 +45,8 @@ class Lecture2GoIE(InfoExtractor): self._sort_formats(formats) - creator = self._html_search_regex(r']+id="description">([^<]+)', webpage, 'creator') + creator = self._html_search_regex( + r']+id="description">([^<]+)', webpage, 'creator', fatal=False) duration = parse_duration(self._html_search_regex( r'Duration:\s*\s*]*>([^<]+)', webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex(