From c6ddbdb66c5d6ead5e198013c54ef53d641063f1 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 12:30:07 +1200 Subject: [PATCH 1/6] [voicerepublic] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/voicerepublic.py | 55 +++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/voicerepublic.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f117578a2..5cb3c304d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -634,6 +634,7 @@ from .vk import ( VKUserVideosIE, ) from .vodlocker import VodlockerIE +from .voicerepublic import VoiceRepublicIE from .vporn import VpornIE from .vrt import VRTIE from .vube import VubeIE diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py new file mode 100644 index 000000000..1a90693cb --- /dev/null +++ b/youtube_dl/extractor/voicerepublic.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..compat import ( + compat_urllib_request, +) + + +class VoiceRepublicIE(InfoExtractor): + _VALID_URL = r'https?://voicerepublic\.com/talks/(?P[0-9a-z-]+)' + _TEST = { + 'url': 'https://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', + 'md5': '0554a24d1657915aa8e8f84e15dc9353', + 'info_dict': { + 'id': '2296', + 'ext': 'm4a', + 'title': 'Watching the Watchers: Building a Sousveillance State', + 'thumbnail': 'https://voicerepublic.com/system/flyer/2296.png', + 'description': 'md5:715ba964958afa2398df615809cfecb1', + 'creator': 'M. C. McGrath', + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + req = compat_urllib_request.Request(url) + # Older versions of Firefox get redirected to an "upgrade browser" page + req.add_header('User-Agent', 'youtube-dl') + webpage = self._download_webpage(req, display_id) + thumbnail = self._og_search_thumbnail(webpage) + video_id = self._search_regex(r'/(\d+)\.png', thumbnail, 'id') + + if '
', webpage, 'author', fatal=False), + } From f900dc3fb9e17e399b0f33925ee239696cc46010 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 15:01:58 +1200 Subject: [PATCH 2/6] [voicerepublic] Extract author using _html_search_meta --- youtube_dl/extractor/voicerepublic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 1a90693cb..7d255d6fa 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -51,5 +51,5 @@ class VoiceRepublicIE(InfoExtractor): 'url': self._og_search_url(webpage), 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), - 'creator': self._search_regex(r'', webpage, 'author', fatal=False), + 'creator': self._html_search_meta('author', webpage), } From 03f760b1c0478c1f65cf6e978d7592be46873313 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 15:40:09 +1200 Subject: [PATCH 3/6] [voicerepublic] Remove creator field --- youtube_dl/extractor/voicerepublic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 7d255d6fa..960974e16 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -19,7 +19,6 @@ class VoiceRepublicIE(InfoExtractor): 'title': 'Watching the Watchers: Building a Sousveillance State', 'thumbnail': 'https://voicerepublic.com/system/flyer/2296.png', 'description': 'md5:715ba964958afa2398df615809cfecb1', - 'creator': 'M. C. McGrath', } } @@ -51,5 +50,4 @@ class VoiceRepublicIE(InfoExtractor): 'url': self._og_search_url(webpage), 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), - 'creator': self._html_search_meta('author', webpage), } From f03a8a3c4ec4dc95164c12181ffc1ddcb7583ef6 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 15:12:29 +1200 Subject: [PATCH 4/6] [voicerepublic] Raise ExtractorError if audio is still being processed --- youtube_dl/extractor/voicerepublic.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 960974e16..d3e35a815 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -2,10 +2,8 @@ from __future__ import unicode_literals from .common import InfoExtractor - -from ..compat import ( - compat_urllib_request, -) +from ..compat import compat_urllib_request +from ..utils import ExtractorError class VoiceRepublicIE(InfoExtractor): @@ -31,17 +29,16 @@ class VoiceRepublicIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) video_id = self._search_regex(r'/(\d+)\.png', thumbnail, 'id') - if '
Queued for processing, please stand by...' in webpage: + raise ExtractorError('Audio is still queued for processing') + + formats = [{ + 'url': 'https://voicerepublic.com/vrmedia/{}-clean.{}'.format(video_id, ext), + 'ext': ext, + 'format_id': ext, + 'vcodec': 'none', + } for ext in ['m4a', 'mp3', 'ogg']] + self._sort_formats(formats) return { 'id': video_id, From 28ebef0b1b1b7b97137fbd8e093c09cb51954606 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 16:03:09 +1200 Subject: [PATCH 5/6] [voicerepublic] Detect list of available formats from the web page --- youtube_dl/extractor/voicerepublic.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index d3e35a815..d150b5b5e 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_urllib_request from ..utils import ExtractorError @@ -32,12 +34,15 @@ class VoiceRepublicIE(InfoExtractor): if 'Queued for processing, please stand by...' in webpage: raise ExtractorError('Audio is still queued for processing') + ext_matches = re.finditer(r'data-\w+=\'/vrmedia/\d+-clean\.(\w+)\'', webpage) + exts = [match.group(1) for match in ext_matches] + formats = [{ 'url': 'https://voicerepublic.com/vrmedia/{}-clean.{}'.format(video_id, ext), 'ext': ext, 'format_id': ext, 'vcodec': 'none', - } for ext in ['m4a', 'mp3', 'ogg']] + } for ext in exts] self._sort_formats(formats) return { From 1dcb52188d3709711b3ea5ae1ff6bdb985e79c62 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 16:38:26 +1200 Subject: [PATCH 6/6] [voicerepublic] Remove hardcoded paths to media files --- youtube_dl/extractor/voicerepublic.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index d150b5b5e..a3e40b940 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -34,15 +34,12 @@ class VoiceRepublicIE(InfoExtractor): if 'Queued for processing, please stand by...' in webpage: raise ExtractorError('Audio is still queued for processing') - ext_matches = re.finditer(r'data-\w+=\'/vrmedia/\d+-clean\.(\w+)\'', webpage) - exts = [match.group(1) for match in ext_matches] - formats = [{ - 'url': 'https://voicerepublic.com/vrmedia/{}-clean.{}'.format(video_id, ext), + 'url': 'https://voicerepublic.com' + path, 'ext': ext, 'format_id': ext, 'vcodec': 'none', - } for ext in exts] + } for ext, path in re.findall(r"data-([^=]+)='(/[^']+\.\1)'", webpage)] self._sort_formats(formats) return {