[youtube] Support automatic captions with original language different from English (fixes #1225) and download in multiple languages.

master
Jaime Marquínez Ferrándiz 11 years ago
parent ac4f319ba1
commit 055e6f3657

@ -15,28 +15,33 @@ class SubtitlesInfoExtractor(InfoExtractor):
self.to_screen(u'%s: Available subtitles for video: %s' % self.to_screen(u'%s: Available subtitles for video: %s' %
(video_id, sub_lang)) (video_id, sub_lang))
def _extract_subtitles(self, video_id): def extract_subtitles(self, video_id, video_webpage=None):
""" returns {sub_lang: sub} or {} if subtitles not found """ """ returns {sub_lang: sub} or {} if subtitles not found """
available_subs_list = self._get_available_subtitles(video_id) if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
available_subs_list = self._get_available_subtitles(video_id)
elif self._downloader.params.get('writeautomaticsub', False):
available_subs_list = self._get_available_automatic_caption(video_id, video_webpage)
else:
return None
if not available_subs_list: # error, it didn't get the available subtitles if not available_subs_list: # error, it didn't get the available subtitles
return {} return {}
if self._downloader.params.get('allsubtitles', False): if self._downloader.params.get('allsubtitles', False):
sub_lang_list = available_subs_list sub_lang_list = available_subs_list
else: else:
if self._downloader.params.get('writesubtitles', False): if self._downloader.params.get('subtitleslangs', False):
if self._downloader.params.get('subtitleslangs', False): requested_langs = self._downloader.params.get('subtitleslangs')
requested_langs = self._downloader.params.get('subtitleslangs') elif 'en' in available_subs_list:
elif 'en' in available_subs_list: requested_langs = ['en']
requested_langs = ['en'] else:
else: requested_langs = [list(available_subs_list.keys())[0]]
requested_langs = [list(available_subs_list.keys())[0]]
sub_lang_list = {} sub_lang_list = {}
for sub_lang in requested_langs: for sub_lang in requested_langs:
if not sub_lang in available_subs_list: if not sub_lang in available_subs_list:
self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang) self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang)
continue continue
sub_lang_list[sub_lang] = available_subs_list[sub_lang] sub_lang_list[sub_lang] = available_subs_list[sub_lang]
subtitles = {} subtitles = {}
for sub_lang, url in sub_lang_list.items(): for sub_lang, url in sub_lang_list.items():
@ -64,23 +69,11 @@ class SubtitlesInfoExtractor(InfoExtractor):
""" """
pass pass
def _request_automatic_caption(self, video_id, webpage): def _get_available_automatic_caption(self, video_id, webpage):
""" """
returns {sub_lang: sub} or {} if not available returns {sub_lang: url} or {} if not available
Must be redefined by the subclasses that support automatic captions, Must be redefined by the subclasses that support automatic captions,
otherwise it will return {} otherwise it will return {}
""" """
self._downloader.report_warning(u'Automatic Captions not supported by this server') self._downloader.report_warning(u'Automatic Captions not supported by this server')
return {} return {}
def extract_subtitles(self, video_id, video_webpage=None):
"""
Extract the subtitles and/or the automatic captions if requested.
Returns None or a dictionary in the format {sub_lang: sub}
"""
video_subtitles = None
if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
video_subtitles = self._extract_subtitles(video_id)
elif self._downloader.params.get('writeautomaticsub', False):
video_subtitles = self._request_automatic_caption(video_id, video_webpage)
return video_subtitles

@ -5,6 +5,7 @@ import netrc
import re import re
import socket import socket
import itertools import itertools
import xml.etree.ElementTree
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor from .subtitles import SubtitlesInfoExtractor
@ -478,14 +479,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
return {} return {}
return sub_lang_list return sub_lang_list
def _request_automatic_caption(self, video_id, webpage): def _get_available_automatic_caption(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an """We need the webpage for getting the captions url, pass it as an
argument to speed up the process.""" argument to speed up the process."""
sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0]
sub_format = self._downloader.params.get('subtitlesformat') sub_format = self._downloader.params.get('subtitlesformat')
self.to_screen(u'%s: Looking for automatic captions' % video_id) self.to_screen(u'%s: Looking for automatic captions' % video_id)
mobj = re.search(r';ytplayer.config = ({.*?});', webpage) mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang err_msg = u'Couldn\'t find automatic captions for %s' % video_id
if mobj is None: if mobj is None:
self._downloader.report_warning(err_msg) self._downloader.report_warning(err_msg)
return {} return {}
@ -494,16 +494,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
args = player_config[u'args'] args = player_config[u'args']
caption_url = args[u'ttsurl'] caption_url = args[u'ttsurl']
timestamp = args[u'timestamp'] timestamp = args[u'timestamp']
params = compat_urllib_parse.urlencode({ # We get the available subtitles
'lang': 'en', list_params = compat_urllib_parse.urlencode({
'tlang': sub_lang, 'type': 'list',
'fmt': sub_format, 'tlangs': 1,
'ts': timestamp, 'asrs': 1,
'kind': 'asr',
}) })
subtitles_url = caption_url + '&' + params list_url = caption_url + '&' + list_params
sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') list_page = self._download_webpage(list_url, video_id)
return {sub_lang: sub} caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
original_lang = caption_list.find('track').attrib['lang_code']
sub_lang_list = {}
for lang_node in caption_list.findall('target'):
sub_lang = lang_node.attrib['lang_code']
params = compat_urllib_parse.urlencode({
'lang': original_lang,
'tlang': sub_lang,
'fmt': sub_format,
'ts': timestamp,
'kind': 'asr',
})
sub_lang_list[sub_lang] = caption_url + '&' + params
return sub_lang_list
# An extractor error can be raise by the download process if there are # An extractor error can be raise by the download process if there are
# no automatic captions but there are subtitles # no automatic captions but there are subtitles
except (KeyError, ExtractorError): except (KeyError, ExtractorError):

Loading…
Cancel
Save