ArteTvIE: support videos from videos.arte.tv

Each source of videos have a different extraction process, they are in different methods of the extractor.
Changed the extension of videos from mp4 to flv.
master
Jaime Marquínez Ferrándiz 11 years ago
parent ca1c9cfe11
commit 37b6a6617f

@ -11,11 +11,21 @@ from ..utils import (
)
class ArteTvIE(InfoExtractor):
_VALID_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
"""
There are two sources of video in arte.tv: videos.arte.tv and
www.arte.tv/guide, the extraction process is different for each one.
The videos expire in 7 days, so we can't add tests.
"""
_EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
_VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P<id>.*?).html'
_LIVE_URL = r'index-[0-9]+\.html$'
IE_NAME = u'arte.tv'
@classmethod
def suitable(cls, url):
return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL))
# TODO implement Live Stream
# def extractLiveStream(self, url):
# video_lang = url.split('/')[-4]
@ -44,17 +54,26 @@ class ArteTvIE(InfoExtractor):
# video_url = u'%s/%s' % (info.get('url'), info.get('path'))
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
# This is not a real id, it can be for example AJT for the news
# http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
video_id = mobj.group('id')
mobj = re.match(self._EMISSION_URL, url)
if mobj is not None:
name = mobj.group('name')
# This is not a real id, it can be for example AJT for the news
# http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
video_id = mobj.group('id')
return self._extract_emission(url, video_id)
mobj = re.match(self._VIDEOS_URL, url)
if mobj is not None:
id = mobj.group('id')
return self._extract_video(url, id)
if re.search(self._LIVE_URL, video_id) is not None:
raise ExtractorError(u'Arte live streams are not yet supported, sorry')
# self.extractLiveStream(url)
# return
def _extract_emission(self, url, video_id):
"""Extract from www.arte.tv/guide"""
webpage = self._download_webpage(url, video_id)
json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
@ -68,6 +87,7 @@ class ArteTvIE(InfoExtractor):
'description': player_info['VDE'],
'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
'thumbnail': player_info['programImage'],
'ext': 'flv',
}
formats = player_info['VSR'].values()
@ -78,9 +98,36 @@ class ArteTvIE(InfoExtractor):
if format_info['mediaType'] == u'rtmp':
info_dict['url'] = format_info['streamer']
info_dict['play_path'] = 'mp4:' + format_info['url']
info_dict['ext'] = 'mp4'
else:
info_dict['url'] = format_info['url']
info_dict['ext'] = 'mp4'
return info_dict
def _extract_video(self, url, video_id):
"""Extract from videos.arte.tv"""
config_xml_url = url.replace('/videos/', '/do_delegate/videos/')
config_xml_url = config_xml_url.replace('.html', ',view,asPlayerXml.xml')
config_xml = self._download_webpage(config_xml_url, video_id)
config_xml_url = self._html_search_regex(r'<video lang=".*?" ref="(.*?)"', config_xml, 'config xml url')
config_xml = self._download_webpage(config_xml_url, video_id)
video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
def _key(m):
quality = m.group('quality')
if quality == 'hd':
return 2
else:
return 1
# We pick the best quality
video_urls = sorted(video_urls, key=_key)
video_url = list(video_urls)[-1].group('url')
title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title')
thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>',
config_xml, 'thumbnail')
return {'id': video_id,
'title': title,
'thumbnail': thumbnail,
'url': video_url,
'ext': 'flv',
}

Loading…
Cancel
Save