#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import absolute_import import base64 import datetime import itertools import netrc import os import re import socket import time import email.utils import xml.etree.ElementTree import random import math import operator import hashlib import binascii import urllib from .utils import * from .extractor.common import InfoExtractor, SearchInfoExtractor class YoutubeIE(InfoExtractor): """Information extractor for youtube.com.""" _VALID_URL = r"""^ ( (?:https?://)? # http(s):// (optional) (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ |(?: # or the v= param in all its forms (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) v= ) )? # optional -> youtube.com/xxxx is OK )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]+) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow $""" _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _NETRC_MACHINE = 'youtube' # Listed in order of quality _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] _video_extensions = { '13': '3gp', '17': 'mp4', '18': 'mp4', '22': 'mp4', '37': 'mp4', '38': 'video', # You actually don't know if this will be MOV, AVI or whatever '43': 'webm', '44': 'webm', '45': 'webm', '46': 'webm', } _video_dimensions = { '5': '240x400', '6': '???', '13': '???', '17': '144x176', '18': '360x640', '22': '720x1280', '34': '360x640', '35': '480x854', '37': '1080x1920', '38': '3072x4096', '43': '360x640', '44': '480x854', '45': '720x1280', '46': '1080x1920', } IE_NAME = u'youtube' @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" if YoutubePlaylistIE.suitable(url): return False return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def report_lang(self): """Report attempt to set language.""" self.to_screen(u'Setting language') def report_login(self): """Report attempt to log in.""" self.to_screen(u'Logging in') def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" self.to_screen(u'%s: Downloading video webpage' % video_id) def report_video_info_webpage_download(self, video_id): """Report attempt to download video info webpage.""" self.to_screen(u'%s: Downloading video info webpage' % video_id) def report_video_subtitles_download(self, video_id): """Report attempt to download video info webpage.""" self.to_screen(u'%s: Checking available subtitles' % video_id) def report_video_subtitles_request(self, video_id, sub_lang, format): """Report attempt to download video info webpage.""" self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) def report_video_subtitles_available(self, video_id, sub_lang_list): """Report available subtitles.""" sub_lang = ",".join(list(sub_lang_list.keys())) self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) def report_information_extraction(self, video_id): """Report attempt to extract video information.""" self.to_screen(u'%s: Extracting video information' % video_id) def report_unavailable_format(self, video_id, format): """Report extracted video URL.""" self.to_screen(u'%s: Format %s not available' % (video_id, format)) def report_rtmp_download(self): """Indicate the download will use the RTMP protocol.""" self.to_screen(u'RTMP download detected') @staticmethod def _decrypt_signature(s): """Decrypt the key the two subkeys must have a length of 43""" (a,b) = s.split('.') if len(a) != 43 or len(b) != 43: raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid') b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40] a = a[-40:] s_dec = '.'.join((a,b))[::-1] return s_dec def _get_available_subtitles(self, video_id): self.report_video_subtitles_download(video_id) request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: return (u'unable to download video subtitles: %s' % compat_str(err), None) sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) if not sub_lang_list: return (u'video doesn\'t have subtitles', None) return sub_lang_list def _list_available_subtitles(self, video_id): sub_lang_list = self._get_available_subtitles(video_id) self.report_video_subtitles_available(video_id, sub_lang_list) def _request_subtitle(self, sub_lang, sub_name, video_id, format): """ Return tuple: (error_message, sub_lang, sub) """ self.report_video_subtitles_request(video_id, sub_lang, format) params = compat_urllib_parse.urlencode({ 'lang': sub_lang, 'name': sub_name, 'v': video_id, 'fmt': format, }) url = 'http://www.youtube.com/api/timedtext?' + params try: sub = compat_urllib_request.urlopen(url).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: return (u'unable to download video subtitles: %s' % compat_str(err), None, None) if not sub: return (u'Did not fetch video subtitles', None, None) return (None, sub_lang, sub) def _request_automatic_caption(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" sub_lang = self._downloader.params.get('subtitleslang') or 'en' sub_format = self._downloader.params.get('subtitlesformat') self.to_screen(u'%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang if mobj is None: return [(err_msg, None, None)] player_config = json.loads(mobj.group(1)) try: args = player_config[u'args'] caption_url = args[u'ttsurl'] timestamp = args[u'timestamp'] params = compat_urllib_parse.urlencode({ 'lang': 'en', 'tlang': sub_lang, 'fmt': sub_format, 'ts': timestamp, 'kind': 'asr', }) subtitles_url = caption_url + '&' + params sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') return [(None, sub_lang, sub)] except KeyError: return [(err_msg, None, None)] def _extract_subtitle(self, video_id): """ Return a list with a tuple: [(error_message, sub_lang, sub)] """ sub_lang_list = self._get_available_subtitles(video_id) sub_format = self._downloader.params.get('subtitlesformat') if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles return [(sub_lang_list[0], None, None)] if self._downloader.params.get('subtitleslang', False): sub_lang = self._downloader.params.get('subtitleslang') elif 'en' in sub_lang_list: sub_lang = 'en' else: sub_lang = list(sub_lang_list.keys())[0] if not sub_lang in sub_lang_list: return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)] subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) return [subtitle] def _extract_all_subtitles(self, video_id): sub_lang_list = self._get_available_subtitles(video_id) sub_format = self._downloader.params.get('subtitlesformat') if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles return [(sub_lang_list[0], None, None)] subtitles = [] for sub_lang in sub_lang_list: subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) subtitles.append(subtitle) return subtitles def _print_formats(self, formats): print('Available formats:') for x in formats: print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))) def _real_initialize(self): if self._downloader is None: return username = None password = None downloader_params = self._downloader.params # Attempt to use provided username and password or .netrc data if downloader_params.get('username', None) is not None: username = downloader_params['username'] password = downloader_params['password'] elif downloader_params.get('usenetrc', False): try: info = netrc.netrc().authenticators(self._NETRC_MACHINE) if info is not None: username = info[0] password = info[2] else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) return # Set language request = compat_urllib_request.Request(self._LANG_URL) try: self.report_lang() compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) return # No authentication to be performed if username is None: return request = compat_urllib_request.Request(self._LOGIN_URL) try: login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) return galx = None dsh = None match = re.search(re.compile(r']* id="gaia_loginform"', login_results) is not None: self._downloader.report_warning(u'unable to log in: bad username or password') return except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) return # Confirm age age_form = { 'next_url': '/', 'action_confirm': 'Confirm', } request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) try: self.report_age_confirmation() age_results = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) def _extract_id(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(2) return video_id def _real_extract(self, url): # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) if mobj: url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') video_id = self._extract_id(url) # Get video webpage self.report_video_webpage_download(video_id) url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id request = compat_urllib_request.Request(url) try: video_webpage_bytes = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err)) video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') # Attempt to extract SWF player URL mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) if mobj is not None: player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) else: player_url = None # Get video info self.report_video_info_webpage_download(video_id) for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' % (video_id, el_type)) video_info_webpage = self._download_webpage(video_info_url, video_id, note=False, errnote='unable to download video info webpage') video_info = compat_parse_qs(video_info_webpage) if 'token' in video_info: break if 'token' not in video_info: if 'reason' in video_info: raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0]) else: raise ExtractorError(u'"token" parameter not in video info for unknown reason') # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: raise ExtractorError(u'"rental" videos not supported') # Start extracting information self.report_information_extraction(video_id) # uploader if 'author' not in video_info: raise ExtractorError(u'Unable to extract uploader name') video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0]) # uploader_id video_uploader_id = None mobj = re.search(r'', video_webpage) if mobj is not None: video_uploader_id = mobj.group(1) else: self._downloader.report_warning(u'unable to extract uploader nickname') # title if 'title' not in video_info: raise ExtractorError(u'Unable to extract video title') video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) # thumbnail image if 'thumbnail_url' not in video_info: self._downloader.report_warning(u'unable to extract video thumbnail') video_thumbnail = '' else: # don't panic if we can't find it video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) # upload date upload_date = None mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) if mobj is not None: upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: video_description = clean_html(video_description) else: fd_mobj = re.search(r'= 1: url_map = {} for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','): url_data = compat_parse_qs(url_data_str) if 'itag' in url_data and 'url' in url_data: url = url_data['url'][0] if 'sig' in url_data: url += '&signature=' + url_data['sig'][0] elif 's' in url_data: signature = self._decrypt_signature(url_data['s'][0]) url += '&signature=' + signature if 'ratebypass' not in url: url += '&ratebypass=yes' url_map[url_data['itag'][0]] = url format_limit = self._downloader.params.get('format_limit', None) available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats if format_limit is not None and format_limit in available_formats: format_list = available_formats[available_formats.index(format_limit):] else: format_list = available_formats existing_formats = [x for x in format_list if x in url_map] if len(existing_formats) == 0: raise ExtractorError(u'no known formats available for video') if self._downloader.params.get('listformats', None): self._print_formats(existing_formats) return if req_format is None or req_format == 'best': video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality elif req_format == 'worst': video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality elif req_format in ('-1', 'all'): video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats else: # Specific formats. We pick the first in a slash-delimeted sequence. # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. req_formats = req_format.split('/') video_url_list = None for rf in req_formats: if rf in url_map: video_url_list = [(rf, url_map[rf])] break if video_url_list is None: raise ExtractorError(u'requested format not available') else: raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info') results = [] for format_param, video_real_url in video_url_list: # Extension video_extension = self._video_extensions.get(format_param, 'flv') video_format = '{0} - {1}'.format(format_param if format_param else video_extension, self._video_dimensions.get(format_param, '???')) results.append({ 'id': video_id, 'url': video_real_url, 'uploader': video_uploader, 'uploader_id': video_uploader_id, 'upload_date': upload_date, 'title': video_title, 'ext': video_extension, 'format': video_format, 'thumbnail': video_thumbnail, 'description': video_description, 'player_url': player_url, 'subtitles': video_subtitles, 'duration': video_duration }) return results class MetacafeIE(InfoExtractor): """Information Extractor for metacafe.com.""" _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = u'metacafe' def report_disclaimer(self): """Report disclaimer retrieval.""" self.to_screen(u'Retrieving disclaimer') def _real_initialize(self): # Retrieve disclaimer request = compat_urllib_request.Request(self._DISCLAIMER) try: self.report_disclaimer() disclaimer = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err)) # Confirm age disclaimer_form = { 'filters': '0', 'submit': "Continue - I'm over 18", } request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) try: self.report_age_confirmation() disclaimer = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) def _real_extract(self, url): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(1) # Check if video comes from YouTube mobj2 = re.match(r'^yt-(.*)$', video_id) if mobj2 is not None: return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')] # Retrieve video webpage to extract further information webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id) # Extract URL, uploader and title from webpage self.report_extraction(video_id) mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) if mobj is not None: mediaURL = compat_urllib_parse.unquote(mobj.group(1)) video_extension = mediaURL[-3:] # Extract gdaKey if available mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) if mobj is None: video_url = mediaURL else: gdaKey = mobj.group(1) video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) else: mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) if mobj is None: raise ExtractorError(u'Unable to extract media URL') vardict = compat_parse_qs(mobj.group(1)) if 'mediaData' not in vardict: raise ExtractorError(u'Unable to extract media URL') mobj = re.search(r'"mediaURL":"(?Phttp.*?)",(.*?)"key":"(?P.*?)"', vardict['mediaData'][0]) if mobj is None: raise ExtractorError(u'Unable to extract media URL') mediaURL = mobj.group('mediaURL').replace('\\/', '/') video_extension = mediaURL[-3:] video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) mobj = re.search(r'(?im)(.*) - Video', webpage) if mobj is None: raise ExtractorError(u'Unable to extract title') video_title = mobj.group(1).decode('utf-8') mobj = re.search(r'submitter=(.*?);', webpage) if mobj is None: raise ExtractorError(u'Unable to extract uploader nickname') video_uploader = mobj.group(1) return [{ 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), 'uploader': video_uploader.decode('utf-8'), 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), }] class DailymotionIE(InfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' IE_NAME = u'dailymotion' def _real_extract(self, url): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(1).split('_')[0].split('?')[0] video_extension = 'mp4' # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url) request.add_header('Cookie', 'family_filter=off') webpage = self._download_webpage(request, video_id) # Extract URL, uploader and title from webpage self.report_extraction(video_id) mobj = re.search(r'\s*var flashvars = (.*)', webpage) if mobj is None: raise ExtractorError(u'Unable to extract media URL') flashvars = compat_urllib_parse.unquote(mobj.group(1)) for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: if key in flashvars: max_quality = key self.to_screen(u'Using %s' % key) break else: raise ExtractorError(u'Unable to extract video URL') mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) if mobj is None: raise ExtractorError(u'Unable to extract video URL') video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/') # TODO: support choosing qualities mobj = re.search(r'', webpage) if mobj is None: raise ExtractorError(u'Unable to extract title') video_title = unescapeHTML(mobj.group('title')) video_uploader = None video_uploader = self._search_regex([r'(?im)[^<]+?]+?>([^<]+?)', # Looking for official user r'<(?:span|a) .*?rel="author".*?>([^<]+?)([0-9]{2})-([0-9]{2})-([0-9]{4})', webpage) if mobj is not None: video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) return [{ 'id': video_id, 'url': video_url, 'uploader': video_uploader, 'upload_date': video_upload_date, 'title': video_title, 'ext': video_extension, }] class PhotobucketIE(InfoExtractor): """Information extractor for photobucket.com.""" # TODO: the original _VALID_URL was: # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' # Check if it's necessary to keep the old extracion process _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P.*)\.(?P(flv)|(mp4))' IE_NAME = u'photobucket' def _real_extract(self, url): # Extract id from URL mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') video_extension = mobj.group('ext') # Retrieve video webpage to extract further information webpage = self._download_webpage(url, video_id) # Extract URL, uploader, and title from webpage self.report_extraction(video_id) # We try first by looking the javascript code: mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P.*?)\);', webpage) if mobj is not None: info = json.loads(mobj.group('json')) return [{ 'id': video_id, 'url': info[u'downloadUrl'], 'uploader': info[u'username'], 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'), 'title': info[u'title'], 'ext': video_extension, 'thumbnail': info[u'thumbUrl'], }] # We try looking in other parts of the webpage video_url = self._search_regex(r'', webpage, u'video URL') mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) if mobj is None: raise ExtractorError(u'Unable to extract title') video_title = mobj.group(1).decode('utf-8') video_uploader = mobj.group(2).decode('utf-8') return [{ 'id': video_id.decode('utf-8'), 'url': video_url.decode('utf-8'), 'uploader': video_uploader, 'upload_date': None, 'title': video_title, 'ext': video_extension.decode('utf-8'), }] class YahooIE(InfoExtractor): """Information extractor for screen.yahoo.com.""" _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P\d*?)\.html' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P.+?)";', webpage) if m_id is None: # TODO: Check which url parameters are required info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') info_re = r'''<!\[CDATA\[(?P<title>.*?)\]\]>.* .*?)\]\]>.* .*?)\ .*\]\]>.* https?://)?(?:(?:www|player)\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)' IE_NAME = u'vimeo' def _verify_video_password(self, url, video_id, webpage): password = self._downloader.params.get('password', None) if password is None: raise ExtractorError(u'This video is protected by a password, use the --password option') token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1) data = compat_urllib_parse.urlencode({'password': password, 'token': token}) # I didn't manage to use the password with https if url.startswith('https'): pass_url = url.replace('https','http') else: pass_url = url password_request = compat_urllib_request.Request(pass_url+'/password', data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') password_request.add_header('Cookie', 'xsrft=%s' % token) pass_web = self._download_webpage(password_request, video_id, u'Verifying the password', u'Wrong password') def _real_extract(self, url, new_video=True): # Extract ID from URL mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') if not mobj.group('proto'): url = 'https://' + url if mobj.group('direct_link') or mobj.group('pro'): url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url, None, std_headers) webpage = self._download_webpage(request, video_id) # Now we begin extracting as much information as we can from what we # retrieved. First we extract the information common to all extractors, # and latter we extract those that are Vimeo specific. self.report_extraction(video_id) # Extract the config JSON try: config = webpage.split(' = {config:')[1].split(',assets:')[0] config = json.loads(config) except: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option') if re.search('If so please provide the correct password.', webpage): self._verify_video_password(url, video_id, webpage) return self._real_extract(url) else: raise ExtractorError(u'Unable to extract info section') # Extract title video_title = config["video"]["title"] # Extract uploader and uploader_id video_uploader = config["video"]["owner"]["name"] video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None # Extract video thumbnail video_thumbnail = config["video"]["thumbnail"] # Extract video description video_description = get_element_by_attribute("itemprop", "description", webpage) if video_description: video_description = clean_html(video_description) else: video_description = u'' # Extract upload date video_upload_date = None mobj = re.search(r' 0: video_quality = files[quality][0][2] video_codec = files[quality][0][0] video_extension = files[quality][0][1] self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality)) break else: raise ExtractorError(u'No known codec found') video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ %(video_id, sig, timestamp, video_quality, video_codec.upper()) return [{ 'id': video_id, 'url': video_url, 'uploader': video_uploader, 'uploader_id': video_uploader_id, 'upload_date': video_upload_date, 'title': video_title, 'ext': video_extension, 'thumbnail': video_thumbnail, 'description': video_description, }] class ArteTvIE(InfoExtractor): """arte.tv information extractor.""" _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' _LIVE_URL = r'index-[0-9]+\.html$' IE_NAME = u'arte.tv' def fetch_webpage(self, url): request = compat_urllib_request.Request(url) try: self.report_download_webpage(url) webpage = compat_urllib_request.urlopen(request).read() except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err)) except ValueError as err: raise ExtractorError(u'Invalid URL: %s' % url) return webpage def grep_webpage(self, url, regex, regexFlags, matchTuples): page = self.fetch_webpage(url) mobj = re.search(regex, page, regexFlags) info = {} if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) for (i, key, err) in matchTuples: if mobj.group(i) is None: raise ExtractorError(err) else: info[key] = mobj.group(i) return info def extractLiveStream(self, url): video_lang = url.split('/')[-4] info = self.grep_webpage( url, r'src="(.*?/videothek_js.*?\.js)', 0, [ (1, 'url', u'Invalid URL: %s' % url) ] ) http_host = url.split('/')[2] next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) info = self.grep_webpage( next_url, r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + '(http://.*?\.swf).*?' + '(rtmp://.*?)\'', re.DOTALL, [ (1, 'path', u'could not extract video path: %s' % url), (2, 'player', u'could not extract video player: %s' % url), (3, 'url', u'could not extract video url: %s' % url) ] ) video_url = u'%s/%s' % (info.get('url'), info.get('path')) def extractPlus7Stream(self, url): video_lang = url.split('/')[-3] info = self.grep_webpage( url, r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', 0, [ (1, 'url', u'Invalid URL: %s' % url) ] ) next_url = compat_urllib_parse.unquote(info.get('url')) info = self.grep_webpage( next_url, r'""" mobj = re.search(_title, webpage_src) if mobj is not None: video_title = mobj.group(1) results = [{ 'id': video_id, 'url' : video_url, 'title' : video_title, 'thumbnail' : thumbnail, 'ext' : ext, }] return results class RBMARadioIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)$' def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') webpage = self._download_webpage(url, video_id) json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', webpage, u'json data', flags=re.MULTILINE) try: data = json.loads(json_data) except ValueError as e: raise ExtractorError(u'Invalid JSON: ' + str(e)) video_url = data['akamai_url'] + '&cbr=256' url_parts = compat_urllib_parse_urlparse(video_url) video_ext = url_parts.path.rpartition('.')[2] info = { 'id': video_id, 'url': video_url, 'ext': video_ext, 'title': data['title'], 'description': data.get('teaser_text'), 'location': data.get('country_of_origin'), 'uploader': data.get('host', {}).get('name'), 'uploader_id': data.get('host', {}).get('slug'), 'thumbnail': data.get('image', {}).get('large_url_2x'), 'duration': data.get('duration'), } return [info] class YouPornIE(InfoExtractor): """Information extractor for youporn.com.""" _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P[0-9]+)/(?P[^/]+)' def _print_formats(self, formats): """Print all available formats""" print(u'Available formats:') print(u'ext\t\tformat') print(u'---------------------------------') for format in formats: print(u'%s\t\t%s' % (format['ext'], format['format'])) def _specific(self, req_format, formats): for x in formats: if(x["format"]==req_format): return x return None def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('videoid') req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) # Get JSON parameters json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters') try: params = json.loads(json_params) except: raise ExtractorError(u'Invalid JSON') self.report_extraction(video_id) try: video_title = params['title'] upload_date = unified_strdate(params['release_date_f']) video_description = params['description'] video_uploader = params['submitted_by'] thumbnail = params['thumbnails'][0]['image'] except KeyError: raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) # Get all of the formats available DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' download_list_html = self._search_regex(DOWNLOAD_LIST_RE, webpage, u'download list').strip() # Get all of the links from the page LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' links = re.findall(LINK_RE, download_list_html) if(len(links) == 0): raise ExtractorError(u'ERROR: no known formats available for video') self.to_screen(u'Links found: %d' % len(links)) formats = [] for link in links: # A link looks like this: # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0 # A path looks like this: # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 video_url = unescapeHTML( link ) path = compat_urllib_parse_urlparse( video_url ).path extension = os.path.splitext( path )[1][1:] format = path.split('/')[4].split('_')[:2] size = format[0] bitrate = format[1] format = "-".join( format ) # title = u'%s-%s-%s' % (video_title, size, bitrate) formats.append({ 'id': video_id, 'url': video_url, 'uploader': video_uploader, 'upload_date': upload_date, 'title': video_title, 'ext': extension, 'format': format, 'thumbnail': thumbnail, 'description': video_description }) if self._downloader.params.get('listformats', None): self._print_formats(formats) return req_format = self._downloader.params.get('format', None) self.to_screen(u'Format: %s' % req_format) if req_format is None or req_format == 'best': return [formats[0]] elif req_format == 'worst': return [formats[-1]] elif req_format in ('-1', 'all'): return formats else: format = self._specific( req_format, formats ) if result is None: raise ExtractorError(u'Requested format not available') return [format] class PornotubeIE(InfoExtractor): """Information extractor for pornotube.com.""" _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('videoid') video_title = mobj.group('title') # Get webpage content webpage = self._download_webpage(url, video_id) # Get the video URL VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url') video_url = compat_urllib_parse.unquote(video_url) #Get the uploaded date VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) if upload_date: upload_date = unified_strdate(upload_date) info = {'id': video_id, 'url': video_url, 'uploader': None, 'upload_date': upload_date, 'title': video_title, 'ext': 'flv', 'format': 'flv'} return [info] class YouJizzIE(InfoExtractor): """Information extractor for youjizz.com.""" _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('videoid') # Get webpage content webpage = self._download_webpage(url, video_id) # Get the video title video_title = self._html_search_regex(r'<title>(?P<title>.*)', webpage, u'title').strip() # Get the embed page result = re.search(r'https?://www.youjizz.com/videos/embed/(?P[0-9]+)', webpage) if result is None: raise ExtractorError(u'ERROR: unable to extract embed page') embed_page_url = result.group(0).strip() video_id = result.group('videoid') webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', webpage, u'video URL') info = {'id': video_id, 'url': video_url, 'title': video_title, 'ext': 'flv', 'format': 'flv', 'player_url': embed_page_url} return [info] class EightTracksIE(InfoExtractor): IE_NAME = '8tracks' _VALID_URL = r'https?://8tracks.com/(?P[^/]+)/(?P[^/#]+)(?:#.*)?$' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) playlist_id = mobj.group('id') webpage = self._download_webpage(url, playlist_id) json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL) data = json.loads(json_like) session = str(random.randint(0, 1000000000)) mix_id = data['id'] track_count = data['tracks_count'] first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id) next_url = first_url res = [] for i in itertools.count(): api_json = self._download_webpage(next_url, playlist_id, note=u'Downloading song information %s/%s' % (str(i+1), track_count), errnote=u'Failed to download song information') api_data = json.loads(api_json) track_data = api_data[u'set']['track'] info = { 'id': track_data['id'], 'url': track_data['track_file_stream_url'], 'title': track_data['performer'] + u' - ' + track_data['name'], 'raw_title': track_data['name'], 'uploader_id': data['user']['login'], 'ext': 'm4a', } res.append(info) if api_data['set']['at_last_track']: break next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id']) return res class KeekIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' IE_NAME = u'keek' def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') video_url = u'http://cdn.keek.com/keek/video/%s' % video_id thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) video_title = self._html_search_regex(r'[\S\s]+?

(?P.+?)

', webpage, u'uploader', fatal=False) info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': video_title, 'thumbnail': thumbnail, 'uploader': uploader } return [info] class TEDIE(InfoExtractor): _VALID_URL=r'''http://www\.ted\.com/ ( ((?Pplaylists)/(?P\d+)) # We have a playlist | ((?Ptalks)) # We have a simple talk ) (/lang/(.*?))? # The url may contain the language /(?P\w+) # Here goes the name and then ".html" ''' @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" return re.match(cls._VALID_URL, url, re.VERBOSE) is not None def _real_extract(self, url): m=re.match(self._VALID_URL, url, re.VERBOSE) if m.group('type_talk'): return [self._talk_info(url)] else : playlist_id=m.group('playlist_id') name=m.group('name') self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) return [self._playlist_videos_info(url,name,playlist_id)] def _playlist_videos_info(self,url,name,playlist_id=0): '''Returns the videos of the playlist''' video_RE=r'''
(?P.+?)

' webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') m_videos=re.finditer(video_RE,webpage,re.VERBOSE) m_names=re.finditer(video_name_RE,webpage) playlist_title = self._html_search_regex(r'div class="headline">\s*?

\s*?(.*?)', webpage, 'playlist title') playlist_entries = [] for m_video, m_name in zip(m_videos,m_names): video_id=m_video.group('video_id') talk_url='http://www.ted.com%s' % m_name.group('talk_url') playlist_entries.append(self.url_result(talk_url, 'TED')) return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title) def _talk_info(self, url, video_id=0): """Return the video for the talk in the url""" m = re.match(self._VALID_URL, url,re.VERBOSE) video_name = m.group('name') webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name) self.report_extraction(video_name) # If the url includes the language we get the title translated title = self._html_search_regex(r'(?P.*)</span>', webpage, 'title') json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>', webpage, 'json data') info = json.loads(json_data) desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>', webpage, 'description', flags = re.DOTALL) thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"', webpage, 'thumbnail') info = { 'id': info['id'], 'url': info['htmlStreams'][-1]['file'], 'ext': 'mp4', 'title': title, 'thumbnail': thumbnail, 'description': desc, } return info class MySpassIE(InfoExtractor): _VALID_URL = r'http://www.myspass.de/.*' def _real_extract(self, url): META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s' # video id is the last path element of the URL # usually there is a trailing slash, so also try the second but last url_path = compat_urllib_parse_urlparse(url).path url_parent_path, video_id = os.path.split(url_path) if not video_id: _, video_id = os.path.split(url_parent_path) # get metadata metadata_url = META_DATA_URL_TEMPLATE % video_id metadata_text = self._download_webpage(metadata_url, video_id) metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8')) # extract values from metadata url_flv_el = metadata.find('url_flv') if url_flv_el is None: raise ExtractorError(u'Unable to extract download url') video_url = url_flv_el.text extension = os.path.splitext(video_url)[1][1:] title_el = metadata.find('title') if title_el is None: raise ExtractorError(u'Unable to extract title') title = title_el.text format_id_el = metadata.find('format_id') if format_id_el is None: format = ext else: format = format_id_el.text description_el = metadata.find('description') if description_el is not None: description = description_el.text else: description = None imagePreview_el = metadata.find('imagePreview') if imagePreview_el is not None: thumbnail = imagePreview_el.text else: thumbnail = None info = { 'id': video_id, 'url': video_url, 'title': title, 'ext': extension, 'format': format, 'thumbnail': thumbnail, 'description': description } return [info] class SpiegelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$' def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') webpage = self._download_webpage(url, video_id) video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>', webpage, u'title') xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' xml_code = self._download_webpage(xml_url, video_id, note=u'Downloading XML', errnote=u'Failed to download XML') idoc = xml.etree.ElementTree.fromstring(xml_code) last_type = idoc[-1] filename = last_type.findall('./filename')[0].text duration = float(last_type.findall('./duration')[0].text) video_url = 'http://video2.spiegel.de/flash/' + filename video_ext = filename.rpartition('.')[2] info = { 'id': video_id, 'url': video_url, 'ext': video_ext, 'title': video_title, 'duration': duration, } return [info] class LiveLeakIE(InfoExtractor): _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' IE_NAME = u'liveleak' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('video_id') webpage = self._download_webpage(url, video_id) video_url = self._search_regex(r'file: "(.*?)",', webpage, u'video URL') video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"', webpage, u'title').replace('LiveLeak.com -', '').strip() video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage, u'description', fatal=False) video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>', webpage, u'uploader', fatal=False) info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': video_title, 'description': video_description, 'uploader': video_uploader } return [info] class ARDIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>' _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)' def _real_extract(self, url): # determine video id from url m = re.match(self._VALID_URL, url) numid = re.search(r'documentId=([0-9]+)', url) if numid: video_id = numid.group(1) else: video_id = m.group('video_id') # determine title and media streams from webpage html = self._download_webpage(url, video_id) title = re.search(self._TITLE, html).group('title') streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] if not streams: assert '"fsk"' in html raise ExtractorError(u'This video is only available after 8:00 pm') # choose default media type and highest quality for now stream = max([s for s in streams if int(s["media_type"]) == 0], key=lambda s: int(s["quality"])) # there's two possibilities: RTMP stream or HTTP download info = {'id': video_id, 'title': title, 'ext': 'mp4'} if stream['rtmp_url']: self.to_screen(u'RTMP download detected') assert stream['video_url'].startswith('mp4:') info["url"] = stream["rtmp_url"] info["play_path"] = stream['video_url'] else: assert stream["video_url"].endswith('.mp4') info["url"] = stream["video_url"] return [info] class ZDFIE(InfoExtractor): _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>' _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('video_id') html = self._download_webpage(url, video_id) streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] if streams is None: raise ExtractorError(u'No media url found.') # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url # choose first/default media type and highest quality for now for s in streams: #find 300 - dsl1000mbit if s['quality'] == '300' and s['media_type'] == 'wstreaming': stream_=s break for s in streams: #find veryhigh - dsl2000mbit if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working stream_=s break if stream_ is None: raise ExtractorError(u'No stream found.') media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL') self.report_extraction(video_id) mobj = re.search(self._TITLE, html) if mobj is None: raise ExtractorError(u'Cannot extract title') title = unescapeHTML(mobj.group('title')) mobj = re.search(self._MMS_STREAM, media_link) if mobj is None: mobj = re.search(self._RTSP_STREAM, media_link) if mobj is None: raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') mms_url = mobj.group('video_url') mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url) if mobj is None: raise ExtractorError(u'Cannot extract extention') ext = mobj.group('ext') return [{'id': video_id, 'url': mms_url, 'title': title, 'ext': ext }] class TumblrIE(InfoExtractor): _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)' def _real_extract(self, url): m_url = re.match(self._VALID_URL, url) video_id = m_url.group('id') blog = m_url.group('blog_name') url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage = self._download_webpage(url, video_id) re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id) video = re.search(re_video, webpage) if video is None: raise ExtractorError(u'Unable to extract video') video_url = video.group('video_url') ext = video.group('ext') video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22', webpage, u'thumbnail', fatal=False) # We pick the first poster if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '') # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos video_title = self._html_search_regex(r'<title>(?P<title>.*?)', webpage, u'title', flags=re.DOTALL) return [{'id': video_id, 'url': video_url, 'title': video_title, 'thumbnail': video_thumbnail, 'ext': ext }] class BandcampIE(InfoExtractor): _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P.*)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') webpage = self._download_webpage(url, title) # We get the link to the free download page m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if m_download is None: raise ExtractorError(u'No free songs found') download_link = m_download.group(1) id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', webpage, re.MULTILINE|re.DOTALL).group('id') download_webpage = self._download_webpage(download_link, id, 'Downloading free downloads page') # We get the dictionary of the track from some javascrip code info = re.search(r'items: (.*?),$', download_webpage, re.MULTILINE).group(1) info = json.loads(info)[0] # We pick mp3-320 for now, until format selection can be easily implemented. mp3_info = info[u'downloads'][u'mp3-320'] # If we try to use this url it says the link has expired initial_url = mp3_info[u'url'] re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$' m_url = re.match(re_url, initial_url) #We build the url we will use to get the final track url # This url is build in Bandcamp in the script download_bunde_*.js request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts')) final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url') # If we could correctly generate the .rand field the url would be #in the "download_url" key final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1) track_info = {'id':id, 'title' : info[u'title'], 'ext' : 'mp3', 'url' : final_url, 'thumbnail' : info[u'thumb_url'], 'uploader' : info[u'artist'] } return [track_info] class RedTubeIE(InfoExtractor): """Information Extractor for redtube""" _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)' def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') video_extension = 'mp4' webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">', webpage, u'video URL') video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>', webpage, u'title') return [{ 'id': video_id, 'url': video_url, 'ext': video_extension, 'title': video_title, }] class InaIE(InfoExtractor): """Information Extractor for Ina.fr""" _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*' def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id video_extension = 'mp4' webpage = self._download_webpage(mrss_url, video_id) self.report_extraction(video_id) video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage, u'video URL') video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]>', webpage, u'title') return [{ 'id': video_id, 'url': video_url, 'ext': video_extension, 'title': video_title, }] class HowcastIE(InfoExtractor): """Information Extractor for Howcast.com""" _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P\d+)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage_url = 'http://www.howcast.com/videos/' + video_id webpage = self._download_webpage(webpage_url, video_id) self.report_extraction(video_id) video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', webpage, u'video URL') video_title = self._html_search_regex(r'\w+)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage_url = 'https://vine.co/v/' + video_id webpage = self._download_webpage(webpage_url, video_id) self.report_extraction(video_id) video_url = self._html_search_regex(r'.*?

(.+?)

', webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': video_title, 'thumbnail': thumbnail, 'uploader': uploader, }] class FlickrIE(InfoExtractor): """Information Extractor for Flickr videos""" _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P[\w\-_@]+)/(?P\d+).*' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') video_uploader_id = mobj.group('uploader_id') webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id webpage = self._download_webpage(webpage_url, video_id) secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret') first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') node_id = self._html_search_regex(r'(\d+-\d+)', first_xml, u'node_id') second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage') self.report_extraction(video_id) mobj = re.search(r'.*)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) url_title = mobj.group('url_title') webpage = self._download_webpage(url, url_title) video_id = self._html_search_regex(r'
(.*?)', data, u'video URL') return [{ 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': video_title, 'thumbnail': thumbnail, 'description': video_description, }] class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P[0-9]+)/.*\.html' def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id webpage = self._download_webpage(mrss_url, video_id) mobj = re.search(r'\'srv\': \'(?P[^\']*)\',\s*\'file\': \'(?P[^\']+)\',', webpage) if mobj is None: raise ExtractorError(u'Unable to extract media URL') if len(mobj.group('server')) == 0: video_url = compat_urllib_parse.unquote(mobj.group('file')) else: video_url = mobj.group('server')+'/key='+mobj.group('file') video_extension = video_url.split('.')[-1] video_title = self._html_search_regex(r'(?P<title>.+?) - xHamster\.com', webpage, u'title') # Can't see the description anywhere in the UI # video_description = self._html_search_regex(r'Description: (?P[^<]+)', # webpage, u'description', fatal=False) # if video_description: video_description = unescapeHTML(video_description) mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) if mobj: video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') else: video_upload_date = None self._downloader.report_warning(u'Unable to extract upload date') video_uploader_id = self._html_search_regex(r']+>(?P[^<]+)', webpage, u'uploader id', default=u'anonymous') video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, 'url': video_url, 'ext': video_extension, 'title': video_title, # 'description': video_description, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, 'thumbnail': video_thumbnail }] class HypemIE(InfoExtractor): """Information Extractor for hypem""" _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) track_id = mobj.group(1) data = { 'ax': 1, 'ts': time.time() } data_encoded = compat_urllib_parse.urlencode(data) complete_url = url + "?" + data_encoded request = compat_urllib_request.Request(complete_url) response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url') cookie = urlh.headers.get('Set-Cookie', '') self.report_extraction(track_id) html_tracks = self._html_search_regex(r'', response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() try: track_list = json.loads(html_tracks) track = track_list[u'tracks'][0] except ValueError: raise ExtractorError(u'Hypemachine contained invalid JSON.') key = track[u"key"] track_id = track[u"id"] artist = track[u"artist"] title = track[u"song"] serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key)) request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'}) request.add_header('cookie', cookie) song_data_json = self._download_webpage(request, track_id, u'Downloading metadata') try: song_data = json.loads(song_data_json) except ValueError: raise ExtractorError(u'Hypemachine contained invalid JSON.') final_url = song_data[u"url"] return [{ 'id': track_id, 'url': final_url, 'ext': "mp3", 'title': title, 'artist': artist, }] class Vbox7IE(InfoExtractor): """Information Extractor for Vbox7""" _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)' def _real_extract(self,url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(1) redirect_page, urlh = self._download_webpage_handle(url, video_id) new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location') redirect_url = urlh.geturl() + new_location webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page') title = self._html_search_regex(r'(.*)', webpage, u'title').split('/')[0].strip() ext = "flv" info_url = "http://vbox7.com/play/magare.do" data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id}) info_request = compat_urllib_request.Request(info_url, data) info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage') if info_response is None: raise ExtractorError(u'Unable to extract the media url') (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) return [{ 'id': video_id, 'url': final_url, 'ext': ext, 'title': title, 'thumbnail': thumbnail_url, }] class GametrailersIE(InfoExtractor): _VALID_URL = r'http://www.gametrailers.com/(?Pvideos|reviews|full-episodes)/(?P.*?)/(?P.*)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') video_type = mobj.group('type') webpage = self._download_webpage(url, video_id) if video_type == 'full-episodes': mgid_re = r'data-video="(?P<mgid>mgid:.*?)"' else: mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\'' mgid = self._search_regex(mgid_re, webpage, u'mgid') data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'}) info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data, video_id, u'Downloading video info') links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data, video_id, u'Downloading video urls info') self.report_extraction(video_id) info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]>.* .*?)\]\]>.* .* (?P.*?).* ''' m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL) if m_info is None: raise ExtractorError(u'Unable to extract video info') video_title = m_info.group('title') video_description = m_info.group('description') video_thumb = m_info.group('thumb') m_urls = list(re.finditer(r'(?P.*)', links_webpage)) if m_urls is None or len(m_urls) == 0: raise ExtractError(u'Unable to extrat video url') # They are sorted from worst to best quality video_url = m_urls[-1].group('url') return {'url': video_url, 'id': video_id, 'title': video_title, # Videos are actually flv not mp4 'ext': 'flv', 'thumbnail': video_thumb, 'description': video_description, } class StatigramIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) video_url = self._html_search_regex( r'', webpage, u'video URL') thumbnail_url = self._html_search_regex( r'', webpage, u'thumbnail URL', fatal=False) html_title = self._html_search_regex( r'(.+?)', webpage, u'title') title = html_title.rpartition(u' | Statigram')[0] uploader_id = self._html_search_regex( r'@([^ ]+)', title, u'uploader name', fatal=False) ext = 'mp4' return [{ 'id': video_id, 'url': video_url, 'ext': ext, 'title': title, 'thumbnail': thumbnail_url, 'uploader_id' : uploader_id }] def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. """ return [ YoutubePlaylistIE(), YoutubeChannelIE(), YoutubeUserIE(), YoutubeSearchIE(), YoutubeIE(), MetacafeIE(), DailymotionIE(), GoogleSearchIE(), PhotobucketIE(), YahooIE(), YahooSearchIE(), DepositFilesIE(), FacebookIE(), BlipTVIE(), BlipTVUserIE(), VimeoIE(), MyVideoIE(), ComedyCentralIE(), EscapistIE(), CollegeHumorIE(), XVideosIE(), SoundcloudSetIE(), SoundcloudIE(), InfoQIE(), MixcloudIE(), StanfordOpenClassroomIE(), MTVIE(), YoukuIE(), XNXXIE(), YouJizzIE(), PornotubeIE(), YouPornIE(), GooglePlusIE(), ArteTvIE(), NBAIE(), WorldStarHipHopIE(), JustinTVIE(), FunnyOrDieIE(), SteamIE(), UstreamIE(), RBMARadioIE(), EightTracksIE(), KeekIE(), TEDIE(), MySpassIE(), SpiegelIE(), LiveLeakIE(), ARDIE(), ZDFIE(), TumblrIE(), BandcampIE(), RedTubeIE(), InaIE(), HowcastIE(), VineIE(), FlickrIE(), TeamcocoIE(), XHamsterIE(), HypemIE(), Vbox7IE(), GametrailersIE(), StatigramIE(), GenericIE() ] def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" return globals()[ie_name+'IE']