[yahoo] Fix video extraction (fixes #1521)

There's no need to use two different methods.
Now we can also download videos over http if possible.
Also run the test for rtmp videos, but skip the download.
master
Jaime Marquínez Ferrándiz 11 years ago
parent 123c10608d
commit 9c15e9de84

@ -1,4 +1,3 @@
import datetime
import itertools import itertools
import json import json
import re import re
@ -6,86 +5,85 @@ import re
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from ..utils import ( from ..utils import (
compat_urllib_parse, compat_urllib_parse,
compat_urlparse,
ExtractorError, determine_ext,
clean_html,
) )
class YahooIE(InfoExtractor): class YahooIE(InfoExtractor):
IE_DESC = u'Yahoo screen' IE_DESC = u'Yahoo screen'
_VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html' _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
_TEST = { _TESTS = [
u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', {
u'file': u'214727115.flv', u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
u'md5': u'2e717f169c1be93d84d3794a00d4a325', u'file': u'214727115.mp4',
u'info_dict': { u'info_dict': {
u"title": u"Julian Smith & Travis Legg Watch Julian Smith" u'title': u'Julian Smith & Travis Legg Watch Julian Smith',
u'description': u'Julian and Travis watch Julian Smith',
},
}, },
u'skip': u'Requires rtmpdump' {
} u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
u'file': u'103000935.flv',
u'info_dict': {
u'title': u'The Cougar Lies with Spanish Moss',
u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
},
u'params': {
# Requires rtmpdump
u'skip_download': True,
},
},
]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id') video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
if m_id is None: items_json = self._search_regex(r'YVIDEO_INIT_ITEMS = ({.*?});$',
# TODO: Check which url parameters are required webpage, u'items', flags=re.MULTILINE)
info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id items = json.loads(items_json)
webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') info = items['mediaItems']['query']['results']['mediaObj'][0]
info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.* meta = info['meta']
<description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
<media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.* formats = []
<media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB" for s in info['streams']:
''' format_info = {
self.report_extraction(video_id) 'width': s.get('width'),
m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL) 'height': s.get('height'),
if m_info is None: 'bitrate': s.get('bitrate'),
raise ExtractorError(u'Unable to extract video info') }
video_title = m_info.group('title')
video_description = m_info.group('description') host = s['host']
video_thumb = m_info.group('thumb') path = s['path']
video_date = m_info.group('date') if host.startswith('rtmp'):
video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d') format_info.update({
'url': host,
# TODO: Find a way to get mp4 videos 'play_path': path,
rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id 'ext': 'flv',
webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage') })
m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage) else:
video_url = m_rest.group('url') format_url = compat_urlparse.urljoin(host, path)
video_path = m_rest.group('path') format_info['url'] = format_url
if m_rest is None: format_info['ext'] = determine_ext(format_url)
raise ExtractorError(u'Unable to extract video url')
formats.append(format_info)
formats = sorted(formats, key=lambda f:(f['height'], f['width']))
info = {
'id': video_id,
'title': meta['title'],
'formats': formats,
'description': clean_html(meta['description']),
'thumbnail': meta['thumbnail'],
}
# TODO: Remove when #980 has been merged
info.update(formats[-1])
else: # We have to use a different method if another id is defined return info
long_id = m_id.group('new_id')
info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
info = json.loads(json_str)
res = info[u'query'][u'results'][u'mediaObj'][0]
stream = res[u'streams'][0]
video_path = stream[u'path']
video_url = stream[u'host']
meta = res[u'meta']
video_title = meta[u'title']
video_description = meta[u'description']
video_thumb = meta[u'thumbnail']
video_date = None # I can't find it
info_dict = {
'id': video_id,
'url': video_url,
'play_path': video_path,
'title':video_title,
'description': video_description,
'thumbnail': video_thumb,
'upload_date': video_date,
'ext': 'flv',
}
return info_dict
class YahooSearchIE(SearchInfoExtractor): class YahooSearchIE(SearchInfoExtractor):
IE_DESC = u'Yahoo screen search' IE_DESC = u'Yahoo screen search'

Loading…
Cancel
Save