[wsj] Add new extractor (Fixes #4854)

master
Philipp Hagemeister 9 years ago
parent 1a6373ef39
commit 9bb8e0a3f9

@ -156,6 +156,9 @@ class TestUtil(unittest.TestCase):
self.assertEqual(
unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False),
'20141126')
self.assertEqual(
unified_strdate('2/2/2015 6:47:40 PM', day_first=False),
'20150202')
def test_find_xpath_attr(self):
testxml = '''<root>

@ -554,6 +554,7 @@ from .wimp import WimpIE
from .wistia import WistiaIE
from .worldstarhiphop import WorldStarHipHopIE
from .wrzuta import WrzutaIE
from .wsj import WSJIE
from .xbef import XBefIE
from .xboxclips import XboxClipsIE
from .xhamster import XHamsterIE

@ -145,6 +145,7 @@ class InfoExtractor(object):
thumbnail: Full URL to a video thumbnail image.
description: Full video description.
uploader: Full name of the video uploader.
creator: The main artist who created the video.
timestamp: UNIX timestamp of the moment the video became available.
upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp.

@ -0,0 +1,89 @@
# encoding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate,
)
class WSJIE(InfoExtractor):
_VALID_URL = r'https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=(?P<id>[a-zA-Z0-9-]+)'
IE_DESC = 'Wall Street Journal'
_TEST = {
'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
'md5': '9747d7a6ebc2f4df64b981e1dde9efa9',
'info_dict': {
'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
'ext': 'mp4',
'upload_date': '20150202',
'uploader_id': 'bbright',
'creator': 'bbright',
'categories': list, # a long list
'duration': 90,
'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
bitrates = [128, 174, 264, 320, 464, 664, 1264]
api_url = (
'http://video-api.wsj.com/api-video/find_all_videos.asp?'
'type=guid&count=1&query=%s&'
'fields=hls,adZone,thumbnailList,guid,state,secondsUntilStartTime,'
'author,description,name,linkURL,videoStillURL,duration,videoURL,'
'adCategory,catastrophic,linkShortURL,doctypeID,youtubeID,'
'titletag,rssURL,wsj-section,wsj-subsection,allthingsd-section,'
'allthingsd-subsection,sm-section,sm-subsection,provider,'
'formattedCreationDate,keywords,keywordsOmniture,column,editor,'
'emailURL,emailPartnerID,showName,omnitureProgramName,'
'omnitureVideoFormat,linkRelativeURL,touchCastID,'
'omniturePublishDate,%s') % (
video_id, ','.join('video%dkMP4Url' % br for br in bitrates))
info = self._download_json(api_url, video_id)['items'][0]
# Thumbnails are conveniently in the correct format already
thumbnails = info.get('thumbnailList')
creator = info.get('author')
uploader_id = info.get('editor')
categories = info.get('keywords')
duration = int_or_none(info.get('duration'))
upload_date = unified_strdate(
info.get('formattedCreationDate'), day_first=False)
title = info.get('name', info.get('titletag'))
formats = [{
'format_id': 'f4m',
'format_note': 'f4m (meta URL)',
'url': info['videoURL'],
}]
if info.get('hls'):
formats.extend(self._extract_m3u8_formats(
info['hls'], video_id, ext='mp4',
preference=0, entry_protocol='m3u8_native'))
for br in bitrates:
field = 'video%dkMP4Url' % br
if info.get(field):
formats.append({
'format_id': 'mp4-%d' % br,
'container': 'mp4',
'tbr': br,
'url': info[field],
})
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'thumbnails': thumbnails,
'creator': creator,
'uploader_id': uploader_id,
'duration': duration,
'upload_date': upload_date,
'title': title,
'formats': formats,
'categories': categories,
}

@ -701,7 +701,7 @@ def unified_strdate(date_str, day_first=True):
# %z (UTC offset) is only supported in python>=3.2
date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
# Remove AM/PM + timezone
date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
format_expressions = [
'%d %B %Y',

Loading…
Cancel
Save