From 42bdd9d0516be5b71c89c9cccde16a880a14b0b1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 12 Dec 2014 02:57:36 +0100 Subject: [PATCH] [cinchcast] Add new extractor (Fixes #4428) --- test/test_utils.py | 3 ++ youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/cinchcast.py | 53 +++++++++++++++++++++++++++++++ youtube_dl/extractor/generic.py | 20 ++++++++++-- youtube_dl/utils.py | 17 +++++++--- 5 files changed, 88 insertions(+), 6 deletions(-) create mode 100644 youtube_dl/extractor/cinchcast.py diff --git a/test/test_utils.py b/test/test_utils.py index aaa293ff8..d42df6d96 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -144,6 +144,9 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011') self.assertEqual(unified_strdate('1968-12-10'), '19681210') self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128') + self.assertEqual( + unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False), + '20141126') def test_find_xpath_attr(self): testxml = ''' diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 982a134bf..746ee69e4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -51,6 +51,7 @@ from .cbsnews import CBSNewsIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE +from .cinchcast import CinchcastIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE diff --git a/youtube_dl/extractor/cinchcast.py b/youtube_dl/extractor/cinchcast.py new file mode 100644 index 000000000..8ce8b3128 --- /dev/null +++ b/youtube_dl/extractor/cinchcast.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + xpath_text, +) + + +class CinchcastIE(InfoExtractor): + _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P[0-9]+)' + _TEST = { + # Actual test is run in generic, look for undergroundwellness + 'url': 'http://player.cinchcast.com/?platformId=1&assetType=single&assetId=7141703', + 'only_matching': True, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + doc = self._download_xml( + 'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id, + video_id) + + item = doc.find('.//item') + title = xpath_text(item, './title', fatal=True) + date_str = xpath_text( + item, './{http://developer.longtailvideo.com/trac/}date') + upload_date = unified_strdate(date_str, day_first=False) + # duration is present but wrong + formats = [] + formats.append({ + 'format_id': 'main', + 'url': item.find( + './{http://search.yahoo.com/mrss/}content').attrib['url'], + }) + backup_url = xpath_text( + item, './{http://developer.longtailvideo.com/trac/}backupContent') + if backup_url: + formats.append({ + 'preference': 2, # seems to be more reliable + 'format_id': 'backup', + 'url': backup_url, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'upload_date': upload_date, + 'formats': formats, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 328301de3..2b4d8c62f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -467,8 +467,17 @@ class GenericIE(InfoExtractor): 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' ] - } - + }, + # Cinchcast embed + { + 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', + 'info_dict': { + 'id': '7141703', + 'ext': 'mp3', + 'upload_date': '20141126', + 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing', + } + }, ] def report_following_redirect(self, new_url): @@ -962,6 +971,13 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'SBS') + # Look for embedded Cinchcast player + mobj = re.search( + r']+?src=(["\'])(?Phttps?://player\.cinchcast\.com/.+?)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Cinchcast') + mobj = re.search( r']+?src=(["\'])(?Phttps?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', webpage) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 75f9594e6..4b0567c93 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -166,7 +166,7 @@ def xpath_text(node, xpath, name=None, fatal=False): xpath = xpath.encode('ascii') n = node.find(xpath) - if n is None: + if n is None or n.text is None: if fatal: name = xpath if name is None else name raise ExtractorError('Could not find XML element %s' % name) @@ -644,17 +644,19 @@ def parse_iso8601(date_str, delimiter='T'): return calendar.timegm(dt.timetuple()) -def unified_strdate(date_str): +def unified_strdate(date_str, day_first=True): """Return a string with the date in the format YYYYMMDD""" if date_str is None: return None - upload_date = None # Replace commas date_str = date_str.replace(',', ' ') # %z (UTC offset) is only supported in python>=3.2 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str) + format_expressions = [ '%d %B %Y', '%d %b %Y', @@ -669,7 +671,6 @@ def unified_strdate(date_str): '%d/%m/%Y', '%d/%m/%y', '%Y/%m/%d %H:%M:%S', - '%d/%m/%Y %H:%M:%S', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', '%d.%m.%Y %H:%M', @@ -681,6 +682,14 @@ def unified_strdate(date_str): '%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M', ] + if day_first: + format_expressions.extend([ + '%d/%m/%Y %H:%M:%S', + ]) + else: + format_expressions.extend([ + '%m/%d/%Y %H:%M:%S', + ]) for expression in format_expressions: try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')