From cd85a1bb8b24eaf6a421a32a985d4c3ad4f80597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 19 Sep 2020 06:34:34 +0700 Subject: [PATCH] [pornhub] Extract metadata from JSON-LD (closes #26614) --- youtube_dl/extractor/pornhub.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index c64c870dc..529f3f711 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -17,6 +17,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + merge_dicts, NO_DEFAULT, orderedSet, remove_quotes, @@ -59,13 +60,14 @@ class PornHubIE(PornHubBaseIE): ''' _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', - 'md5': '1e19b41231a02eba417839222ac9d58e', + 'md5': 'a6391306d050e4547f62b3f485dd9ba9', 'info_dict': { 'id': '648719015', 'ext': 'mp4', 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', 'uploader': 'Babes', 'upload_date': '20130628', + 'timestamp': 1372447216, 'duration': 361, 'view_count': int, 'like_count': int, @@ -82,8 +84,8 @@ class PornHubIE(PornHubBaseIE): 'id': '1331683002', 'ext': 'mp4', 'title': '重庆婷婷女王足交', - 'uploader': 'Unknown', 'upload_date': '20150213', + 'timestamp': 1423804862, 'duration': 1753, 'view_count': int, 'like_count': int, @@ -121,6 +123,7 @@ class PornHubIE(PornHubBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'This video has been disabled', }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, @@ -338,7 +341,7 @@ class PornHubIE(PornHubBaseIE): video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', - webpage, 'uploader', fatal=False) + webpage, 'uploader', default=None) view_count = self._extract_count( r'([\d,\.]+) [Vv]iews', webpage, 'view') @@ -356,7 +359,11 @@ class PornHubIE(PornHubBaseIE): if div: return re.findall(r']+\bhref=[^>]+>([^<]+)', div) - return { + info = self._search_json_ld(webpage, video_id, default={}) + # description provided in JSON-LD is irrelevant + info['description'] = None + + return merge_dicts({ 'id': video_id, 'uploader': video_uploader, 'upload_date': upload_date, @@ -372,7 +379,7 @@ class PornHubIE(PornHubBaseIE): 'tags': extract_list('tags'), 'categories': extract_list('categories'), 'subtitles': subtitles, - } + }, info) class PornHubPlaylistBaseIE(PornHubBaseIE):