[extractor/common] Add the encoding parameter

The QQMusic info extractor need forced encoding for correct working.
9 years ago · c9a779695d
parent a685ae511a
commit c9a779695d
2 changed files with 23 additions and 15 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -324,7 +324,7 @@ class InfoExtractor(object):
                self._downloader.report_warning(errmsg)
                return False

-    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
+    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
        """ Returns a tuple (page content as string, URL handle) """
        # Strip hashes from the URL (#1038)
        if isinstance(url_or_request, (compat_str, str)):
@ -334,14 +334,11 @@ class InfoExtractor(object):
        if urlh is False:
            assert not fatal
            return False
-        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
+        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
        return (content, urlh)

-    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
-        content_type = urlh.headers.get('Content-Type', '')
-        webpage_bytes = urlh.read()
-        if prefix is not None:
-            webpage_bytes = prefix + webpage_bytes
+    @staticmethod
+    def _guess_encoding_from_content(content_type, webpage_bytes):
        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
        if m:
            encoding = m.group(1)
@ -354,6 +351,16 @@ class InfoExtractor(object):
                encoding = 'utf-16'
            else:
                encoding = 'utf-8'
+
+        return encoding
+
+    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+        content_type = urlh.headers.get('Content-Type', '')
+        webpage_bytes = urlh.read()
+        if prefix is not None:
+            webpage_bytes = prefix + webpage_bytes
+        if not encoding:
+            encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
        if self._downloader.params.get('dump_intermediate_pages', False):
            try:
                url = url_or_request.get_full_url()
@ -410,13 +417,13 @@ class InfoExtractor(object):

        return content

-    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
+    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
        """ Returns the data of the page as a string """
        success = False
        try_count = 0
        while success is False:
            try:
-                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
+                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
                success = True
            except compat_http_client.IncompleteRead as e:
                try_count += 1
@ -431,10 +438,10 @@ class InfoExtractor(object):

    def _download_xml(self, url_or_request, video_id,
                      note='Downloading XML', errnote='Unable to download XML',
-                      transform_source=None, fatal=True):
+                      transform_source=None, fatal=True, encoding=None):
        """Return the xml as an xml.etree.ElementTree.Element"""
        xml_string = self._download_webpage(
-            url_or_request, video_id, note, errnote, fatal=fatal)
+            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
        if xml_string is False:
            return xml_string
        if transform_source:
@ -445,9 +452,10 @@ class InfoExtractor(object):
                       note='Downloading JSON metadata',
                       errnote='Unable to download JSON metadata',
                       transform_source=None,
-                       fatal=True):
+                       fatal=True, encoding=None):
        json_string = self._download_webpage(
-            url_or_request, video_id, note, errnote, fatal=fatal)
+            url_or_request, video_id, note, errnote, fatal=fatal,
+            encoding=encoding)
        if (not fatal) and json_string is False:
            return None
        return self._parse_json(
--- a/youtube_dl/extractor/qqmusic.py
+++ b/youtube_dl/extractor/qqmusic.py
@ -24,7 +24,7 @@ class QQMusicIE(InfoExtractor):
            'title': '可惜没如果',
            'upload_date': '20141227',
            'creator': '林俊杰',
-            'description': 'md5:242c97c2847e0495583b7b13764f7106',
+            'description': 'md5:4348ff1dd24036906baa7b6f973f8d30',
        }
    }]

@ -41,7 +41,7 @@ class QQMusicIE(InfoExtractor):
        detail_info_page = self._download_webpage(
            'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid,
            mid, note='Download song detail info',
-            errnote='Unable to get song detail info')
+            errnote='Unable to get song detail info', encoding='gbk')

        song_name = self._html_search_regex(
            r"songname:\s*'([^']+)'", detail_info_page, 'song name')