diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7a2e5dee0..7666cf207 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import os import re -import xml.etree.ElementTree from .common import InfoExtractor from .youtube import YoutubeIE @@ -17,6 +16,7 @@ from ..utils import ( ExtractorError, HEADRequest, + parse_xml, smuggle_url, unescapeHTML, unified_strdate, @@ -274,7 +274,7 @@ class GenericIE(InfoExtractor): # Is it an RSS feed? try: - doc = xml.etree.ElementTree.fromstring(webpage.encode('utf-8')) + doc = parse_xml(webpage) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) except compat_xml_parse_error: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d4abd4031..3943cc9c5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -22,6 +22,7 @@ import struct import subprocess import sys import traceback +import xml.etree.ElementTree import zlib try: @@ -1267,3 +1268,13 @@ def read_batch_urls(batch_fd): def urlencode_postdata(*args, **kargs): return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') + + +def parse_xml(s): + class TreeBuilder(xml.etree.ElementTree.TreeBuilder): + def doctype(self, name, pubid, system): + pass # Ignore doctypes + + parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) + kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} + return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)