From 4f195f55f0b734f2897319abf96c5542ce6212d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 2 Nov 2014 17:28:42 +0100 Subject: [PATCH] Do not override stdlib html parser 'locatestarttagend' regex (fixes #4081) '' wouldn't be parsed right (the problem is '/ >', '/>' worked fine). We need to change it in python 2.6 (for example the description of youtube videos wouldn't be extracted). --- youtube_dl/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 233286de8..bdd637e48 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -152,7 +152,9 @@ def xpath_text(node, xpath, name=None, fatal=False): return n.text -compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix +if sys.version_info < (2, 7): + compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix + class BaseHTMLParser(compat_html_parser.HTMLParser): def __init(self): compat_html_parser.HTMLParser.__init__(self)