Do not override stdlib html parser 'locatestarttagend' regex (fixes #4081)

'<a href="foo" ><img src="bar" / ></a>' wouldn't be parsed right (the problem is '/ >', '/>' worked fine).
We need to change it in python 2.6 (for example the description of youtube videos wouldn't be extracted).
master
Jaime Marquínez Ferrándiz 10 years ago
parent ac35c26686
commit 4f195f55f0

@ -152,7 +152,9 @@ def xpath_text(node, xpath, name=None, fatal=False):
return n.text
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
if sys.version_info < (2, 7):
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
class BaseHTMLParser(compat_html_parser.HTMLParser):
def __init(self):
compat_html_parser.HTMLParser.__init__(self)

Loading…
Cancel
Save