Improve the OpenGraph regex

* Do not accept '>' between the property and content attributes.
* Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE).
master
Jaime Marquínez Ferrándiz 11 years ago
parent 85d61685f1
commit ab2d524780

@ -315,13 +315,17 @@ class InfoExtractor(object):
# Helper functions for extracting OpenGraph info # Helper functions for extracting OpenGraph info
@staticmethod @staticmethod
def _og_regex(prop): def _og_regexes(prop):
return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop) esc_prop = re.escape(prop)
return [
r'<meta[^>]+?property=[\'"]og:%s[\'"][^>]+?content=(?:"(.+?)"|\'(.+?)\')' % esc_prop,
r'<meta[^>]+?content=(?:"(.+?)"|\'(.+?)\')[^>]+?property=[\'"]og:%s[\'"]' % esc_prop,
]
def _og_search_property(self, prop, html, name=None, **kargs): def _og_search_property(self, prop, html, name=None, **kargs):
if name is None: if name is None:
name = 'OpenGraph %s' % prop name = 'OpenGraph %s' % prop
escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
if escaped is None: if escaped is None:
return None return None
return unescapeHTML(escaped) return unescapeHTML(escaped)
@ -336,8 +340,8 @@ class InfoExtractor(object):
return self._og_search_property('title', html, **kargs) return self._og_search_property('title', html, **kargs)
def _og_search_video_url(self, html, name='video url', secure=True, **kargs): def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
regexes = [self._og_regex('video')] regexes = self._og_regexes('video')
if secure: regexes.insert(0, self._og_regex('video:secure_url')) if secure: regexes = self._og_regexes('video:secure_url') + regexes
return self._html_search_regex(regexes, html, name, **kargs) return self._html_search_regex(regexes, html, name, **kargs)
def _rta_search(self, html): def _rta_search(self, html):

Loading…
Cancel
Save