[ustream] Simplify channel extraction

the ChannelParser has been moved to a new function in utils get_meta_content
Instead of the SocialStreamParser now it uses a regex
This commit is contained in:
Jaime Marquínez Ferrándiz 2013-09-13 22:05:29 +02:00
parent 74ac9bdd82
commit a921f40799
3 changed files with 70 additions and 59 deletions

View file

@ -249,7 +249,17 @@ def htmlentity_transform(matchobj):
return (u'&%s;' % entity)
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
class AttrParser(compat_html_parser.HTMLParser):
class BaseHTMLParser(compat_html_parser.HTMLParser):
def __init(self):
compat_html_parser.HTMLParser.__init__(self)
self.html = None
def loads(self, html):
self.html = html
self.feed(html)
self.close()
class AttrParser(BaseHTMLParser):
"""Modified HTMLParser that isolates a tag with the specified attribute"""
def __init__(self, attribute, value):
self.attribute = attribute
@ -257,10 +267,9 @@ class AttrParser(compat_html_parser.HTMLParser):
self.result = None
self.started = False
self.depth = {}
self.html = None
self.watch_startpos = False
self.error_count = 0
compat_html_parser.HTMLParser.__init__(self)
BaseHTMLParser.__init__(self)
def error(self, message):
if self.error_count > 10 or self.started:
@ -269,11 +278,6 @@ class AttrParser(compat_html_parser.HTMLParser):
self.error_count += 1
self.goahead(1)
def loads(self, html):
self.html = html
self.feed(html)
self.close()
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if self.started:
@ -334,6 +338,38 @@ def get_element_by_attribute(attribute, value, html):
pass
return parser.get_result()
class MetaParser(BaseHTMLParser):
"""
Modified HTMLParser that isolates a meta tag with the specified name
attribute.
"""
def __init__(self, name):
BaseHTMLParser.__init__(self)
self.name = name
self.content = None
self.result = None
def handle_starttag(self, tag, attrs):
if tag != 'meta':
return
attrs = dict(attrs)
if attrs.get('name') == self.name:
self.result = attrs.get('content')
def get_result(self):
return self.result
def get_meta_content(name, html):
"""
Return the content attribute from the meta tag with the given name attribute.
"""
parser = MetaParser(name)
try:
parser.loads(html)
except compat_html_parser.HTMLParseError:
pass
return parser.get_result()
def clean_html(html):
"""Clean an HTML snippet into a readable string"""