Merge pull request #8092 from bpfoley/twitter-thumbnail

[utils] Add extract_attributes for extracting html tag attributes
This commit is contained in:
remitamine 2016-03-16 13:16:27 +01:00
commit 83548824c2
3 changed files with 76 additions and 0 deletions

View file

@ -35,6 +35,7 @@ import xml.etree.ElementTree
import zlib
from .compat import (
compat_HTMLParser,
compat_basestring,
compat_chr,
compat_etree_fromstring,
@ -272,6 +273,35 @@ def get_element_by_attribute(attribute, value, html):
return unescapeHTML(res)
class HTMLAttributeParser(compat_HTMLParser):
"""Trivial HTML parser to gather the attributes for a single element"""
def __init__(self):
self.attrs = { }
compat_HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
self.attrs = dict(attrs)
def extract_attributes(html_element):
"""Given a string for an HTML element such as
<el
a="foo" B="bar" c="&98;az" d=boz
empty= noval entity="&amp;"
sq='"' dq="'"
>
Decode and return a dictionary of attributes.
{
'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
'empty': '', 'noval': None, 'entity': '&',
'sq': '"', 'dq': '\''
}.
NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
"""
parser = HTMLAttributeParser()
parser.feed(html_element)
parser.close()
return parser.attrs
def clean_html(html):
"""Clean an HTML snippet into a readable string"""