removed dependency from lxml: added IDParser

This commit is contained in:
Filippo Valsorda - Campagna 2012-04-10 18:21:00 +02:00
parent d6a9615347
commit c6f45d4314
2 changed files with 155 additions and 76 deletions

View file

@ -67,11 +67,6 @@ try:
except ImportError:
from cgi import parse_qs
try:
import lxml.etree
except ImportError:
pass # Handled below
try:
import xml.etree.ElementTree
except ImportError: # Python<2.5: Not officially supported, but let it slip
@ -198,6 +193,69 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr
raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
return res
class IDParser(HTMLParser.HTMLParser):
"""Modified HTMLParser that isolates a tag with the specified id"""
def __init__(self, id):
self.id = id
self.result = None
self.started = False
self.depth = {}
self.html = None
self.watch_startpos = False
HTMLParser.HTMLParser.__init__(self)
def loads(self, html):
self.html = html
self.feed(html)
self.close()
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if self.started:
self.find_startpos(None)
if 'id' in attrs and attrs['id'] == self.id:
self.result = [tag]
self.started = True
self.watch_startpos = True
if self.started:
if not tag in self.depth: self.depth[tag] = 0
self.depth[tag] += 1
def handle_endtag(self, tag):
if self.started:
if tag in self.depth: self.depth[tag] -= 1
if self.depth[self.result[0]] == 0:
self.started = False
self.result.append(self.getpos())
def find_startpos(self, x):
"""Needed to put the start position of the result (self.result[1])
after the opening tag with the requested id"""
if self.watch_startpos:
self.watch_startpos = False
self.result.append(self.getpos())
handle_entityref = handle_charref = handle_data = handle_comment = \
handle_decl = handle_pi = unknown_decl = find_startpos
def get_result(self):
if self.result == None: return None
if len(self.result) != 3: return None
lines = self.html.split('\n')
lines = lines[self.result[1][0]-1:self.result[2][0]]
lines[0] = lines[0][self.result[1][1]:]
if len(lines) == 1:
lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
lines[-1] = lines[-1][:self.result[2][1]]
return '\n'.join(lines).strip()
def get_element_by_id(id, html):
"""Return the content of the tag with the specified id in the passed HTML document"""
parser = IDParser(id)
parser.loads(html)
return parser.get_result()
def preferredencoding():
"""Get preferred encoding.
@ -246,7 +304,7 @@ def clean_html(html):
"""Clean an HTML snippet into a readable string"""
# Newline vs <br />
html = html.replace('\n', ' ')
html = re.sub('<\s*br\s*/?\s*>', '\n', html)
html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
@ -1432,18 +1490,9 @@ class YoutubeIE(InfoExtractor):
pass
# description
try:
lxml.etree
except NameError:
video_description = u'No description available.'
mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
if mobj is not None:
video_description = mobj.group(1).decode('utf-8')
else:
html_parser = lxml.etree.HTMLParser(encoding='utf-8')
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
# TODO use another parser
video_description = get_element_by_id("eow-description", video_webpage)
if video_description: video_description = clean_html(video_description.decode('utf8'))
else: video_description = ''
# closed captions
video_subtitles = None
@ -2177,18 +2226,9 @@ class VimeoIE(InfoExtractor):
video_thumbnail = config["video"]["thumbnail"]
# Extract video description
try:
lxml.etree
except NameError:
video_description = u'No description available.'
mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
if mobj is not None:
video_description = mobj.group(1)
else:
html_parser = lxml.etree.HTMLParser()
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
# TODO use another parser
video_description = get_element_by_id("description", webpage)
if video_description: video_description = clean_html(video_description.decode('utf8'))
else: video_description = ''
# Extract upload date
video_upload_date = u'NA'