[generic] Add support for BOMs (Fixes #4753)

This commit is contained in:
Philipp Hagemeister 2015-01-23 01:21:30 +01:00
parent 317239b097
commit 61ca9a80b3
3 changed files with 39 additions and 1 deletions

View file

@ -1631,3 +1631,23 @@ def age_restricted(content_limit, age_limit):
if content_limit is None:
return False # Content available for everyone
return age_limit < content_limit
def is_html(first_bytes):
""" Detect whether a file contains HTML by examining its first bytes. """
BOMS = [
(b'\xef\xbb\xbf', 'utf-8'),
(b'\x00\x00\xfe\xff', 'utf-32-be'),
(b'\xff\xfe\x00\x00', 'utf-32-le'),
(b'\xff\xfe', 'utf-16-le'),
(b'\xfe\xff', 'utf-16-be'),
]
for bom, enc in BOMS:
if first_bytes.startswith(bom):
s = first_bytes[len(bom):].decode(enc, 'replace')
break
else:
s = first_bytes.decode('utf-8', 'replace')
return re.match(r'^\s*<', s)