Merge branch 'master' into openload-phantomjs-method

This commit is contained in:
Tithen-Firion 2017-05-04 11:00:06 +02:00 committed by GitHub
commit c89267d31a
99 changed files with 2116 additions and 684 deletions

View file

@ -11,6 +11,7 @@ import contextlib
import ctypes
import datetime
import email.utils
import email.header
import errno
import functools
import gzip
@ -421,8 +422,8 @@ def clean_html(html):
# Newline vs <br />
html = html.replace('\n', ' ')
html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
@ -1194,6 +1195,11 @@ def unified_timestamp(date_str, day_first=True):
# Remove AM/PM + timezone
date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
# Remove unrecognized timezones from ISO 8601 alike timestamps
m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
if m:
date_str = date_str[:-len(m.group('tz'))]
for expression in date_formats(day_first):
try:
dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
@ -2092,6 +2098,58 @@ def update_Request(req, url=None, data=None, headers={}, query={}):
return new_req
def try_multipart_encode(data, boundary):
content_type = 'multipart/form-data; boundary=%s' % boundary
out = b''
for k, v in data.items():
out += b'--' + boundary.encode('ascii') + b'\r\n'
if isinstance(k, compat_str):
k = k.encode('utf-8')
if isinstance(v, compat_str):
v = v.encode('utf-8')
# RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
# suggests sending UTF-8 directly. Firefox sends UTF-8, too
content = b'Content-Disposition: form-data; name="%s"\r\n\r\n' % k + v + b'\r\n'
if boundary.encode('ascii') in content:
raise ValueError('Boundary overlaps with data')
out += content
out += b'--' + boundary.encode('ascii') + b'--\r\n'
return out, content_type
def multipart_encode(data, boundary=None):
'''
Encode a dict to RFC 7578-compliant form-data
data:
A dict where keys and values can be either Unicode or bytes-like
objects.
boundary:
If specified a Unicode object, it's used as the boundary. Otherwise
a random boundary is generated.
Reference: https://tools.ietf.org/html/rfc7578
'''
has_specified_boundary = boundary is not None
while True:
if boundary is None:
boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
try:
out, content_type = try_multipart_encode(data, boundary)
break
except ValueError:
if has_specified_boundary:
raise
boundary = None
return out, content_type
def dict_get(d, key_or_keys, default=None, skip_false_values=True):
if isinstance(key_or_keys, (list, tuple)):
for key in key_or_keys:
@ -2103,13 +2161,16 @@ def dict_get(d, key_or_keys, default=None, skip_false_values=True):
def try_get(src, getter, expected_type=None):
try:
v = getter(src)
except (AttributeError, KeyError, TypeError, IndexError):
pass
else:
if expected_type is None or isinstance(v, expected_type):
return v
if not isinstance(getter, (list, tuple)):
getter = [getter]
for get in getter:
try:
v = get(src)
except (AttributeError, KeyError, TypeError, IndexError):
pass
else:
if expected_type is None or isinstance(v, expected_type):
return v
def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
@ -2270,10 +2331,8 @@ def mimetype2ext(mt):
return {
'3gpp': '3gp',
'smptett+xml': 'tt',
'srt': 'srt',
'ttaf+xml': 'dfxp',
'ttml+xml': 'ttml',
'vtt': 'vtt',
'x-flv': 'flv',
'x-mp4-fragmented': 'mp4',
'x-ms-wmv': 'wmv',
@ -2281,11 +2340,11 @@ def mimetype2ext(mt):
'x-mpegurl': 'm3u8',
'vnd.apple.mpegurl': 'm3u8',
'dash+xml': 'mpd',
'f4m': 'f4m',
'f4m+xml': 'f4m',
'hds+xml': 'f4m',
'vnd.ms-sstr+xml': 'ism',
'quicktime': 'mov',
'mp2t': 'ts',
}.get(res, res)
@ -2508,27 +2567,97 @@ def srt_subtitles_timecode(seconds):
def dfxp2srt(dfxp_data):
LEGACY_NAMESPACES = (
('http://www.w3.org/ns/ttml', [
'http://www.w3.org/2004/11/ttaf1',
'http://www.w3.org/2006/04/ttaf1',
'http://www.w3.org/2006/10/ttaf1',
]),
('http://www.w3.org/ns/ttml#styling', [
'http://www.w3.org/ns/ttml#style',
]),
)
SUPPORTED_STYLING = [
'color',
'fontFamily',
'fontSize',
'fontStyle',
'fontWeight',
'textDecoration'
]
_x = functools.partial(xpath_with_ns, ns_map={
'ttml': 'http://www.w3.org/ns/ttml',
'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
'tts': 'http://www.w3.org/ns/ttml#styling',
})
styles = {}
default_style = {}
class TTMLPElementParser(object):
out = ''
_out = ''
_unclosed_elements = []
_applied_styles = []
def start(self, tag, attrib):
if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
self.out += '\n'
if tag in (_x('ttml:br'), 'br'):
self._out += '\n'
else:
unclosed_elements = []
style = {}
element_style_id = attrib.get('style')
if default_style:
style.update(default_style)
if element_style_id:
style.update(styles.get(element_style_id, {}))
for prop in SUPPORTED_STYLING:
prop_val = attrib.get(_x('tts:' + prop))
if prop_val:
style[prop] = prop_val
if style:
font = ''
for k, v in sorted(style.items()):
if self._applied_styles and self._applied_styles[-1].get(k) == v:
continue
if k == 'color':
font += ' color="%s"' % v
elif k == 'fontSize':
font += ' size="%s"' % v
elif k == 'fontFamily':
font += ' face="%s"' % v
elif k == 'fontWeight' and v == 'bold':
self._out += '<b>'
unclosed_elements.append('b')
elif k == 'fontStyle' and v == 'italic':
self._out += '<i>'
unclosed_elements.append('i')
elif k == 'textDecoration' and v == 'underline':
self._out += '<u>'
unclosed_elements.append('u')
if font:
self._out += '<font' + font + '>'
unclosed_elements.append('font')
applied_style = {}
if self._applied_styles:
applied_style.update(self._applied_styles[-1])
applied_style.update(style)
self._applied_styles.append(applied_style)
self._unclosed_elements.append(unclosed_elements)
def end(self, tag):
pass
if tag not in (_x('ttml:br'), 'br'):
unclosed_elements = self._unclosed_elements.pop()
for element in reversed(unclosed_elements):
self._out += '</%s>' % element
if unclosed_elements and self._applied_styles:
self._applied_styles.pop()
def data(self, data):
self.out += data
self._out += data
def close(self):
return self.out.strip()
return self._out.strip()
def parse_node(node):
target = TTMLPElementParser()
@ -2536,13 +2665,45 @@ def dfxp2srt(dfxp_data):
parser.feed(xml.etree.ElementTree.tostring(node))
return parser.close()
for k, v in LEGACY_NAMESPACES:
for ns in v:
dfxp_data = dfxp_data.replace(ns, k)
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = []
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
if not paras:
raise ValueError('Invalid dfxp/TTML subtitle')
repeat = False
while True:
for style in dfxp.findall(_x('.//ttml:style')):
style_id = style.get('id')
parent_style_id = style.get('style')
if parent_style_id:
if parent_style_id not in styles:
repeat = True
continue
styles[style_id] = styles[parent_style_id].copy()
for prop in SUPPORTED_STYLING:
prop_val = style.get(_x('tts:' + prop))
if prop_val:
styles.setdefault(style_id, {})[prop] = prop_val
if repeat:
repeat = False
else:
break
for p in ('body', 'div'):
ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
if ele is None:
continue
style = styles.get(ele.get('style'))
if not style:
continue
default_style.update(style)
for para, index in zip(paras, itertools.count(1)):
begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
end_time = parse_dfxp_time_expr(para.attrib.get('end'))
@ -3862,3 +4023,10 @@ class PhantomJSwrapper(object):
return (html, encodeArgument(out))
def random_birthday(year_field, month_field, day_field):
return {
year_field: str(random.randint(1950, 1995)),
month_field: str(random.randint(1, 12)),
day_field: str(random.randint(1, 31)),
}