[utils] Use bytes-like objects in dfxp2srt

This fixes handling of non-UTF8 TTML subtitles

Closes #14191
This commit is contained in:
Yen Chi Hsuan 2017-09-16 12:18:38 +08:00
parent 68d43a61b5
commit 3869028ffb
4 changed files with 41 additions and 11 deletions

View file

@ -2572,14 +2572,18 @@ def srt_subtitles_timecode(seconds):
def dfxp2srt(dfxp_data):
'''
@param dfxp_data A bytes-like object containing DFXP data
@returns A unicode object containing converted SRT data
'''
LEGACY_NAMESPACES = (
('http://www.w3.org/ns/ttml', [
'http://www.w3.org/2004/11/ttaf1',
'http://www.w3.org/2006/04/ttaf1',
'http://www.w3.org/2006/10/ttaf1',
(b'http://www.w3.org/ns/ttml', [
b'http://www.w3.org/2004/11/ttaf1',
b'http://www.w3.org/2006/04/ttaf1',
b'http://www.w3.org/2006/10/ttaf1',
]),
('http://www.w3.org/ns/ttml#styling', [
'http://www.w3.org/ns/ttml#style',
(b'http://www.w3.org/ns/ttml#styling', [
b'http://www.w3.org/ns/ttml#style',
]),
)
@ -2674,7 +2678,7 @@ def dfxp2srt(dfxp_data):
for ns in v:
dfxp_data = dfxp_data.replace(ns, k)
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
dfxp = compat_etree_fromstring(dfxp_data)
out = []
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')