Merge branch 'master' into openload-phantomjs-method

2017-05-04 11:00:06 +02:00 · 2017-05-04 11:00:06 +02:00 · c89267d31a
commit c89267d31a
parent 7552f96352 0c26548601
99 changed files with 2116 additions and 684 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -245,6 +245,10 @@ class InfoExtractor(object):
                    specified in the URL.
    end_time:       Time in seconds where the reproduction should end, as
                    specified in the URL.
+    chapters:       A list of dictionaries, with the following entries:
+                        * "start_time" - The start time of the chapter in seconds
+                        * "end_time" - The end time of the chapter in seconds
+                        * "title" (optional, string)

    The following fields should only be used when the video belongs to some logical
    chapter or section:
@ -976,6 +980,23 @@ class InfoExtractor(object):
            return info
        if isinstance(json_ld, dict):
            json_ld = [json_ld]
+
+        def extract_video_object(e):
+            assert e['@type'] == 'VideoObject'
+            info.update({
+                'url': e.get('contentUrl'),
+                'title': unescapeHTML(e.get('name')),
+                'description': unescapeHTML(e.get('description')),
+                'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
+                'duration': parse_duration(e.get('duration')),
+                'timestamp': unified_timestamp(e.get('uploadDate')),
+                'filesize': float_or_none(e.get('contentSize')),
+                'tbr': int_or_none(e.get('bitrate')),
+                'width': int_or_none(e.get('width')),
+                'height': int_or_none(e.get('height')),
+                'view_count': int_or_none(e.get('interactionCount')),
+            })
+
        for e in json_ld:
            if e.get('@context') == 'http://schema.org':
                item_type = e.get('@type')
@ -1000,18 +1021,11 @@ class InfoExtractor(object):
                        'description': unescapeHTML(e.get('articleBody')),
                    })
                elif item_type == 'VideoObject':
-                    info.update({
-                        'url': e.get('contentUrl'),
-                        'title': unescapeHTML(e.get('name')),
-                        'description': unescapeHTML(e.get('description')),
-                        'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
-                        'duration': parse_duration(e.get('duration')),
-                        'timestamp': unified_timestamp(e.get('uploadDate')),
-                        'filesize': float_or_none(e.get('contentSize')),
-                        'tbr': int_or_none(e.get('bitrate')),
-                        'width': int_or_none(e.get('width')),
-                        'height': int_or_none(e.get('height')),
-                    })
+                    extract_video_object(e)
+                elif item_type == 'WebPage':
+                    video = e.get('video')
+                    if isinstance(video, dict) and video.get('@type') == 'VideoObject':
+                        extract_video_object(video)
                break
        return dict((k, v) for k, v in info.items() if v is not None)

@ -1303,40 +1317,50 @@ class InfoExtractor(object):
                              entry_protocol='m3u8', preference=None,
                              m3u8_id=None, note=None, errnote=None,
                              fatal=True, live=False):
-
        res = self._download_webpage_handle(
            m3u8_url, video_id,
            note=note or 'Downloading m3u8 information',
            errnote=errnote or 'Failed to download m3u8 information',
            fatal=fatal)
+
        if res is False:
            return []
+
        m3u8_doc, urlh = res
        m3u8_url = urlh.geturl()

+        return self._parse_m3u8_formats(
+            m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
+            preference=preference, m3u8_id=m3u8_id, live=live)
+
+    def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
+                            entry_protocol='m3u8', preference=None,
+                            m3u8_id=None, live=False):
        if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
            return []

-        formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
+        formats = []

        format_url = lambda u: (
            u
            if re.match(r'^https?://', u)
            else compat_urlparse.urljoin(m3u8_url, u))

-        # We should try extracting formats only from master playlists [1], i.e.
-        # playlists that describe available qualities. On the other hand media
-        # playlists [2] should be returned as is since they contain just the media
-        # without qualities renditions.
+        # References:
+        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
+        # 2. https://github.com/rg3/youtube-dl/issues/12211
+
+        # We should try extracting formats only from master playlists [1, 4.3.4],
+        # i.e. playlists that describe available qualities. On the other hand
+        # media playlists [1, 4.3.3] should be returned as is since they contain
+        # just the media without qualities renditions.
        # Fortunately, master playlist can be easily distinguished from media
-        # playlist based on particular tags availability. As of [1, 2] master
-        # playlist tags MUST NOT appear in a media playist and vice versa.
-        # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
-        # and MUST NOT appear in master playlist thus we can clearly detect media
-        # playlist with this criterion.
-        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
-        # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
-        # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
+        # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
+        # master playlist tags MUST NOT appear in a media playist and vice versa.
+        # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
+        # media playlist and MUST NOT appear in master playlist thus we can
+        # clearly detect media playlist with this criterion.
+
        if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
            return [{
                'url': m3u8_url,
@ -1345,52 +1369,72 @@ class InfoExtractor(object):
                'protocol': entry_protocol,
                'preference': preference,
            }]
-        audio_in_video_stream = {}
-        last_info = {}
-        last_media = {}
+
+        groups = {}
+        last_stream_inf = {}
+
+        def extract_media(x_media_line):
+            media = parse_m3u8_attributes(x_media_line)
+            # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
+            media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
+            if not (media_type and group_id and name):
+                return
+            groups.setdefault(group_id, []).append(media)
+            if media_type not in ('VIDEO', 'AUDIO'):
+                return
+            media_url = media.get('URI')
+            if media_url:
+                format_id = []
+                for v in (group_id, name):
+                    if v:
+                        format_id.append(v)
+                f = {
+                    'format_id': '-'.join(format_id),
+                    'url': format_url(media_url),
+                    'manifest_url': m3u8_url,
+                    'language': media.get('LANGUAGE'),
+                    'ext': ext,
+                    'protocol': entry_protocol,
+                    'preference': preference,
+                }
+                if media_type == 'AUDIO':
+                    f['vcodec'] = 'none'
+                formats.append(f)
+
+        def build_stream_name():
+            # Despite specification does not mention NAME attribute for
+            # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
+            # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
+            # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
+            stream_name = last_stream_inf.get('NAME')
+            if stream_name:
+                return stream_name
+            # If there is no NAME in EXT-X-STREAM-INF it will be obtained
+            # from corresponding rendition group
+            stream_group_id = last_stream_inf.get('VIDEO')
+            if not stream_group_id:
+                return
+            stream_group = groups.get(stream_group_id)
+            if not stream_group:
+                return stream_group_id
+            rendition = stream_group[0]
+            return rendition.get('NAME') or stream_group_id
+
        for line in m3u8_doc.splitlines():
            if line.startswith('#EXT-X-STREAM-INF:'):
-                last_info = parse_m3u8_attributes(line)
+                last_stream_inf = parse_m3u8_attributes(line)
            elif line.startswith('#EXT-X-MEDIA:'):
-                media = parse_m3u8_attributes(line)
-                media_type = media.get('TYPE')
-                if media_type in ('VIDEO', 'AUDIO'):
-                    group_id = media.get('GROUP-ID')
-                    media_url = media.get('URI')
-                    if media_url:
-                        format_id = []
-                        for v in (group_id, media.get('NAME')):
-                            if v:
-                                format_id.append(v)
-                        f = {
-                            'format_id': '-'.join(format_id),
-                            'url': format_url(media_url),
-                            'language': media.get('LANGUAGE'),
-                            'ext': ext,
-                            'protocol': entry_protocol,
-                            'preference': preference,
-                        }
-                        if media_type == 'AUDIO':
-                            f['vcodec'] = 'none'
-                            if group_id and not audio_in_video_stream.get(group_id):
-                                audio_in_video_stream[group_id] = False
-                        formats.append(f)
-                    else:
-                        # When there is no URI in EXT-X-MEDIA let this tag's
-                        # data be used by regular URI lines below
-                        last_media = media
-                        if media_type == 'AUDIO' and group_id:
-                            audio_in_video_stream[group_id] = True
+                extract_media(line)
            elif line.startswith('#') or not line.strip():
                continue
            else:
-                tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
+                tbr = float_or_none(
+                    last_stream_inf.get('AVERAGE-BANDWIDTH') or
+                    last_stream_inf.get('BANDWIDTH'), scale=1000)
                format_id = []
                if m3u8_id:
                    format_id.append(m3u8_id)
-                # Despite specification does not mention NAME attribute for
-                # EXT-X-STREAM-INF it still sometimes may be present
-                stream_name = last_info.get('NAME') or last_media.get('NAME')
+                stream_name = build_stream_name()
                # Bandwidth of live streams may differ over time thus making
                # format_id unpredictable. So it's better to keep provided
                # format_id intact.
@ -1400,14 +1444,14 @@ class InfoExtractor(object):
                f = {
                    'format_id': '-'.join(format_id),
                    'url': manifest_url,
-                    'manifest_url': manifest_url,
+                    'manifest_url': m3u8_url,
                    'tbr': tbr,
                    'ext': ext,
-                    'fps': float_or_none(last_info.get('FRAME-RATE')),
+                    'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
                    'protocol': entry_protocol,
                    'preference': preference,
                }
-                resolution = last_info.get('RESOLUTION')
+                resolution = last_stream_inf.get('RESOLUTION')
                if resolution:
                    mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
                    if mobj:
@ -1423,13 +1467,26 @@ class InfoExtractor(object):
                        'vbr': vbr,
                        'abr': abr,
                    })
-                f.update(parse_codecs(last_info.get('CODECS')))
-                if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
-                    # TODO: update acodec for audio only formats with the same GROUP-ID
-                    f['acodec'] = 'none'
+                codecs = parse_codecs(last_stream_inf.get('CODECS'))
+                f.update(codecs)
+                audio_group_id = last_stream_inf.get('AUDIO')
+                # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
+                # references a rendition group MUST have a CODECS attribute.
+                # However, this is not always respected, for example, [2]
+                # contains EXT-X-STREAM-INF tag which references AUDIO
+                # rendition group but does not have CODECS and despite
+                # referencing audio group an audio group, it represents
+                # a complete (with audio and video) format. So, for such cases
+                # we will ignore references to rendition groups and treat them
+                # as complete formats.
+                if audio_group_id and codecs and f.get('vcodec') != 'none':
+                    audio_group = groups.get(audio_group_id)
+                    if audio_group and audio_group[0].get('URI'):
+                        # TODO: update acodec for audio only formats with
+                        # the same GROUP-ID
+                        f['acodec'] = 'none'
                formats.append(f)
-                last_info = {}
-                last_media = {}
+                last_stream_inf = {}
        return formats

    @staticmethod
@ -1803,7 +1860,7 @@ class InfoExtractor(object):
                            'ext': mimetype2ext(mime_type),
                            'width': int_or_none(representation_attrib.get('width')),
                            'height': int_or_none(representation_attrib.get('height')),
-                            'tbr': int_or_none(bandwidth, 1000),
+                            'tbr': float_or_none(bandwidth, 1000),
                            'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
                            'fps': int_or_none(representation_attrib.get('frameRate')),
                            'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
@ -2182,7 +2239,7 @@ class InfoExtractor(object):

    def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
        mobj = re.search(
-            r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
+            r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
            webpage)
        if mobj:
            try:
@ -2258,11 +2315,17 @@ class InfoExtractor(object):

    def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
                                m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+        urls = []
        formats = []
        for source in jwplayer_sources_data:
-            source_url = self._proto_relative_url(source['file'])
+            source_url = self._proto_relative_url(source.get('file'))
+            if not source_url:
+                continue
            if base_url:
                source_url = compat_urlparse.urljoin(base_url, source_url)
+            if source_url in urls:
+                continue
+            urls.append(source_url)
            source_type = source.get('type') or ''
            ext = mimetype2ext(source_type) or determine_ext(source_url)
            if source_type == 'hls' or ext == 'm3u8':