[BostonGlobe] New. Nonstandard version of Brightcove.

Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise pretty much just Brightcove. Except the Globe isn't all Brightcove videos, so fallback to Generic, too. Also, abstract playlist_from_matches() from generic.py to common.py, and use it here. History of these changes can be found in 51170427d4b1143572a498dedaee61863a5b2c5b.
2017-03-08 18:13:54 -05:00 · 2017-03-08 18:13:54 -05:00 · 46b18f2349
commit 46b18f2349
parent 772b5ff57f
4 changed files with 126 additions and 52 deletions
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -1841,14 +1841,6 @@ class GenericIE(InfoExtractor):
        video_description = self._og_search_description(webpage, default=None)
        video_thumbnail = self._og_search_thumbnail(webpage, default=None)

-        # Helper method
-        def _playlist_from_matches(matches, getter=None, ie=None):
-            urlrs = orderedSet(
-                self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
-                for m in matches)
-            return self.playlist_result(
-                urlrs, playlist_id=video_id, playlist_title=video_title)
-
        # Look for Brightcove Legacy Studio embeds
        bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
        if bc_urls:
@ -1869,28 +1861,28 @@ class GenericIE(InfoExtractor):
        # Look for Brightcove New Studio embeds
        bc_urls = BrightcoveNewIE._extract_urls(webpage)
        if bc_urls:
-            return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
+            return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')

        # Look for ThePlatform embeds
        tp_urls = ThePlatformIE._extract_urls(webpage)
        if tp_urls:
-            return _playlist_from_matches(tp_urls, ie='ThePlatform')
+            return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')

        # Look for Vessel embeds
        vessel_urls = VesselIE._extract_urls(webpage)
        if vessel_urls:
-            return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key())
+            return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key())

        # Look for embedded rtl.nl player
        matches = re.findall(
            r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
            webpage)
        if matches:
-            return _playlist_from_matches(matches, ie='RtlNl')
+            return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')

        vimeo_urls = VimeoIE._extract_urls(url, webpage)
        if vimeo_urls:
-            return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key())
+            return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())

        vid_me_embed_url = self._search_regex(
            r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
@ -1912,25 +1904,25 @@ class GenericIE(InfoExtractor):
                (?:embed|v|p)/.+?)
            \1''', webpage)
        if matches:
-            return _playlist_from_matches(
-                matches, lambda m: unescapeHTML(m[1]))
+            return self.playlist_from_matches(
+                matches, video_id, video_title, lambda m: unescapeHTML(m[1]))

        # Look for lazyYT YouTube embed
        matches = re.findall(
            r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
        if matches:
-            return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
+            return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))

        # Look for Wordpress "YouTube Video Importer" plugin
        matches = re.findall(r'''(?x)<div[^>]+
            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
        if matches:
-            return _playlist_from_matches(matches, lambda m: m[-1])
+            return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])

        matches = DailymotionIE._extract_urls(webpage)
        if matches:
-            return _playlist_from_matches(matches)
+            return self.playlist_from_matches(matches, video_id, video_title)

        # Look for embedded Dailymotion playlist player (#3822)
        m = re.search(
@ -1939,8 +1931,8 @@ class GenericIE(InfoExtractor):
            playlists = re.findall(
                r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
            if playlists:
-                return _playlist_from_matches(
-                    playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
+                return self.playlist_from_matches(
+                    playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)

        # Look for embedded Wistia player
        match = re.search(
@ -2047,8 +2039,9 @@ class GenericIE(InfoExtractor):
        if mobj is not None:
            embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
            if embeds:
-                return _playlist_from_matches(
-                    embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')
+                return self.playlist_from_matches(
+                    embeds, video_id, video_title,
+                    getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')

        # Look for Aparat videos
        mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
@ -2110,13 +2103,13 @@ class GenericIE(InfoExtractor):
        # Look for funnyordie embed
        matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
        if matches:
-            return _playlist_from_matches(
-                matches, getter=unescapeHTML, ie='FunnyOrDie')
+            return self.playlist_from_matches(
+                matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')

        # Look for BBC iPlayer embed
        matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
        if matches:
-            return _playlist_from_matches(matches, ie='BBCCoUk')
+            return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk')

        # Look for embedded RUTV player
        rutv_url = RUTVIE._extract_url(webpage)
@ -2131,32 +2124,32 @@ class GenericIE(InfoExtractor):
        # Look for embedded SportBox player
        sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
        if sportbox_urls:
-            return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
+            return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed')

        # Look for embedded XHamster player
        xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
        if xhamster_urls:
-            return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
+            return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed')

        # Look for embedded TNAFlixNetwork player
        tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
        if tnaflix_urls:
-            return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key())
+            return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key())

        # Look for embedded PornHub player
        pornhub_urls = PornHubIE._extract_urls(webpage)
        if pornhub_urls:
-            return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key())
+            return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key())

        # Look for embedded DrTuber player
        drtuber_urls = DrTuberIE._extract_urls(webpage)
        if drtuber_urls:
-            return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key())
+            return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key())

        # Look for embedded RedTube player
        redtube_urls = RedTubeIE._extract_urls(webpage)
        if redtube_urls:
-            return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key())
+            return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key())

        # Look for embedded Tvigle player
        mobj = re.search(
@ -2202,12 +2195,12 @@ class GenericIE(InfoExtractor):
        # Look for embedded soundcloud player
        soundcloud_urls = SoundcloudIE._extract_urls(webpage)
        if soundcloud_urls:
-            return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
+            return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key())

        # Look for tunein player
        tunein_urls = TuneInBaseIE._extract_urls(webpage)
        if tunein_urls:
-            return _playlist_from_matches(tunein_urls)
+            return self.playlist_from_matches(tunein_urls, video_id, video_title)

        # Look for embedded mtvservices player
        mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
@ -2490,35 +2483,35 @@ class GenericIE(InfoExtractor):
        # Look for DBTV embeds
        dbtv_urls = DBTVIE._extract_urls(webpage)
        if dbtv_urls:
-            return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key())
+            return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key())

        # Look for Videa embeds
        videa_urls = VideaIE._extract_urls(webpage)
        if videa_urls:
-            return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key())
+            return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key())

        # Look for 20 minuten embeds
        twentymin_urls = TwentyMinutenIE._extract_urls(webpage)
        if twentymin_urls:
-            return _playlist_from_matches(
-                twentymin_urls, ie=TwentyMinutenIE.ie_key())
+            return self.playlist_from_matches(
+                twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key())

        # Look for Openload embeds
        openload_urls = OpenloadIE._extract_urls(webpage)
        if openload_urls:
-            return _playlist_from_matches(
-                openload_urls, ie=OpenloadIE.ie_key())
+            return self.playlist_from_matches(
+                openload_urls, video_id, video_title, ie=OpenloadIE.ie_key())

        # Look for VideoPress embeds
        videopress_urls = VideoPressIE._extract_urls(webpage)
        if videopress_urls:
-            return _playlist_from_matches(
-                videopress_urls, ie=VideoPressIE.ie_key())
+            return self.playlist_from_matches(
+                videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key())

        # Look for Rutube embeds
        rutube_urls = RutubeIE._extract_urls(webpage)
        if rutube_urls:
-            return _playlist_from_matches(
+            return self.playlist_from_matches(
                rutube_urls, ie=RutubeIE.ie_key())

        # Looking for http://schema.org/VideoObject