Merge remote-tracking branch 'fstirlitz/master'

2014-12-11 17:11:25 +01:00 · 2014-12-11 17:11:25 +01:00 · 69f491f14e
commit 69f491f14e
parent cb007f47c1 ce36339575
2 changed files with 93 additions and 17 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -51,7 +51,6 @@ from .cbsnews import CBSNewsIE
 from .ceskatelevize import CeskaTelevizeIE
 from .channel9 import Channel9IE
 from .chilloutzone import ChilloutzoneIE
 from .cinemassacre import CinemassacreIE
 from .clipfish import ClipfishIE
 from .cliphunter import CliphunterIE
 from .clipsyndicate import ClipsyndicateIE
@ -336,6 +335,7 @@ from .savefrom import SaveFromIE
 from .sbs import SBSIE
 from .scivee import SciVeeIE
 from .screencast import ScreencastIE
 from .screenwavemedia import ScreenwaveMediaIE
 from .servingsys import ServingSysIE
 from .sexu import SexuIE
 from .sexykarma import SexyKarmaIE
--- a/youtube_dl/extractor/screenwavemedia.py
+++ b/youtube_dl/extractor/screenwavemedia.py
@ -6,18 +6,23 @@ import re
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
    month_by_name,
    int_or_none,
 )
 class ScreenwaveMediaIE(InfoExtractor):
    _VALID_URL = r'(?:http://)?(?' \
        r':(?P<generic>player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<video_id>.+))' \
        r'|(?P<cinemassacre>(?:www\.)?cinemassacre\.com/(?P<cm_date_Y>[0-9]{4})/(?P<cm_date_m>[0-9]{2})/(?P<cm_date_d>[0-9]{2})/(?P<cm_display_id>[^?#/]+))' \
        r'|(?P<teamfourstar>(?:www\.)?teamfourstar\.com/video/(?P<tfs_display_id>[a-z0-9\-]+)/?)' \
        r')'
 class CinemassacreIE(InfoExtractor):
    _VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
    _TESTS = [
        {
            'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
            'md5': 'fde81fbafaee331785f58cd6c0d46190',
            'info_dict': {
-                'id': '19911',
+                'id': 'Cinemasssacre-19911',
                'ext': 'mp4',
                'upload_date': '20121110',
                'title': '“Angry Video Game Nerd: The Movie” – Trailer',
@ -28,7 +33,7 @@ class CinemassacreIE(InfoExtractor):
            'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
            'md5': 'd72f10cd39eac4215048f62ab477a511',
            'info_dict': {
-                'id': '521be8ef82b16',
+                'id': 'Cinemasssacre-521be8ef82b16',
                'ext': 'mp4',
                'upload_date': '20131002',
                'title': 'The Mummy’s Hand (1940)',
@ -36,18 +41,16 @@ class CinemassacreIE(InfoExtractor):
        }
    ]
-    def _real_extract(self, url):
+    def _cinemassacre_get_info(self, url):
        mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('display_id')
+        display_id = mobj.group('cm_display_id')
        webpage = self._download_webpage(url, display_id)
-        video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
+        video_date = mobj.group('cm_date_Y') + mobj.group('cm_date_m') + mobj.group('cm_date_d')
-        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<full_video_id>(?:Cinemassacre-)?(?P<video_id>.+?)))"', webpage)
+        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', webpage)
        if not mobj:
            raise ExtractorError('Can\'t extract embed url and video id')
        playerdata_url = mobj.group('embed_url')
        video_id = mobj.group('video_id')
        full_video_id = mobj.group('full_video_id')
        video_title = self._html_search_regex(
            r'<title>(?P<title>.+?)\|', webpage, 'title')
@ -56,10 +59,56 @@ class CinemassacreIE(InfoExtractor):
            webpage, 'description', flags=re.DOTALL, fatal=False)
        video_thumbnail = self._og_search_thumbnail(webpage)
-        playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage')
+        return {
            'title': video_title,
            'description': video_description,
            'upload_date': video_date,
            'thumbnail': video_thumbnail,
            '_embed_url': playerdata_url,
        }
    def _teamfourstar_get_info(self, url):
        mobj = re.match(self._VALID_URL, url)
        display_id = mobj.group('tfs_display_id')
        webpage = self._download_webpage(url, display_id)
        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', webpage)
        if not mobj:
            raise ExtractorError('Can\'t extract embed url and video id')
        playerdata_url = mobj.group('embed_url')
        video_title = self._html_search_regex(
            r'<div class="heroheadingtitle">(?P<title>.+?)</div>', webpage, 'title')
        video_date = self._html_search_regex(
            r'<div class="heroheadingdate">(?P<date>.+?)</div>', webpage, 'date')
        mobj = re.match('(?P<month>[A-Z][a-z]+) (?P<day>\d+), (?P<year>\d+)', video_date)
        video_date = '%04u%02u%02u' % (int(mobj.group('year')), month_by_name(mobj.group('month')), int(mobj.group('day')))
        video_description = self._html_search_regex(
            r'<div class="postcontent">(?P<description>.+?)</div>', webpage, 'description', flags=re.DOTALL)
        video_thumbnail = self._og_search_thumbnail(webpage)
        return {
            'title': video_title,
            'description': video_description,
            'upload_date': video_date,
            'thumbnail': video_thumbnail,
            '_embed_url': playerdata_url,
        }
    def _screenwavemedia_get_info(self, url):
        mobj = re.match(self._VALID_URL, url)
        if not mobj:
            raise ExtractorError('Can\'t extract embed url and video id')
        video_id = mobj.group('video_id')
        playerdata = self._download_webpage(url, video_id, 'Downloading player webpage')
        vidtitle = self._search_regex(
            r'\'vidtitle\'\s*:\s*"([^\']+)"', playerdata, 'vidtitle').replace('\\/', '/')
        vidurl = self._search_regex(
            r'\'vidurl\'\s*:\s*"([^\']+)"', playerdata, 'vidurl').replace('\\/', '/')
        pageurl = self._search_regex(
            r'\'pageurl\'\s*:\s*"([^\']+)"', playerdata, 'pageurl', fatal=False).replace('\\/', '/')
        videolist_url = None
@ -67,7 +116,7 @@ class CinemassacreIE(InfoExtractor):
        if mobj:
            videoserver = mobj.group('videoserver')
            mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata)
-            vidid = mobj.group('vidid') if mobj else full_video_id
+            vidid = mobj.group('vidid') if mobj else video_id
            videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)
        else:
            mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata)
@ -110,9 +159,36 @@ class CinemassacreIE(InfoExtractor):
        return {
            'id': video_id,
-            'title': video_title,
+            'title': vidtitle,
            'formats': formats,
-            'description': video_description,
+            '_episode_page': pageurl,
            'upload_date': video_date,
            'thumbnail': video_thumbnail,
        }
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        swm_info = None
        site_info = None
        if mobj.group('generic'):
            swm_info = self._screenwavemedia_get_info(url)
            url = swm_info['_episode_page']
            mobj = re.match(self._VALID_URL, url)
        if mobj:
            if mobj.group('cinemassacre'):
                site_info = self._cinemassacre_get_info(url)
            elif mobj.group('teamfourstar'):
                site_info = self._teamfourstar_get_info(url)
        if not swm_info:
            if site_info:
                swm_info = self._screenwavemedia_get_info(site_info['_embed_url'])
        if not swm_info:
            raise ExtractorError("Failed to extract metadata for this URL")
        if site_info:
            swm_info.update(site_info)
        return swm_info