Merge branch 'master' of github.com-rndusr:rg3/youtube-dl into fix/str-item-assignment

This commit is contained in:
Random User 2017-03-25 21:36:59 +01:00
commit 4f06c1c9fc
75 changed files with 2462 additions and 957 deletions

View file

@ -25,7 +25,8 @@ class AddAnimeIE(InfoExtractor):
'ext': 'mp4',
'description': 'One Piece 606',
'title': 'One Piece 606',
}
},
'skip': 'Video is gone',
}, {
'url': 'http://add-anime.net/video/MDUGWYKNGBD8/One-Piece-687',
'only_matching': True,

View file

@ -36,6 +36,11 @@ MSO_INFO = {
'username_field': 'Ecom_User_ID',
'password_field': 'Ecom_Password',
},
'Charter_Direct': {
'name': 'Charter Spectrum',
'username_field': 'IDToken1',
'password_field': 'IDToken2',
},
'thr030': {
'name': '3 Rivers Communications'
},
@ -1453,6 +1458,8 @@ class AdobePassIE(InfoExtractor):
self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {})
count += 1
continue
if '<error' in authorize:
raise ExtractorError(xml_text(authorize, 'details'), expected=True)
authz_token = unescapeHTML(xml_text(authorize, 'authzToken'))
requestor_info[guid] = authz_token
self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info)

View file

@ -4,15 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_urlparse,
compat_urlparse,
)
from ..compat import compat_xpath
from ..utils import (
ExtractorError,
int_or_none,
update_url_query,
xpath_element,
xpath_text,
)
@ -43,7 +38,8 @@ class AfreecaTVIE(InfoExtractor):
'uploader': 'dailyapril',
'uploader_id': 'dailyapril',
'upload_date': '20160503',
}
},
'skip': 'Video is gone',
}, {
'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867',
'info_dict': {
@ -71,6 +67,19 @@ class AfreecaTVIE(InfoExtractor):
'upload_date': '20160502',
},
}],
'skip': 'Video is gone',
}, {
'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793',
'info_dict': {
'id': '18650793',
'ext': 'flv',
'uploader': '윈아디',
'uploader_id': 'badkids',
'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!',
},
'params': {
'skip_download': True, # requires rtmpdump
},
}, {
'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',
'only_matching': True,
@ -90,40 +99,33 @@ class AfreecaTVIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
parsed_url = compat_urllib_parse_urlparse(url)
info_url = compat_urlparse.urlunparse(parsed_url._replace(
netloc='afbbs.afreecatv.com:8080',
path='/api/video/get_video_info.php'))
video_xml = self._download_xml(
update_url_query(info_url, {'nTitleNo': video_id}), video_id)
'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php',
video_id, query={'nTitleNo': video_id})
if xpath_element(video_xml, './track/video/file') is None:
video_element = video_xml.findall(compat_xpath('./track/video'))[1]
if video_element is None or video_element.text is None:
raise ExtractorError('Specified AfreecaTV video does not exist',
expected=True)
title = xpath_text(video_xml, './track/title', 'title')
video_url_raw = video_element.text
app, playpath = video_url_raw.split('mp4:')
title = xpath_text(video_xml, './track/title', 'title', fatal=True)
uploader = xpath_text(video_xml, './track/nickname', 'uploader')
uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id')
duration = int_or_none(xpath_text(video_xml, './track/duration',
'duration'))
thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail')
entries = []
for i, video_file in enumerate(video_xml.findall('./track/video/file')):
video_key = self.parse_video_key(video_file.get('key', ''))
if not video_key:
continue
entries.append({
'id': '%s_%s' % (video_id, video_key.get('part', i + 1)),
'title': title,
'upload_date': video_key.get('upload_date'),
'duration': int_or_none(video_file.get('duration')),
'url': video_file.text,
})
info = {
return {
'id': video_id,
'url': app,
'ext': 'flv',
'play_path': 'mp4:' + playpath,
'rtmp_live': True, # downloading won't end without this
'title': title,
'uploader': uploader,
'uploader_id': uploader_id,
@ -131,20 +133,6 @@ class AfreecaTVIE(InfoExtractor):
'thumbnail': thumbnail,
}
if len(entries) > 1:
info['_type'] = 'multi_video'
info['entries'] = entries
elif len(entries) == 1:
info['url'] = entries[0]['url']
info['upload_date'] = entries[0].get('upload_date')
else:
raise ExtractorError(
'No files found for the specified AfreecaTV video, either'
' the URL is incorrect or the video has been made private.',
expected=True)
return info
class AfreecaTVGlobalIE(AfreecaTVIE):
IE_NAME = 'afreecatv:global'

View file

@ -93,8 +93,7 @@ class ArkenaIE(InfoExtractor):
exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None))
if kind == 'm3u8' or 'm3u8' in exts:
formats.extend(self._extract_m3u8_formats(
f_url, video_id, 'mp4',
entry_protocol='m3u8' if is_live else 'm3u8_native',
f_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=kind, fatal=False, live=is_live))
elif kind == 'flash' or 'f4m' in exts:
formats.extend(self._extract_f4m_formats(

View file

@ -90,7 +90,8 @@ class AtresPlayerIE(InfoExtractor):
request, None, 'Logging in as %s' % username)
error = self._html_search_regex(
r'(?s)<ul class="list_error">(.+?)</ul>', response, 'error', default=None)
r'(?s)<ul[^>]+class="[^"]*\blist_error\b[^"]*">(.+?)</ul>',
response, 'error', default=None)
if error:
raise ExtractorError(
'Unable to login: %s' % error, expected=True)
@ -155,13 +156,17 @@ class AtresPlayerIE(InfoExtractor):
if format_id == 'token' or not video_url.startswith('http'):
continue
if 'geodeswowsmpra3player' in video_url:
f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0]
f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path)
# f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0]
# f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path)
# this videos are protected by DRM, the f4m downloader doesn't support them
continue
else:
f4m_url = video_url[:-9] + '/manifest.f4m'
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
video_url_hd = video_url.replace('free_es', 'es')
formats.extend(self._extract_f4m_formats(
video_url_hd[:-9] + '/manifest.f4m', video_id, f4m_id='hds',
fatal=False))
formats.extend(self._extract_mpd_formats(
video_url_hd[:-9] + '/manifest.mpd', video_id, mpd_id='dash',
fatal=False))
self._sort_formats(formats)
path_data = player.get('pathData')

View file

@ -0,0 +1,73 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
int_or_none,
unescapeHTML,
)
class ATVAtIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?atv\.at/(?:[^/]+/){2}(?P<id>[dv]\d+)'
_TESTS = [{
'url': 'http://atv.at/aktuell/di-210317-2005-uhr/v1698449/',
'md5': 'c3b6b975fb3150fc628572939df205f2',
'info_dict': {
'id': '1698447',
'ext': 'mp4',
'title': 'DI, 21.03.17 | 20:05 Uhr 1/1',
}
}, {
'url': 'http://atv.at/aktuell/meinrad-knapp/d8416/',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_data = self._parse_json(unescapeHTML(self._search_regex(
r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="([^"]+)"',
webpage, 'player data')), display_id)['config']['initial_video']
video_id = video_data['id']
video_title = video_data['title']
parts = []
for part in video_data.get('parts', []):
part_id = part['id']
part_title = part['title']
formats = []
for source in part.get('sources', []):
source_url = source.get('src')
if not source_url:
continue
ext = determine_ext(source_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
source_url, part_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
else:
formats.append({
'format_id': source.get('delivery'),
'url': source_url,
})
self._sort_formats(formats)
parts.append({
'id': part_id,
'title': part_title,
'thumbnail': part.get('preview_image_url'),
'duration': int_or_none(part.get('duration')),
'is_live': part.get('is_livestream'),
'formats': formats,
})
return {
'_type': 'multi_video',
'id': video_id,
'title': video_title,
'entries': parts,
}

View file

@ -21,10 +21,11 @@ class BellMediaIE(InfoExtractor):
animalplanet|
bravo|
mtv|
space
space|
etalk
)\.ca|
much\.com
)/.*?(?:\bvid=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})'''
)/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})'''
_TESTS = [{
'url': 'http://www.ctv.ca/video/player?vid=706966',
'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0',
@ -58,6 +59,9 @@ class BellMediaIE(InfoExtractor):
}, {
'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430',
'only_matching': True,
}, {
'url': 'http://www.etalk.ca/video?videoid=663455',
'only_matching': True,
}]
_DOMAINS = {
'thecomedynetwork': 'comedy',
@ -65,6 +69,7 @@ class BellMediaIE(InfoExtractor):
'sciencechannel': 'discsci',
'investigationdiscovery': 'invdisc',
'animalplanet': 'aniplan',
'etalk': 'ctv',
}
def _real_extract(self, url):

View file

@ -0,0 +1,72 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
extract_attributes,
)
class BostonGlobeIE(InfoExtractor):
_VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?'
_TESTS = [
{
'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html',
'md5': '0a62181079c85c2d2b618c9a738aedaf',
'info_dict': {
'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood',
'id': '5320421710001',
'ext': 'mp4',
'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.',
'timestamp': 1486877593,
'upload_date': '20170212',
'uploader_id': '245991542',
},
},
{
# Embedded youtube video; we hand it off to the Generic extractor.
'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html',
'md5': '582b40327089d5c0c949b3c54b13c24b',
'info_dict': {
'title': "Who Is Matt Damon's Favorite Batman?",
'id': 'ZW1QCnlA6Qc',
'ext': 'mp4',
'upload_date': '20170217',
'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb',
'uploader': 'The Late Late Show with James Corden',
'uploader_id': 'TheLateLateShow',
},
'expected_warnings': ['404'],
},
]
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
page_title = self._og_search_title(webpage, default=None)
# <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject">
entries = []
for video in re.findall(r'(?i)(<video[^>]+>)', webpage):
attrs = extract_attributes(video)
video_id = attrs.get('data-brightcove-video-id')
account_id = attrs.get('data-account')
player_id = attrs.get('data-player')
embed = attrs.get('data-embed')
if video_id and account_id and player_id and embed:
entries.append(
'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
% (account_id, player_id, embed, video_id))
if len(entries) == 0:
return self.url_result(url, 'Generic')
elif len(entries) == 1:
return self.url_result(entries[0], 'BrightcoveNew')
else:
return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew')

View file

@ -193,7 +193,13 @@ class BrightcoveLegacyIE(InfoExtractor):
if videoPlayer is not None:
if isinstance(videoPlayer, list):
videoPlayer = videoPlayer[0]
if not (videoPlayer.isdigit() or videoPlayer.startswith('ref:')):
videoPlayer = videoPlayer.strip()
# UUID is also possible for videoPlayer (e.g.
# http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd
# or http://www8.hp.com/cn/zh/home.html)
if not (re.match(
r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$',
videoPlayer) or videoPlayer.startswith('ref:')):
return None
params['@videoPlayer'] = videoPlayer
linkBase = find_param('linkBaseURL')
@ -515,6 +521,9 @@ class BrightcoveNewIE(InfoExtractor):
return entries
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
self._initialize_geo_bypass(smuggled_data.get('geo_countries'))
account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(
@ -544,8 +553,10 @@ class BrightcoveNewIE(InfoExtractor):
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
json_data = self._parse_json(e.cause.read().decode(), video_id)[0]
raise ExtractorError(
json_data.get('message') or json_data['error_code'], expected=True)
message = json_data.get('message') or json_data['error_code']
if json_data.get('error_subcode') == 'CLIENT_GEO':
self.raise_geo_restricted(msg=message)
raise ExtractorError(message, expected=True)
raise
title = json_data['name'].strip()

View file

@ -160,8 +160,7 @@ class CeskaTelevizeIE(InfoExtractor):
for format_id, stream_url in item.get('streamUrls', {}).items():
if 'playerType=flash' in stream_url:
stream_formats = self._extract_m3u8_formats(
stream_url, playlist_id, 'mp4',
entry_protocol='m3u8' if is_live else 'm3u8_native',
stream_url, playlist_id, 'mp4', 'm3u8_native',
m3u8_id='hls-%s' % format_id, fatal=False)
else:
stream_formats = self._extract_mpd_formats(

View file

@ -4,62 +4,62 @@ import re
from .common import InfoExtractor
from ..utils import (
clean_html,
ExtractorError,
parse_filesize,
int_or_none,
parse_iso8601,
qualities,
unescapeHTML,
)
class Channel9IE(InfoExtractor):
'''
Common extractor for channel9.msdn.com.
The type of provided URL (video or playlist) is determined according to
meta Search.PageType from web page HTML rather than URL itself, as it is
not always possible to do.
'''
IE_DESC = 'Channel 9'
IE_NAME = 'channel9'
_VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
_VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
_TESTS = [{
'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
'md5': '32083d4eaf1946db6d454313f44510ca',
'info_dict': {
'id': 'Events/TechEd/Australia/2013/KOS002',
'ext': 'mp4',
'id': '6c413323-383a-49dc-88f9-a22800cab024',
'ext': 'wmv',
'title': 'Developer Kick-Off Session: Stuff We Love',
'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
'duration': 4576,
'thumbnail': r're:http://.*\.jpg',
'thumbnail': r're:https?://.*\.jpg',
'timestamp': 1377717420,
'upload_date': '20130828',
'session_code': 'KOS002',
'session_day': 'Day 1',
'session_room': 'Arena 1A',
'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
'Mads Kristensen'],
'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'],
},
}, {
'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
'info_dict': {
'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
'ext': 'mp4',
'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
'ext': 'wmv',
'title': 'Self-service BI with Power BI - nuclear testing',
'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
'duration': 1540,
'thumbnail': r're:http://.*\.jpg',
'thumbnail': r're:https?://.*\.jpg',
'timestamp': 1386381991,
'upload_date': '20131207',
'authors': ['Mike Wilmot'],
},
}, {
# low quality mp4 is best
'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
'info_dict': {
'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
'ext': 'mp4',
'title': 'Ranges for the Standard Library',
'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
'duration': 5646,
'thumbnail': r're:http://.*\.jpg',
'thumbnail': r're:https?://.*\.jpg',
'upload_date': '20150930',
'timestamp': 1443640735,
},
'params': {
'skip_download': True,
@ -70,7 +70,7 @@ class Channel9IE(InfoExtractor):
'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
'title': 'Channel 9',
},
'playlist_count': 2,
'playlist_mincount': 100,
}, {
'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
'only_matching': True,
@ -81,189 +81,6 @@ class Channel9IE(InfoExtractor):
_RSS_URL = 'http://channel9.msdn.com/%s/RSS'
def _formats_from_html(self, html):
FORMAT_REGEX = r'''
(?x)
<a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
<span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
(?:<div\s+class="popup\s+rounded">\s*
<h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
</div>)? # File size part may be missing
'''
quality = qualities((
'MP3', 'MP4',
'Low Quality WMV', 'Low Quality MP4',
'Mid Quality WMV', 'Mid Quality MP4',
'High Quality WMV', 'High Quality MP4'))
formats = [{
'url': x.group('url'),
'format_id': x.group('quality'),
'format_note': x.group('note'),
'format': '%s (%s)' % (x.group('quality'), x.group('note')),
'filesize_approx': parse_filesize(x.group('filesize')),
'quality': quality(x.group('quality')),
'vcodec': 'none' if x.group('note') == 'Audio only' else None,
} for x in list(re.finditer(FORMAT_REGEX, html))]
self._sort_formats(formats)
return formats
def _extract_title(self, html):
title = self._html_search_meta('title', html, 'title')
if title is None:
title = self._og_search_title(html)
TITLE_SUFFIX = ' (Channel 9)'
if title is not None and title.endswith(TITLE_SUFFIX):
title = title[:-len(TITLE_SUFFIX)]
return title
def _extract_description(self, html):
DESCRIPTION_REGEX = r'''(?sx)
<div\s+class="entry-content">\s*
<div\s+id="entry-body">\s*
(?P<description>.+?)\s*
</div>\s*
</div>
'''
m = re.search(DESCRIPTION_REGEX, html)
if m is not None:
return m.group('description')
return self._html_search_meta('description', html, 'description')
def _extract_duration(self, html):
m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
def _extract_slides(self, html):
m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
return m.group('slidesurl') if m is not None else None
def _extract_zip(self, html):
m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
return m.group('zipurl') if m is not None else None
def _extract_avg_rating(self, html):
m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
return float(m.group('avgrating')) if m is not None else 0
def _extract_rating_count(self, html):
m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
def _extract_view_count(self, html):
m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
def _extract_comment_count(self, html):
m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
def _fix_count(self, count):
return int(str(count).replace(',', '')) if count is not None else None
def _extract_authors(self, html):
m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
if m is None:
return None
return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
def _extract_session_code(self, html):
m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
return m.group('code') if m is not None else None
def _extract_session_day(self, html):
m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
return m.group('day').strip() if m is not None else None
def _extract_session_room(self, html):
m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
return m.group('room') if m is not None else None
def _extract_session_speakers(self, html):
return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
def _extract_content(self, html, content_path):
# Look for downloadable content
formats = self._formats_from_html(html)
slides = self._extract_slides(html)
zip_ = self._extract_zip(html)
# Nothing to download
if len(formats) == 0 and slides is None and zip_ is None:
self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
return
# Extract meta
title = self._extract_title(html)
description = self._extract_description(html)
thumbnail = self._og_search_thumbnail(html)
duration = self._extract_duration(html)
avg_rating = self._extract_avg_rating(html)
rating_count = self._extract_rating_count(html)
view_count = self._extract_view_count(html)
comment_count = self._extract_comment_count(html)
common = {
'_type': 'video',
'id': content_path,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'avg_rating': avg_rating,
'rating_count': rating_count,
'view_count': view_count,
'comment_count': comment_count,
}
result = []
if slides is not None:
d = common.copy()
d.update({'title': title + '-Slides', 'url': slides})
result.append(d)
if zip_ is not None:
d = common.copy()
d.update({'title': title + '-Zip', 'url': zip_})
result.append(d)
if len(formats) > 0:
d = common.copy()
d.update({'title': title, 'formats': formats})
result.append(d)
return result
def _extract_entry_item(self, html, content_path):
contents = self._extract_content(html, content_path)
if contents is None:
return contents
if len(contents) > 1:
raise ExtractorError('Got more than one entry')
result = contents[0]
result['authors'] = self._extract_authors(html)
return result
def _extract_session(self, html, content_path):
contents = self._extract_content(html, content_path)
if contents is None:
return contents
session_meta = {
'session_code': self._extract_session_code(html),
'session_day': self._extract_session_day(html),
'session_room': self._extract_session_room(html),
'session_speakers': self._extract_session_speakers(html),
}
for content in contents:
content.update(session_meta)
return self.playlist_result(contents)
def _extract_list(self, video_id, rss_url=None):
if not rss_url:
rss_url = self._RSS_URL % video_id
@ -274,9 +91,7 @@ class Channel9IE(InfoExtractor):
return self.playlist_result(entries, video_id, title_text)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
content_path = mobj.group('contentpath')
rss = mobj.group('rss')
content_path, rss = re.match(self._VALID_URL, url).groups()
if rss:
return self._extract_list(content_path, url)
@ -284,17 +99,158 @@ class Channel9IE(InfoExtractor):
webpage = self._download_webpage(
url, content_path, 'Downloading web page')
page_type = self._search_regex(
r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2',
webpage, 'page type', default=None, group='pagetype')
if page_type:
if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
return self._extract_entry_item(webpage, content_path)
elif page_type == 'Session': # Event session page, may contain downloadable content
return self._extract_session(webpage, content_path)
elif page_type == 'Event':
return self._extract_list(content_path)
episode_data = self._search_regex(
r"data-episode='([^']+)'", webpage, 'episode data', default=None)
if episode_data:
episode_data = self._parse_json(unescapeHTML(
episode_data), content_path)
content_id = episode_data['contentId']
is_session = '/Sessions(' in episode_data['api']
content_url = 'https://channel9.msdn.com/odata' + episode_data['api']
if is_session:
content_url += '?$expand=Speakers'
else:
raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
else: # Assuming list
content_url += '?$expand=Authors'
content_data = self._download_json(content_url, content_id)
title = content_data['Title']
QUALITIES = (
'mp3',
'wmv', 'mp4',
'wmv-low', 'mp4-low',
'wmv-mid', 'mp4-mid',
'wmv-high', 'mp4-high',
)
quality_key = qualities(QUALITIES)
def quality(quality_id, format_url):
return (len(QUALITIES) if '_Source.' in format_url
else quality_key(quality_id))
formats = []
urls = set()
SITE_QUALITIES = {
'MP3': 'mp3',
'MP4': 'mp4',
'Low Quality WMV': 'wmv-low',
'Low Quality MP4': 'mp4-low',
'Mid Quality WMV': 'wmv-mid',
'Mid Quality MP4': 'mp4-mid',
'High Quality WMV': 'wmv-high',
'High Quality MP4': 'mp4-high',
}
formats_select = self._search_regex(
r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage,
'formats select', default=None)
if formats_select:
for mobj in re.finditer(
r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<',
formats_select):
format_url = mobj.group('url')
if format_url in urls:
continue
urls.add(format_url)
format_id = mobj.group('format')
quality_id = SITE_QUALITIES.get(format_id, format_id)
formats.append({
'url': format_url,
'format_id': quality_id,
'quality': quality(quality_id, format_url),
'vcodec': 'none' if quality_id == 'mp3' else None,
})
API_QUALITIES = {
'VideoMP4Low': 'mp4-low',
'VideoWMV': 'wmv-mid',
'VideoMP4Medium': 'mp4-mid',
'VideoMP4High': 'mp4-high',
'VideoWMVHQ': 'wmv-hq',
}
for format_id, q in API_QUALITIES.items():
q_url = content_data.get(format_id)
if not q_url or q_url in urls:
continue
urls.add(q_url)
formats.append({
'url': q_url,
'format_id': q,
'quality': quality(q, q_url),
})
self._sort_formats(formats)
slides = content_data.get('Slides')
zip_file = content_data.get('ZipFile')
if not formats and not slides and not zip_file:
raise ExtractorError(
'None of recording, slides or zip are available for %s' % content_path)
subtitles = {}
for caption in content_data.get('Captions', []):
caption_url = caption.get('Url')
if not caption_url:
continue
subtitles.setdefault(caption.get('Language', 'en'), []).append({
'url': caption_url,
'ext': 'vtt',
})
common = {
'id': content_id,
'title': title,
'description': clean_html(content_data.get('Description') or content_data.get('Body')),
'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'),
'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
'timestamp': parse_iso8601(content_data.get('PublishedDate')),
'avg_rating': int_or_none(content_data.get('Rating')),
'rating_count': int_or_none(content_data.get('RatingCount')),
'view_count': int_or_none(content_data.get('Views')),
'comment_count': int_or_none(content_data.get('CommentCount')),
'subtitles': subtitles,
}
if is_session:
speakers = []
for s in content_data.get('Speakers', []):
speaker_name = s.get('FullName')
if not speaker_name:
continue
speakers.append(speaker_name)
common.update({
'session_code': content_data.get('Code'),
'session_room': content_data.get('Room'),
'session_speakers': speakers,
})
else:
authors = []
for a in content_data.get('Authors', []):
author_name = a.get('DisplayName')
if not author_name:
continue
authors.append(author_name)
common['authors'] = authors
contents = []
if slides:
d = common.copy()
d.update({'title': title + '-Slides', 'url': slides})
contents.append(d)
if zip_file:
d = common.copy()
d.update({'title': title + '-Zip', 'url': zip_file})
contents.append(d)
if formats:
d = common.copy()
d.update({'title': title, 'formats': formats})
contents.append(d)
return self.playlist_result(contents)
else:
return self._extract_list(content_path)

View file

@ -1,97 +1,56 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_HTTPError,
)
from ..utils import (
ExtractorError,
HEADRequest,
remove_end,
str_to_int,
unified_strdate,
)
class CloudyIE(InfoExtractor):
_IE_DESC = 'cloudy.ec'
_VALID_URL = r'''(?x)
https?://(?:www\.)?cloudy\.ec/
(?:v/|embed\.php\?id=)
(?P<id>[A-Za-z0-9]+)
'''
_EMBED_URL = 'http://www.cloudy.ec/embed.php?id=%s'
_API_URL = 'http://www.cloudy.ec/api/player.api.php'
_MAX_TRIES = 2
_TEST = {
_VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'https://www.cloudy.ec/v/af511e2527aac',
'md5': '5cb253ace826a42f35b4740539bedf07',
'md5': '29832b05028ead1b58be86bf319397ca',
'info_dict': {
'id': 'af511e2527aac',
'ext': 'flv',
'ext': 'mp4',
'title': 'Funny Cats and Animals Compilation june 2013',
'upload_date': '20130913',
'view_count': int,
}
}
def _extract_video(self, video_id, file_key, error_url=None, try_num=0):
if try_num > self._MAX_TRIES - 1:
raise ExtractorError('Unable to extract video URL', expected=True)
form = {
'file': video_id,
'key': file_key,
}
if error_url:
form.update({
'numOfErrors': try_num,
'errorCode': '404',
'errorUrl': error_url,
})
player_data = self._download_webpage(
self._API_URL, video_id, 'Downloading player data', query=form)
data = compat_parse_qs(player_data)
try_num += 1
if 'error' in data:
raise ExtractorError(
'%s error: %s' % (self.IE_NAME, ' '.join(data['error_msg'])),
expected=True)
title = data.get('title', [None])[0]
if title:
title = remove_end(title, '&asdasdas').strip()
video_url = data.get('url', [None])[0]
if video_url:
try:
self._request_webpage(HEADRequest(video_url), video_id, 'Checking video URL')
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in [404, 410]:
self.report_warning('Invalid video URL, requesting another', video_id)
return self._extract_video(video_id, file_key, video_url, try_num)
return {
'id': video_id,
'url': video_url,
'title': title,
}
}, {
'url': 'http://www.cloudy.ec/embed.php?autoplay=1&id=af511e2527aac',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
video_id = self._match_id(url)
url = self._EMBED_URL % video_id
webpage = self._download_webpage(url, video_id)
webpage = self._download_webpage(
'http://www.cloudy.ec/embed.php?id=%s' % video_id, video_id)
file_key = self._search_regex(
[r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'],
webpage, 'file_key')
info = self._parse_html5_media_entries(url, webpage, video_id)[0]
return self._extract_video(video_id, file_key)
webpage = self._download_webpage(
'https://www.cloudy.ec/v/%s' % video_id, video_id, fatal=False)
if webpage:
info.update({
'title': self._search_regex(
r'<h\d[^>]*>([^<]+)<', webpage, 'title'),
'upload_date': unified_strdate(self._search_regex(
r'>Published at (\d{4}-\d{1,2}-\d{1,2})', webpage,
'upload date', fatal=False)),
'view_count': str_to_int(self._search_regex(
r'([\d,.]+) views<', webpage, 'view count', fatal=False)),
})
if not info.get('title'):
info['title'] = video_id
info['id'] = video_id
return info

View file

@ -36,34 +36,35 @@ from ..utils import (
clean_html,
compiled_regex_type,
determine_ext,
determine_protocol,
error_to_compat_str,
ExtractorError,
extract_attributes,
fix_xml_ampersands,
float_or_none,
GeoRestrictedError,
GeoUtils,
int_or_none,
js_to_json,
mimetype2ext,
orderedSet,
parse_codecs,
parse_duration,
parse_iso8601,
parse_m3u8_attributes,
RegexNotFoundError,
sanitize_filename,
sanitized_Request,
sanitize_filename,
unescapeHTML,
unified_strdate,
unified_timestamp,
update_Request,
update_url_query,
urljoin,
url_basename,
xpath_element,
xpath_text,
xpath_with_ns,
determine_protocol,
parse_duration,
mimetype2ext,
update_Request,
update_url_query,
parse_m3u8_attributes,
extract_attributes,
parse_codecs,
urljoin,
)
@ -714,6 +715,13 @@ class InfoExtractor(object):
video_info['title'] = video_title
return video_info
def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
urlrs = orderedSet(
self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
for m in matches)
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
@staticmethod
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
"""Returns a playlist"""
@ -2204,56 +2212,9 @@ class InfoExtractor(object):
this_video_id = video_id or video_data['mediaid']
formats = []
for source in video_data['sources']:
source_url = self._proto_relative_url(source['file'])
if base_url:
source_url = compat_urlparse.urljoin(base_url, source_url)
source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url)
if source_type == 'hls' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
source_url, this_video_id, mpd_id=mpd_id, fatal=False))
# https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
formats.append({
'url': source_url,
'vcodec': 'none',
'ext': ext,
})
else:
height = int_or_none(source.get('height'))
if height is None:
# Often no height is provided but there is a label in
# format like 1080p.
height = int_or_none(self._search_regex(
r'^(\d{3,})[pP]$', source.get('label') or '',
'height', default=None))
a_format = {
'url': source_url,
'width': int_or_none(source.get('width')),
'height': height,
'ext': ext,
}
if source_url.startswith('rtmp'):
a_format['ext'] = 'flv'
# See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
# of jwplayer.flash.swf
rtmp_url_parts = re.split(
r'((?:mp4|mp3|flv):)', source_url, 1)
if len(rtmp_url_parts) == 3:
rtmp_url, prefix, play_path = rtmp_url_parts
a_format.update({
'url': rtmp_url,
'play_path': prefix + play_path,
})
if rtmp_params:
a_format.update(rtmp_params)
formats.append(a_format)
formats = self._parse_jwplayer_formats(
video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
self._sort_formats(formats)
subtitles = {}
@ -2284,6 +2245,65 @@ class InfoExtractor(object):
else:
return self.playlist_result(entries)
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
formats = []
for source in jwplayer_sources_data:
source_url = self._proto_relative_url(source['file'])
if base_url:
source_url = compat_urlparse.urljoin(base_url, source_url)
source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url)
if source_type == 'hls' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=m3u8_id, fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
source_url, video_id, mpd_id=mpd_id, fatal=False))
elif ext == 'smil':
formats.extend(self._extract_smil_formats(
source_url, video_id, fatal=False))
# https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
elif source_type.startswith('audio') or ext in (
'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
formats.append({
'url': source_url,
'vcodec': 'none',
'ext': ext,
})
else:
height = int_or_none(source.get('height'))
if height is None:
# Often no height is provided but there is a label in
# format like "1080p", "720p SD", or 1080.
height = int_or_none(self._search_regex(
r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
'height', default=None))
a_format = {
'url': source_url,
'width': int_or_none(source.get('width')),
'height': height,
'tbr': int_or_none(source.get('bitrate')),
'ext': ext,
}
if source_url.startswith('rtmp'):
a_format['ext'] = 'flv'
# See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
# of jwplayer.flash.swf
rtmp_url_parts = re.split(
r'((?:mp4|mp3|flv):)', source_url, 1)
if len(rtmp_url_parts) == 3:
rtmp_url, prefix, play_path = rtmp_url_parts
a_format.update({
'url': rtmp_url,
'play_path': prefix + play_path,
})
if rtmp_params:
a_format.update(rtmp_params)
formats.append(a_format)
return formats
def _live_title(self, name):
""" Generate the title for a live video """
now = datetime.datetime.now()

View file

@ -9,13 +9,14 @@ from ..compat import (
compat_urlparse,
)
from ..utils import (
orderedSet,
remove_end,
extract_attributes,
mimetype2ext,
determine_ext,
extract_attributes,
int_or_none,
js_to_json,
mimetype2ext,
orderedSet,
parse_iso8601,
remove_end,
)
@ -66,6 +67,16 @@ class CondeNastIE(InfoExtractor):
'upload_date': '20130314',
'timestamp': 1363219200,
}
}, {
'url': 'http://video.gq.com/watch/the-closer-with-keith-olbermann-the-only-true-surprise-trump-s-an-idiot?c=series',
'info_dict': {
'id': '58d1865bfd2e6126e2000015',
'ext': 'mp4',
'title': 'The Only True Surprise? Trumps an Idiot',
'uploader': 'gq',
'upload_date': '20170321',
'timestamp': 1490126427,
},
}, {
# JS embed
'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js',
@ -114,26 +125,33 @@ class CondeNastIE(InfoExtractor):
})
video_id = query['videoId']
video_info = None
info_page = self._download_webpage(
info_page = self._download_json(
'http://player.cnevids.com/player/video.js',
video_id, 'Downloading video info', query=query, fatal=False)
video_id, 'Downloading video info', fatal=False, query=query)
if info_page:
video_info = self._parse_json(self._search_regex(
r'loadCallback\(({.+})\)', info_page, 'video info'), video_id)['video']
else:
video_info = info_page.get('video')
if not video_info:
info_page = self._download_webpage(
'http://player.cnevids.com/player/loader.js',
video_id, 'Downloading loader info', query=query)
video_info = self._parse_json(self._search_regex(
r'var\s+video\s*=\s*({.+?});', info_page, 'video info'), video_id)
video_info = self._parse_json(
self._search_regex(
r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'),
video_id, transform_source=js_to_json)['video']
title = video_info['title']
formats = []
for fdata in video_info.get('sources', [{}])[0]:
for fdata in video_info['sources']:
src = fdata.get('src')
if not src:
continue
ext = mimetype2ext(fdata.get('type')) or determine_ext(src)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
src, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
continue
quality = fdata.get('quality')
formats.append({
'format_id': ext + ('-%s' % quality if quality else ''),
@ -169,7 +187,6 @@ class CondeNastIE(InfoExtractor):
path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/')))
url_type = 'embed'
self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site])
webpage = self._download_webpage(url, item_id)
if url_type == 'series':

View file

@ -177,6 +177,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'uploader': 'Kadokawa Pictures Inc.',
'upload_date': '20170118',
'series': "KONOSUBA -God's blessing on this wonderful world!",
'season': "KONOSUBA -God's blessing on this wonderful world! 2",
'season_number': 2,
'episode': 'Give Me Deliverance from this Judicial Injustice!',
'episode_number': 1,
@ -222,6 +223,23 @@ class CrunchyrollIE(CrunchyrollBaseIE):
# just test metadata extraction
'skip_download': True,
},
}, {
# A video with a vastly different season name compared to the series name
'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532',
'info_dict': {
'id': '590532',
'ext': 'mp4',
'title': 'Haiyoru! Nyaruani (ONA) Episode 1 Test',
'description': 'Mahiro and Nyaruko talk about official certification.',
'uploader': 'TV TOKYO',
'upload_date': '20120305',
'series': 'Nyarko-san: Another Crawling Chaos',
'season': 'Haiyoru! Nyaruani (ONA)',
},
'params': {
# Just test metadata extraction
'skip_download': True,
},
}]
_FORMAT_IDS = {
@ -491,7 +509,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
# webpage provide more accurate data than series_title from XML
series = self._html_search_regex(
r'id=["\']showmedia_about_episode_num[^>]+>\s*<a[^>]+>([^<]+)',
webpage, 'series', default=xpath_text(metadata, 'series_title'))
webpage, 'series', fatal=False)
season = xpath_text(metadata, 'series_title')
episode = xpath_text(metadata, 'episode_title')
episode_number = int_or_none(xpath_text(metadata, 'episode_number'))
@ -508,6 +527,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'uploader': video_uploader,
'upload_date': video_upload_date,
'series': series,
'season': season,
'season_number': season_number,
'episode': episode,
'episode_number': episode_number,

View file

@ -1,17 +1,21 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
extract_attributes,
ExtractorError,
int_or_none,
parse_age_limit,
ExtractorError,
remove_end,
unescapeHTML,
)
class DiscoveryGoIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://(?:www\.)?(?:
class DiscoveryGoBaseIE(InfoExtractor):
_VALID_URL_TEMPLATE = r'''(?x)https?://(?:www\.)?(?:
discovery|
investigationdiscovery|
discoverylife|
@ -21,18 +25,23 @@ class DiscoveryGoIE(InfoExtractor):
sciencechannel|
tlc|
velocitychannel
)go\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'''
)go\.com/%s(?P<id>[^/?#&]+)'''
class DiscoveryGoIE(DiscoveryGoBaseIE):
_VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % r'(?:[^/]+/)+'
_GEO_COUNTRIES = ['US']
_TEST = {
'url': 'https://www.discoverygo.com/love-at-first-kiss/kiss-first-ask-questions-later/',
'url': 'https://www.discoverygo.com/bering-sea-gold/reaper-madness/',
'info_dict': {
'id': '57a33c536b66d1cd0345eeb1',
'id': '58c167d86b66d12f2addeb01',
'ext': 'mp4',
'title': 'Kiss First, Ask Questions Later!',
'description': 'md5:fe923ba34050eae468bffae10831cb22',
'duration': 2579,
'series': 'Love at First Kiss',
'season_number': 1,
'episode_number': 1,
'title': 'Reaper Madness',
'description': 'md5:09f2c625c99afb8946ed4fb7865f6e78',
'duration': 2519,
'series': 'Bering Sea Gold',
'season_number': 8,
'episode_number': 6,
'age_limit': 14,
},
}
@ -113,3 +122,46 @@ class DiscoveryGoIE(InfoExtractor):
'formats': formats,
'subtitles': subtitles,
}
class DiscoveryGoPlaylistIE(DiscoveryGoBaseIE):
_VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % ''
_TEST = {
'url': 'https://www.discoverygo.com/bering-sea-gold/',
'info_dict': {
'id': 'bering-sea-gold',
'title': 'Bering Sea Gold',
'description': 'md5:cc5c6489835949043c0cc3ad66c2fa0e',
},
'playlist_mincount': 6,
}
@classmethod
def suitable(cls, url):
return False if DiscoveryGoIE.suitable(url) else super(
DiscoveryGoPlaylistIE, cls).suitable(url)
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
entries = []
for mobj in re.finditer(r'data-json=(["\'])(?P<json>{.+?})\1', webpage):
data = self._parse_json(
mobj.group('json'), display_id,
transform_source=unescapeHTML, fatal=False)
if not isinstance(data, dict) or data.get('type') != 'episode':
continue
episode_url = data.get('socialUrl')
if not episode_url:
continue
entries.append(self.url_result(
episode_url, ie=DiscoveryGoIE.ie_key(),
video_id=data.get('id')))
return self.playlist_result(
entries, display_id,
remove_end(self._og_search_title(
webpage, fatal=False), ' | Discovery GO'),
self._og_search_description(webpage))

View file

@ -9,13 +9,13 @@ from ..compat import (
compat_parse_qs,
compat_urlparse,
)
from ..utils import smuggle_url
class TlcDeIE(InfoExtractor):
IE_NAME = 'tlc.de'
_VALID_URL = r'https?://(?:www\.)?tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?'
class DiscoveryNetworksDeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:discovery|tlc|animalplanet|dmax)\.de/(?:.*#(?P<id>\d+)|(?:[^/]+/)*videos/(?P<title>[^/?#]+))'
_TEST = {
_TESTS = [{
'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001',
'info_dict': {
'id': '3235167922001',
@ -29,7 +29,13 @@ class TlcDeIE(InfoExtractor):
'upload_date': '20140404',
'uploader_id': '1659832546',
},
}
}, {
'url': 'http://www.dmax.de/programme/storage-hunters-uk/videos/storage-hunters-uk-episode-6/',
'only_matching': True,
}, {
'url': 'http://www.discovery.de/#5332316765001',
'only_matching': True,
}]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1659832546/default_default/index.html?videoId=%s'
def _real_extract(self, url):
@ -39,5 +45,8 @@ class TlcDeIE(InfoExtractor):
title = mobj.group('title')
webpage = self._download_webpage(url, title)
brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0]
return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
brightcove_id = compat_parse_qs(compat_urlparse.urlparse(
brightcove_legacy_url).query)['@videoPlayer'][0]
return self.url_result(smuggle_url(
self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {'geo_countries': ['DE']}),
'BrightcoveNew', brightcove_id)

View file

@ -1,6 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
import time
import hashlib
from .common import InfoExtractor
from ..utils import (
ExtractorError,
@ -16,7 +19,7 @@ class DouyuTVIE(InfoExtractor):
'info_dict': {
'id': '17732',
'display_id': 'iseven',
'ext': 'mp4',
'ext': 'flv',
'title': 're:^清晨醒脑T-ARA根本停不下来 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.jpg$',
@ -31,7 +34,7 @@ class DouyuTVIE(InfoExtractor):
'info_dict': {
'id': '85982',
'display_id': '85982',
'ext': 'mp4',
'ext': 'flv',
'title': 're:^小漠从零单排记——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'md5:746a2f7a253966a06755a912f0acc0d2',
'thumbnail': r're:^https?://.*\.jpg$',
@ -47,7 +50,7 @@ class DouyuTVIE(InfoExtractor):
'info_dict': {
'id': '17732',
'display_id': '17732',
'ext': 'mp4',
'ext': 'flv',
'title': 're:^清晨醒脑T-ARA根本停不下来 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.jpg$',
@ -66,10 +69,6 @@ class DouyuTVIE(InfoExtractor):
'only_matching': True,
}]
# Decompile core.swf in webpage by ffdec "Search SWFs in memory". core.swf
# is encrypted originally, but ffdec can dump memory to get the decrypted one.
_API_KEY = 'A12Svb&%1UUmf@hC'
def _real_extract(self, url):
video_id = self._match_id(url)
@ -80,6 +79,7 @@ class DouyuTVIE(InfoExtractor):
room_id = self._html_search_regex(
r'"room_id\\?"\s*:\s*(\d+),', page, 'room id')
# Grab metadata from mobile API
room = self._download_json(
'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id,
note='Downloading room info')['data']
@ -88,8 +88,19 @@ class DouyuTVIE(InfoExtractor):
if room.get('show_status') == '2':
raise ExtractorError('Live stream is offline', expected=True)
formats = self._extract_m3u8_formats(
room['hls_url'], video_id, ext='mp4')
# Grab the URL from PC client API
# The m3u8 url from mobile API requires re-authentication every 5 minutes
tt = int(time.time())
signContent = 'lapi/live/thirdPart/getPlay/%s?aid=pcclient&rate=0&time=%d9TUk5fjjUjg9qIMH3sdnh' % (room_id, tt)
sign = hashlib.md5(signContent.encode('ascii')).hexdigest()
video_url = self._download_json(
'http://coapi.douyucdn.cn/lapi/live/thirdPart/getPlay/' + room_id,
video_id, note='Downloading video URL info',
query={'rate': 0}, headers={
'auth': sign,
'time': str(tt),
'aid': 'pcclient'
})['data']['live_url']
title = self._live_title(unescapeHTML(room['room_name']))
description = room.get('show_details')
@ -99,7 +110,7 @@ class DouyuTVIE(InfoExtractor):
return {
'id': room_id,
'display_id': video_id,
'formats': formats,
'url': video_url,
'title': title,
'description': description,
'thumbnail': thumbnail,

View file

@ -6,37 +6,24 @@ import re
import time
from .common import InfoExtractor
from ..compat import compat_urlparse
from ..compat import (
compat_urlparse,
compat_HTTPError,
)
from ..utils import (
USER_AGENTS,
ExtractorError,
int_or_none,
unified_strdate,
remove_end,
update_url_query,
)
class DPlayIE(InfoExtractor):
_VALID_URL = r'https?://(?P<domain>it\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)'
_VALID_URL = r'https?://(?P<domain>www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)'
_TESTS = [{
# geo restricted, via direct unsigned hls URL
'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/',
'info_dict': {
'id': '1255600',
'display_id': 'stagione-1-episodio-25',
'ext': 'mp4',
'title': 'Episodio 25',
'description': 'md5:cae5f40ad988811b197d2d27a53227eb',
'duration': 2761,
'timestamp': 1454701800,
'upload_date': '20160205',
'creator': 'RTIT',
'series': 'Take me out',
'season_number': 1,
'episode_number': 25,
'age_limit': 0,
},
'expected_warnings': ['Unable to download f4m manifest'],
}, {
# non geo restricted, via secure api, unsigned download hls URL
'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/',
'info_dict': {
@ -168,3 +155,90 @@ class DPlayIE(InfoExtractor):
'formats': formats,
'subtitles': subtitles,
}
class DPlayItIE(InfoExtractor):
_VALID_URL = r'https?://it\.dplay\.com/[^/]+/[^/]+/(?P<id>[^/?#]+)'
_GEO_COUNTRIES = ['IT']
_TEST = {
'url': 'http://it.dplay.com/nove/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij/',
'md5': '2b808ffb00fc47b884a172ca5d13053c',
'info_dict': {
'id': '6918',
'display_id': 'luigi-di-maio-la-psicosi-di-stanislawskij',
'ext': 'mp4',
'title': 'Biografie imbarazzanti: Luigi Di Maio: la psicosi di Stanislawskij',
'description': 'md5:3c7a4303aef85868f867a26f5cc14813',
'thumbnail': r're:^https?://.*\.jpe?g',
'upload_date': '20160524',
'series': 'Biografie imbarazzanti',
'season_number': 1,
'episode': 'Luigi Di Maio: la psicosi di Stanislawskij',
'episode_number': 1,
},
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
info_url = self._search_regex(
r'url\s*:\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)',
webpage, 'video id')
title = remove_end(self._og_search_title(webpage), ' | Dplay')
try:
info = self._download_json(
info_url, display_id, headers={
'Authorization': 'Bearer %s' % self._get_cookies(url).get(
'dplayit_token').value,
'Referer': url,
})
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403):
info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
error = info['errors'][0]
if error.get('code') == 'access.denied.geoblocked':
self.raise_geo_restricted(
msg=error.get('detail'), countries=self._GEO_COUNTRIES)
raise ExtractorError(info['errors'][0]['detail'], expected=True)
raise
hls_url = info['data']['attributes']['streaming']['hls']['url']
formats = self._extract_m3u8_formats(
hls_url, display_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id='hls')
series = self._html_search_regex(
r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>',
webpage, 'series', fatal=False)
episode = self._search_regex(
r'<p[^>]+class=["\'].*?\bdesc_ep\b.*?["\'][^>]*>\s*<br/>\s*<b>([^<]+)',
webpage, 'episode', fatal=False)
mobj = re.search(
r'(?s)<span[^>]+class=["\']dates["\'][^>]*>.+?\bS\.(?P<season_number>\d+)\s+E\.(?P<episode_number>\d+)\s*-\s*(?P<upload_date>\d{2}/\d{2}/\d{4})',
webpage)
if mobj:
season_number = int(mobj.group('season_number'))
episode_number = int(mobj.group('episode_number'))
upload_date = unified_strdate(mobj.group('upload_date'))
else:
season_number = episode_number = upload_date = None
return {
'id': info_url.rpartition('/')[-1],
'display_id': display_id,
'title': title,
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'series': series,
'season_number': season_number,
'episode': episode,
'episode_number': episode_number,
'upload_date': upload_date,
'formats': formats,
}

View file

@ -15,6 +15,8 @@ from ..utils import (
class DRTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder|radio/ondemand)/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)'
_GEO_BYPASS = False
_GEO_COUNTRIES = ['DK']
IE_NAME = 'drtv'
_TESTS = [{
'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10',
@ -137,7 +139,7 @@ class DRTVIE(InfoExtractor):
if not formats and restricted_to_denmark:
self.raise_geo_restricted(
'Unfortunately, DR is not allowed to show this program outside Denmark.',
expected=True)
countries=self._GEO_COUNTRIES)
self._sort_formats(formats)
@ -156,6 +158,7 @@ class DRTVIE(InfoExtractor):
class DRTVLiveIE(InfoExtractor):
IE_NAME = 'drtv:live'
_VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)'
_GEO_COUNTRIES = ['DK']
_TEST = {
'url': 'https://www.dr.dk/tv/live/dr1',
'info_dict': {

View file

@ -71,6 +71,7 @@ from .arte import (
)
from .atresplayer import AtresPlayerIE
from .atttechchannel import ATTTechChannelIE
from .atvat import ATVAtIE
from .audimedia import AudiMediaIE
from .audioboom import AudioBoomIE
from .audiomack import AudiomackIE, AudiomackAlbumIE
@ -117,6 +118,7 @@ from .bleacherreport import (
from .blinkx import BlinkxIE
from .bloomberg import BloombergIE
from .bokecc import BokeCCIE
from .bostonglobe import BostonGlobeIE
from .bpb import BpbIE
from .br import BRIE
from .bravotv import BravoTVIE
@ -246,7 +248,10 @@ from .dfb import DFBIE
from .dhm import DHMIE
from .dotsub import DotsubIE
from .douyutv import DouyuTVIE
from .dplay import DPlayIE
from .dplay import (
DPlayIE,
DPlayItIE,
)
from .dramafever import (
DramaFeverIE,
DramaFeverSeriesIE,
@ -262,7 +267,11 @@ from .dvtv import DVTVIE
from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE
from .discoverygo import DiscoveryGoIE
from .discoverygo import (
DiscoveryGoIE,
DiscoveryGoPlaylistIE,
)
from .discoverynetworks import DiscoveryNetworksDeIE
from .disney import DisneyIE
from .dispeak import DigitallySpeakingIE
from .dropbox import DropboxIE
@ -793,6 +802,7 @@ from .rai import (
)
from .rbmaradio import RBMARadioIE
from .rds import RDSIE
from .redbulltv import RedBullTVIE
from .redtube import RedTubeIE
from .regiotv import RegioTVIE
from .rentv import (
@ -966,7 +976,6 @@ from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE
from .threeqsdn import ThreeQSDNIE
from .tinypic import TinyPicIE
from .tlc import TlcDeIE
from .tmz import (
TMZIE,
TMZArticleIE,
@ -979,6 +988,7 @@ from .tnaflix import (
)
from .toggle import ToggleIE
from .tonline import TOnlineIE
from .toongoggles import ToonGogglesIE
from .toutv import TouTvIE
from .toypics import ToypicsUserIE, ToypicsIE
from .traileraddict import TrailerAddictIE
@ -999,6 +1009,7 @@ from .tunein import (
TuneInTopicIE,
TuneInShortenerIE,
)
from .tunepk import TunePkIE
from .turbo import TurboIE
from .tutv import TutvIE
from .tv2 import (
@ -1165,6 +1176,8 @@ from .voicerepublic import VoiceRepublicIE
from .voxmedia import VoxMediaIE
from .vporn import VpornIE
from .vrt import VRTIE
from .vrak import VrakIE
from .medialaan import MedialaanIE
from .vube import VubeIE
from .vuclip import VuClipIE
from .vvvvid import VVVVIDIE

View file

@ -54,7 +54,7 @@ class EyedoTVIE(InfoExtractor):
'id': video_id,
'title': title,
'formats': self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8' if is_live else 'm3u8_native'),
m3u8_url, video_id, 'mp4', 'm3u8_native'),
'description': xpath_text(video_data, _add_ns('Description')),
'duration': parse_duration(xpath_text(video_data, _add_ns('Duration'))),
'uploader': xpath_text(video_data, _add_ns('Createur')),

View file

@ -196,6 +196,10 @@ class FacebookIE(InfoExtractor):
}, {
'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670',
'only_matching': True,
}, {
# no title
'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/',
'only_matching': True,
}]
@staticmethod
@ -303,7 +307,7 @@ class FacebookIE(InfoExtractor):
if not video_data:
server_js_data = self._parse_json(
self._search_regex(
r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall)',
r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)',
webpage, 'js data', default='{}'),
video_id, transform_source=js_to_json, fatal=False)
if server_js_data:
@ -353,15 +357,15 @@ class FacebookIE(InfoExtractor):
self._sort_formats(formats)
video_title = self._html_search_regex(
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',
default=None)
r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage,
'title', default=None)
if not video_title:
video_title = self._html_search_regex(
r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
webpage, 'alternative title', default=None)
if not video_title:
video_title = self._html_search_meta(
'description', webpage, 'title')
'description', webpage, 'title', default=None)
if video_title:
video_title = limit_length(video_title, 80)
else:

View file

@ -47,9 +47,12 @@ class FOXIE(AdobePassIE):
resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating)
query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource)
return {
info = self._search_json_ld(webpage, video_id, fatal=False)
info.update({
'_type': 'url_transparent',
'ie_key': 'ThePlatform',
'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}),
'id': video_id,
}
})
return info

View file

@ -4,7 +4,8 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
unified_strdate,
extract_attributes,
int_or_none,
)
@ -19,6 +20,7 @@ class FranceCultureIE(InfoExtractor):
'title': 'Rendez-vous au pays des geeks',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20140301',
'timestamp': 1393642916,
'vcodec': 'none',
}
}
@ -28,30 +30,34 @@ class FranceCultureIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
video_url = self._search_regex(
r'(?s)<div[^>]+class="[^"]*?title-zone-diffusion[^"]*?"[^>]*>.*?<button[^>]+data-asset-source="([^"]+)"',
webpage, 'video path')
video_data = extract_attributes(self._search_regex(
r'(?s)<div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>.*?(<button[^>]+data-asset-source="[^"]+"[^>]+>)',
webpage, 'video data'))
title = self._og_search_title(webpage)
video_url = video_data['data-asset-source']
title = video_data.get('data-asset-title') or self._og_search_title(webpage)
upload_date = unified_strdate(self._search_regex(
'(?s)<div[^>]+class="date"[^>]*>.*?<span[^>]+class="inner"[^>]*>([^<]+)<',
webpage, 'upload date', fatal=False))
description = self._html_search_regex(
r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>',
webpage, 'description', default=None)
thumbnail = self._search_regex(
r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+data-dejavu-src="([^"]+)"',
r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"',
webpage, 'thumbnail', fatal=False)
uploader = self._html_search_regex(
r'(?s)<div id="emission".*?<span class="author">(.*?)</span>',
r'(?s)<span class="author">(.*?)</span>',
webpage, 'uploader', default=None)
vcodec = 'none' if determine_ext(video_url.lower()) == 'mp3' else None
ext = determine_ext(video_url.lower())
return {
'id': display_id,
'display_id': display_id,
'url': video_url,
'title': title,
'description': description,
'thumbnail': thumbnail,
'vcodec': vcodec,
'ext': ext,
'vcodec': 'none' if ext == 'mp3' else None,
'uploader': uploader,
'upload_date': upload_date,
'timestamp': int_or_none(video_data.get('data-asset-created-date')),
'duration': int_or_none(video_data.get('data-duration')),
}

View file

@ -56,9 +56,8 @@ class FreshLiveIE(InfoExtractor):
is_live = info.get('liveStreamUrl') is not None
formats = self._extract_m3u8_formats(
stream_url, video_id, ext='mp4',
entry_protocol='m3u8' if is_live else 'm3u8_native',
m3u8_id='hls')
stream_url, video_id, 'mp4',
'm3u8_native', m3u8_id='hls')
if is_live:
title = self._live_title(title)

View file

@ -84,6 +84,7 @@ from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE
from .openload import OpenloadIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
class GenericIE(InfoExtractor):
@ -448,6 +449,23 @@ class GenericIE(InfoExtractor):
},
}],
},
{
# Brightcove with UUID in videoPlayer
'url': 'http://www8.hp.com/cn/zh/home.html',
'info_dict': {
'id': '5255815316001',
'ext': 'mp4',
'title': 'Sprocket Video - China',
'description': 'Sprocket Video - China',
'uploader': 'HP-Video Gallery',
'timestamp': 1482263210,
'upload_date': '20161220',
'uploader_id': '1107601872001',
},
'params': {
'skip_download': True, # m3u8 download
},
},
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
@ -884,12 +902,13 @@ class GenericIE(InfoExtractor):
},
# LazyYT
{
'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
'url': 'https://skiplagged.com/',
'info_dict': {
'id': '1986',
'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
'id': 'skiplagged',
'title': 'Skiplagged: The smart way to find cheap flights',
},
'playlist_mincount': 2,
'playlist_mincount': 1,
'add_ie': ['Youtube'],
},
# Cinchcast embed
{
@ -1516,11 +1535,39 @@ class GenericIE(InfoExtractor):
},
'add_ie': [VideoPressIE.ie_key()],
},
{
# Rutube embed
'url': 'http://magazzino.friday.ru/videos/vipuski/kazan-2',
'info_dict': {
'id': '9b3d5bee0a8740bf70dfd29d3ea43541',
'ext': 'flv',
'title': 'Магаззино: Казань 2',
'description': 'md5:99bccdfac2269f0e8fdbc4bbc9db184a',
'uploader': 'Магаззино',
'upload_date': '20170228',
'uploader_id': '996642',
},
'params': {
'skip_download': True,
},
'add_ie': [RutubeIE.ie_key()],
},
{
# ThePlatform embedded with whitespaces in URLs
'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm',
'only_matching': True,
},
{
# Senate ISVP iframe https
'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security',
'md5': 'fb8c70b0b515e5037981a2492099aab8',
'info_dict': {
'id': 'govtaff020316',
'ext': 'mp4',
'title': 'Integrated Senate Video Player',
},
'add_ie': [SenateISVPIE.ie_key()],
},
# {
# # TODO: find another test
# # http://schema.org/VideoObject
@ -1820,14 +1867,6 @@ class GenericIE(InfoExtractor):
video_description = self._og_search_description(webpage, default=None)
video_thumbnail = self._og_search_thumbnail(webpage, default=None)
# Helper method
def _playlist_from_matches(matches, getter=None, ie=None):
urlrs = orderedSet(
self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
for m in matches)
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
# Look for Brightcove Legacy Studio embeds
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls:
@ -1848,28 +1887,28 @@ class GenericIE(InfoExtractor):
# Look for Brightcove New Studio embeds
bc_urls = BrightcoveNewIE._extract_urls(webpage)
if bc_urls:
return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
# Look for ThePlatform embeds
tp_urls = ThePlatformIE._extract_urls(webpage)
if tp_urls:
return _playlist_from_matches(tp_urls, ie='ThePlatform')
return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')
# Look for Vessel embeds
vessel_urls = VesselIE._extract_urls(webpage)
if vessel_urls:
return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key())
return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key())
# Look for embedded rtl.nl player
matches = re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
webpage)
if matches:
return _playlist_from_matches(matches, ie='RtlNl')
return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
vimeo_urls = VimeoIE._extract_urls(url, webpage)
if vimeo_urls:
return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key())
return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())
vid_me_embed_url = self._search_regex(
r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
@ -1891,25 +1930,25 @@ class GenericIE(InfoExtractor):
(?:embed|v|p)/.+?)
\1''', webpage)
if matches:
return _playlist_from_matches(
matches, lambda m: unescapeHTML(m[1]))
return self.playlist_from_matches(
matches, video_id, video_title, lambda m: unescapeHTML(m[1]))
# Look for lazyYT YouTube embed
matches = re.findall(
r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
if matches:
return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))
# Look for Wordpress "YouTube Video Importer" plugin
matches = re.findall(r'''(?x)<div[^>]+
class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
if matches:
return _playlist_from_matches(matches, lambda m: m[-1])
return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])
matches = DailymotionIE._extract_urls(webpage)
if matches:
return _playlist_from_matches(matches)
return self.playlist_from_matches(matches, video_id, video_title)
# Look for embedded Dailymotion playlist player (#3822)
m = re.search(
@ -1918,8 +1957,8 @@ class GenericIE(InfoExtractor):
playlists = re.findall(
r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
if playlists:
return _playlist_from_matches(
playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
return self.playlist_from_matches(
playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
# Look for embedded Wistia player
match = re.search(
@ -2026,8 +2065,9 @@ class GenericIE(InfoExtractor):
if mobj is not None:
embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
if embeds:
return _playlist_from_matches(
embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')
return self.playlist_from_matches(
embeds, video_id, video_title,
getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')
# Look for Aparat videos
mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
@ -2089,13 +2129,13 @@ class GenericIE(InfoExtractor):
# Look for funnyordie embed
matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
if matches:
return _playlist_from_matches(
matches, getter=unescapeHTML, ie='FunnyOrDie')
return self.playlist_from_matches(
matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
# Look for BBC iPlayer embed
matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
if matches:
return _playlist_from_matches(matches, ie='BBCCoUk')
return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk')
# Look for embedded RUTV player
rutv_url = RUTVIE._extract_url(webpage)
@ -2110,32 +2150,32 @@ class GenericIE(InfoExtractor):
# Look for embedded SportBox player
sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
if sportbox_urls:
return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed')
# Look for embedded XHamster player
xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
if xhamster_urls:
return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed')
# Look for embedded TNAFlixNetwork player
tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
if tnaflix_urls:
return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key())
return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key())
# Look for embedded PornHub player
pornhub_urls = PornHubIE._extract_urls(webpage)
if pornhub_urls:
return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key())
return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key())
# Look for embedded DrTuber player
drtuber_urls = DrTuberIE._extract_urls(webpage)
if drtuber_urls:
return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key())
return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key())
# Look for embedded RedTube player
redtube_urls = RedTubeIE._extract_urls(webpage)
if redtube_urls:
return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key())
return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key())
# Look for embedded Tvigle player
mobj = re.search(
@ -2181,12 +2221,12 @@ class GenericIE(InfoExtractor):
# Look for embedded soundcloud player
soundcloud_urls = SoundcloudIE._extract_urls(webpage)
if soundcloud_urls:
return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
# Look for tunein player
tunein_urls = TuneInBaseIE._extract_urls(webpage)
if tunein_urls:
return _playlist_from_matches(tunein_urls)
return self.playlist_from_matches(tunein_urls, video_id, video_title)
# Look for embedded mtvservices player
mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
@ -2469,30 +2509,36 @@ class GenericIE(InfoExtractor):
# Look for DBTV embeds
dbtv_urls = DBTVIE._extract_urls(webpage)
if dbtv_urls:
return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key())
return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key())
# Look for Videa embeds
videa_urls = VideaIE._extract_urls(webpage)
if videa_urls:
return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key())
return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key())
# Look for 20 minuten embeds
twentymin_urls = TwentyMinutenIE._extract_urls(webpage)
if twentymin_urls:
return _playlist_from_matches(
twentymin_urls, ie=TwentyMinutenIE.ie_key())
return self.playlist_from_matches(
twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key())
# Look for Openload embeds
openload_urls = OpenloadIE._extract_urls(webpage)
if openload_urls:
return _playlist_from_matches(
openload_urls, ie=OpenloadIE.ie_key())
return self.playlist_from_matches(
openload_urls, video_id, video_title, ie=OpenloadIE.ie_key())
# Look for VideoPress embeds
videopress_urls = VideoPressIE._extract_urls(webpage)
if videopress_urls:
return _playlist_from_matches(
videopress_urls, ie=VideoPressIE.ie_key())
return self.playlist_from_matches(
videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key())
# Look for Rutube embeds
rutube_urls = RutubeIE._extract_urls(webpage)
if rutube_urls:
return self.playlist_from_matches(
rutube_urls, ie=RutubeIE.ie_key())
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
@ -2521,7 +2567,11 @@ class GenericIE(InfoExtractor):
jwplayer_data = self._find_jwplayer_data(
webpage, video_id, transform_source=js_to_json)
if jwplayer_data:
return self._parse_jwplayer_data(jwplayer_data, video_id)
info = self._parse_jwplayer_data(
jwplayer_data, video_id, require_title=False)
if not info.get('title'):
info['title'] = video_title
return info
def check_video(vurl):
if YoutubeIE.suitable(vurl):
@ -2596,11 +2646,14 @@ class GenericIE(InfoExtractor):
found = re.search(REDIRECT_REGEX, refresh_header)
if found:
new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))
self.report_following_redirect(new_url)
return {
'_type': 'url',
'url': new_url,
}
if new_url != url:
self.report_following_redirect(new_url)
return {
'_type': 'url',
'url': new_url,
}
else:
found = None
if not found:
# twitter:player is a https URL to iframe player that may or may not

View file

@ -36,7 +36,7 @@ class GoIE(AdobePassIE):
'requestor_id': 'DisneyXD',
}
}
_VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|season-\d+/\d+-(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
_VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
_TESTS = [{
'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx',
'info_dict': {
@ -52,6 +52,12 @@ class GoIE(AdobePassIE):
}, {
'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601',
'only_matching': True,
}, {
'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding',
'only_matching': True,
}, {
'url': 'http://abc.go.com/shows/world-news-tonight/episode-guide/2017-02/17-021717-intense-stand-off-between-man-with-rifle-and-police-in-oakland',
'only_matching': True,
}]
def _real_extract(self, url):

View file

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
xpath_text,
xpath_element,
@ -14,14 +15,26 @@ from ..utils import (
class HBOBaseIE(InfoExtractor):
_FORMATS_INFO = {
'pro7': {
'width': 1280,
'height': 720,
},
'1920': {
'width': 1280,
'height': 720,
},
'pro6': {
'width': 768,
'height': 432,
},
'640': {
'width': 768,
'height': 432,
},
'pro5': {
'width': 640,
'height': 360,
},
'highwifi': {
'width': 640,
'height': 360,
@ -78,6 +91,17 @@ class HBOBaseIE(InfoExtractor):
formats.extend(self._extract_m3u8_formats(
video_url.replace('.tar', '/base_index_w8.m3u8'),
video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
elif source.tag == 'hls':
# #EXT-X-BYTERANGE is not supported by native hls downloader
# and ffmpeg (#10955)
# formats.extend(self._extract_m3u8_formats(
# video_url.replace('.tar', '/base_index.m3u8'),
# video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
continue
elif source.tag == 'dash':
formats.extend(self._extract_mpd_formats(
video_url.replace('.tar', '/manifest.mpd'),
video_id, mpd_id='dash', fatal=False))
else:
format_info = self._FORMATS_INFO.get(source.tag, {})
formats.append({
@ -112,10 +136,11 @@ class HBOBaseIE(InfoExtractor):
class HBOIE(HBOBaseIE):
IE_NAME = 'hbo'
_VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839',
'md5': '1c33253f0c7782142c993c0ba62a8753',
'md5': '2c6a6bc1222c7e91cb3334dad1746e5a',
'info_dict': {
'id': '1437839',
'ext': 'mp4',
@ -131,11 +156,12 @@ class HBOIE(HBOBaseIE):
class HBOEpisodeIE(HBOBaseIE):
_VALID_URL = r'https?://(?:www\.)?hbo\.com/(?!video)([^/]+/)+video/(?P<id>[0-9a-z-]+)\.html'
IE_NAME = 'hbo:episode'
_VALID_URL = r'https?://(?:www\.)?hbo\.com/(?P<path>(?!video)(?:(?:[^/]+/)+video|watch-free-episodes)/(?P<id>[0-9a-z-]+))(?:\.html)?'
_TESTS = [{
'url': 'http://www.hbo.com/girls/episodes/5/52-i-love-you-baby/video/ep-52-inside-the-episode.html?autoplay=true',
'md5': '689132b253cc0ab7434237fc3a293210',
'md5': '61ead79b9c0dfa8d3d4b07ef4ac556fb',
'info_dict': {
'id': '1439518',
'display_id': 'ep-52-inside-the-episode',
@ -147,16 +173,19 @@ class HBOEpisodeIE(HBOBaseIE):
}, {
'url': 'http://www.hbo.com/game-of-thrones/about/video/season-5-invitation-to-the-set.html?autoplay=true',
'only_matching': True,
}, {
'url': 'http://www.hbo.com/watch-free-episodes/last-week-tonight-with-john-oliver',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
path, display_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id)
content = self._download_json(
'http://www.hbo.com/api/content/' + path, display_id)['content']
video_id = self._search_regex(
r'(?P<q1>[\'"])videoId(?P=q1)\s*:\s*(?P<q2>[\'"])(?P<video_id>\d+)(?P=q2)',
webpage, 'video ID', group='video_id')
video_id = compat_str((content.get('parsed', {}).get(
'common:FullBleedVideo', {}) or content['selectedEpisode'])['videoId'])
info_dict = self._extract_from_id(video_id)
info_dict['display_id'] = display_id

View file

@ -119,7 +119,8 @@ class LivestreamIE(InfoExtractor):
m3u8_url = video_data.get('m3u8_url')
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
f4m_url = video_data.get('f4m_url')
if f4m_url:
@ -158,11 +159,11 @@ class LivestreamIE(InfoExtractor):
if smil_url:
formats.extend(self._extract_smil_formats(smil_url, broadcast_id))
entry_protocol = 'm3u8' if is_live else 'm3u8_native'
m3u8_url = stream_info.get('m3u8_url')
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, broadcast_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False))
m3u8_url, broadcast_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
rtsp_url = stream_info.get('rtsp_url')
if rtsp_url:
@ -276,7 +277,7 @@ class LivestreamOriginalIE(InfoExtractor):
'view_count': view_count,
}
def _extract_video_formats(self, video_data, video_id, entry_protocol):
def _extract_video_formats(self, video_data, video_id):
formats = []
progressive_url = video_data.get('progressiveUrl')
@ -289,7 +290,8 @@ class LivestreamOriginalIE(InfoExtractor):
m3u8_url = video_data.get('httpUrl')
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False))
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
rtsp_url = video_data.get('rtspUrl')
if rtsp_url:
@ -340,11 +342,10 @@ class LivestreamOriginalIE(InfoExtractor):
}
video_data = self._download_json(stream_url, content_id)
is_live = video_data.get('isLive')
entry_protocol = 'm3u8' if is_live else 'm3u8_native'
info.update({
'id': content_id,
'title': self._live_title(info['title']) if is_live else info['title'],
'formats': self._extract_video_formats(video_data, content_id, entry_protocol),
'formats': self._extract_video_formats(video_data, content_id),
'is_live': is_live,
})
return info

View file

@ -0,0 +1,259 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
parse_duration,
try_get,
unified_timestamp,
urlencode_postdata,
)
class MedialaanIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:www\.)?
(?:
(?P<site_id>vtm|q2|vtmkzoom)\.be/
(?:
video(?:/[^/]+/id/|/?\?.*?\baid=)|
(?:[^/]+/)*
)
)
(?P<id>[^/?#&]+)
'''
_NETRC_MACHINE = 'medialaan'
_APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-'
_SITE_TO_APP_ID = {
'vtm': 'vtm_watch',
'q2': 'q2',
'vtmkzoom': 'vtmkzoom',
}
_TESTS = [{
# vod
'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch',
'info_dict': {
'id': 'vtm_20170219_VM0678361_vtmwatch',
'ext': 'mp4',
'title': 'Allemaal Chris afl. 6',
'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2',
'timestamp': 1487533280,
'upload_date': '20170219',
'duration': 2562,
'series': 'Allemaal Chris',
'season': 'Allemaal Chris',
'season_number': 1,
'season_id': '256936078124527',
'episode': 'Allemaal Chris afl. 6',
'episode_number': 6,
'episode_id': '256936078591527',
},
'params': {
'skip_download': True,
},
'skip': 'Requires account credentials',
}, {
# clip
'url': 'http://vtm.be/video?aid=168332',
'info_dict': {
'id': '168332',
'ext': 'mp4',
'title': '"Veronique liegt!"',
'description': 'md5:1385e2b743923afe54ba4adc38476155',
'timestamp': 1489002029,
'upload_date': '20170308',
'duration': 96,
},
}, {
# vod
'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000',
'only_matching': True,
}, {
# vod
'url': 'http://vtm.be/video?aid=163157',
'only_matching': True,
}, {
# vod
'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2',
'only_matching': True,
}, {
# clip
'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio',
'only_matching': True,
}]
def _real_initialize(self):
self._logged_in = False
def _login(self):
username, password = self._get_login_info()
if username is None:
self.raise_login_required()
auth_data = {
'APIKey': self._APIKEY,
'sdk': 'js_6.1',
'format': 'json',
'loginID': username,
'password': password,
}
auth_info = self._download_json(
'https://accounts.eu1.gigya.com/accounts.login', None,
note='Logging in', errnote='Unable to log in',
data=urlencode_postdata(auth_data))
error_message = auth_info.get('errorDetails') or auth_info.get('errorMessage')
if error_message:
raise ExtractorError(
'Unable to login: %s' % error_message, expected=True)
self._uid = auth_info['UID']
self._uid_signature = auth_info['UIDSignature']
self._signature_timestamp = auth_info['signatureTimestamp']
self._logged_in = True
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id, site_id = mobj.group('id', 'site_id')
webpage = self._download_webpage(url, video_id)
config = self._parse_json(
self._search_regex(
r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);',
webpage, 'config', default='{}'), video_id,
transform_source=lambda s: s.replace(
'\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'"))
vod_id = config.get('vodId') or self._search_regex(
(r'\\"vodId\\"\s*:\s*\\"(.+?)\\"',
r'<[^>]+id=["\']vod-(\d+)'),
webpage, 'video_id', default=None)
# clip, no authentication required
if not vod_id:
player = self._parse_json(
self._search_regex(
r'vmmaplayer\(({.+?})\);', webpage, 'vmma player',
default=''),
video_id, transform_source=lambda s: '[%s]' % s, fatal=False)
if player:
video = player[-1]
info = {
'id': video_id,
'url': video['videoUrl'],
'title': video['title'],
'thumbnail': video.get('imageUrl'),
'timestamp': int_or_none(video.get('createdDate')),
'duration': int_or_none(video.get('duration')),
}
else:
info = self._parse_html5_media_entries(
url, webpage, video_id, m3u8_id='hls')[0]
info.update({
'id': video_id,
'title': self._html_search_meta('description', webpage),
'duration': parse_duration(self._html_search_meta('duration', webpage)),
})
# vod, authentication required
else:
if not self._logged_in:
self._login()
settings = self._parse_json(
self._search_regex(
r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
webpage, 'drupal settings', default='{}'),
video_id)
def get(container, item):
return try_get(
settings, lambda x: x[container][item],
compat_str) or self._search_regex(
r'"%s"\s*:\s*"([^"]+)' % item, webpage, item,
default=None)
app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch')
sso = get('vod', 'gigyaDatabase') or 'vtm-sso'
data = self._download_json(
'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id,
video_id, query={
'app_id': app_id,
'user_network': sso,
'UID': self._uid,
'UIDSignature': self._uid_signature,
'signatureTimestamp': self._signature_timestamp,
})
formats = self._extract_m3u8_formats(
data['response']['uri'], video_id, entry_protocol='m3u8_native',
ext='mp4', m3u8_id='hls')
self._sort_formats(formats)
info = {
'id': vod_id,
'formats': formats,
}
api_key = get('vod', 'apiKey')
channel = get('medialaanGigya', 'channel')
if api_key:
videos = self._download_json(
'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False,
query={
'channels': channel,
'ids': vod_id,
'limit': 1,
'apikey': api_key,
})
if videos:
video = try_get(
videos, lambda x: x['response']['videos'][0], dict)
if video:
def get(container, item, expected_type=None):
return try_get(
video, lambda x: x[container][item], expected_type)
def get_string(container, item):
return get(container, item, compat_str)
info.update({
'series': get_string('program', 'title'),
'season': get_string('season', 'title'),
'season_number': int_or_none(get('season', 'number')),
'season_id': get_string('season', 'id'),
'episode': get_string('episode', 'title'),
'episode_number': int_or_none(get('episode', 'number')),
'episode_id': get_string('episode', 'id'),
'duration': int_or_none(
video.get('duration')) or int_or_none(
video.get('durationMillis'), scale=1000),
'title': get_string('episode', 'title'),
'description': get_string('episode', 'text'),
'timestamp': unified_timestamp(get_string(
'publication', 'begin')),
})
if not info.get('title'):
info['title'] = try_get(
config, lambda x: x['videoConfig']['title'],
compat_str) or self._html_search_regex(
r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title',
default=None) or self._og_search_title(webpage)
if not info.get('description'):
info['description'] = self._html_search_regex(
r'<div[^>]+class="field-item\s+even">\s*<p>(.+?)</p>',
webpage, 'description', default=None)
return info

View file

@ -51,6 +51,7 @@ class MioMioIE(InfoExtractor):
'ext': 'mp4',
'title': 'マツコの知らない世界【劇的進化SPビニール傘冷凍食品2016】 1_2 - 16 05 31',
},
'skip': 'Unable to load videos',
}]
def _extract_mioplayer(self, webpage, video_id, title, http_headers):
@ -94,9 +95,18 @@ class MioMioIE(InfoExtractor):
return entries
def _download_chinese_webpage(self, *args, **kwargs):
# Requests with English locales return garbage
headers = {
'Accept-Language': 'zh-TW,en-US;q=0.7,en;q=0.3',
}
kwargs.setdefault('headers', {}).update(headers)
return self._download_webpage(*args, **kwargs)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
webpage = self._download_chinese_webpage(
url, video_id)
title = self._html_search_meta(
'description', webpage, 'title', fatal=True)
@ -106,7 +116,7 @@ class MioMioIE(InfoExtractor):
if '_h5' in mioplayer_path:
player_url = compat_urlparse.urljoin(url, mioplayer_path)
player_webpage = self._download_webpage(
player_webpage = self._download_chinese_webpage(
player_url, video_id,
note='Downloading player webpage', headers={'Referer': url})
entries = self._parse_html5_media_entries(player_url, player_webpage, video_id)

View file

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import uuid
from .common import InfoExtractor
from .ooyala import OoyalaIE
from ..compat import (
compat_str,
compat_urllib_parse_urlencode,
@ -24,6 +25,9 @@ class MiTeleBaseIE(InfoExtractor):
r'(?s)(<ms-video-player.+?</ms-video-player>)',
webpage, 'ms video player'))
video_id = player_data['data-media-id']
if player_data.get('data-cms-id') == 'ooyala':
return self.url_result(
'ooyala:%s' % video_id, ie=OoyalaIE.ie_key(), video_id=video_id)
config_url = compat_urlparse.urljoin(url, player_data['data-config'])
config = self._download_json(
config_url, video_id, 'Downloading config JSON')

View file

@ -34,12 +34,6 @@ class NineCNineMediaStackIE(NineCNineMediaBaseIE):
formats.extend(self._extract_f4m_formats(
stack_base_url + 'f4m', stack_id,
f4m_id='hds', fatal=False))
mp4_url = self._download_webpage(stack_base_url + 'pd', stack_id, fatal=False)
if mp4_url:
formats.append({
'url': mp4_url,
'format_id': 'mp4',
})
self._sort_formats(formats)
return {

View file

@ -3,41 +3,27 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..compat import (
compat_HTTPError,
compat_str,
)
from ..utils import (
determine_ext,
ExtractorError,
fix_xml_ampersands,
orderedSet,
parse_duration,
qualities,
strip_jsonp,
unified_strdate,
ExtractorError,
)
class NPOBaseIE(InfoExtractor):
def _get_token(self, video_id):
token_page = self._download_webpage(
'http://ida.omroep.nl/npoplayer/i.js',
video_id, note='Downloading token')
token = self._search_regex(
r'npoplayer\.token = "(.+?)"', token_page, 'token')
# Decryption algorithm extracted from http://npoplayer.omroep.nl/csjs/npoplayer-min.js
token_l = list(token)
first = second = None
for i in range(5, len(token_l) - 4):
if token_l[i].isdigit():
if first is None:
first = i
elif second is None:
second = i
if first is None or second is None:
first = 12
second = 13
token_l[first], token_l[second] = token_l[second], token_l[first]
return ''.join(token_l)
return self._download_json(
'http://ida.omroep.nl/app.php/auth', video_id,
note='Downloading token')['token']
class NPOIE(NPOBaseIE):
@ -58,103 +44,113 @@ class NPOIE(NPOBaseIE):
(?P<id>[^/?#]+)
'''
_TESTS = [
{
'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719',
'md5': '4b3f9c429157ec4775f2c9cb7b911016',
'info_dict': {
'id': 'VPWON_1220719',
'ext': 'm4v',
'title': 'Nieuwsuur',
'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.',
'upload_date': '20140622',
},
_TESTS = [{
'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719',
'md5': '4b3f9c429157ec4775f2c9cb7b911016',
'info_dict': {
'id': 'VPWON_1220719',
'ext': 'm4v',
'title': 'Nieuwsuur',
'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.',
'upload_date': '20140622',
},
{
'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800',
'md5': 'da50a5787dbfc1603c4ad80f31c5120b',
'info_dict': {
'id': 'VARA_101191800',
'ext': 'm4v',
'title': 'De Mega Mike & Mega Thomas show: The best of.',
'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4',
'upload_date': '20090227',
'duration': 2400,
},
}, {
'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800',
'md5': 'da50a5787dbfc1603c4ad80f31c5120b',
'info_dict': {
'id': 'VARA_101191800',
'ext': 'm4v',
'title': 'De Mega Mike & Mega Thomas show: The best of.',
'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4',
'upload_date': '20090227',
'duration': 2400,
},
{
'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289',
'md5': 'f8065e4e5a7824068ed3c7e783178f2c',
'info_dict': {
'id': 'VPWON_1169289',
'ext': 'm4v',
'title': 'Tegenlicht: De toekomst komt uit Afrika',
'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
'upload_date': '20130225',
'duration': 3000,
},
}, {
'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289',
'md5': 'f8065e4e5a7824068ed3c7e783178f2c',
'info_dict': {
'id': 'VPWON_1169289',
'ext': 'm4v',
'title': 'Tegenlicht: Zwart geld. De toekomst komt uit Afrika',
'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
'upload_date': '20130225',
'duration': 3000,
},
{
'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706',
'info_dict': {
'id': 'WO_VPRO_043706',
'ext': 'wmv',
'title': 'De nieuwe mens - Deel 1',
'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b',
'duration': 4680,
},
'params': {
# mplayer mms download
'skip_download': True,
}
}, {
'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706',
'info_dict': {
'id': 'WO_VPRO_043706',
'ext': 'm4v',
'title': 'De nieuwe mens - Deel 1',
'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b',
'duration': 4680,
},
# non asf in streams
{
'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771',
'md5': 'b3da13de374cbe2d5332a7e910bef97f',
'info_dict': {
'id': 'WO_NOS_762771',
'ext': 'mp4',
'title': 'Hoe gaat Europa verder na Parijs?',
},
},
{
'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content',
'md5': '01c6a2841675995da1f0cf776f03a9c3',
'info_dict': {
'id': 'VPWON_1233944',
'ext': 'm4v',
'title': 'Aap, poot, pies',
'description': 'md5:c9c8005d1869ae65b858e82c01a91fde',
'upload_date': '20150508',
'duration': 599,
},
},
{
'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698',
'md5': 'd30cd8417b8b9bca1fdff27428860d08',
'info_dict': {
'id': 'POW_00996502',
'ext': 'm4v',
'title': '''"Dit is wel een 'landslide'..."''',
'description': 'md5:f8d66d537dfb641380226e31ca57b8e8',
'upload_date': '20150508',
'duration': 462,
},
},
{
'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547',
'only_matching': True,
},
{
'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118',
'only_matching': True,
},
{
'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990',
'only_matching': True,
'params': {
'skip_download': True,
}
]
}, {
# non asf in streams
'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771',
'info_dict': {
'id': 'WO_NOS_762771',
'ext': 'mp4',
'title': 'Hoe gaat Europa verder na Parijs?',
},
'params': {
'skip_download': True,
}
}, {
'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content',
'info_dict': {
'id': 'VPWON_1233944',
'ext': 'm4v',
'title': 'Aap, poot, pies',
'description': 'md5:c9c8005d1869ae65b858e82c01a91fde',
'upload_date': '20150508',
'duration': 599,
},
'params': {
'skip_download': True,
}
}, {
'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698',
'info_dict': {
'id': 'POW_00996502',
'ext': 'm4v',
'title': '''"Dit is wel een 'landslide'..."''',
'description': 'md5:f8d66d537dfb641380226e31ca57b8e8',
'upload_date': '20150508',
'duration': 462,
},
'params': {
'skip_download': True,
}
}, {
# audio
'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437',
'info_dict': {
'id': 'RBX_FUNX_6683215',
'ext': 'mp3',
'title': 'Jouw Stad Rotterdam',
'description': 'md5:db251505244f097717ec59fabc372d9f',
},
'params': {
'skip_download': True,
}
}, {
'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547',
'only_matching': True,
}, {
'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118',
'only_matching': True,
}, {
'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990',
'only_matching': True,
}, {
# live stream
'url': 'npo:LI_NL1_4188102',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
@ -183,70 +179,115 @@ class NPOIE(NPOBaseIE):
token = self._get_token(video_id)
formats = []
urls = set()
pubopties = metadata.get('pubopties')
if pubopties:
quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std'])
for format_id in pubopties:
format_info = self._download_json(
'http://ida.omroep.nl/odi/?prid=%s&puboptions=%s&adaptive=yes&token=%s'
% (video_id, format_id, token),
video_id, 'Downloading %s JSON' % format_id)
if format_info.get('error_code', 0) or format_info.get('errorcode', 0):
quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std'])
items = self._download_json(
'http://ida.omroep.nl/app.php/%s' % video_id, video_id,
'Downloading formats JSON', query={
'adaptive': 'yes',
'token': token,
})['items'][0]
for num, item in enumerate(items):
item_url = item.get('url')
if not item_url or item_url in urls:
continue
urls.add(item_url)
format_id = self._search_regex(
r'video/ida/([^/]+)', item_url, 'format id',
default=None)
def add_format_url(format_url):
formats.append({
'url': format_url,
'format_id': format_id,
'quality': quality(format_id),
})
# Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706
if item.get('contentType') in ('url', 'audio'):
add_format_url(item_url)
continue
try:
stream_info = self._download_json(
item_url + '&type=json', video_id,
'Downloading %s stream JSON'
% item.get('label') or item.get('format') or format_id or num)
except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
error = (self._parse_json(
ee.cause.read().decode(), video_id,
fatal=False) or {}).get('errorstring')
if error:
raise ExtractorError(error, expected=True)
raise
# Stream URL instead of JSON, example: npo:LI_NL1_4188102
if isinstance(stream_info, compat_str):
if not stream_info.startswith('http'):
continue
streams = format_info.get('streams')
if streams:
try:
video_info = self._download_json(
streams[0] + '&type=json',
video_id, 'Downloading %s stream JSON' % format_id)
except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
error = (self._parse_json(ee.cause.read().decode(), video_id, fatal=False) or {}).get('errorstring')
if error:
raise ExtractorError(error, expected=True)
raise
else:
video_info = format_info
video_url = video_info.get('url')
if not video_url:
video_url = stream_info
# JSON
else:
video_url = stream_info.get('url')
if not video_url or video_url in urls:
continue
urls.add(item_url)
if determine_ext(video_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_url, video_id, ext='mp4',
entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
else:
add_format_url(video_url)
is_live = metadata.get('medium') == 'live'
if not is_live:
for num, stream in enumerate(metadata.get('streams', [])):
stream_url = stream.get('url')
if not stream_url or stream_url in urls:
continue
if format_id == 'adaptive':
formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4'))
else:
urls.add(stream_url)
# smooth streaming is not supported
stream_type = stream.get('type', '').lower()
if stream_type in ['ss', 'ms']:
continue
if stream_type == 'hds':
f4m_formats = self._extract_f4m_formats(
stream_url, video_id, fatal=False)
# f4m downloader downloads only piece of live stream
for f4m_format in f4m_formats:
f4m_format['preference'] = -1
formats.extend(f4m_formats)
elif stream_type == 'hls':
formats.extend(self._extract_m3u8_formats(
stream_url, video_id, ext='mp4', fatal=False))
# Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706
elif '.asf' in stream_url:
asx = self._download_xml(
stream_url, video_id,
'Downloading stream %d ASX playlist' % num,
transform_source=fix_xml_ampersands, fatal=False)
if not asx:
continue
ref = asx.find('./ENTRY/Ref')
if ref is None:
continue
video_url = ref.get('href')
if not video_url or video_url in urls:
continue
urls.add(video_url)
formats.append({
'url': video_url,
'format_id': format_id,
'quality': quality(format_id),
'ext': stream.get('formaat', 'asf'),
'quality': stream.get('kwaliteit'),
'preference': -10,
})
streams = metadata.get('streams')
if streams:
for i, stream in enumerate(streams):
stream_url = stream.get('url')
if not stream_url:
continue
if '.asf' not in stream_url:
else:
formats.append({
'url': stream_url,
'quality': stream.get('kwaliteit'),
})
continue
asx = self._download_xml(
stream_url, video_id,
'Downloading stream %d ASX playlist' % i,
transform_source=fix_xml_ampersands)
ref = asx.find('./ENTRY/Ref')
if ref is None:
continue
video_url = ref.get('href')
if not video_url:
continue
formats.append({
'url': video_url,
'ext': stream.get('formaat', 'asf'),
'quality': stream.get('kwaliteit'),
})
self._sort_formats(formats)
@ -259,28 +300,28 @@ class NPOIE(NPOBaseIE):
return {
'id': video_id,
'title': title,
'title': self._live_title(title) if is_live else title,
'description': metadata.get('info'),
'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],
'upload_date': unified_strdate(metadata.get('gidsdatum')),
'duration': parse_duration(metadata.get('tijdsduur')),
'formats': formats,
'subtitles': subtitles,
'is_live': is_live,
}
class NPOLiveIE(NPOBaseIE):
IE_NAME = 'npo.nl:live'
_VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P<id>.+)'
_VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'http://www.npo.nl/live/npo-1',
'info_dict': {
'id': 'LI_NEDERLAND1_136692',
'id': 'LI_NL1_4188102',
'display_id': 'npo-1',
'ext': 'mp4',
'title': 're:^Nederland 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'Livestream',
'title': 're:^NPO 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'is_live': True,
},
'params': {
@ -296,58 +337,12 @@ class NPOLiveIE(NPOBaseIE):
live_id = self._search_regex(
r'data-prid="([^"]+)"', webpage, 'live id')
metadata = self._download_json(
'http://e.omroep.nl/metadata/%s' % live_id,
display_id, transform_source=strip_jsonp)
token = self._get_token(display_id)
formats = []
streams = metadata.get('streams')
if streams:
for stream in streams:
stream_type = stream.get('type').lower()
# smooth streaming is not supported
if stream_type in ['ss', 'ms']:
continue
stream_info = self._download_json(
'http://ida.omroep.nl/aapi/?stream=%s&token=%s&type=jsonp'
% (stream.get('url'), token),
display_id, 'Downloading %s JSON' % stream_type)
if stream_info.get('error_code', 0) or stream_info.get('errorcode', 0):
continue
stream_url = self._download_json(
stream_info['stream'], display_id,
'Downloading %s URL' % stream_type,
'Unable to download %s URL' % stream_type,
transform_source=strip_jsonp, fatal=False)
if not stream_url:
continue
if stream_type == 'hds':
f4m_formats = self._extract_f4m_formats(stream_url, display_id)
# f4m downloader downloads only piece of live stream
for f4m_format in f4m_formats:
f4m_format['preference'] = -1
formats.extend(f4m_formats)
elif stream_type == 'hls':
formats.extend(self._extract_m3u8_formats(stream_url, display_id, 'mp4'))
else:
formats.append({
'url': stream_url,
'preference': -10,
})
self._sort_formats(formats)
return {
'_type': 'url_transparent',
'url': 'npo:%s' % live_id,
'ie_key': NPOIE.ie_key(),
'id': live_id,
'display_id': display_id,
'title': self._live_title(metadata['titel']),
'description': metadata['info'],
'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],
'formats': formats,
'is_live': True,
}

View file

@ -75,22 +75,51 @@ class OpenloadIE(InfoExtractor):
'<span[^>]+id="[^"]+"[^>]*>([0-9A-Za-z]+)</span>',
webpage, 'openload ID')
first_char = int(ol_id[0])
urlcode = []
num = 1
video_url_chars = []
while num < len(ol_id):
i = ord(ol_id[num])
key = 0
if i <= 90:
key = i - 65
elif i >= 97:
key = 25 + i - 97
urlcode.append((key, compat_chr(int(ol_id[num + 2:num + 5]) // int(ol_id[num + 1]) - first_char)))
num += 5
first_char = ord(ol_id[0])
key = first_char - 55
maxKey = max(2, key)
key = min(maxKey, len(ol_id) - 38)
t = ol_id[key:key + 36]
video_url = 'https://openload.co/stream/' + ''.join(
[value for _, value in sorted(urlcode, key=lambda x: x[0])])
hashMap = {}
v = ol_id.replace(t, '')
h = 0
while h < len(t):
f = t[h:h + 3]
i = int(f, 8)
hashMap[h / 3] = i
h += 3
h = 0
H = 0
while h < len(v):
B = ''
C = ''
if len(v) >= h + 2:
B = v[h:h + 2]
if len(v) >= h + 3:
C = v[h:h + 3]
i = int(B, 16)
h += 2
if H % 3 == 0:
i = int(C, 8)
h += 1
elif H % 2 == 0 and H != 0 and ord(v[H - 1]) < 60:
i = int(C, 10)
h += 1
index = H % 7
A = hashMap[index]
i ^= 213
i ^= A
video_url_chars.append(compat_chr(i))
H += 1
video_url = 'https://openload.co/stream/%s?mime=true'
video_url = video_url % (''.join(video_url_chars))
title = self._og_search_title(webpage, default=None) or self._search_regex(
r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage,

View file

@ -40,7 +40,7 @@ class PluralsightIE(PluralsightBaseIE):
'info_dict': {
'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04',
'ext': 'mp4',
'title': 'Management of SQL Server - Demo Monitoring',
'title': 'Demo Monitoring',
'duration': 338,
},
'skip': 'Requires pluralsight account credentials',
@ -169,11 +169,10 @@ class PluralsightIE(PluralsightBaseIE):
collection = course['modules']
module, clip = None, None
clip = None
for module_ in collection:
if name in (module_.get('moduleName'), module_.get('name')):
module = module_
for clip_ in module_.get('clips', []):
clip_index = clip_.get('clipIndex')
if clip_index is None:
@ -187,7 +186,7 @@ class PluralsightIE(PluralsightBaseIE):
if not clip:
raise ExtractorError('Unable to resolve clip')
title = '%s - %s' % (module['title'], clip['title'])
title = clip['title']
QUALITIES = {
'low': {'width': 640, 'height': 480},

View file

@ -1,7 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
import functools
import itertools
import operator
# import os
import re
@ -18,6 +20,7 @@ from ..utils import (
js_to_json,
orderedSet,
# sanitized_Request,
remove_quotes,
str_to_int,
)
# from ..aes import (
@ -129,9 +132,32 @@ class PornHubIE(InfoExtractor):
tv_webpage = dl_webpage('tv')
video_url = self._search_regex(
r'<video[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//.+?)\1', tv_webpage,
'video url', group='url')
assignments = self._search_regex(
r'(var.+?mediastring.+?)</script>', tv_webpage,
'encoded url').split(';')
js_vars = {}
def parse_js_value(inp):
inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
if '+' in inp:
inps = inp.split('+')
return functools.reduce(
operator.concat, map(parse_js_value, inps))
inp = inp.strip()
if inp in js_vars:
return js_vars[inp]
return remove_quotes(inp)
for assn in assignments:
assn = assn.strip()
if not assn:
continue
assn = re.sub(r'var\s+', '', assn)
vname, value = assn.split('=', 1)
js_vars[vname] = parse_js_value(value)
video_url = js_vars['mediastring']
title = self._search_regex(
r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None)

View file

@ -300,6 +300,21 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE):
'skip_download': True,
},
},
{
# title in <h2 class="subtitle">
'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip',
'info_dict': {
'id': '4895826',
'ext': 'mp4',
'title': 'Jetzt erst enthüllt: Das Geheimnis von Emma Stones Oscar-Robe',
'description': 'md5:e5ace2bc43fadf7b63adc6187e9450b9',
'upload_date': '20170302',
},
'params': {
'skip_download': True,
},
'skip': 'geo restricted to Germany',
},
{
# geo restricted to Germany
'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge',
@ -338,6 +353,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE):
r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>',
r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>',
r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>',
r'<h2[^>]+class="subtitle"[^>]*>([^<]+)</h2>',
]
_DESCRIPTION_REGEXES = [
r'<p itemprop="description">\s*(.+?)</p>',
@ -369,7 +385,9 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE):
def _extract_clip(self, url, webpage):
clip_id = self._html_search_regex(
self._CLIPID_REGEXES, webpage, 'clip id')
title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title')
title = self._html_search_regex(
self._TITLE_REGEXES, webpage, 'title',
default=None) or self._og_search_title(webpage)
info = self._extract_video_info(url, clip_id)
description = self._html_search_regex(
self._DESCRIPTION_REGEXES, webpage, 'description', default=None)

View file

@ -0,0 +1,122 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import (
float_or_none,
int_or_none,
try_get,
# unified_timestamp,
ExtractorError,
)
class RedBullTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film)/(?P<id>AP-\w+)'
_TESTS = [{
# film
'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc',
'md5': 'fb0445b98aa4394e504b413d98031d1f',
'info_dict': {
'id': 'AP-1Q756YYX51W11',
'ext': 'mp4',
'title': 'ABC of...WRC',
'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31',
'duration': 1582.04,
# 'timestamp': 1488405786,
# 'upload_date': '20170301',
},
}, {
# episode
'url': 'https://www.redbull.tv/video/AP-1PMT5JCWH1W11/grime?playlist=shows:shows-playall:web',
'info_dict': {
'id': 'AP-1PMT5JCWH1W11',
'ext': 'mp4',
'title': 'Grime - Hashtags S2 E4',
'description': 'md5:334b741c8c1ce65be057eab6773c1cf5',
'duration': 904.6,
# 'timestamp': 1487290093,
# 'upload_date': '20170217',
'series': 'Hashtags',
'season_number': 2,
'episode_number': 4,
},
}, {
'url': 'https://www.redbull.tv/film/AP-1MSKKF5T92111/in-motion',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
session = self._download_json(
'https://api-v2.redbull.tv/session', video_id,
note='Downloading access token', query={
'build': '4.370.0',
'category': 'personal_computer',
'os_version': '1.0',
'os_family': 'http',
})
if session.get('code') == 'error':
raise ExtractorError('%s said: %s' % (
self.IE_NAME, session['message']))
auth = '%s %s' % (session.get('token_type', 'Bearer'), session['access_token'])
try:
info = self._download_json(
'https://api-v2.redbull.tv/content/%s' % video_id,
video_id, note='Downloading video information',
headers={'Authorization': auth}
)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
error_message = self._parse_json(
e.cause.read().decode(), video_id)['message']
raise ExtractorError('%s said: %s' % (
self.IE_NAME, error_message), expected=True)
raise
video = info['video_product']
title = info['title'].strip()
formats = self._extract_m3u8_formats(
video['url'], video_id, 'mp4', 'm3u8_native')
self._sort_formats(formats)
subtitles = {}
for _, captions in (try_get(
video, lambda x: x['attachments']['captions'],
dict) or {}).items():
if not captions or not isinstance(captions, list):
continue
for caption in captions:
caption_url = caption.get('url')
if not caption_url:
continue
ext = caption.get('format')
if ext == 'xml':
ext = 'ttml'
subtitles.setdefault(caption.get('lang') or 'en', []).append({
'url': caption_url,
'ext': ext,
})
subheading = info.get('subheading')
if subheading:
title += ' - %s' % subheading
return {
'id': video_id,
'title': title,
'description': info.get('long_description') or info.get(
'short_description'),
'duration': float_or_none(video.get('duration'), scale=1000),
# 'timestamp': unified_timestamp(info.get('published')),
'series': info.get('show_title'),
'season_number': int_or_none(info.get('season_number')),
'episode_number': int_or_none(info.get('episode_number')),
'formats': formats,
'subtitles': subtitles,
}

View file

@ -17,7 +17,7 @@ from ..utils import (
class RutubeIE(InfoExtractor):
IE_NAME = 'rutube'
IE_DESC = 'Rutube videos'
_VALID_URL = r'https?://rutube\.ru/(?:video|play/embed)/(?P<id>[\da-z]{32})'
_VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})'
_TESTS = [{
'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
@ -39,8 +39,17 @@ class RutubeIE(InfoExtractor):
}, {
'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',
'only_matching': True,
}, {
'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return [mobj.group('url') for mobj in re.finditer(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1',
webpage)]
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_json(

View file

@ -82,6 +82,9 @@ class RuutuIE(InfoExtractor):
formats.extend(self._extract_f4m_formats(
video_url, video_id, f4m_id='hds', fatal=False))
elif ext == 'mpd':
# video-only and audio-only streams are of different
# duration resulting in out of sync issue
continue
formats.extend(self._extract_mpd_formats(
video_url, video_id, mpd_id='dash', fatal=False))
else:

View file

@ -89,7 +89,7 @@ class SenateISVPIE(InfoExtractor):
@staticmethod
def _search_iframe_url(webpage):
mobj = re.search(
r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
webpage)
if mobj:
return mobj.group('url')

View file

@ -121,7 +121,7 @@ class SoundcloudIE(InfoExtractor):
},
]
_CLIENT_ID = 'fDoItMDbsbZz8dY16ZzARCZmzgHBPotA'
_CLIENT_ID = '2t9loNQH90kzJcsFCODdigxfp325aq4z'
_IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
@staticmethod

View file

@ -65,7 +65,7 @@ class StreamableIE(InfoExtractor):
# to return video info like the title properly sometimes, and doesn't
# include info like the video duration
video = self._download_json(
'https://streamable.com/ajax/videos/%s' % video_id, video_id)
'https://ajax.streamable.com/videos/%s' % video_id, video_id)
# Format IDs:
# 0 The video is being uploaded

View file

@ -44,6 +44,10 @@ class TelecincoIE(MiTeleBaseIE):
}, {
'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html',
'only_matching': True,
}, {
# ooyala video
'url': 'http://www.cuatro.com/chesterinlove/a-carta/chester-chester_in_love-chester_edu_2_2331030022.html',
'only_matching': True,
}]
def _real_extract(self, url):

View file

@ -2,15 +2,17 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
smuggle_url,
try_get,
)
class TeleQuebecIE(InfoExtractor):
_VALID_URL = r'https?://zonevideo\.telequebec\.tv/media/(?P<id>\d+)'
_TEST = {
_TESTS = [{
'url': 'http://zonevideo.telequebec.tv/media/20984/le-couronnement-de-new-york/couronnement-de-new-york',
'md5': 'fe95a0957e5707b1b01f5013e725c90f',
'info_dict': {
@ -18,10 +20,14 @@ class TeleQuebecIE(InfoExtractor):
'ext': 'mp4',
'title': 'Le couronnement de New York',
'description': 'md5:f5b3d27a689ec6c1486132b2d687d432',
'upload_date': '20160220',
'timestamp': 1455965438,
'upload_date': '20170201',
'timestamp': 1485972222,
}
}
}, {
# no description
'url': 'http://zonevideo.telequebec.tv/media/30261',
'only_matching': True,
}]
def _real_extract(self, url):
media_id = self._match_id(url)
@ -31,9 +37,13 @@ class TeleQuebecIE(InfoExtractor):
return {
'_type': 'url_transparent',
'id': media_id,
'url': smuggle_url('limelight:media:' + media_data['streamInfo']['sourceId'], {'geo_countries': ['CA']}),
'url': smuggle_url(
'limelight:media:' + media_data['streamInfo']['sourceId'],
{'geo_countries': ['CA']}),
'title': media_data['title'],
'description': media_data.get('descriptions', [{'text': None}])[0].get('text'),
'duration': int_or_none(media_data.get('durationInMilliseconds'), 1000),
'description': try_get(
media_data, lambda x: x['descriptions'][0]['text'], compat_str),
'duration': int_or_none(
media_data.get('durationInMilliseconds'), 1000),
'ie_key': 'LimelightMedia',
}

View file

@ -0,0 +1,81 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_duration,
)
class ToonGogglesIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?toongoggles\.com/shows/(?P<show_id>\d+)(?:/[^/]+/episodes/(?P<episode_id>\d+))?'
_TESTS = [{
'url': 'http://www.toongoggles.com/shows/217143/bernard-season-2/episodes/217147/football',
'md5': '18289fc2b951eff6b953a9d8f01e6831',
'info_dict': {
'id': '217147',
'ext': 'mp4',
'title': 'Football',
'uploader_id': '1',
'description': 'Bernard decides to play football in order to be better than Lloyd and tries to beat him no matter how, he even cheats.',
'upload_date': '20160718',
'timestamp': 1468879330,
}
}, {
'url': 'http://www.toongoggles.com/shows/227759/om-nom-stories-around-the-world',
'info_dict': {
'id': '227759',
'title': 'Om Nom Stories Around The World',
},
'playlist_mincount': 11,
}]
def _call_api(self, action, page_id, query):
query.update({
'for_ng': 1,
'for_web': 1,
'show_meta': 1,
'version': 7.0,
})
return self._download_json('http://api.toongoggles.com/' + action, page_id, query=query)
def _parse_episode_data(self, episode_data):
title = episode_data['episode_name']
return {
'_type': 'url_transparent',
'id': episode_data['episode_id'],
'title': title,
'url': 'kaltura:513551:' + episode_data['entry_id'],
'thumbnail': episode_data.get('thumbnail_url'),
'description': episode_data.get('description'),
'duration': parse_duration(episode_data.get('hms')),
'series': episode_data.get('show_name'),
'season_number': int_or_none(episode_data.get('season_num')),
'episode_id': episode_data.get('episode_id'),
'episode': title,
'episode_number': int_or_none(episode_data.get('episode_num')),
'categories': episode_data.get('categories'),
'ie_key': 'Kaltura',
}
def _real_extract(self, url):
show_id, episode_id = re.match(self._VALID_URL, url).groups()
if episode_id:
episode_data = self._call_api('search', episode_id, {
'filter': 'episode',
'id': episode_id,
})['objects'][0]
return self._parse_episode_data(episode_data)
else:
show_data = self._call_api('getepisodesbyshow', show_id, {
'max': 1000000000,
'showid': show_id,
})
entries = []
for episode_data in show_data.get('objects', []):
entries.append(self._parse_episode_data(episode_data))
return self.playlist_result(entries, show_id, show_data.get('show_name'))

View file

@ -0,0 +1,90 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
try_get,
unified_timestamp,
)
class TunePkIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
(?:www\.)?tune\.pk/(?:video/|player/embed_player.php?.*?\bvid=)|
embed\.tune\.pk/play/
)
(?P<id>\d+)
'''
_TESTS = [{
'url': 'https://tune.pk/video/6919541/maudie-2017-international-trailer-1-ft-ethan-hawke-sally-hawkins',
'md5': '0c537163b7f6f97da3c5dd1e3ef6dd55',
'info_dict': {
'id': '6919541',
'ext': 'mp4',
'title': 'Maudie (2017) | International Trailer # 1 ft Ethan Hawke, Sally Hawkins',
'description': 'md5:eb5a04114fafef5cec90799a93a2d09c',
'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': 1487327564,
'upload_date': '20170217',
'uploader': 'Movie Trailers',
'duration': 107,
'view_count': int,
}
}, {
'url': 'https://tune.pk/player/embed_player.php?vid=6919541&folder=2017/02/17/&width=600&height=350&autoplay=no',
'only_matching': True,
}, {
'url': 'https://embed.tune.pk/play/6919541?autoplay=no&ssl=yes&inline=true',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'https://tune.pk/video/%s' % video_id, video_id)
details = self._parse_json(
self._search_regex(
r'new\s+TunePlayer\(({.+?})\)\s*;\s*\n', webpage, 'tune player'),
video_id)['details']
video = details['video']
title = video.get('title') or self._og_search_title(
webpage, default=None) or self._html_search_meta(
'title', webpage, 'title', fatal=True)
formats = self._parse_jwplayer_formats(
details['player']['sources'], video_id)
self._sort_formats(formats)
description = self._og_search_description(
webpage, default=None) or self._html_search_meta(
'description', webpage, 'description')
thumbnail = video.get('thumb') or self._og_search_thumbnail(
webpage, default=None) or self._html_search_meta(
'thumbnail', webpage, 'thumbnail')
timestamp = unified_timestamp(video.get('date_added'))
uploader = try_get(
video, lambda x: x['uploader']['name'],
compat_str) or self._html_search_meta('author', webpage, 'author')
duration = int_or_none(video.get('duration'))
view_count = int_or_none(video.get('views'))
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'timestamp': timestamp,
'uploader': uploader,
'duration': duration,
'view_count': view_count,
'formats': formats,
}

View file

@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
parse_iso8601,
@ -12,7 +14,7 @@ from ..utils import (
class TwentyFourVideoIE(InfoExtractor):
IE_NAME = '24video'
_VALID_URL = r'https?://(?:www\.)?24video\.(?:net|me|xxx|sex|tube)/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)'
_VALID_URL = r'https?://(?P<host>(?:www\.)?24video\.(?:net|me|xxx|sex|tube))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.24video.net/video/view/1044982',
@ -43,10 +45,12 @@ class TwentyFourVideoIE(InfoExtractor):
}]
def _real_extract(self, url):
video_id = self._match_id(url)
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
host = mobj.group('host')
webpage = self._download_webpage(
'http://www.24video.sex/video/view/%s' % video_id, video_id)
'http://%s/video/view/%s' % (host, video_id), video_id)
title = self._og_search_title(webpage)
description = self._html_search_regex(
@ -72,11 +76,11 @@ class TwentyFourVideoIE(InfoExtractor):
# Sets some cookies
self._download_xml(
r'http://www.24video.sex/video/xml/%s?mode=init' % video_id,
r'http://%s/video/xml/%s?mode=init' % (host, video_id),
video_id, 'Downloading init XML')
video_xml = self._download_xml(
'http://www.24video.sex/video/xml/%s?mode=play' % video_id,
'http://%s/video/xml/%s?mode=play' % (host, video_id),
video_id, 'Downloading video XML')
video = xpath_element(video_xml, './/video', 'video', fatal=True)

View file

@ -12,7 +12,6 @@ from ..compat import (
compat_str,
compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
compat_urlparse,
)
from ..utils import (
clean_html,
@ -24,6 +23,7 @@ from ..utils import (
parse_iso8601,
update_url_query,
urlencode_postdata,
urljoin,
)
@ -32,7 +32,7 @@ class TwitchBaseIE(InfoExtractor):
_API_BASE = 'https://api.twitch.tv'
_USHER_BASE = 'https://usher.ttvnw.net'
_LOGIN_URL = 'http://www.twitch.tv/login'
_LOGIN_URL = 'https://www.twitch.tv/login'
_CLIENT_ID = 'jzkbprff40iqj646a697cyrvl0zt2m6'
_NETRC_MACHINE = 'twitch'
@ -64,6 +64,35 @@ class TwitchBaseIE(InfoExtractor):
raise ExtractorError(
'Unable to login. Twitch said: %s' % message, expected=True)
def login_step(page, urlh, note, data):
form = self._hidden_inputs(page)
form.update(data)
page_url = urlh.geturl()
post_url = self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page,
'post url', default=page_url, group='url')
post_url = urljoin(page_url, post_url)
headers = {'Referer': page_url}
try:
response = self._download_json(
post_url, None, note,
data=urlencode_postdata(form),
headers=headers)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
response = self._parse_json(
e.cause.read().decode('utf-8'), None)
fail(response['message'])
raise
redirect_url = urljoin(post_url, response['redirect'])
return self._download_webpage_handle(
redirect_url, None, 'Downloading login redirect page',
headers=headers)
login_page, handle = self._download_webpage_handle(
self._LOGIN_URL, None, 'Downloading login page')
@ -71,40 +100,19 @@ class TwitchBaseIE(InfoExtractor):
if 'blacklist_message' in login_page:
fail(clean_html(login_page))
login_form = self._hidden_inputs(login_page)
redirect_page, handle = login_step(
login_page, handle, 'Logging in as %s' % username, {
'username': username,
'password': password,
})
login_form.update({
'username': username,
'password': password,
})
redirect_url = handle.geturl()
post_url = self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
'post url', default=redirect_url, group='url')
if not post_url.startswith('http'):
post_url = compat_urlparse.urljoin(redirect_url, post_url)
headers = {'Referer': redirect_url}
try:
response = self._download_json(
post_url, None, 'Logging in as %s' % username,
data=urlencode_postdata(login_form),
headers=headers)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
response = self._parse_json(
e.cause.read().decode('utf-8'), None)
fail(response['message'])
raise
if response.get('redirect'):
self._download_webpage(
response['redirect'], None, 'Downloading login redirect page',
headers=headers)
if re.search(r'(?i)<form[^>]+id="two-factor-submit"', redirect_page) is not None:
# TODO: Add mechanism to request an SMS or phone call
tfa_token = self._get_tfa_info('two-factor authentication token')
login_step(redirect_page, handle, 'Submitting TFA token', {
'authy_token': tfa_token,
'remember_2fa': 'true',
})
def _prefer_source(self, formats):
try:

View file

@ -9,7 +9,7 @@ from .common import InfoExtractor
class VierIE(InfoExtractor):
IE_NAME = 'vier'
_VALID_URL = r'https?://(?:www\.)?vier\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
_VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
_TESTS = [{
'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
'info_dict': {
@ -23,6 +23,19 @@ class VierIE(InfoExtractor):
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614',
'info_dict': {
'id': '2561614',
'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas',
'ext': 'mp4',
'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s',
'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen',
'only_matching': True,
@ -35,6 +48,7 @@ class VierIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
embed_id = mobj.group('embed_id')
display_id = mobj.group('display_id') or embed_id
site = mobj.group('site')
webpage = self._download_webpage(url, display_id)
@ -43,7 +57,7 @@ class VierIE(InfoExtractor):
webpage, 'video id')
application = self._search_regex(
[r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
webpage, 'application', default='vier_vod')
webpage, 'application', default=site + '_vod')
filename = self._search_regex(
[r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
webpage, 'filename')
@ -68,13 +82,19 @@ class VierIE(InfoExtractor):
class VierVideosIE(InfoExtractor):
IE_NAME = 'vier:videos'
_VALID_URL = r'https?://(?:www\.)?vier\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)'
_VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)'
_TESTS = [{
'url': 'http://www.vier.be/demoestuin/videos',
'info_dict': {
'id': 'demoestuin',
},
'playlist_mincount': 153,
}, {
'url': 'http://www.vijf.be/temptationisland/videos',
'info_dict': {
'id': 'temptationisland',
},
'playlist_mincount': 159,
}, {
'url': 'http://www.vier.be/demoestuin/videos?page=6',
'info_dict': {
@ -92,6 +112,7 @@ class VierVideosIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
program = mobj.group('program')
site = mobj.group('site')
page_id = mobj.group('page')
if page_id:
@ -105,13 +126,13 @@ class VierVideosIE(InfoExtractor):
entries = []
for current_page_id in itertools.count(start_page):
current_page = self._download_webpage(
'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id),
'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id),
program,
'Downloading page %d' % (current_page_id + 1))
page_entries = [
self.url_result('http://www.vier.be' + video_url, 'Vier')
self.url_result('http://www.' + site + '.be' + video_url, 'Vier')
for video_url in re.findall(
r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]
r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]
entries.extend(page_entries)
if page_id or '>Meer<' not in current_page:
break

View file

@ -44,7 +44,7 @@ class ViuBaseIE(InfoExtractor):
class ViuIE(ViuBaseIE):
_VALID_URL = r'(?:viu:|https?://www\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)'
_VALID_URL = r'(?:viu:|https?://[^/]+\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.viu.com/en/media/1116705532?containerId=playlist-22168059',
'info_dict': {
@ -69,6 +69,9 @@ class ViuIE(ViuBaseIE):
'skip_download': 'm3u8 download',
},
'skip': 'Geo-restricted to Indonesia',
}, {
'url': 'https://india.viu.com/en/media/1126286865',
'only_matching': True,
}]
def _real_extract(self, url):

View file

@ -432,8 +432,7 @@ class VKIE(VKBaseIE):
})
elif format_id == 'hls':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4',
entry_protocol='m3u8' if is_live else 'm3u8_native',
format_url, video_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False, live=is_live))
elif format_id == 'rtmp':
formats.append({

View file

@ -0,0 +1,80 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .brightcove import BrightcoveNewIE
from ..utils import (
int_or_none,
parse_age_limit,
smuggle_url,
unescapeHTML,
)
class VrakIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?vrak\.tv/videos\?.*?\btarget=(?P<id>[\d.]+)'
_TEST = {
'url': 'http://www.vrak.tv/videos?target=1.2306782&filtre=emission&id=1.1806721',
'info_dict': {
'id': '5345661243001',
'ext': 'mp4',
'title': 'Obésité, film de hockey et Roseline Filion',
'timestamp': 1488492126,
'upload_date': '20170302',
'uploader_id': '2890187628001',
'creator': 'VRAK.TV',
'age_limit': 8,
'series': 'ALT (Actualité Légèrement Tordue)',
'episode': 'Obésité, film de hockey et Roseline Filion',
'tags': list,
},
'params': {
'skip_download': True,
},
}
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2890187628001/default_default/index.html?videoId=%s'
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
r'<h\d\b[^>]+\bclass=["\']videoTitle["\'][^>]*>([^<]+)',
webpage, 'title', default=None) or self._og_search_title(webpage)
content = self._parse_json(
self._search_regex(
r'data-player-options-content=(["\'])(?P<content>{.+?})\1',
webpage, 'content', default='{}', group='content'),
video_id, transform_source=unescapeHTML)
ref_id = content.get('refId') or self._search_regex(
r'refId&quot;:&quot;([^&]+)&quot;', webpage, 'ref id')
brightcove_id = self._search_regex(
r'''(?x)
java\.lang\.String\s+value\s*=\s*["']brightcove\.article\.\d+\.%s
[^>]*
java\.lang\.String\s+value\s*=\s*["'](\d+)
''' % re.escape(ref_id), webpage, 'brightcove id')
return {
'_type': 'url_transparent',
'ie_key': BrightcoveNewIE.ie_key(),
'url': smuggle_url(
self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
{'geo_countries': ['CA']}),
'id': brightcove_id,
'description': content.get('description'),
'creator': content.get('brand'),
'age_limit': parse_age_limit(content.get('rating')),
'series': content.get('showName') or content.get(
'episodeName'), # this is intentional
'season_number': int_or_none(content.get('seasonNumber')),
'episode': title,
'episode_number': int_or_none(content.get('episodeNumber')),
'tags': content.get('tags', []),
}

View file

@ -19,9 +19,10 @@ class WDRBaseIE(InfoExtractor):
def _extract_wdr_video(self, webpage, display_id):
# for wdr.de the data-extension is in a tag with the class "mediaLink"
# for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn"
# for wdrmaus its in a link to the page in a multiline "videoLink"-tag
# for wdrmaus, in a tag with the class "videoButton" (previously a link
# to the page in a multiline "videoLink"-tag)
json_metadata = self._html_search_regex(
r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"',
r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"',
webpage, 'media link', default=None, flags=re.MULTILINE)
if not json_metadata:
@ -32,7 +33,7 @@ class WDRBaseIE(InfoExtractor):
jsonp_url = media_link_obj['mediaObj']['url']
metadata = self._download_json(
jsonp_url, 'metadata', transform_source=strip_jsonp)
jsonp_url, display_id, transform_source=strip_jsonp)
metadata_tracker_data = metadata['trackerData']
metadata_media_resource = metadata['mediaResource']
@ -161,23 +162,23 @@ class WDRIE(WDRBaseIE):
{
'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
'info_dict': {
'id': 'mdb-1096487',
'ext': 'flv',
'id': 'mdb-1323501',
'ext': 'mp4',
'upload_date': 're:^[0-9]{8}$',
'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$',
'description': '- Die Sendung mit der Maus -',
'description': 'Die Seite mit der Maus -',
},
'skip': 'The id changes from week to week because of the new episode'
},
{
'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5',
'url': 'http://www.wdrmaus.de/filme/sachgeschichten/achterbahn.php5',
'md5': '803138901f6368ee497b4d195bb164f2',
'info_dict': {
'id': 'mdb-186083',
'ext': 'mp4',
'upload_date': '20130919',
'title': 'Sachgeschichte - Achterbahn ',
'description': '- Die Sendung mit der Maus -',
'description': 'Die Seite mit der Maus -',
},
},
{
@ -186,7 +187,7 @@ class WDRIE(WDRBaseIE):
'info_dict': {
'id': 'mdb-869971',
'ext': 'flv',
'title': 'Funkhaus Europa Livestream',
'title': 'COSMO Livestream',
'description': 'md5:2309992a6716c347891c045be50992e4',
'upload_date': '20160101',
},

View file

@ -59,6 +59,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
_PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}'
def _set_language(self):
self._set_cookie(
'.youtube.com', 'PREF', 'f1=50000000&hl=en',
@ -265,9 +267,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
)
)? # all until now is optional -> you can pass the naked ID
([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
(?!.*?\blist=) # combined list/video URLs are handled by the playlist IE
(?!.*?\blist=
(?:
%(playlist_id)s| # combined list/video URLs are handled by the playlist IE
WL # WL are handled by the watch later IE
)
)
(?(1).+)? # if we found the ID, everything can follow
$"""
$""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
_formats = {
'5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
@ -924,6 +931,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'sJL6WA-aGkQ',
'only_matching': True,
},
{
'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
'only_matching': True,
},
]
def __init__(self, *args, **kwargs):
@ -1454,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Check for "rental" videos
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
raise ExtractorError('"rental" videos not supported')
raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
# Start extracting information
self.report_information_extraction(video_id)
@ -1864,8 +1875,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
)
.*
|
((?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,})
)"""
(%(playlist_id)s)
)""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true'
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
IE_NAME = 'youtube:playlist'