2016-10-14 22:16:43 -04:00
# coding: utf-8
2014-05-03 02:28:38 +07:00
from __future__ import unicode_literals
2016-09-01 19:07:41 +01:00
import hmac
import hashlib
import base64
2014-05-03 02:28:38 +07:00
from . common import InfoExtractor
2015-03-19 21:23:52 +06:00
from . . utils import (
float_or_none ,
int_or_none ,
2016-10-14 22:16:43 -04:00
js_to_json ,
2015-03-19 21:23:52 +06:00
parse_iso8601 ,
2016-09-01 19:07:41 +01:00
mimetype2ext ,
determine_ext ,
2015-03-19 21:23:52 +06:00
)
2014-05-03 02:28:38 +07:00
2015-05-04 22:32:57 +08:00
class NYTimesBaseIE ( InfoExtractor ) :
2016-09-01 19:07:41 +01:00
_SECRET = b ' pX(2MbU2);4N { 7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v '
2015-05-04 22:32:57 +08:00
def _extract_video_from_id ( self , video_id ) :
2016-09-01 19:07:41 +01:00
# Authorization generation algorithm is reverse engineered from `signer` in
# http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js
path = ' /svc/video/api/v3/video/ ' + video_id
hm = hmac . new ( self . _SECRET , ( path + ' :vhs ' ) . encode ( ) , hashlib . sha512 ) . hexdigest ( )
video_data = self . _download_json ( ' http://www.nytimes.com ' + path , video_id , ' Downloading video JSON ' , headers = {
' Authorization ' : ' NYTV ' + base64 . b64encode ( hm . encode ( ) ) . decode ( ) ,
' X-NYTV ' : ' vhs ' ,
} , fatal = False )
if not video_data :
video_data = self . _download_json (
' http://www.nytimes.com/svc/video/api/v2/video/ ' + video_id ,
video_id , ' Downloading video JSON ' )
2014-05-03 02:28:38 +07:00
title = video_data [ ' headline ' ]
2014-05-03 03:11:38 +07:00
def get_file_size ( file_size ) :
if isinstance ( file_size , int ) :
return file_size
elif isinstance ( file_size , dict ) :
return int ( file_size . get ( ' value ' , 0 ) )
else :
2016-09-01 19:07:41 +01:00
return None
urls = [ ]
formats = [ ]
for video in video_data . get ( ' renditions ' , [ ] ) :
video_url = video . get ( ' url ' )
format_id = video . get ( ' type ' )
if not video_url or format_id == ' thumbs ' or video_url in urls :
continue
urls . append ( video_url )
ext = mimetype2ext ( video . get ( ' mimetype ' ) ) or determine_ext ( video_url )
if ext == ' m3u8 ' :
formats . extend ( self . _extract_m3u8_formats (
video_url , video_id , ' mp4 ' , ' m3u8_native ' ,
m3u8_id = format_id or ' hls ' , fatal = False ) )
elif ext == ' mpd ' :
continue
# formats.extend(self._extract_mpd_formats(
# video_url, video_id, format_id or 'dash', fatal=False))
else :
formats . append ( {
' url ' : video_url ,
' format_id ' : format_id ,
' vcodec ' : video . get ( ' videoencoding ' ) or video . get ( ' video_codec ' ) ,
' width ' : int_or_none ( video . get ( ' width ' ) ) ,
' height ' : int_or_none ( video . get ( ' height ' ) ) ,
' filesize ' : get_file_size ( video . get ( ' file_size ' ) or video . get ( ' fileSize ' ) ) ,
' tbr ' : int_or_none ( video . get ( ' bitrate ' ) , 1000 ) ,
' ext ' : ext ,
} )
2014-05-03 02:28:38 +07:00
self . _sort_formats ( formats )
2016-09-01 19:07:41 +01:00
thumbnails = [ ]
for image in video_data . get ( ' images ' , [ ] ) :
image_url = image . get ( ' url ' )
if not image_url :
continue
thumbnails . append ( {
' url ' : ' http://www.nytimes.com/ ' + image_url ,
2015-03-19 21:23:52 +06:00
' width ' : int_or_none ( image . get ( ' width ' ) ) ,
' height ' : int_or_none ( image . get ( ' height ' ) ) ,
2016-09-01 19:07:41 +01:00
} )
publication_date = video_data . get ( ' publication_date ' )
timestamp = parse_iso8601 ( publication_date [ : - 8 ] ) if publication_date else None
2014-05-03 02:28:38 +07:00
return {
' id ' : video_id ,
' title ' : title ,
2016-09-01 19:07:41 +01:00
' description ' : video_data . get ( ' summary ' ) ,
2014-05-03 02:28:38 +07:00
' timestamp ' : timestamp ,
2016-09-01 19:07:41 +01:00
' uploader ' : video_data . get ( ' byline ' ) ,
' duration ' : float_or_none ( video_data . get ( ' duration ' ) , 1000 ) ,
2014-05-03 02:28:38 +07:00
' formats ' : formats ,
' thumbnails ' : thumbnails ,
2014-11-23 20:41:03 +01:00
}
2015-05-04 22:32:57 +08:00
2016-10-14 22:16:43 -04:00
def _extract_podcast_from_json ( self , json , page_id , webpage ) :
audio_data = self . _parse_json ( json , page_id , transform_source = js_to_json ) [ ' data ' ]
description = audio_data [ ' track ' ] . get ( ' description ' )
if not description :
description = self . _html_search_meta ( [ ' og:description ' , ' twitter:description ' ] , webpage )
episode_title = audio_data [ ' track ' ] [ ' title ' ]
episode_number = None
episode = audio_data [ ' podcast ' ] [ ' episode ' ] . split ( )
if episode :
episode_number = int_or_none ( episode [ - 1 ] )
video_id = episode [ - 1 ]
else :
video_id = page_id
podcast_title = audio_data [ ' podcast ' ] [ ' title ' ]
title = None
if podcast_title :
title = " %s : %s " % ( podcast_title , episode_title )
else :
title = episode_title
info_dict = {
' id ' : video_id ,
' title ' : title ,
' creator ' : audio_data [ ' track ' ] . get ( ' credit ' ) ,
' series ' : podcast_title ,
' episode ' : episode_title ,
' episode_number ' : episode_number ,
' url ' : audio_data [ ' track ' ] [ ' source ' ] ,
' duration ' : audio_data [ ' track ' ] . get ( ' duration ' ) ,
' description ' : description ,
}
return info_dict
2015-05-04 22:32:57 +08:00
class NYTimesIE ( NYTimesBaseIE ) :
_VALID_URL = r ' https?://(?:(?:www \ .)?nytimes \ .com/video/(?:[^/]+/)+?|graphics8 \ .nytimes \ .com/bcvideo/ \ d+(?: \ . \ d+)?/iframe/embed \ .html \ ?videoId=)(?P<id> \ d+) '
_TESTS = [ {
' url ' : ' http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263 ' ,
2016-09-01 19:07:41 +01:00
' md5 ' : ' d665342765db043f7e225cff19df0f2d ' ,
2015-05-04 22:32:57 +08:00
' info_dict ' : {
' id ' : ' 100000002847155 ' ,
' ext ' : ' mov ' ,
' title ' : ' Verbatim: What Is a Photocopier? ' ,
' description ' : ' md5:93603dada88ddbda9395632fdc5da260 ' ,
' timestamp ' : 1398631707 ,
' upload_date ' : ' 20140427 ' ,
' uploader ' : ' Brett Weiner ' ,
' duration ' : 419 ,
}
} , {
' url ' : ' http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html ' ,
' only_matching ' : True ,
} ]
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
return self . _extract_video_from_id ( video_id )
class NYTimesArticleIE ( NYTimesBaseIE ) :
2015-05-12 12:42:13 +08:00
_VALID_URL = r ' https?://(?:www \ .)?nytimes \ .com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?: \ .html)? '
2015-05-04 23:00:09 +08:00
_TESTS = [ {
2015-05-04 22:32:57 +08:00
' url ' : ' http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0 ' ,
' md5 ' : ' e2076d58b4da18e6a001d53fd56db3c9 ' ,
' info_dict ' : {
' id ' : ' 100000003628438 ' ,
' ext ' : ' mov ' ,
' title ' : ' New Minimum Wage: $70,000 a Year ' ,
' description ' : ' Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year. ' ,
' timestamp ' : 1429033037 ,
' upload_date ' : ' 20150414 ' ,
' uploader ' : ' Matthew Williams ' ,
}
2016-10-14 22:16:43 -04:00
} , {
' url ' : ' http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html ' ,
' md5 ' : ' e0d52040cafb07662acf3c9132db3575 ' ,
' info_dict ' : {
' id ' : ' 20 ' ,
' title ' : " The Run-Up: \u2018 He Was Like an Octopus \u2019 " ,
' ext ' : ' mp3 ' ,
' description ' : ' We go behind the story of the two women who told us that Donald Trump touched them inappropriately (which he denies) and check in on Hillary Clinton’ s campaign. ' ,
}
} , {
' url ' : ' http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html ' ,
' md5 ' : ' 66fb5471d7ef15da98af176dc1af4cb9 ' ,
' info_dict ' : {
' id ' : ' inside-the-new-york-times-book-review-the-rise-of-hitler ' ,
' title ' : " The Rise of Hitler " ,
' ext ' : ' mp3 ' ,
' description ' : ' Adam Kirsch discusses Volker Ullrich \' s new biography of Hitler; Billy Collins talks about his latest collection of poems; and iO Tillett Wright on his new memoir, " Darling Days. " ' ,
}
2015-05-04 23:00:09 +08:00
} , {
' url ' : ' http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1 ' ,
' only_matching ' : True ,
} ]
2015-05-04 22:32:57 +08:00
def _real_extract ( self , url ) :
2016-10-14 22:16:43 -04:00
page_id = self . _match_id ( url )
2015-05-04 22:32:57 +08:00
2016-10-14 22:16:43 -04:00
webpage = self . _download_webpage ( url , page_id )
2015-05-04 22:32:57 +08:00
2016-10-14 22:16:43 -04:00
video_id = self . _html_search_regex ( r ' data-videoid= " ( \ d+) " ' , webpage , ' video id ' , None , False )
if video_id is not None :
return self . _extract_video_from_id ( video_id )
data_json = self . _html_search_regex ( r ' NYTD \ .FlexTypes \ .push \ (( { .*}) \ ); ' , webpage , ' json data ' )
return self . _extract_podcast_from_json ( data_json , page_id , webpage )