[ytsearch] Fix extraction (closes #26920)
This commit is contained in:
parent
48c5663c5f
commit
416da574ec
1 changed files with 78 additions and 38 deletions
|
@ -3181,54 +3181,94 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
|
||||||
_MAX_RESULTS = float('inf')
|
_MAX_RESULTS = float('inf')
|
||||||
IE_NAME = 'youtube:search'
|
IE_NAME = 'youtube:search'
|
||||||
_SEARCH_KEY = 'ytsearch'
|
_SEARCH_KEY = 'ytsearch'
|
||||||
_EXTRA_QUERY_ARGS = {}
|
_SEARCH_PARAMS = None
|
||||||
_TESTS = []
|
_TESTS = []
|
||||||
|
|
||||||
|
def _entries(self, query, n):
|
||||||
|
data = {
|
||||||
|
'context': {
|
||||||
|
'client': {
|
||||||
|
'clientName': 'WEB',
|
||||||
|
'clientVersion': '2.20201021.03.00',
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'query': query,
|
||||||
|
}
|
||||||
|
if self._SEARCH_PARAMS:
|
||||||
|
data['params'] = self._SEARCH_PARAMS
|
||||||
|
total = 0
|
||||||
|
for page_num in itertools.count(1):
|
||||||
|
search = self._download_json(
|
||||||
|
'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
|
||||||
|
video_id='query "%s"' % query,
|
||||||
|
note='Downloading page %s' % page_num,
|
||||||
|
errnote='Unable to download API page', fatal=False,
|
||||||
|
data=json.dumps(data).encode('utf8'),
|
||||||
|
headers={'content-type': 'application/json'})
|
||||||
|
if not search:
|
||||||
|
break
|
||||||
|
slr_contents = try_get(
|
||||||
|
search,
|
||||||
|
(lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
|
||||||
|
lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
|
||||||
|
list)
|
||||||
|
if not slr_contents:
|
||||||
|
break
|
||||||
|
isr_contents = try_get(
|
||||||
|
slr_contents,
|
||||||
|
lambda x: x[0]['itemSectionRenderer']['contents'],
|
||||||
|
list)
|
||||||
|
if not isr_contents:
|
||||||
|
break
|
||||||
|
for content in isr_contents:
|
||||||
|
if not isinstance(content, dict):
|
||||||
|
continue
|
||||||
|
video = content.get('videoRenderer')
|
||||||
|
if not isinstance(video, dict):
|
||||||
|
continue
|
||||||
|
video_id = video.get('videoId')
|
||||||
|
if not video_id:
|
||||||
|
continue
|
||||||
|
title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
|
||||||
|
description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
|
||||||
|
duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
|
||||||
|
view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
|
||||||
|
view_count = int_or_none(self._search_regex(
|
||||||
|
r'^(\d+)', re.sub(r'\s', '', view_count_text),
|
||||||
|
'view count', default=None))
|
||||||
|
uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
|
||||||
|
total += 1
|
||||||
|
yield {
|
||||||
|
'_type': 'url_transparent',
|
||||||
|
'ie_key': YoutubeIE.ie_key(),
|
||||||
|
'id': video_id,
|
||||||
|
'url': video_id,
|
||||||
|
'title': title,
|
||||||
|
'description': description,
|
||||||
|
'duration': duration,
|
||||||
|
'view_count': view_count,
|
||||||
|
'uploader': uploader,
|
||||||
|
}
|
||||||
|
if total == n:
|
||||||
|
return
|
||||||
|
token = try_get(
|
||||||
|
slr_contents,
|
||||||
|
lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
|
||||||
|
compat_str)
|
||||||
|
if not token:
|
||||||
|
break
|
||||||
|
data['continuation'] = token
|
||||||
|
|
||||||
def _get_n_results(self, query, n):
|
def _get_n_results(self, query, n):
|
||||||
"""Get a specified number of results for a query"""
|
"""Get a specified number of results for a query"""
|
||||||
|
return self.playlist_result(self._entries(query, n), query)
|
||||||
videos = []
|
|
||||||
limit = n
|
|
||||||
|
|
||||||
url_query = {
|
|
||||||
'search_query': query.encode('utf-8'),
|
|
||||||
}
|
|
||||||
url_query.update(self._EXTRA_QUERY_ARGS)
|
|
||||||
result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
|
|
||||||
|
|
||||||
for pagenum in itertools.count(1):
|
|
||||||
data = self._download_json(
|
|
||||||
result_url, video_id='query "%s"' % query,
|
|
||||||
note='Downloading page %s' % pagenum,
|
|
||||||
errnote='Unable to download API page',
|
|
||||||
query={'spf': 'navigate'})
|
|
||||||
html_content = data[1]['body']['content']
|
|
||||||
|
|
||||||
if 'class="search-message' in html_content:
|
|
||||||
raise ExtractorError(
|
|
||||||
'[youtube] No video results', expected=True)
|
|
||||||
|
|
||||||
new_videos = list(self._process_page(html_content))
|
|
||||||
videos += new_videos
|
|
||||||
if not new_videos or len(videos) > limit:
|
|
||||||
break
|
|
||||||
next_link = self._html_search_regex(
|
|
||||||
r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
|
|
||||||
html_content, 'next link', default=None)
|
|
||||||
if next_link is None:
|
|
||||||
break
|
|
||||||
result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
|
|
||||||
|
|
||||||
if len(videos) > n:
|
|
||||||
videos = videos[:n]
|
|
||||||
return self.playlist_result(videos, query)
|
|
||||||
|
|
||||||
|
|
||||||
class YoutubeSearchDateIE(YoutubeSearchIE):
|
class YoutubeSearchDateIE(YoutubeSearchIE):
|
||||||
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
|
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
|
||||||
_SEARCH_KEY = 'ytsearchdate'
|
_SEARCH_KEY = 'ytsearchdate'
|
||||||
IE_DESC = 'YouTube.com searches, newest videos first'
|
IE_DESC = 'YouTube.com searches, newest videos first'
|
||||||
_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
|
_SEARCH_PARAMS = 'CAI%3D'
|
||||||
|
|
||||||
|
|
||||||
class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
|
class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
|
||||||
|
|
Loading…
Reference in a new issue