1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-07-19 01:16:40 +00:00

[ie] Add _search_nextjs_v13_data helper (#13398)

* Fixes FranceTVSiteIE livestream extraction
* Fixes GoPlayIE metadata extraction

Authored by: bashonly
This commit is contained in:
bashonly 2025-07-12 17:12:46 -05:00 committed by GitHub
parent 3ae61e0f31
commit 5245231e4a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 120 additions and 66 deletions

View File

@ -1959,6 +1959,32 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
with self.assertWarns(DeprecationWarning): with self.assertWarns(DeprecationWarning):
self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {}) self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {})
def test_search_nextjs_v13_data(self):
HTML = R'''
<script>(self.__next_f=self.__next_f||[]).push([0])</script>
<script>self.__next_f.push([2,"0:[\"$\",\"$L0\",null,{\"do_not_add_this\":\"fail\"}]\n"])</script>
<script>self.__next_f.push([1,"1:I[46975,[],\"HTTPAccessFallbackBoundary\"]\n2:I[32630,[\"8183\",\"static/chunks/8183-768193f6a9e33cdd.js\"]]\n"])</script>
<script nonce="abc123">self.__next_f.push([1,"e:[false,[\"$\",\"div\",null,{\"children\":[\"$\",\"$L18\",null,{\"foo\":\"bar\"}]}],false]\n "])</script>
<script>self.__next_f.push([1,"2a:[[\"$\",\"div\",null,{\"className\":\"flex flex-col\",\"children\":[]}],[\"$\",\"$L16\",null,{\"meta\":{\"dateCreated\":1730489700,\"uuid\":\"40cac41d-8d29-4ef5-aa11-75047b9f0907\"}}]]\n"])</script>
<script>self.__next_f.push([1,"df:[\"$undefined\",[\"$\",\"div\",null,{\"children\":[\"$\",\"$L17\",null,{}],\"do_not_include_this_field\":\"fail\"}],[\"$\",\"div\",null,{\"children\":[[\"$\",\"$L19\",null,{\"duplicated_field_name\":{\"x\":1}}],[\"$\",\"$L20\",null,{\"duplicated_field_name\":{\"y\":2}}]]}],\"$undefined\"]\n"])</script>
<script>self.__next_f.push([3,"MzM6WyIkIiwiJEwzMiIsbnVsbCx7ImRlY29kZWQiOiJzdWNjZXNzIn1d"])</script>
'''
EXPECTED = [{
'foo': 'bar',
}, {
'meta': {
'dateCreated': 1730489700,
'uuid': '40cac41d-8d29-4ef5-aa11-75047b9f0907',
},
}, {
'duplicated_field_name': {'x': 1},
}, {
'duplicated_field_name': {'y': 2},
}]
self.assertEqual(self.ie._search_nextjs_v13_data(HTML, None), EXPECTED)
self.assertEqual(self.ie._search_nextjs_v13_data('', None, fatal=False), [])
self.assertEqual(self.ie._search_nextjs_v13_data(None, None, fatal=False), [])
def test_search_nuxt_json(self): def test_search_nuxt_json(self):
HTML_TMPL = '<script data-ssr="true" id="__NUXT_DATA__" type="application/json">[{}]</script>' HTML_TMPL = '<script data-ssr="true" id="__NUXT_DATA__" type="application/json">[{}]</script>'
VALID_DATA = ''' VALID_DATA = '''

View File

@ -1783,6 +1783,53 @@ class InfoExtractor:
r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data', r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
video_id, end_pattern='</script>', fatal=fatal, default=default, **kw) video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
def _search_nextjs_v13_data(self, webpage, video_id, fatal=True):
"""Parses Next.js app router flight data that was introduced in Next.js v13"""
nextjs_data = []
if not fatal and not isinstance(webpage, str):
return nextjs_data
def flatten(flight_data):
if not isinstance(flight_data, list):
return
if len(flight_data) == 4 and flight_data[0] == '$':
_, name, _, data = flight_data
if not isinstance(data, dict):
return
children = data.pop('children', None)
if data and isinstance(name, str) and name.startswith('$'):
# It is useful hydration JSON data
nextjs_data.append(data)
flatten(children)
return
for f in flight_data:
flatten(f)
flight_text = ''
# The pattern for the surrounding JS/tag should be strict as it's a hardcoded string in the next.js source
# Ref: https://github.com/vercel/next.js/blob/5a4a08fdc/packages/next/src/server/app-render/use-flight-response.tsx#L189
for flight_segment in re.findall(r'<script\b[^>]*>self\.__next_f\.push\((\[.+?\])\)</script>', webpage):
segment = self._parse_json(flight_segment, video_id, fatal=fatal, errnote=None if fatal else False)
# Some earlier versions of next.js "optimized" away this array structure; this is unsupported
# Ref: https://github.com/vercel/next.js/commit/0123a9d5c9a9a77a86f135b7ae30b46ca986d761
if not isinstance(segment, list) or len(segment) != 2:
self.write_debug(
f'{video_id}: Unsupported next.js flight data structure detected', only_once=True)
continue
# Only use the relevant payload type (1 == data)
# Ref: https://github.com/vercel/next.js/blob/5a4a08fdc/packages/next/src/server/app-render/use-flight-response.tsx#L11-L14
payload_type, chunk = segment
if payload_type == 1:
flight_text += chunk
for f in flight_text.splitlines():
prefix, _, body = f.partition(':')
if body.startswith('[') and body.endswith(']') and re.fullmatch(r'[0-9a-f]{1,3}', prefix.lstrip()):
# The body isn't necessarily valid JSON, so this should always be non-fatal
flatten(self._parse_json(body, video_id, fatal=False, errnote=False))
return nextjs_data
def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
rectx = re.escape(context_name) rectx = re.escape(context_name)

View File

@ -1,4 +1,3 @@
import json
import re import re
import urllib.parse import urllib.parse
@ -19,7 +18,11 @@ from ..utils import (
unsmuggle_url, unsmuggle_url,
url_or_none, url_or_none,
) )
from ..utils.traversal import find_element, traverse_obj from ..utils.traversal import (
find_element,
get_first,
traverse_obj,
)
class FranceTVBaseInfoExtractor(InfoExtractor): class FranceTVBaseInfoExtractor(InfoExtractor):
@ -258,7 +261,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
'info_dict': { 'info_dict': {
'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', # old: c5bda21d-2c6f-4470-8849-3d8327adb2ba' 'id': 'b2cf9fd8-e971-4757-8651-848f2772df61', # old: ec217ecc-0733-48cf-ac06-af1347b849d1
'ext': 'mp4', 'ext': 'mp4',
'title': '13h15, le dimanche... - Les mystères de Jésus', 'title': '13h15, le dimanche... - Les mystères de Jésus',
'timestamp': 1502623500, 'timestamp': 1502623500,
@ -269,7 +272,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
'add_ie': [FranceTVIE.ie_key()], 'skip': 'Unfortunately, this video is no longer available',
}, { }, {
# geo-restricted # geo-restricted
'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html', 'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html',
@ -287,7 +290,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1441, 'duration': 1441,
}, },
'skip': 'No longer available', 'skip': 'Unfortunately, this video is no longer available',
}, { }, {
# geo-restricted livestream (workflow == 'token-akamai') # geo-restricted livestream (workflow == 'token-akamai')
'url': 'https://www.france.tv/france-4/direct.html', 'url': 'https://www.france.tv/france-4/direct.html',
@ -308,6 +311,19 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
'live_status': 'is_live', 'live_status': 'is_live',
}, },
'params': {'skip_download': 'livestream'}, 'params': {'skip_download': 'livestream'},
}, {
# Not geo-restricted
'url': 'https://www.france.tv/france-2/la-maison-des-maternelles/5574051-nous-sommes-amis-et-nous-avons-fait-un-enfant-ensemble.html',
'info_dict': {
'id': 'b448bfe4-9fe7-11ee-97d8-2ba3426fa3df',
'ext': 'mp4',
'title': 'Nous sommes amis et nous avons fait un enfant ensemble - Émission du jeudi 21 décembre 2023',
'duration': 1065,
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1703147921,
'upload_date': '20231221',
},
'params': {'skip_download': 'm3u8'},
}, { }, {
# france3 # france3
'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html',
@ -342,30 +358,16 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
# XXX: For parsing next.js v15+ data; see also yt_dlp.extractor.goplay
def _find_json(self, s):
return self._search_json(
r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None)
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
nextjs_data = self._search_nextjs_v13_data(webpage, display_id)
nextjs_data = traverse_obj( if get_first(nextjs_data, ('isLive', {bool})):
re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*</script>', webpage),
(..., {json.loads}, ..., {self._find_json}, ..., 'children', ..., ..., 'children', ..., ..., 'children'))
if traverse_obj(nextjs_data, (..., ..., 'children', ..., 'isLive', {bool}, any)):
# For livestreams we need the id of the stream instead of the currently airing episode id # For livestreams we need the id of the stream instead of the currently airing episode id
video_id = traverse_obj(nextjs_data, ( video_id = get_first(nextjs_data, ('options', 'id', {str}))
..., ..., 'children', ..., 'children', ..., 'children', ..., 'children', ..., ...,
'children', ..., ..., 'children', ..., ..., 'children', (..., (..., ...)),
'options', 'id', {str}, any))
else: else:
video_id = traverse_obj(nextjs_data, ( video_id = get_first(nextjs_data, ('video', ('playerReplayId', 'siId'), {str}))
..., ..., ..., 'children',
lambda _, v: v['video']['url'] == urllib.parse.urlparse(url).path,
'video', ('playerReplayId', 'siId'), {str}, any))
if not video_id: if not video_id:
raise ExtractorError('Unable to extract video ID') raise ExtractorError('Unable to extract video ID')

View File

@ -5,16 +5,11 @@ import hashlib
import hmac import hmac
import json import json
import os import os
import re
import urllib.parse import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import ExtractorError, int_or_none
ExtractorError, from ..utils.traversal import get_first, traverse_obj
int_or_none,
remove_end,
traverse_obj,
)
class GoPlayIE(InfoExtractor): class GoPlayIE(InfoExtractor):
@ -27,10 +22,10 @@ class GoPlayIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '2baa4560-87a0-421b-bffc-359914e3c387', 'id': '2baa4560-87a0-421b-bffc-359914e3c387',
'ext': 'mp4', 'ext': 'mp4',
'title': 'S22 - Aflevering 1', 'title': 'De Slimste Mens ter Wereld - S22 - Aflevering 1',
'description': r're:In aflevering 1 nemen Daan Alferink, Tess Elst en Xander De Rycke .{66}', 'description': r're:In aflevering 1 nemen Daan Alferink, Tess Elst en Xander De Rycke .{66}',
'series': 'De Slimste Mens ter Wereld', 'series': 'De Slimste Mens ter Wereld',
'episode': 'Episode 1', 'episode': 'Wordt aangekondigd',
'season_number': 22, 'season_number': 22,
'episode_number': 1, 'episode_number': 1,
'season': 'Season 22', 'season': 'Season 22',
@ -52,7 +47,7 @@ class GoPlayIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'ecb79672-92b9-4cd9-a0d7-e2f0250681ee', 'id': 'ecb79672-92b9-4cd9-a0d7-e2f0250681ee',
'ext': 'mp4', 'ext': 'mp4',
'title': 'S11 - Aflevering 1', 'title': 'De Mol - S11 - Aflevering 1',
'description': r're:Tien kandidaten beginnen aan hun verovering van Amerika en ontmoeten .{102}', 'description': r're:Tien kandidaten beginnen aan hun verovering van Amerika en ontmoeten .{102}',
'episode': 'Episode 1', 'episode': 'Episode 1',
'series': 'De Mol', 'series': 'De Mol',
@ -75,21 +70,13 @@ class GoPlayIE(InfoExtractor):
if not self._id_token: if not self._id_token:
raise self.raise_login_required(method='password') raise self.raise_login_required(method='password')
# XXX: For parsing next.js v15+ data; see also yt_dlp.extractor.francetv
def _find_json(self, s):
return self._search_json(
r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None)
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
nextjs_data = traverse_obj( nextjs_data = self._search_nextjs_v13_data(webpage, display_id)
re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*</script>', webpage), meta = get_first(nextjs_data, (
(..., {json.loads}, ..., {self._find_json}, ...)) lambda k, v: k in ('video', 'meta') and v['path'] == urllib.parse.urlparse(url).path))
meta = traverse_obj(nextjs_data, (
..., ..., 'children', ..., ..., 'children',
lambda _, v: v['video']['path'] == urllib.parse.urlparse(url).path, 'video', any))
video_id = meta['uuid'] video_id = meta['uuid']
info_dict = traverse_obj(meta, { info_dict = traverse_obj(meta, {
@ -98,19 +85,18 @@ class GoPlayIE(InfoExtractor):
}) })
if traverse_obj(meta, ('program', 'subtype')) != 'movie': if traverse_obj(meta, ('program', 'subtype')) != 'movie':
for season_data in traverse_obj(nextjs_data, (..., 'children', ..., 'playlists', ...)): for season_data in traverse_obj(nextjs_data, (..., 'playlists', ..., {dict})):
episode_data = traverse_obj( episode_data = traverse_obj(season_data, ('videos', lambda _, v: v['videoId'] == video_id, any))
season_data, ('videos', lambda _, v: v['videoId'] == video_id, any))
if not episode_data: if not episode_data:
continue continue
episode_title = traverse_obj( season_number = traverse_obj(season_data, ('season', {int_or_none}))
episode_data, 'contextualTitle', 'episodeTitle', expected_type=str)
info_dict.update({ info_dict.update({
'title': episode_title or info_dict.get('title'), 'episode': traverse_obj(episode_data, ('episodeTitle', {str})),
'series': remove_end(info_dict.get('title'), f' - {episode_title}'),
'season_number': traverse_obj(season_data, ('season', {int_or_none})),
'episode_number': traverse_obj(episode_data, ('episodeNumber', {int_or_none})), 'episode_number': traverse_obj(episode_data, ('episodeNumber', {int_or_none})),
'season_number': season_number,
'series': self._search_regex(
fr'^(.+)? - S{season_number} - ', info_dict.get('title'), 'series', default=None),
}) })
break break

View File

@ -1,6 +1,3 @@
import json
import re
from .brightcove import BrightcoveNewIE from .brightcove import BrightcoveNewIE
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
@ -11,7 +8,12 @@ from ..utils import (
str_or_none, str_or_none,
url_or_none, url_or_none,
) )
from ..utils.traversal import require, traverse_obj, value from ..utils.traversal import (
get_first,
require,
traverse_obj,
value,
)
class NineNowIE(InfoExtractor): class NineNowIE(InfoExtractor):
@ -101,20 +103,11 @@ class NineNowIE(InfoExtractor):
}] }]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId={}' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId={}'
# XXX: For parsing next.js v15+ data; see also yt_dlp.extractor.francetv and yt_dlp.extractor.goplay
def _find_json(self, s):
return self._search_json(
r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None)
def _real_extract(self, url): def _real_extract(self, url):
display_id, video_type = self._match_valid_url(url).group('id', 'type') display_id, video_type = self._match_valid_url(url).group('id', 'type')
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
common_data = traverse_obj( common_data = get_first(self._search_nextjs_v13_data(webpage, display_id), ('payload', {dict}))
re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*</script>', webpage),
(..., {json.loads}, ..., {self._find_json},
lambda _, v: v['payload'][video_type]['slug'] == display_id,
'payload', any, {require('video data')}))
if traverse_obj(common_data, (video_type, 'video', 'drm', {bool})): if traverse_obj(common_data, (video_type, 'video', 'drm', {bool})):
self.report_drm(display_id) self.report_drm(display_id)