[extractor] Add _search_json

All fetching of JSON objects should eventually be done with this function but only `youtube` is being refactored for now
2025-09-03 00:25:08 +00:00 · 2022-06-03 21:02:31 +05:30
parent 00bbc5f177
commit b7c47b7438
4 changed files with 42 additions and 39 deletions
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -35,6 +35,7 @@ from ..utils import (
    ExtractorError,
    GeoRestrictedError,
    GeoUtils,
+    LenientJSONDecoder,
    RegexNotFoundError,
    UnsupportedError,
    age_restricted,
@ -930,19 +931,10 @@ class InfoExtractor:
            else:
                self.report_warning(errmsg + str(ve))

-    def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, lenient=False):
-        if transform_source:
-            json_string = transform_source(json_string)
+    def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, **parser_kwargs):
        try:
-            try:
-                return json.loads(json_string, strict=False)
-            except json.JSONDecodeError as e:
-                if not lenient:
-                    raise
-                try:
-                    return json.loads(json_string[:e.pos], strict=False)
-                except ValueError:
-                    raise e
+            return json.loads(
+                json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
        except ValueError as ve:
            errmsg = f'{video_id}: Failed to parse JSON'
            if fatal:
@ -1196,6 +1188,14 @@ class InfoExtractor:
            self.report_warning('unable to extract %s' % _name + bug_reports_message())
            return None

+    def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', fatal=True, **kwargs):
+        """Searches string for the JSON object specified by start_pattern"""
+        # NB: end_pattern is only used to reduce the size of the initial match
+        return self._parse_json(
+            self._search_regex(rf'{start_pattern}\s*(?P<json>{{.+}})\s*{end_pattern}',
+                               string, name, group='json', fatal=fatal) or '{}',
+            video_id, fatal=fatal, ignore_extra=True, **kwargs) or {}
+
    def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
        """
        Like _search_regex, but strips HTML tags and unescapes entities.