1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-09-03 00:25:08 +00:00

[extractor] Add _search_json

All fetching of JSON objects should eventually be done with this function
but only `youtube` is being refactored for now
This commit is contained in:
pukkandan
2022-06-03 21:02:31 +05:30
parent 00bbc5f177
commit b7c47b7438
4 changed files with 42 additions and 39 deletions

View File

@ -35,6 +35,7 @@ from ..utils import (
ExtractorError,
GeoRestrictedError,
GeoUtils,
LenientJSONDecoder,
RegexNotFoundError,
UnsupportedError,
age_restricted,
@ -930,19 +931,10 @@ class InfoExtractor:
else:
self.report_warning(errmsg + str(ve))
def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, lenient=False):
if transform_source:
json_string = transform_source(json_string)
def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, **parser_kwargs):
try:
try:
return json.loads(json_string, strict=False)
except json.JSONDecodeError as e:
if not lenient:
raise
try:
return json.loads(json_string[:e.pos], strict=False)
except ValueError:
raise e
return json.loads(
json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs)
except ValueError as ve:
errmsg = f'{video_id}: Failed to parse JSON'
if fatal:
@ -1196,6 +1188,14 @@ class InfoExtractor:
self.report_warning('unable to extract %s' % _name + bug_reports_message())
return None
def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', fatal=True, **kwargs):
"""Searches string for the JSON object specified by start_pattern"""
# NB: end_pattern is only used to reduce the size of the initial match
return self._parse_json(
self._search_regex(rf'{start_pattern}\s*(?P<json>{{.+}})\s*{end_pattern}',
string, name, group='json', fatal=fatal) or '{}',
video_id, fatal=fatal, ignore_extra=True, **kwargs) or {}
def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.