1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-06-16 16:34:09 +00:00

[ie/theplatform] Improve metadata extraction (#13131)

Authored by: bashonly
This commit is contained in:
bashonly 2025-05-26 14:06:39 -05:00 committed by bashonly
parent eee90acc47
commit ed108b3ea4

View File

@ -12,11 +12,13 @@ from ..utils import (
float_or_none, float_or_none,
int_or_none, int_or_none,
mimetype2ext, mimetype2ext,
parse_age_limit,
parse_qs, parse_qs,
traverse_obj, traverse_obj,
unsmuggle_url, unsmuggle_url,
update_url, update_url,
update_url_query, update_url_query,
url_or_none,
urlhandle_detect_ext, urlhandle_detect_ext,
xpath_with_ns, xpath_with_ns,
) )
@ -63,62 +65,53 @@ class ThePlatformBaseIE(AdobePassIE):
return formats, subtitles return formats, subtitles
def _download_theplatform_metadata(self, path, video_id): def _download_theplatform_metadata(self, path, video_id, fatal=True):
info_url = f'http://link.theplatform.{self._TP_TLD}/s/{path}?format=preview' return self._download_json(
return self._download_json(info_url, video_id) f'https://link.theplatform.{self._TP_TLD}/s/{path}', video_id,
fatal=fatal, query={'format': 'preview'}) or {}
def _parse_theplatform_metadata(self, info): @staticmethod
subtitles = {} def _parse_theplatform_metadata(tp_metadata):
captions = info.get('captions') def site_specific_filter(*fields):
if isinstance(captions, list): return lambda k, v: v and k.endswith(tuple(f'${f}' for f in fields))
for caption in captions:
lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
subtitles.setdefault(lang, []).append({
'ext': mimetype2ext(mime),
'url': src,
})
duration = info.get('duration') info = traverse_obj(tp_metadata, {
tp_chapters = info.get('chapters', []) 'title': ('title', {str}),
chapters = [] 'episode': ('title', {str}),
if tp_chapters: 'description': ('description', {str}),
def _add_chapter(start_time, end_time): 'thumbnail': ('defaultThumbnailUrl', {url_or_none}),
start_time = float_or_none(start_time, 1000) 'duration': ('duration', {float_or_none(scale=1000)}),
end_time = float_or_none(end_time, 1000) 'timestamp': ('pubDate', {float_or_none(scale=1000)}),
if start_time is None or end_time is None: 'uploader': ('billingCode', {str}),
return 'creators': ('author', {str}, filter, all, filter),
chapters.append({ 'categories': (
'start_time': start_time, 'categories', lambda _, v: v.get('label') in ['category', None],
'end_time': end_time, 'name', {str}, filter, all, filter),
}) 'tags': ('keywords', {str}, filter, {lambda x: re.split(r'[;,]\s?', x)}, filter),
'age_limit': ('ratings', ..., 'rating', {parse_age_limit}, any),
'season_number': (site_specific_filter('seasonNumber'), {int_or_none}, any),
'episode_number': (site_specific_filter('episodeNumber', 'airOrder'), {int_or_none}, any),
'series': (site_specific_filter('show', 'seriesTitle', 'seriesShortTitle'), (None, ...), {str}, any),
'location': (site_specific_filter('region'), {str}, any),
'media_type': (site_specific_filter('programmingType', 'type'), {str}, any),
})
for chapter in tp_chapters[:-1]: chapters = traverse_obj(tp_metadata, ('chapters', ..., {
_add_chapter(chapter.get('startTime'), chapter.get('endTime')) 'start_time': ('startTime', {float_or_none(scale=1000)}),
_add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) 'end_time': ('endTime', {float_or_none(scale=1000)}),
}))
# Ignore pointless single chapters from short videos that span the entire video's duration
if len(chapters) > 1 or traverse_obj(chapters, (0, 'end_time')):
info['chapters'] = chapters
def extract_site_specific_field(field): info['subtitles'] = {}
# A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber' for caption in traverse_obj(tp_metadata, ('captions', lambda _, v: url_or_none(v['src']))):
return traverse_obj(info, lambda k, v: v and k.endswith(f'${field}'), get_all=False) info['subtitles'].setdefault(caption.get('lang') or 'en', []).append({
'url': caption['src'],
'ext': mimetype2ext(caption.get('type')),
})
return { return info
'title': info['title'],
'subtitles': subtitles,
'description': info['description'],
'thumbnail': info['defaultThumbnailUrl'],
'duration': float_or_none(duration, 1000),
'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
'uploader': info.get('billingCode'),
'chapters': chapters,
'creator': traverse_obj(info, ('author', {str})) or None,
'categories': traverse_obj(info, (
'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None,
'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})),
'location': extract_site_specific_field('region'),
'series': extract_site_specific_field('show') or extract_site_specific_field('seriesTitle'),
'season_number': int_or_none(extract_site_specific_field('seasonNumber')),
'episode_number': int_or_none(extract_site_specific_field('episodeNumber')),
'media_type': extract_site_specific_field('programmingType') or extract_site_specific_field('type'),
}
def _extract_theplatform_metadata(self, path, video_id): def _extract_theplatform_metadata(self, path, video_id):
info = self._download_theplatform_metadata(path, video_id) info = self._download_theplatform_metadata(path, video_id)