From 3a84be9d1660ef798ea28f929a20391bef6afda4 Mon Sep 17 00:00:00 2001 From: Nikolay Fedorov <40500428+swayll@users.noreply.github.com> Date: Mon, 14 Jul 2025 22:01:53 +0300 Subject: [PATCH] [ie/TheHighWire] Add extractor (#13505) Closes #13364 Authored by: swayll --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/thehighwire.py | 43 +++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 yt_dlp/extractor/thehighwire.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 0a00db437e..c9172fef78 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2092,6 +2092,7 @@ from .theguardian import ( TheGuardianPodcastIE, TheGuardianPodcastPlaylistIE, ) +from .thehighwire import TheHighWireIE from .theholetv import TheHoleTvIE from .theintercept import TheInterceptIE from .theplatform import ( diff --git a/yt_dlp/extractor/thehighwire.py b/yt_dlp/extractor/thehighwire.py new file mode 100644 index 0000000000..8b596143f7 --- /dev/null +++ b/yt_dlp/extractor/thehighwire.py @@ -0,0 +1,43 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + url_or_none, +) +from ..utils.traversal import ( + find_element, + require, + traverse_obj, +) + + +class TheHighWireIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thehighwire\.com/ark-videos/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://thehighwire.com/ark-videos/the-deposition-of-stanley-plotkin/', + 'info_dict': { + 'id': 'the-deposition-of-stanley-plotkin', + 'ext': 'mp4', + 'title': 'THE DEPOSITION OF STANLEY PLOTKIN', + 'description': 'md5:6d0be4f1181daaa10430fd8b945a5e54', + 'thumbnail': r're:https?://static\.arkengine\.com/video/.+\.jpg', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + embed_url = traverse_obj(webpage, ( + {find_element(cls='ark-video-embed', html=True)}, + {extract_attributes}, 'src', {url_or_none}, {require('embed URL')})) + embed_page = self._download_webpage(embed_url, display_id) + + return { + 'id': display_id, + **traverse_obj(webpage, { + 'title': ({find_element(cls='section-header')}, {clean_html}), + 'description': ({find_element(cls='episode-description__copy')}, {clean_html}), + }), + **self._parse_html5_media_entries(embed_url, embed_page, display_id, m3u8_id='hls')[0], + }