1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-09-03 00:25:08 +00:00

[extractor/generic:quoted-html] Add extractor (#5213)

Extracts embeds from escaped HTML within `data-html` attribute.
Related: https://github.com/ytdl-org/youtube-dl/issues/21294, https://github.com/yt-dlp/yt-dlp/pull/5121

Authored by: coletdjnz
Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com>
This commit is contained in:
Matthew
2022-10-14 17:32:52 +13:00
committed by GitHub
parent 6678a4f0b3
commit 6dca2aa66d
4 changed files with 89 additions and 87 deletions

View File

@ -1980,22 +1980,6 @@ class GenericIE(InfoExtractor):
},
'playlist_count': 6,
},
{
# Squarespace video embed, 2019-08-28
'url': 'http://ootboxford.com',
'info_dict': {
'id': 'Tc7b_JGdZfw',
'title': 'Out of the Blue, at Childish Things 10',
'ext': 'mp4',
'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f',
'uploader_id': 'helendouglashouse',
'uploader': 'Helen & Douglas House',
'upload_date': '20140328',
},
'params': {
'skip_download': True,
},
},
# {
# # Zype embed
# 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites',
@ -2784,12 +2768,6 @@ class GenericIE(InfoExtractor):
# There probably should be a second run of generic extractor on unescaped webpage.
# webpage = urllib.parse.unquote(webpage)
# Unescape squarespace embeds to be detected by generic extractor,
# see https://github.com/ytdl-org/youtube-dl/issues/21294
webpage = re.sub(
r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
lambda x: unescapeHTML(x.group(0)), webpage)
# TODO: Move to respective extractors
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls: