1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-09-03 00:25:08 +00:00

[core] Improve HTTP redirect handling (#7094)

Aligns HTTP redirect handling with what browsers commonly do and RFC standards. 

Fixes issues afac4caa7d missed.

Authored by: coletdjnz
This commit is contained in:
coletdjnz
2023-05-27 19:06:13 +12:00
committed by GitHub
parent 66468bbf49
commit 08916a49c7
3 changed files with 281 additions and 72 deletions

View File

@ -1664,61 +1664,44 @@ class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
The code is based on HTTPRedirectHandler implementation from CPython [1].
This redirect handler solves two issues:
- ensures redirect URL is always unicode under python 2
- introduces support for experimental HTTP response status code
308 Permanent Redirect [2] used by some sites [3]
This redirect handler fixes and improves the logic to better align with RFC7261
and what browsers tend to do [2][3]
1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
3. https://github.com/ytdl-org/youtube-dl/issues/28768
2. https://datatracker.ietf.org/doc/html/rfc7231
3. https://github.com/python/cpython/issues/91306
"""
http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
def redirect_request(self, req, fp, code, msg, headers, newurl):
"""Return a Request or None in response to a redirect.
This is called by the http_error_30x methods when a
redirection response is received. If a redirection should
take place, return a new Request to allow http_error_30x to
perform the redirect. Otherwise, raise HTTPError if no-one
else should try to handle this url. Return None if you can't
but another Handler might.
"""
m = req.get_method()
if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
or code in (301, 302, 303) and m == "POST")):
if code not in (301, 302, 303, 307, 308):
raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
# Strictly (according to RFC 2616), 301 or 302 in response to
# a POST MUST NOT cause a redirection without confirmation
# from the user (of urllib.request, in this case). In practice,
# essentially all clients do redirect in this case, so we do
# the same.
# Be conciliant with URIs containing a space. This is mainly
# redundant with the more complete encoding done in http_error_302(),
# but it is kept for compatibility with other callers.
newurl = newurl.replace(' ', '%20')
CONTENT_HEADERS = ("content-length", "content-type")
# NB: don't use dict comprehension for python 2.6 compatibility
newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
new_method = req.get_method()
new_data = req.data
remove_headers = []
# A 303 must either use GET or HEAD for subsequent request
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
if code == 303 and m != 'HEAD':
m = 'GET'
if code == 303 and req.get_method() != 'HEAD':
new_method = 'GET'
# 301 and 302 redirects are commonly turned into a GET from a POST
# for subsequent requests by browsers, so we'll do the same.
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
if code in (301, 302) and m == 'POST':
m = 'GET'
elif code in (301, 302) and req.get_method() == 'POST':
new_method = 'GET'
# only remove payload if method changed (e.g. POST to GET)
if new_method != req.get_method():
new_data = None
remove_headers.extend(['Content-Length', 'Content-Type'])
new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
return urllib.request.Request(
newurl, headers=newheaders, origin_req_host=req.origin_req_host,
unverifiable=True, method=m)
newurl, headers=new_headers, origin_req_host=req.origin_req_host,
unverifiable=True, method=new_method, data=new_data)
def extract_timezone(date_str):