mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-09-03 00:25:08 +00:00
[core] Improve HTTP redirect handling (#7094)
Aligns HTTP redirect handling with what browsers commonly do and RFC standards.
Fixes issues afac4caa7d
missed.
Authored by: coletdjnz
This commit is contained in:
@ -1664,61 +1664,44 @@ class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
|
||||
|
||||
The code is based on HTTPRedirectHandler implementation from CPython [1].
|
||||
|
||||
This redirect handler solves two issues:
|
||||
- ensures redirect URL is always unicode under python 2
|
||||
- introduces support for experimental HTTP response status code
|
||||
308 Permanent Redirect [2] used by some sites [3]
|
||||
This redirect handler fixes and improves the logic to better align with RFC7261
|
||||
and what browsers tend to do [2][3]
|
||||
|
||||
1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
|
||||
2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
|
||||
3. https://github.com/ytdl-org/youtube-dl/issues/28768
|
||||
2. https://datatracker.ietf.org/doc/html/rfc7231
|
||||
3. https://github.com/python/cpython/issues/91306
|
||||
"""
|
||||
|
||||
http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
|
||||
|
||||
def redirect_request(self, req, fp, code, msg, headers, newurl):
|
||||
"""Return a Request or None in response to a redirect.
|
||||
|
||||
This is called by the http_error_30x methods when a
|
||||
redirection response is received. If a redirection should
|
||||
take place, return a new Request to allow http_error_30x to
|
||||
perform the redirect. Otherwise, raise HTTPError if no-one
|
||||
else should try to handle this url. Return None if you can't
|
||||
but another Handler might.
|
||||
"""
|
||||
m = req.get_method()
|
||||
if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
|
||||
or code in (301, 302, 303) and m == "POST")):
|
||||
if code not in (301, 302, 303, 307, 308):
|
||||
raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
|
||||
# Strictly (according to RFC 2616), 301 or 302 in response to
|
||||
# a POST MUST NOT cause a redirection without confirmation
|
||||
# from the user (of urllib.request, in this case). In practice,
|
||||
# essentially all clients do redirect in this case, so we do
|
||||
# the same.
|
||||
|
||||
# Be conciliant with URIs containing a space. This is mainly
|
||||
# redundant with the more complete encoding done in http_error_302(),
|
||||
# but it is kept for compatibility with other callers.
|
||||
newurl = newurl.replace(' ', '%20')
|
||||
|
||||
CONTENT_HEADERS = ("content-length", "content-type")
|
||||
# NB: don't use dict comprehension for python 2.6 compatibility
|
||||
newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
|
||||
|
||||
new_method = req.get_method()
|
||||
new_data = req.data
|
||||
remove_headers = []
|
||||
# A 303 must either use GET or HEAD for subsequent request
|
||||
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
|
||||
if code == 303 and m != 'HEAD':
|
||||
m = 'GET'
|
||||
if code == 303 and req.get_method() != 'HEAD':
|
||||
new_method = 'GET'
|
||||
# 301 and 302 redirects are commonly turned into a GET from a POST
|
||||
# for subsequent requests by browsers, so we'll do the same.
|
||||
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
|
||||
# https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
|
||||
if code in (301, 302) and m == 'POST':
|
||||
m = 'GET'
|
||||
elif code in (301, 302) and req.get_method() == 'POST':
|
||||
new_method = 'GET'
|
||||
|
||||
# only remove payload if method changed (e.g. POST to GET)
|
||||
if new_method != req.get_method():
|
||||
new_data = None
|
||||
remove_headers.extend(['Content-Length', 'Content-Type'])
|
||||
|
||||
new_headers = {k: v for k, v in req.headers.items() if k.lower() not in remove_headers}
|
||||
|
||||
return urllib.request.Request(
|
||||
newurl, headers=newheaders, origin_req_host=req.origin_req_host,
|
||||
unverifiable=True, method=m)
|
||||
newurl, headers=new_headers, origin_req_host=req.origin_req_host,
|
||||
unverifiable=True, method=new_method, data=new_data)
|
||||
|
||||
|
||||
def extract_timezone(date_str):
|
||||
|
Reference in New Issue
Block a user