A Python module for parsing absolute and relative URLs.
RFC 3986 is considered the current standard and any future changes to urlparse module
should conform with it. The urlparse module is currently not entirely compliant with this
RFC due to defacto scenarios for parsing, and for backward compatibility purposes, some
parsing quirks from older RFCs are retained. The testcases in test_urlparse.py provide a
good indicator of parsing behavior.
def urlparse(url, scheme='', allow_fragments=True):
Parse a URL into 6 components:
<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes.
tuple = urlsplit(url, scheme, allow_fragments)
scheme, netloc, url, query, fragment = tuple
if scheme in uses_params and ';' in url:
url, params = _splitparams(url)
else:
params = ''
return ParseResult(scheme, netloc, url, params, query, fragment)
def _splitparams(url):
if '/' in url:
i = url.find(';', url.rfind('/'))
if i < 0:
return url, ''
else:
i = url.find(';')
return url[:i], url[i+1:]
def _splitnetloc(url, start=0):
delim = len(url) position of end of domain part of url, default is end
for c in '/?#': look for delimiters; the order is NOT important
wdelim = url.find(c, start) find first of this delim
if wdelim >= 0: if found
delim = min(delim, wdelim) use earliest delim position
return url[start:delim], url[delim:] return (domain, rest)
def urlsplit(url, scheme='', allow_fragments=True):
Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment>
Return a 5-tuple: (scheme, netloc, path, query, fragment).
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes.
allow_fragments = bool(allow_fragments)
key = url, scheme, allow_fragments, type(url), type(scheme)
cached = _parse_cache.get(key, None)
if cached:
return cached
if len(_parse_cache) >= MAX_CACHE_SIZE: avoid runaway growth
clear_cache()
netloc = query = fragment = ''
i = url.find(':')