urlparse

A Python module for parsing absolute and relative URLs.

RFC 3986 is considered the current standard and any future changes to urlparse module
should conform with it. The urlparse module is currently not entirely compliant with this
RFC due to defacto scenarios for parsing, and for backward compatibility purposes, some
parsing quirks from older RFCs are retained. The testcases in test_urlparse.py provide a
good indicator of parsing behavior.

def urlparse(url, scheme='', allow_fragments=True):
 Parse a URL into 6 components:
 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
 Note that we don't break the components up in smaller bits
 (e.g. netloc is a single string) and we don't expand % escapes.

 tuple = urlsplit(url, scheme, allow_fragments)
 scheme, netloc, url, query, fragment = tuple
 if scheme in uses_params and ';' in url:
  url, params = _splitparams(url)
 else:
  params = ''
 return ParseResult(scheme, netloc, url, params, query, fragment)

def _splitparams(url):
 if '/'  in url:
  i = url.find(';', url.rfind('/'))
  if i < 0:
   return url, ''
 else:
  i = url.find(';')
 return url[:i], url[i+1:]

def _splitnetloc(url, start=0):
 delim = len(url)   position of end of domain part of url, default is end
 for c in '/?#':    look for delimiters; the order is NOT important
  wdelim = url.find(c, start)        find first of this delim
  if wdelim >= 0:                    if found
   delim = min(delim, wdelim)     use earliest delim position
 return url[start:delim], url[delim:]   return (domain, rest)

def urlsplit(url, scheme='', allow_fragments=True):
 Parse a URL into 5 components:
 <scheme>://<netloc>/<path>?<query>#<fragment>
 Return a 5-tuple: (scheme, netloc, path, query, fragment).
 Note that we don't break the components up in smaller bits
 (e.g. netloc is a single string) and we don't expand % escapes.

 allow_fragments = bool(allow_fragments)
 key = url, scheme, allow_fragments, type(url), type(scheme)
 cached = _parse_cache.get(key, None)
 if cached:
  return cached
 if len(_parse_cache) >= MAX_CACHE_SIZE: avoid runaway growth
  clear_cache()
 netloc = query = fragment = ''
 i = url.find(':')