Source code for domain_utils.domain_utils

from functools import wraps
from ipaddress import ip_address
from tldextract import TLDExtract
from urllib.parse import urlparse

NO_SCHEME = 'no_scheme'
HTTP = 'http'
HTTPS = 'https'
WS = 'ws'
WSS = 'wss'


def _load_and_update_extractor(function):
    @wraps(function)
    def wrapper(*args, **kwargs):
        if 'extractor' not in kwargs:
            if wrapper.extractor is None:
                _extractor = TLDExtract(include_psl_private_domains=True)
                _extractor.update()
                wrapper.extractor = _extractor
            return function(*args, extractor=wrapper.extractor, **kwargs)
        else:
            return function(*args, **kwargs)
    wrapper.extractor = None
    return wrapper


[docs]def is_ip_address(hostname):
    """
    Check if the given string is a valid IP address
    """
    try:
        ip_address(str(hostname))
        return True
    except ValueError:
        return False


def _adapt_url_for_port_and_scheme(url, extractor):
    # To handle the case where we have no scheme, but we have a port
    # we have the following heuristic. Does scheme have a . in it
    # which is stdlib behavior when not recognizing a netloc due to
    # lack of //. If TLDExtract, can find a suffix in the _scheme
    # then it's probably a domain without an http.

    purl = urlparse(url)
    _scheme = purl.scheme

    if '.' in str(_scheme):
        # From the docs: "urlparse recognizes a netloc only
        # if it is properly introduced by ‘//’". So we
        # prepend to get results we expect.
        if extractor(_scheme).suffix != '' or is_ip_address(_scheme):
            url = '//{url}'.format(url=url)
    elif url == purl.path:
        # this is the case where the url has no scheme
        # and we are trying to access the root. Ex: localhost:5000
        url = '//{url}/'.format(url=url)
    return url


@_load_and_update_extractor
def _get_tld_extract(url, **kwargs):
    extractor = kwargs.get('extractor')
    if not isinstance(extractor, TLDExtract):
        raise ValueError(
            "A tldextract::TLDExtract instance must be passed using the "
            "`extractor` keyword argument.")

    scheme = kwargs.get('scheme', True)
    path = kwargs.get('path', True)
    return_unparsed = kwargs.get('return_unparsed', False)
    use_netloc = kwargs.get('use_netloc', True)
    scheme_default = kwargs.get('scheme_default', HTTP)
    stemmed = stem_url(
            url,
            return_unparsed=return_unparsed,
            scheme_default=scheme_default,
            scheme=scheme,
            path=path,
            use_netloc=use_netloc,
            extractor=extractor,
    )
    return extractor(stemmed)


[docs]def get_etld1(url, **kwargs):
    """
    Returns the eTLD+1 (aka PS+1) of the url.

    Parameters
    ----------
    url : string
        The url from which to extract the eTLD+1 / PS+1
    extractor : tldextract::TLDExtract, optional
        An (optional) tldextract::TLDExtract instance can be passed with
        keyword `extractor`, otherwise we create and update one automatically.
    kwargs:
        The method preprocesses the url with ``stem_url`` before
        extracting the domain. You can pass in ``stem_url`` parameters
        if you wish to change the behavior in some specific way.

    Returns
    -------
    string
        The eTLD+1 / PS+1 of the url passed in. If no eTLD+1 is detectable,
        an empty string will be returned. Returns an IP address if the hostname
        of the url is a valid IP address.
    """
    parsed = _get_tld_extract(url, **kwargs)
    if parsed.suffix == '':
        return parsed.domain
    else:
        return f'{parsed.domain}.{parsed.suffix}'


[docs]def get_ps_plus_1(url, **kwargs):
    """An alias for ``get_etld1``."""
    return get_etld1(url, **kwargs)


[docs]@_load_and_update_extractor
def hostname_subparts(url, include_ps=False, **kwargs):
    """
    Returns a list of slices of a url's hostname down to the eTLD+1 / PS+1.


    Parameters
    ----------
    url : string
        The url from which to extract the hostname parts
    include_ps : boolean, optional
        If ``include_ps`` is set, the hostname slices will include the public suffix
        For example: ``http://a.b.c.d.com/path?query#frag`` would yield:

        * ``["a.b.c.d.com", "b.c.d.com", "c.d.com", "d.com"]`` if ``include_ps == False``
        * ``["a.b.c.d.com", "b.c.d.com", "c.d.com", "d.com", "com"]`` if ``include_ps == True``
    kwargs:
        Additionally all kwargs for get_etld1, can be passed to this method.

    Returns
    -------
    list (string)
        List of slices of of a url's hostname down to the eTLD+1 / PS+1.
    """
    ext = _get_tld_extract(url, **kwargs)
    etld1 = get_etld1(url, **kwargs)

    # If an IP address, just return a single item list with the IP
    if is_ip_address(ext.domain):
        return [ext.domain]

    # We expect all eTLD+1s to have at least one '.'
    # If they don't, the url was likely malformed, so we'll just
    # return an empty list
    if '.' not in etld1:
        return []

    # Build a string of the URL except the suffix
    domain_less_ps = '.'.join([
        url_part for url_part
        in [ext.subdomain, ext.domain]
        if url_part != ''
    ])

    # Assemble subparts list
    subparts = []

    if domain_less_ps != '':
        domain_parts_to_pop = list(reversed(domain_less_ps.split('.')))
        while len(domain_parts_to_pop) > 0:
            domain_parts = list(reversed(domain_parts_to_pop)) + [ext.suffix]
            subparts.append('.'.join(domain_parts))
            domain_parts_to_pop.pop()

    if include_ps:
        subparts.append(ext.suffix)

    return subparts


[docs]@_load_and_update_extractor
def stem_url(
        url,
        return_unparsed=True,
        scheme_default=HTTP,
        parse_ws=True,
        scheme=False,
        path=True,
        use_netloc=True,
        extractor=None):
    """
    Returns a url stripped to just the beginning and end.

    More formally it returns ``(scheme)?+(netloc|hostname)+(path)?``.

    For example ``https://my.domain.net/a/path/to/a/file.html#anchor?a=1``
    becomes ``my.domain.net/a/path/to/a/file.html``
    URL parsing is done using std lib
    `urllib.parse.urlparse
    <https://docs.python.org/3.8/library/urllib.parse.html>`_.

    A url is parsed if it has a qualifying scheme. The qualifying schemes are
    ``http``, ``https``, ``ws`` and ``wss``. Websocket schemes can be omitted using
    the ``parse_ws`` parameter. Additionally, the ``scheme_default`` parameter
    provides a scheme where the url doesn't contain one. The default is ``http``
    and so urls without a scheme will, by default, be considered as http and therfore
    parsed.

    What is returned for unparsed urls is determined by the ``return_unparsed``
    parameter.

    Parameters
    ----------
    url : string
        The URL to be parsed
    return_unparsed : boolean, optional
        Action to take if scheme is not parsed e.g. ``file:`` or ``about:blank``.
        If ``False``, the result for non parsed urls will be an empty string
        If ``True``, the result will be the original url, e.g.
        ``about:blank`` -> ``about:blank`` even if ``scheme=False``.
        See method description to understand whether a URL is parsed or not.
        Default is ``True``.
    scheme_default : string, optional
        This parameter is passed to scheme parameter of `urllib.parse.urlparse`. This
        causes urls without a scheme to return the scheme default.
        Default is ``http``.
    parse_ws : boolean, optional
        If ``True``, then ``ws`` and ``wss`` urls are parsed.
        Default is ``True``.
    scheme : boolean, optional
        If ``True``, scheme will be prepended in parsed result.
        Default is ``False``.
    path : boolean, optional
        If ``True``, path will be included in parsed result.
        Default is ``True``.
    use_netloc : boolean, optional
        If ``True`` urlparse's netloc will be used.
        If ``False`` urlparse's host will be returned. Using netloc means
        that a port is included, for example, if it was in the path.
        Default is ``True``.
    extractor : tldextract::TLDExtract, optional
        An (optional) tldextract::TLDExtract instance can be passed with
        keyword `extractor`, otherwise we create and update one automatically.

    Returns
    -------
    string
        Returns a url stripped to (scheme)?+(netloc|hostname)+(path)?.
        Returns empty string if appropriate.
    """
    url = _adapt_url_for_port_and_scheme(url, extractor)

    purl = urlparse(url, scheme=scheme_default)
    _scheme = purl.scheme

    # Will we parse
    schemes_to_parse = [HTTP, HTTPS]
    if parse_ws is True:
        schemes_to_parse += [WS, WSS]
    if _scheme not in schemes_to_parse:
        if return_unparsed is True:
            return url
        return ''

    scheme_out = ''
    loc_out = ''
    path_out = ''

    if scheme is True:
        if _scheme in schemes_to_parse:
            scheme_out = '{scheme}://'.format(scheme=_scheme)

    if path is True:
        path_out = purl.path

    if use_netloc is True:
        loc_out = purl.netloc
    else:
        loc_out = purl.hostname

    return '{scheme_out}{loc_out}{path_out}'.format(
        scheme_out=scheme_out,
        loc_out=loc_out,
        path_out=path_out,
    )


[docs]def get_stripped_url(url, **kwargs):
    """Alias for ``stem_url``."""
    return stem_url(url, **kwargs)


[docs]def get_scheme(url, no_scheme=NO_SCHEME):
    """
    Given an url, extract from it the scheme.

    Parameters
    ----------
    url: string
        The URL from where we want to get the scheme
    no_scheme: any
        The value to use if no scheme is detected.
        Default is ``no_scheme``

    Returns
    -------
    string
        Returns the scheme with a default of 'blank' if no schema is provided
    """

    scheme = urlparse(url).scheme

    if scheme:
        return scheme
    else:
        return no_scheme


[docs]@_load_and_update_extractor
def get_port(url, extractor=None):
    """
    Given an url, extract from it port if present.

    Parameters
    ----------
    url: string
        The URL from where we want to get the scheme
    extractor : tldextract::TLDExtract, optional
        An (optional) tldextract::TLDExtract instance can be passed with
        keyword `extractor`, otherwise we create and update one automatically.

    Returns
    ----------
    int
        Returns port in the url. If port not found, returns ``None``.
    """

    url = _adapt_url_for_port_and_scheme(url, extractor)
    return urlparse(url).port
Source code for domain_utils.domain_utils

Table of Contents

Search