Source code for domain_utils.domain_utils

from __future__ import absolute_import
from __future__ import print_function
import tempfile
import codecs
import os
import six

from ipaddress import ip_address
from functools import wraps
from publicsuffix import PublicSuffixList, fetch
from six.moves import range
from six.moves.urllib.parse import urlparse


# We cache the Public Suffix List in temp directory
PSL_CACHE_LOC = os.path.join(tempfile.gettempdir(), 'public_suffix_list.dat')


[docs]def get_psl(location=PSL_CACHE_LOC): """ Grabs an updated public suffix list. """ if not os.path.isfile(location): psl_file = fetch() with codecs.open(location, 'w', encoding='utf8') as f: f.write(psl_file.read()) psl_cache = codecs.open(location, encoding='utf8') return PublicSuffixList(psl_cache)
[docs]def load_psl(function): @wraps(function) def wrapper(*args, **kwargs): if 'psl' not in kwargs: if wrapper.psl is None: wrapper.psl = get_psl() return function(*args, psl=wrapper.psl, **kwargs) else: return function(*args, **kwargs) wrapper.psl = None return wrapper
[docs]def is_ip_address(hostname): """ Check if the given string is a valid IP address """ try: ip_address(six.text_type(hostname)) return True except ValueError: return False
[docs]@load_psl def get_ps_plus_1(url, **kwargs): """ Returns the PS+1 of the url. This will also return an IP address if the hostname of the url is a valid IP address. An (optional) PublicSuffixList object can be passed with keyword arg 'psl', otherwise a version cached in the system temp directory is used. """ if 'psl' not in kwargs: raise ValueError( "A PublicSuffixList must be passed as a keyword argument.") hostname = urlparse(url).hostname if is_ip_address(hostname): return hostname elif hostname is None: # Possible reasons hostname is None, `url` is: # * malformed # * a relative url # * a `javascript:` or `data:` url # * many others return else: return kwargs['psl'].get_public_suffix(hostname)
[docs]@load_psl def hostname_subparts(url, include_ps=False, **kwargs): """ Returns a list of slices of a url's hostname down to the PS+1 If `include_ps` is set, the hostname slices will include the public suffix For example: http://a.b.c.d.com/path?query#frag would yield: [a.b.c.d.com, b.c.d.com, c.d.com, d.com] if include_ps == False [a.b.c.d.com, b.c.d.com, c.d.com, d.com, com] if include_ps == True An (optional) PublicSuffixList object can be passed with keyword arg 'psl'. otherwise a version cached in the system temp directory is used. """ if 'psl' not in kwargs: raise ValueError( "A PublicSuffixList must be passed as a keyword argument.") hostname = urlparse(url).hostname # If an IP address, just return a single item list with the IP if is_ip_address(hostname): return [hostname] subparts = list() ps_plus_1 = kwargs['psl'].get_public_suffix(hostname) # We expect all ps_plus_1s to have at least one '.' # If they don't, the url was likely malformed, so we'll just return an # empty list if '.' not in ps_plus_1: return [] subdomains = hostname[:-(len(ps_plus_1)+1)].split('.') if subdomains == ['']: subdomains = [] for i in range(len(subdomains)): subparts.append('.'.join(subdomains[i:])+'.'+ps_plus_1) subparts.append(ps_plus_1) if include_ps: try: subparts.append(ps_plus_1[ps_plus_1.index('.')+1:]) except Exception: pass return subparts
[docs]def get_stripped_url(url, scheme=False): """Returns a url stripped to (scheme)?+hostname+path""" purl = urlparse(url) surl = '' if scheme: surl += purl.scheme + '://' try: surl += purl.hostname + purl.path except TypeError: surl += purl.hostname return surl