{"cells":[{"cell_type":"markdown","source":["# Usage examples\n\nThe following notebook demonstrates running ``domain_utils`` with [pandas](https://pandas.pydata.org) and [pyspark](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html). \n\nIt was created on databricks, and covers installing ``domain_utils`` into a databricks notebook and working with custom extractors on databricks."],"metadata":{}},{"cell_type":"code","source":["dbutils.library.installPyPI('domain_utils')\n\ndbutils.library.restartPython()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":2},{"cell_type":"code","source":["import domain_utils as du\nfrom tldextract import TLDExtract\nfrom pathlib import Path\nimport tempfile"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":3},{"cell_type":"code","source":["path = \"path to crawl data\""],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":4},{"cell_type":"markdown","source":["## Make a custom extractor"],"metadata":{}},{"cell_type":"code","source":["tmp_path = Path(tempfile.mkdtemp())\ndbutils.fs.mkdirs(tmp_path.as_uri())\nlocal_list_location = tmp_path / \"list.txt\"\ndbutils.fs.ls(tmp_path.as_uri())"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
Out[3]: []
"]}}],"execution_count":6},{"cell_type":"markdown","source":["I could find no way of making the following cell work with a local databricks temp file.\n\nSo I created a custom psl on gist with just the entry. This should be ok for most use cases. The need for a custom PSL should be pretty rare anyway.\n\n googlesyndication.com"],"metadata":{}},{"cell_type":"code","source":["http_loc = 'https://gist.githubusercontent.com/birdsarah/876ecbcaa5510fbcad65639ab7913edd/raw/cce905d186e0623e161af4f6730c2857a181373f/custom_psl_test.txt'\ncustom_extractor = TLDExtract(\n suffix_list_urls=[http_loc, ],\n cache_file=local_list_location.as_posix(),\n fallback_to_snapshot=False\n)\ncustom_extractor('foo.bar.googlesyndication.com')"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
Out[4]: ExtractResult(subdomain='foo', domain='bar', suffix='googlesyndication.com')
"]}}],"execution_count":8},{"cell_type":"markdown","source":["## Pandas"],"metadata":{}},{"cell_type":"code","source":["# Make a pandas dataframe to apply methods on\ndf = spark.read.parquet('%s/visits/%s' % (path, 'javascript')).select('script_url', 'document_url')\ndf_p = df.drop_duplicates().limit(100_000).toPandas()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":10},{"cell_type":"code","source":["df_p['script_url_ps1'] = df_p.script_url.apply(du.get_ps_plus_1)\ndf_p['script_url_stemmed'] = df_p.script_url.apply(du.get_stripped_url)\ndf_p['script_w_scheme'] = df_p.script_url.apply(du.get_stripped_url, scheme=True)\ndf_p['doc_w_scheme'] = df_p.document_url.apply(du.get_stripped_url, scheme=True)\ndf_p.head(10)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
script_urldocument_urlscript_url_ps1script_url_stemmedscript_w_schemedoc_w_scheme
0https://moonliteco.in/js/ion.sound.min.jshttps://moonliteco.in/moonliteco.inmoonliteco.in/js/ion.sound.min.jshttps://moonliteco.in/js/ion.sound.min.jshttps://moonliteco.in/
1https://www.google-analytics.com/analytics.jshttps://moonliteco.in/google-analytics.comwww.google-analytics.com/analytics.jshttps://www.google-analytics.com/analytics.jshttps://moonliteco.in/
2https://www.hotelscombined.com/QUkd4lO9/init.jshttps://www.hotelscombined.com/TrafficInspecti...hotelscombined.comwww.hotelscombined.com/QUkd4lO9/init.jshttps://www.hotelscombined.com/QUkd4lO9/init.jshttps://www.hotelscombined.com/TrafficInspecti...
3https://apis.google.com/_/scs/apps-static/_/js...https://www.i-gamer.net/mobile/site/3293.htmlgoogle.comapis.google.com/_/scs/apps-static/_/js/k=oz.ga...https://apis.google.com/_/scs/apps-static/_/js...https://www.i-gamer.net/mobile/site/3293.html
4https://pagead2.googlesyndication.com/bg/o1Put...https://googleads.g.doubleclick.net/pagead/ads...googlesyndication.compagead2.googlesyndication.com/bg/o1Putv1UN_aI0...https://pagead2.googlesyndication.com/bg/o1Put...https://googleads.g.doubleclick.net/pagead/ads
5http://www.donews.com/static/js/sdk/lib/JSSDK-...http://www.donews.com/donews.comwww.donews.com/static/js/sdk/lib/JSSDK-home_1....http://www.donews.com/static/js/sdk/lib/JSSDK-...http://www.donews.com/
6https://themeforest.net/user/muffingrouphttps://themeforest.net/user/muffingroupthemeforest.netthemeforest.net/user/muffingrouphttps://themeforest.net/user/muffingrouphttps://themeforest.net/user/muffingroup
7https://www.gearbest.com/promotion-Life-Essent...https://www.gearbest.com/promotion-Life-Essent...gearbest.comwww.gearbest.com/promotion-Life-Essentials-Gad...https://www.gearbest.com/promotion-Life-Essent...https://www.gearbest.com/promotion-Life-Essent...
8https://www.googletagmanager.com/gtm.js?id=GTM...https://fivethirtyeight.abcnews.go.com/video/e...googletagmanager.comwww.googletagmanager.com/gtm.jshttps://www.googletagmanager.com/gtm.jshttps://fivethirtyeight.abcnews.go.com/video/e...
9https://connect.facebook.net/ja_JP/sdk.js#xfbm...https://gigazine.net/news/20190729-acecook-sup...facebook.netconnect.facebook.net/ja_JP/sdk.jshttps://connect.facebook.net/ja_JP/sdk.jshttps://gigazine.net/news/20190729-acecook-sup...
\n
"]}}],"execution_count":11},{"cell_type":"code","source":["df_p['custom_ps1'] = df_p.script_url.apply(du.get_ps_plus_1, extractor=custom_extractor) \ndf_p.head(10)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
script_urldocument_urlscript_url_ps1script_url_stemmedscript_w_schemedoc_w_schemecustom_ps1
0https://moonliteco.in/js/ion.sound.min.jshttps://moonliteco.in/moonliteco.inmoonliteco.in/js/ion.sound.min.jshttps://moonliteco.in/js/ion.sound.min.jshttps://moonliteco.in/in
1https://www.google-analytics.com/analytics.jshttps://moonliteco.in/google-analytics.comwww.google-analytics.com/analytics.jshttps://www.google-analytics.com/analytics.jshttps://moonliteco.in/com
2https://www.hotelscombined.com/QUkd4lO9/init.jshttps://www.hotelscombined.com/TrafficInspecti...hotelscombined.comwww.hotelscombined.com/QUkd4lO9/init.jshttps://www.hotelscombined.com/QUkd4lO9/init.jshttps://www.hotelscombined.com/TrafficInspecti...com
3https://apis.google.com/_/scs/apps-static/_/js...https://www.i-gamer.net/mobile/site/3293.htmlgoogle.comapis.google.com/_/scs/apps-static/_/js/k=oz.ga...https://apis.google.com/_/scs/apps-static/_/js...https://www.i-gamer.net/mobile/site/3293.htmlcom
4https://pagead2.googlesyndication.com/bg/o1Put...https://googleads.g.doubleclick.net/pagead/ads...googlesyndication.compagead2.googlesyndication.com/bg/o1Putv1UN_aI0...https://pagead2.googlesyndication.com/bg/o1Put...https://googleads.g.doubleclick.net/pagead/adspagead2.googlesyndication.com
5http://www.donews.com/static/js/sdk/lib/JSSDK-...http://www.donews.com/donews.comwww.donews.com/static/js/sdk/lib/JSSDK-home_1....http://www.donews.com/static/js/sdk/lib/JSSDK-...http://www.donews.com/com
6https://themeforest.net/user/muffingrouphttps://themeforest.net/user/muffingroupthemeforest.netthemeforest.net/user/muffingrouphttps://themeforest.net/user/muffingrouphttps://themeforest.net/user/muffingroupnet
7https://www.gearbest.com/promotion-Life-Essent...https://www.gearbest.com/promotion-Life-Essent...gearbest.comwww.gearbest.com/promotion-Life-Essentials-Gad...https://www.gearbest.com/promotion-Life-Essent...https://www.gearbest.com/promotion-Life-Essent...com
8https://www.googletagmanager.com/gtm.js?id=GTM...https://fivethirtyeight.abcnews.go.com/video/e...googletagmanager.comwww.googletagmanager.com/gtm.jshttps://www.googletagmanager.com/gtm.jshttps://fivethirtyeight.abcnews.go.com/video/e...com
9https://connect.facebook.net/ja_JP/sdk.js#xfbm...https://gigazine.net/news/20190729-acecook-sup...facebook.netconnect.facebook.net/ja_JP/sdk.jshttps://connect.facebook.net/ja_JP/sdk.jshttps://gigazine.net/news/20190729-acecook-sup...net
\n
"]}}],"execution_count":12},{"cell_type":"markdown","source":["## Spark"],"metadata":{}},{"cell_type":"code","source":["from pyspark.sql import functions as F, types as T\n\n# This is the convoluted way I found to pass kwargs to a udf\n\ndef get_stripped_url_udf(**function_kwargs):\n return F.udf(f=lambda x: du.get_stripped_url(x, **function_kwargs), returnType=T.StringType())\n\ndef get_ps_plus_1_udf(**function_kwargs):\n return F.udf(f=lambda x: du.get_ps_plus_1(x, **function_kwargs), returnType=T.StringType())"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":14},{"cell_type":"code","source":["df = spark.read.parquet('%s/visits/%s' % (path, 'javascript')).select('script_url', 'document_url').dropDuplicates()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":15},{"cell_type":"code","source":["# These are equivalent demonstrating with and without col syntax\ndf = df.withColumn('script_url_stripped', get_stripped_url_udf()(F.col('script_url')))\ndf = df.withColumn('script_url_stripped_2', get_stripped_url_udf()('script_url'))\ndf.limit(5).toPandas()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
script_urldocument_urlscript_url_strippedscript_url_stripped_2
0https://stats.wp.com/w.js?60https://heavy.com/entertainment/2019/07/could-...stats.wp.com/w.jsstats.wp.com/w.js
1http://platform-api.sharethis.com/js/sharethis...http://www.chasethetrend.com/category/stories/platform-api.sharethis.com/js/sharethis.jsplatform-api.sharethis.com/js/sharethis.js
2https://cdn.tinypass.com/api/tinypass.min.jshttps://www.thedailybeast.com/category/us-newscdn.tinypass.com/api/tinypass.min.jscdn.tinypass.com/api/tinypass.min.js
3https://vidstat.taboola.com/vpaid/units/23_7_1...https://www.gazetaexpress.com/arbenita-ismajli...vidstat.taboola.com/vpaid/units/23_7_1/infra/c...vidstat.taboola.com/vpaid/units/23_7_1/infra/c...
4https://pixel.yabidos.com/fltiu.js?qid=5373031...https://www.gridoto.com/read/221801860/pengend...pixel.yabidos.com/fltiu.jspixel.yabidos.com/fltiu.js
\n
"]}}],"execution_count":16},{"cell_type":"code","source":["custom_extractor('foo.googlesyndication.com')"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
Out[33]: ExtractResult(subdomain='', domain='foo', suffix='googlesyndication.com')
"]}}],"execution_count":17},{"cell_type":"markdown","source":["Because spark is non deteterministic we don't always get back a hit to test the googlesyndication entry, but we can see that it's working anyways."],"metadata":{}},{"cell_type":"code","source":["df = (\n df\n .withColumn('document_url_ps1', get_ps_plus_1_udf()(F.col('document_url')))\n .withColumn('script_url_stripped_w_scheme', get_stripped_url_udf(scheme=True)(F.col('script_url')))\n .withColumn('custom_ps1', get_ps_plus_1_udf(extractor=custom_extractor)(F.col('document_url')))\n)\ndf.limit(10).toPandas()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
script_urldocument_urlscript_url_strippedscript_url_stripped_2document_url_ps1script_url_stripped_w_schemecustom_ps1
0https://cdn.krxd.net/ctjs/controltag.js.05f9d0...https://as.com/autor/diario_as/a/cdn.krxd.net/ctjs/controltag.js.05f9d0dad02f8a...cdn.krxd.net/ctjs/controltag.js.05f9d0dad02f8a...as.comhttps://cdn.krxd.net/ctjs/controltag.js.05f9d0...com
1https://www.googletagservices.com/tag/js/gpt.jshttps://www.storm.mg/reading-inspirationwww.googletagservices.com/tag/js/gpt.jswww.googletagservices.com/tag/js/gpt.jsstorm.mghttps://www.googletagservices.com/tag/js/gpt.jsmg
2https://bttrack.com/engagement/js?goalId=14072...https://www.schwab.com/bttrack.com/engagement/jsbttrack.com/engagement/jsschwab.comhttps://bttrack.com/engagement/jscom
3https://connect.facebook.net/tr_TR/sdk.js#xfbm...https://www.kizlarsoruyor.com/kisilik-karakterconnect.facebook.net/tr_TR/sdk.jsconnect.facebook.net/tr_TR/sdk.jskizlarsoruyor.comhttps://connect.facebook.net/tr_TR/sdk.jscom
4https://www.drtuber.com/signuphttps://www.drtuber.com/signupwww.drtuber.com/signupwww.drtuber.com/signupdrtuber.comhttps://www.drtuber.com/signupcom
5https://c1.sfdcstatic.com/etc/clientlibs/sfdc-...https://www.salesforce.com/company/legal/sfdc-...c1.sfdcstatic.com/etc/clientlibs/sfdc-aem-mast...c1.sfdcstatic.com/etc/clientlibs/sfdc-aem-mast...salesforce.comhttps://c1.sfdcstatic.com/etc/clientlibs/sfdc-...com
6https://d31qbv1cthcecs.cloudfront.net/atrk.jshttps://www.brilio.net/gadget/last-seen-whatsa...d31qbv1cthcecs.cloudfront.net/atrk.jsd31qbv1cthcecs.cloudfront.net/atrk.jsbrilio.nethttps://d31qbv1cthcecs.cloudfront.net/atrk.jsnet
7https://www.googleadservices.com/pagead/conver...https://ejje.weblio.jp/category/academic/iterywww.googleadservices.com/pagead/conversion_asy...www.googleadservices.com/pagead/conversion_asy...weblio.jphttps://www.googleadservices.com/pagead/conver...jp
8https://g.alicdn.com/alilog/mlog/aplus_v2.jshttps://food.tmall.com/g.alicdn.com/alilog/mlog/aplus_v2.jsg.alicdn.com/alilog/mlog/aplus_v2.jstmall.comhttps://g.alicdn.com/alilog/mlog/aplus_v2.jscom
9https://assets.alicdn.com/g/security/umscript/...https://g.alicdn.com/alilog/oneplus/blk.html#c...assets.alicdn.com/g/security/umscript/2.1.4/um.jsassets.alicdn.com/g/security/umscript/2.1.4/um.jsalicdn.comhttps://assets.alicdn.com/g/security/umscript/...com
\n
"]}}],"execution_count":19},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":20}],"metadata":{"name":"domain_utils usage","notebookId":274964},"nbformat":4,"nbformat_minor":0}