Skip to content

Update 04.2022 #36

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 21 additions & 41 deletions isp_data_pollution.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,6 @@
browserdriver_rss_limit_mb = 1024 # Default maximum memory limit of browserdriver (chromedriver) processs (MB)
terminal_width = 80 # tty width, standard is 80 chars; add code to adapt later

blacklist_url = 'http://www.shallalist.de/Downloads/shallalist.tar.gz'
# Usage of the Shalla Blacklists:
# ===============================
#
# The Shalla Blacklists are property of Shalla Secure Services.
#
# This collection of url lists may be used for free for non
# commercial usage. This includes all kinds of private usage.
# The lists must not be given to any third party.

# property value distribution to match household
property_pvals = \
{'DNT': # Do Not Track HTTP header
Expand Down Expand Up @@ -209,7 +199,6 @@ def __init__(self,gb_per_month=gb_per_month,
max_links_per_domain=max_links_per_domain,
property_pvals=property_pvals,
user_agent=user_agent,
blacklist_url=blacklist_url,
wordsite_url=wordsite_url,
seed_bias_links=seed_bias_links,
timeout=timeout, diurnal_flag=True,
Expand All @@ -221,7 +210,6 @@ def __init__(self,gb_per_month=gb_per_month,
self.max_links_per_domain = max_links_per_domain
self.property_pvals = property_pvals
self.user_agent = user_agent
self.blacklist_url = blacklist_url
self.wordsite_url = wordsite_url
self.seed_bias_links = seed_bias_links
self.blacklist = blacklist; self.verbose = verbose
Expand Down Expand Up @@ -393,10 +381,10 @@ def get_blacklist(self,update_flag=False):
if self.verbose: print('Downloading the blacklists… ',end='',flush=True)
else:
raise Exception('Skip downloading the blacklist.')
self.get_shalla_blacklist()
if self.verbose: print('Shallalist done… ', end='', flush=True)
self.get_easylist_blacklist()
if self.verbose: print('EasyList done.', flush=True)
self.get_utcapitole_blacklist()
if self.verbose: print('Ut-Capitole done.', flush=True)
except Exception as e:
if self.verbose: print(e)
# Make sure blacklists are not empty
Expand All @@ -416,9 +404,9 @@ def get_blacklist(self,update_flag=False):
# ignore problem urls
self.blacklist_urls |= { 'about:blank' }

def get_shalla_blacklist(self):
def get_utcapitole_blacklist(self):
# http://stackoverflow.com/questions/18623842/read-contents-tarfile-into-python-seeking-backwards-is-not-allowed
tgzstream = urllib.request.urlopen(urllib.request.Request(self.blacklist_url, headers={'User-Agent': self.user_agent}))
tgzstream = urllib.request.urlopen(urllib.request.Request('https://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz', headers={'User-Agent': self.user_agent}))
tmpfile = BytesIO()
while True:
s = tgzstream.read(16384)
Expand All @@ -427,32 +415,24 @@ def get_shalla_blacklist(self):
tgzstream.close()
tmpfile.seek(0)
tgz = tarfile.open(fileobj=tmpfile, mode='r:gz')
# bash$ ls BL
# COPYRIGHT education isp recreation updatesites
# adv finance jobsearch redirector urlshortener
# aggressive fortunetelling library religion violence
# alcohol forum military remotecontrol warez
# anonvpn gamble models ringtones weapons
# automobile global_usage movies science webmail
# chat government music searchengines webphone
# costtraps hacking news sex webradio
# dating hobby podcasts shopping webtv
# downloads homestyle politics socialnet
# drugs hospitals porn spyware
# dynamic imagehosting radiotv tracker
for member in [ 'downloads', 'drugs', 'hacking', 'gamble', 'porn', 'spyware', 'updatesites', 'urlshortener', 'violence', 'warez', 'weapons' ]:
self.blacklist_domains |= set(tgz.extractfile(f'BL/{member}/domains').read().decode('utf-8').splitlines())
self.blacklist_urls |= set(tgz.extractfile(f'BL/{member}/urls').read().decode('utf-8').splitlines())
for member in ['adult', 'agressif', 'download', 'drogue', 'hacking', 'malware', 'mixed_adult', 'gambling', 'phishing', 'shortener', 'update', 'violence', 'warez']:
try:
self.blacklist_domains |= set(tgz.extractfile(f'blacklists/{member}/domains').read().decode('utf-8').splitlines())
except:
if self.verbose: print(f'Ut-Capitole: blacklists/{member}/domains does not exist.', flush=True)
try:
self.blacklist_urls |= set(tgz.extractfile(f'blacklists/{member}/urls').read().decode('utf-8').splitlines())
except:
if self.verbose: print(f'Ut-Capitole: blacklists/{member}/urls does not exist.', flush=True)
tgz.close()
tmpfile.close()

def get_easylist_blacklist(self):
# Malware lists from open source AdBlock and spam404.com lists
malwaredomains_full = 'https://easylist-downloads.adblockplus.org/malwaredomains_full.txt'
spam404_com_adblock_list = 'https://raw.githubusercontent.com/Dawsey21/Lists/master/adblock-list.txt'
spam404_com_main_blacklist = 'https://raw.githubusercontent.com/Dawsey21/Lists/master/main-blacklist.txt' # not EasyList format
download_list = list(set([malwaredomains_full, spam404_com_adblock_list, spam404_com_main_blacklist]))
download_parse = { malwaredomains_full: True, spam404_com_adblock_list: True, spam404_com_main_blacklist: False }
download_list = list(set([spam404_com_adblock_list, spam404_com_main_blacklist]))
download_parse = {spam404_com_adblock_list: True, spam404_com_main_blacklist: False}

for url in download_list:
resp = urllib.request.urlopen(urllib.request.Request(url, headers={'User-Agent': self.user_agent}))
Expand Down Expand Up @@ -517,7 +497,7 @@ def link_count(self):
def domain_entropy(self):
result = 0.
domain_count = np.array([(dmn, len(self.domain_links[dmn])) for dmn in self.domain_links])
p = np.array([np.float(c) for d, c in domain_count])
p = np.array([float(c) for d, c in domain_count])
count_total = p.sum()
if count_total > 0:
p = p / p.sum()
Expand Down Expand Up @@ -696,7 +676,7 @@ def draw_links(self,n=1,log_sampling=False):
urls = []
domain_array = np.array([dmn for dmn in self.domain_links])
domain_count = np.array([len(self.domain_links[domain_array[k]]) for k in range(domain_array.shape[0])])
p = np.array([np.float(c) for c in domain_count])
p = np.array([float(c) for c in domain_count])
count_total = p.sum()
if log_sampling: # log-sampling [log(x+1)] to bias lower count domains
p = np.fromiter((np.log1p(x) for x in p), dtype=p.dtype)
Expand All @@ -721,7 +701,7 @@ def draw_domain(self,log_sampling=False):
domain = None
domain_array = np.array([dmn for dmn in self.domain_links])
domain_count = np.array([len(self.domain_links[domain_array[k]]) for k in range(domain_array.shape[0])])
p = np.array([np.float(c) for c in domain_count])
p = np.array([float(c) for c in domain_count])
count_total = p.sum()
if log_sampling: # log-sampling [log(x+1)] to bias lower count domains
p = np.fromiter((np.log1p(x) for x in p), dtype=p.dtype)
Expand Down Expand Up @@ -809,15 +789,15 @@ def websearch_links(self):
# https://github.com/detro/ghostdriver/issues/169
@self.chromedriver_short_timeout
def chromedriver_find_elements_by_css_selector():
return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_css_selector(self.SafeSearch.css_selector))
return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements(by=By.CSS_SELECTOR, value=self.SafeSearch.css_selector))
elements = chromedriver_find_elements_by_css_selector()
# get links in random order until max. per page
k = 0
links = []
try:
for elt in sorted(elements,key=lambda k: random.random()):
@self.chromedriver_short_timeout
def chromedriver_find_element_by_tag_name(): return elt.find_element_by_tag_name('a')
def chromedriver_find_element_by_tag_name(): return elt.find_element(by=By.TAG_NAME, value='a')
a_tag = chromedriver_find_element_by_tag_name()
@self.chromedriver_short_timeout
def chromedriver_get_attribute(): return a_tag.get_attribute('href')
Expand Down Expand Up @@ -852,7 +832,7 @@ def url_links(self):
# https://github.com/detro/ghostdriver/issues/169
@self.chromedriver_short_timeout
def chromedriver_find_elements_by_tag_name():
return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_tag_name('a'))
return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_element(by=By.TAG_NAME, value='a'))
elements = chromedriver_find_elements_by_tag_name()

# get links in random order until max. per page
Expand Down