Skip to content

Commit f7964fb

Browse files
committed
chg: [crawler] add option to controls whether the crawler should proceed with crawling onion domains that have not yet been classified as safe or unsafe.
1 parent f01cfe7 commit f7964fb

File tree

4 files changed

+87
-5
lines changed

4 files changed

+87
-5
lines changed

bin/crawlers/Crawler.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def __init__(self):
5858
config_loader = ConfigLoader()
5959

6060
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled(cache=False)
61+
self.filter_unknown_onion = crawlers.is_onion_filter_unknown(cache=False)
6162
self.last_config_check = int(time.time())
6263

6364
self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
@@ -145,6 +146,7 @@ def get_message(self):
145146
# Refresh Config
146147
if int(time.time()) - self.last_config_check > 60:
147148
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled()
149+
self.filter_unknown_onion = crawlers.is_onion_filter_unknown()
148150
self.last_config_check = int(time.time())
149151

150152
# Check if a new Capture can be Launched
@@ -156,7 +158,7 @@ def get_message(self):
156158
if self.filter_unsafe_onion:
157159
if domain.endswith('.onion'):
158160
try:
159-
if not crawlers.check_if_onion_is_safe(domain):
161+
if not crawlers.check_if_onion_is_safe(domain, unknown=self.filter_unknown_onion):
160162
# print('---------------------------------------------------------')
161163
# print('DOMAIN FILTERED')
162164
task.delete()
@@ -388,7 +390,7 @@ def save_capture_response(self, parent_id, entries):
388390
# Filter Domain
389391
if self.filter_unsafe_onion:
390392
if current_domain.endswith('.onion'):
391-
if not crawlers.check_if_onion_is_safe(current_domain):
393+
if not crawlers.check_if_onion_is_safe(current_domain, unknown=self.filter_unknown_onion):
392394
return False
393395

394396
# TODO LAST URL

bin/lib/crawlers.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2296,7 +2296,7 @@ def _onion_lookup(onion_url):
22962296
return {'error': f'Timeout Error'}
22972297

22982298

2299-
def check_if_onion_is_safe(onion_url):
2299+
def check_if_onion_is_safe(onion_url, unknown):
23002300
resp = _onion_lookup(onion_url)
23012301
if resp:
23022302
if isinstance(resp, dict):
@@ -2305,6 +2305,11 @@ def check_if_onion_is_safe(onion_url):
23052305
elif 'error' in resp:
23062306
if resp['error']:
23072307
raise OnionFilteringError(resp['error'])
2308+
elif not unknown:
2309+
if isinstance(resp, list):
2310+
if len(resp) > 1:
2311+
if resp[1] == 404:
2312+
return True
23082313
return False
23092314

23102315

@@ -2351,6 +2356,40 @@ def change_onion_filter_state(new_state):
23512356
return True
23522357
return False
23532358

2359+
# # Crawl Unknown Onion # #
2360+
def _is_onion_filter_unknown():
2361+
unknown = r_crawler.hget('crawler:onion_filter', 'unknown')
2362+
if unknown is None:
2363+
r_crawler.hset('crawler:onion_filter', 'unknown', str(False))
2364+
filter_enabled = False
2365+
else:
2366+
filter_enabled = unknown == 'True'
2367+
r_cache.set('crawler:onion_filter:unknown', str(filter_enabled))
2368+
return filter_enabled
2369+
2370+
def is_onion_filter_unknown(cache=True):
2371+
if cache:
2372+
res = r_cache.get('crawler:onion_filter:unknown')
2373+
if res is None:
2374+
unknown = _is_onion_filter_unknown()
2375+
r_cache.set('crawler:onion_filter:unknown', str(unknown))
2376+
return unknown
2377+
else:
2378+
return res == 'True'
2379+
else:
2380+
return _is_onion_filter_unknown()
2381+
2382+
def change_onion_filter_unknown_state(new_state):
2383+
old_state = is_onion_filter_unknown(cache=False)
2384+
if old_state != new_state:
2385+
r_crawler.hset('crawler:onion_filter', 'unknown', str(new_state))
2386+
r_cache.set('crawler:onion_filter:unknown', str(new_state))
2387+
update_time = time.time()
2388+
r_crawler.hset('crawler:onion_filter', 'update_time', update_time)
2389+
r_cache.set('crawler:onion_filter:last_update_time', update_time)
2390+
return True
2391+
return False
2392+
23542393
#### ---- ####
23552394

23562395

var/www/blueprints/crawler_splash.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,7 @@ def crawler_settings():
997997
crawler_error_mess = crawlers.get_test_ail_crawlers_message()
998998

999999
is_onion_filter_enabled = crawlers.is_onion_filter_enabled(cache=False)
1000+
is_onion_filter_unknown = crawlers.is_onion_filter_unknown(cache=False)
10001001

10011002
# TODO REGISTER PROXY
10021003
# all_proxies = crawlers.get_all_proxies_metadata()
@@ -1011,6 +1012,7 @@ def crawler_settings():
10111012
is_crawler_working=is_crawler_working,
10121013
crawler_error_mess=crawler_error_mess,
10131014
is_onion_filter_enabled=is_onion_filter_enabled,
1015+
is_onion_filter_unknown=is_onion_filter_unknown
10141016
)
10151017

10161018

@@ -1066,9 +1068,20 @@ def crawler_filter_unsafe_onion():
10661068
filter_unsafe_onion = True
10671069
else:
10681070
filter_unsafe_onion = False
1069-
print(filter_unsafe_onion)
10701071
crawlers.change_onion_filter_state(filter_unsafe_onion)
10711072
return redirect(url_for('crawler_splash.crawler_settings'))
10721073

1074+
@crawler_splash.route('/crawler/settings/crawler/filter_unknown_onion', methods=['GET'])
1075+
@login_required
1076+
@login_admin
1077+
def crawler_filter_unknown_onion():
1078+
filter_unknown_onion = request.args.get('state')
1079+
if filter_unknown_onion == 'enable':
1080+
filter_unknown_onion = True
1081+
else:
1082+
filter_unknown_onion = False
1083+
crawlers.change_onion_filter_unknown_state(filter_unknown_onion)
1084+
return redirect(url_for('crawler_splash.crawler_settings'))
1085+
10731086

10741087
# --- LACUS ---#

var/www/templates/crawler/crawler_splash/settings_crawler.html

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ <h5 class="card-title">
243243
</p>
244244
{% if is_onion_filter_enabled %}
245245
<a href="{{ url_for('crawler_splash.crawler_filter_unsafe_onion') }}?state=disable">
246-
<button class="btn btn-danger mx-4 my-2">
246+
<button class="btn btn-danger my-2">
247247
<i class="fa-solid fa-xmark"></i> Disable Onion Filter
248248
</button>
249249
</a>
@@ -254,6 +254,34 @@ <h5 class="card-title">
254254
</button>
255255
</a>
256256
{% endif %}
257+
258+
<hr class="border-1 my-4">
259+
260+
<h5 class="card-title">
261+
Crawl Unknown Onion: &nbsp;&nbsp;<b class="text-primary"><span class="text-{% if is_onion_filter_unknown %}success{% else %}secondary{% endif %}">{% if is_onion_filter_unknown %}Enabled{% else %}Disabled{% endif %}</span></b>
262+
</h5>
263+
<p>This option controls whether the crawler should proceed with crawling onion domains that have <strong>not yet been classified</strong> as safe or unsafe.</p>
264+
265+
<ul>
266+
<li><strong>If disabled:</strong> The crawler will process domains that have never been checked, potentially discovering new useful content but also increasing the risk of encountering unsafe materials.</li>
267+
<li><strong>If enabled:</strong> The crawler will only process domains that have been explicitly identified as safe, reducing risk but potentially missing new, unclassified domains.</li>
268+
</ul>
269+
270+
<p>This option is useful for users who want to explore uncharted domains while still benefiting from the <code>filter_unsafe_onion</code> protection. However, enabling this option increases the likelihood of encountering harmful content, so caution is advised.</p>
271+
{% if is_onion_filter_unknown %}
272+
<a href="{{ url_for('crawler_splash.crawler_filter_unknown_onion') }}?state=disable">
273+
<button class="btn btn-secondary my-2">
274+
<i class="fa-solid fa-xmark"></i> Disable Unknown Onion Filter
275+
</button>
276+
</a>
277+
{% else %}
278+
<a href="{{ url_for('crawler_splash.crawler_filter_unknown_onion') }}?state=enable">
279+
<button class="btn btn-info my-2">
280+
<i class="fa-solid fa-check"></i> Enable Unknown Onion Filter
281+
</button>
282+
</a>
283+
{% endif %}
284+
257285
</div>
258286
</div>
259287

0 commit comments

Comments
 (0)