forked from JMousqueton/ransomware.live
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapegang.py
103 lines (92 loc) · 5.37 KB
/
scrapegang.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# from ast import parse
import os,sys
import json
from datetime import datetime
import os
from bs4 import BeautifulSoup
# local imports
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
from playwright_stealth import stealth_sync
from sharedutils import striptld
from sharedutils import openjson
from sharedutils import getsitetitle
from sharedutils import stdlog, dbglog, errlog
import hashlib
def scraper(querygroup=''):
'''main scraping function'''
groups = openjson("groups.json")
stdlog('scraper: ' + 'looking for ' + querygroup)
# iterate each provider
for group in groups:
if group['name'] == querygroup:
stdlog('scraper: ' + 'working on ' + querygroup)
# iterate each location/mirror/relay
for host in group['locations']:
stdlog('ransomwatch: ' + 'scraping ' + host['slug'])
if host['enabled'] is False:
stdlog('ransomwatch: ' + 'skipping, this host has been flagged as disabled')
continue
if host['version'] == 3 or host['version'] == 0:
# here
try:
with sync_playwright() as play:
if querygroup in ['blackbasta','everest','metaencryptor', 'bianlian','knight','mydata']:
stdlog('exception for ' + querygroup)
browser = play.firefox.launch(proxy={"server": "socks5://127.0.0.1:9050"},
args=['--unsafely-treat-insecure-origin-as-secure='+host['slug'], "--headless=new"])
elif querygroup in ['toufan','werewolves']:
stdlog('exception not tor for ' + querygroup)
browser = play.firefox.launch(args=["--headless=new"])
else:
browser = play.chromium.launch(proxy={"server": "socks5://127.0.0.1:9050"},
args=['--unsafely-treat-insecure-origin-as-secure='+host['slug']])
context = browser.new_context(ignore_https_errors= True )
page = context.new_page()
#stealth_sync(page)
if 'timeout' in host and host['timeout'] is not None:
page.goto(host['slug'], wait_until='load', timeout = host['timeout']*1000)
else:
page.goto(host['slug'], wait_until='load', timeout = 120000)
page.bring_to_front()
delay = host['delay']*1000 if ( 'delay' in host and host['delay'] is not None ) \
else 15000
if querygroup == "knight":
delay = 60000
if delay != 15000:
stdlog('New delay : ' + str(delay) + 'ms')
#page.wait_for_timeout(5000)
page.wait_for_timeout(delay)
page.mouse.move(x=500, y=400)
page.wait_for_load_state('networkidle')
page.mouse.wheel(delta_y=2000, delta_x=0)
page.wait_for_load_state('networkidle')
page.wait_for_timeout(delay)
#filename = group['name'] + '-' + str(striptld(host['slug'])) + '.html'
hash_object = hashlib.md5()
hash_object.update(host['slug'].encode('utf-8'))
hex_digest = hash_object.hexdigest()
filename = group['name'] + '-' + hex_digest + '.html'
name = os.path.join(os.getcwd(), 'source', filename)
with open(name, 'w', encoding='utf-8') as sitesource:
sitesource.write(page.content())
sitesource.close()
host['available'] = True
host['title'] = getsitetitle(name)
host['lastscrape'] = str(datetime.today())
host['updated'] = str(datetime.today())
dbglog('ransomwatch: ' + 'scrape successful')
with open('groups.json', 'w', encoding='utf-8') as groupsfile:
json.dump(groups, groupsfile, ensure_ascii=False, indent=4)
groupsfile.close()
dbglog('scraper: ' + 'groups.json updated')
browser.close()
except PlaywrightTimeoutError:
stdlog('Timeout!')
except Exception as exception:
errlog(exception)
stdlog('leaving : ' + host['slug'] + ' --------- ' + group['name'])
def main():
groupname = sys.argv[1]
scraper(groupname)
if __name__ == '__main__':
main()