-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfind-cross-slow.py
140 lines (110 loc) · 3.29 KB
/
find-cross-slow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
import pdb
import redis
import sys
import json
from PIL import Image
import imagehash
import os
from imagehash import ImageMultiHash
from pprint import pprint
from hexhamming import hamming_distance_string
r = redis.Redis(host='localhost', port=6379, db=0,charset="utf-8", decode_responses=True)
hdict={}
pdict={}
ulist={}
ll={}
USERMAX = 200
# This records the checksum as having a single user
for k,v in r.hgetall('cksum').items():
v = json.loads(v)
user = v[0]
if 'data_' in v[1]:
continue
if user not in ulist:
ulist[user] = 0
ulist[user] += 1
hdict[k] = user
pdict[k] = v[1]
"""
# Now we go through the ignore list
for k,v in r.hgetall('ignore').items():
parts = k.split('/')
if len(parts) > 1:
if parts[0] == 'data':
user = parts[1]
if v not in hdict:
hdict[v] = set()
# if we find the checksum again we add another user to it
hdict[v].add(user)
"""
# Create the lookup table for a hamming distance calculation.
for hx in hdict.keys():
stub = hx[:2]
if stub == "na":
print(hx)
else:
mylen = len(hx)
if mylen not in ll:
ll[mylen] = {}
if stub not in ll[mylen]:
ll[mylen][stub] = set()
ll[mylen][stub].add(hx)
# Now here's the sloooow part.
ttl = len(hdict.keys())
ix=0
hcache = {}
def compare(h1, h2, d):
global hcache
path1="data/{}/{}".format(hdict[h1], pdict[h1])
path2="data/{}/{}".format(hdict[h2], pdict[h2])
key=":".join(sorted([path1,path2]))
crh = r.hget('crh', key)
if not os.path.exists(path1) or not os.path.exists(path2):
return
o = ""
if not crh:
ch1 = r.hget('cr', path1)
if not ch1:
ch1 = imagehash.crop_resistant_hash(Image.open(path1))
r.hset('cr', path1, str(ch1))
o += "+"
else:
ch1 = imagehash.hex_to_multihash(ch1)
o += "."
ch2 = r.hget('cr', path2)
if not ch2:
ch2 = imagehash.crop_resistant_hash(Image.open(path2))
r.hset('cr', path2, str(ch2))
o += "+"
else:
ch2 = imagehash.hex_to_multihash(ch2)
o += "."
crh = ch1 - ch2
if o != "..":
print("{} {} {}:{}".format(o, cix,path1,path2), file=sys.stderr)
r.hset('crh', key, crh)
crh = float(crh)
if (crh < 1.2):
print(' '.join(sorted([path1, path2])), crh, d, ','.join(sorted([hdict[h1],hdict[h2]])),flush=True)
sys.stdout.flush()
cix=0
for len_ix in ll:
llm = sorted(ll[mylen].keys())
for stub_ix in llm:
tocompare = []
for bucket_stub in ll[mylen].keys():
if hamming_distance_string(stub_ix,bucket_stub) < 2:
tocompare.append(bucket_stub)
for hx in ll[len_ix][stub_ix]:
ix += 1
if ix % 50 == 0:
print(100 * ix / ttl, file=sys.stderr)
for bucket_stub in tocompare:
for what in ll[len_ix][bucket_stub]:
hd = hamming_distance_string(what, hx)
if hd < 6:
if hdict[hx] != hdict[what]:
cix += 1
compare(hx,what, hd)
del ll[mylen][stub_ix]