-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathintersectcsv.py
86 lines (72 loc) · 2.45 KB
/
intersectcsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python
import csv, sys, getopt
from collections import OrderedDict
# load first file into dict, then read second file and add/update fields
CSV_DIALECT = 'excel'
data = {}
data_fields = set()
data_key_field = None
def printError(message):
try:
sys.stderr.write(message + "\n")
except UnicodeEncodeError:
sys.stderr.write(message.encode("utf8"))
def clean_header(header):
cleaned = header.replace('_','').replace(' ','').replace('[','').replace(']','').lower()
if cleaned in ('emailaddress',):
cleaned = 'email'
return cleaned
def clean_headers(headers):
return [clean_header(h) for h in headers]
def file_to_dict(fname):
with open(fname,'rb') as f:
csv1 = csv.reader(f,dialect=CSV_DIALECT)
header_row = csv1.next()
header_row = clean_headers(header_row)
# import ipdb;ipdb.set_trace()
data_key_field = header_row[0]
# data_fields.update(header_row)
data_fields.add(header_row[0])
for row in csv1:
row_dict = OrderedDict(zip(header_row,row))
key_field = header_row[0]
key_value = row[0]
# data[key_value] = row_dict
data[key_value] = {}
def update_dict(fname):
with open(fname,'rb') as f:
c = csv.reader(f,dialect=CSV_DIALECT)
header_row = c.next()
header_row = clean_headers(header_row)
data_fields.update(header_row)
for row in c:
row_dict = OrderedDict(zip(header_row,row))
key_field = 'email'
key_value = row_dict[key_field]
if data.has_key(key_value):
data[key_value] = dict(data[key_value].items() + row_dict.items())
def output_master(fname):
with open(fname,'wb') as f:
c = csv.writer(f)
header_row = list(data_fields)
c.writerow(header_row)
for r in data.itervalues():
data_row = [r.get(h) for h in header_row]
c.writerow(data_row)
def main(argv):
args = argv
# print args
if len(args) < 2:
print "Needs at least two arguments"
return 1
print "Loading %s" % args[0]
file_to_dict(args[0])
for arg in args[1:]:
print "Merging %s" % (arg)
update_dict(arg)
outfilename = "output.csv"
print "Outputting combined data to %s" % outfilename
output_master(outfilename)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:])) # slice this filename off args