-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgetCkan.py
165 lines (132 loc) · 5.02 KB
/
getCkan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from ckanapi import RemoteCKAN
import urllib
import easygui as gui
import sys
import os
# sourcs: name, download base, access url
sources = [
(u"berlin","berlin.de","https://datenregister.berlin.de/"),\
(u"karlsruhe","karlsruhe.de","https://transparenz.karlsruhe.de/"),\
(u"bahn","deutschebahn.com","https://data.deutschebahn.com/"),\
(u"offeneDaten","offenedaten.de/","https://offenedaten.de/"),\
(u"muenchen","opengov-muenchen.de","https://www.opengov-muenchen.de"),\
(u"meerbusch","meerbusch.de","https://opendata.meerbusch.de/"),\
(u"bonn","bonn.de","https://opendata.bonn.de/"),\
(u"jena","jena.de","https://opendata.jena.de/"),\
(u"africa","africaopendata.org","https://africaopendata.org/")\
]
# file types we want to download
downs = (".csv",".json",".geojson",".gpx",".kmz",".xls",".xlx",".ods",".xlsx",".pdf")
download = True # Download file or not
excludext = True # exclude external resources or not.
#########
def config():
""" Ask user which file types to download"""
choices = []
for s in sources:
choices.append(s[0])
choice = gui.choicebox("Select CKAN origin",\
"Source select",\
choices)
if choice != None:
for c in range(len(sources)):
if choice == sources[c][0]:
choice = c
print("Selected: ",choice)
break
return choice
#########
def loadUrl(p,u):
""" load file from url, print 404 error or raise"""
f = filedir + "/" + p + "_" + u[u.rfind("/")+1:]
try:
urllib.request.urlretrieve(u,f)
except urllib.error.HTTPError as err:
if err.code == 404:
print("URL not found: ",u)
else:
raise
#########
# prepare ...
agent = 'ckanapi/3.0 (+http://digital-codes.de)'
sel = config()
if sel == None:
sys.exit(0)
filedir = "files-" + sources[sel][0]
if not os.path.exists(filedir):
os.makedirs(filedir)
downloadBase = sources[sel][1]
ckanGet = RemoteCKAN(sources[sel][2],user_agent=agent)
# access
grps = ckanGet.action.group_list()
print("Groups:\n",grps)
##for g in grps:
## print("Group: ",ckanGet.action.group_show(id=g))
pkgs = ckanGet.action.package_list()
print("Packages:\n",pkgs)
# reset our list of resource files
items = []
# iterate over packages and resources
for p in pkgs:
try:
pk = ckanGet.action.package_show(id=p)
except ckanapi.errors.NotFound:
print("Package not found: ",p)
continue
print("################")
print("\n\nPackage ",p)
gp = pk.get("groups")
gpname = ""
if gp != None:
for g in gp:
gpname = g.get("name")
if gpname is None:
gpname = g.get("title")
if gpname is None:
gpname = "no name"
print("Group: ",gpname)
#print("\nKeys in package ",p,": ",pk.keys())
#for k in pk.keys():
# print(k,": ",pk.get(k))
#print("\nTitle: ",pk.get("title"),", Notes: ",pk.get("notes"))
u = pk.get("url")
if None != u and u != "":
print("Url:", u)
r = pk.get("resources")
if None != r:
print("#########")
for rr in r:
#print("\nKeys in resource: ",rr.keys())
ru = rr.get("url")
print("\nResource: ",ru)
# check and skip external urls
try:
if excludext and ru.find(downloadBase) < 0:
print("External url: ",ru)
continue
if None != ru:
ri = []
for x in (p,gpname,pk.get("title"),pk.get("license_id"),pk.get("notes"),rr.get("name"),rr.get("description"),rr.get("last_modified"),ru):
if type(x) == str:
x = "\"" + x + "\""
else:
x = ""
ri.append(x.encode('utf-8').strip())
items.append(ri)
rf,re = os.path.splitext(ru)
#print("File: ",rf, ": ", re)
if download and re.lower() in downs:
loadUrl(p,ru)
except urllib.error.URLError:
print("url error")
pass
# write resource description to csv
itemfile = open("items-"+sources[sel][0]+".csv", 'w')
# for some reason, we need a string join here ..
fieldnames = (u"\"package\"",u"\"group\"",u"\"title\"",u"\"license_id\"",u"\"notes\"",u"\"name\"",\
u"\"description\"",u"\"last_modified\"",u"\"url\"")
itemfile.write(u",".join(fieldnames)+u"\n")
# and a byte join here
for i in items:
itemfile.write((b",".join(i)+b"\n").decode("utf-8"))
itemfile.close()