-
Notifications
You must be signed in to change notification settings - Fork 77
/
Copy pathdoctor.py
executable file
·337 lines (277 loc) · 10.2 KB
/
doctor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
#!/usr/bin/env python3
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
from multiprocessing.pool import ThreadPool
import queue
import subprocess
import sys
import traceback
from argparse import ArgumentParser
_PROG_HELP = """
Diagnose the environment and print out helpful instructions to fix.
"""
Task = collections.namedtuple('Task',
['name', 'function', 'prerequisites', 'args'])
Task.__new__.__defaults__ = (None,) * len(Task._fields)
# Records the outcome of the execution of a task.
TaskEvent = collections.namedtuple('TaskEvent', ['name', 'success', 'message'])
TaskEvent.__new__.__defaults__ = (None,) * len(Task._fields)
class BadPrerequisitesException(Exception):
pass
def main():
parser = ArgumentParser(description=_PROG_HELP)
args = parser.parse_args()
all_good = run(
args=args,
docker=check_docker,
gcloud=check_gcloud,
gcloud_login=(check_gcloud_login, ['gcloud']),
gcloud_project=(check_gcloud_default_project, ['gcloud_login']),
gsutil=(check_gsutil, ['gcloud_login']),
kubectl=check_kubectl,
kubectl_nodes=(check_kubectl_nodes, ['kubectl']),
crd=(check_crd, ['kubectl_nodes']))
if all_good:
print('\nEverything looks good to go!!')
else:
sys.exit(1)
def check_docker(args):
p = subprocess.run(['docker', 'run', '--rm', 'hello-world'],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)
if p.returncode != 0:
return TaskEvent(
success=False,
message='docker is not installed. See '
'https://docs.docker.com/install/')
return TaskEvent(success=True)
def check_gcloud(args):
p = subprocess.run(['gcloud', 'version'],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)
if p.returncode != 0:
return TaskEvent(
success=False,
message='gcloud is not installed. See '
'https://cloud.google.com/sdk/install')
return TaskEvent(success=True)
def check_gcloud_login(args):
p = subprocess.run(['gcloud', 'config', 'get-value', 'account'],
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL)
if p.returncode == 0:
if p.stdout:
account = p.stdout.decode('utf-8')
if '(unset)' not in account:
return TaskEvent(success=True)
return TaskEvent(
success=False,
message='''
You need to be logged in with gcloud. Run:
gcloud auth login
''')
def check_kubectl(args):
p = subprocess.run(['kubectl', 'help'],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)
if p.returncode != 0:
return make_run_event(
p=p,
success=False,
message='''
kubectl is not installed.
See https://kubernetes.io/docs/tasks/tools/install-kubectl/#download-as-part-of-the-google-cloud-sdk
''')
return TaskEvent(success=True)
def check_gcloud_default_project(args):
p = subprocess.run(['gcloud', 'config', 'get-value', 'project'],
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL)
if p.returncode == 0:
if p.stdout:
return make_run_event(
p=p,
success=True,
message='''
Your gcloud default project: {stdout}
You can set an environment variables to record the GCR URL:
export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/')
''')
return TaskEvent(
success=True,
message='''
You must set a default project for gcloud. This project should
house your GKE cluster. Docker images should also be
published to the GCR repo under this project, so that they can
be accessible to the GKE cluster.
Run:
# Uncomment this to also create a new configuration.
# gcloud config configurations create marketplace
gcloud config set project YOUR_PROJECT
''')
def check_kubectl_nodes(args):
p = subprocess.run(['kubectl', 'get', 'nodes'],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
if p.returncode != 0:
return make_run_event(
p=p,
success=False,
message='''
Unable to query nodes from the cluster.
If you have not created a GKE cluster, you can create one by
running the following command:
CLUSTER=cluster-1
ZONE=us-west1-a
# Create the cluster.
gcloud container clusters create "$CLUSTER" \\
--zone "$ZONE" \\
--machine-type "n1-standard-1" \\
--num-nodes "3"
# Configure kubectl authorization.
gcloud container clusters get-credentials "$CLUSTER" --zone "$ZONE"
# Bootstrap RBAC cluster-admin for your user.
# More info: https://cloud.google.com/kubernetes-engine/docs/how-to/role-based-access-control
kubectl create clusterrolebinding cluster-admin-binding \\
--clusterrole cluster-admin --user $(gcloud config get-value account)
If your kubectl should have been setup already, here's the
output of what went wrong when querying for cluster nodes:
{stdouterr}
''')
return TaskEvent(success=True)
def check_crd(args):
p = subprocess.run(['kubectl', 'get', 'crd/applications.app.k8s.io'],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)
if p.returncode != 0:
return make_run_event(
p=p,
success=False,
message='''
Application CRD is not installed in your cluster.
Run the following to install it:
kubectl apply -f "https://raw.githubusercontent.com/GoogleCloudPlatform/marketplace-k8s-app-tools/master/crd/app-crd.yaml"
For more details about the application CRD, see
https://github.com/kubernetes-sigs/application
''')
return TaskEvent(success=True)
# TODO(huyhg):
# - Check connected cluster to be GKE of the project.
# - Check gcloud auth configure-docker.
# - Check RBAC cluster-admin for user.
# - Check userinfo scope for GCE VM.
# Also, to use the Google Kubernetes Engine API from a GCE VM
# you need to add the cloud platform scope
# ("https://www.googleapis.com/auth/cloud-platform")
# to your VM when it is created. Scopes can be editted
# when the VM is stopped.
# - gcloud beta compute instances set-scopes debian-workstation --zone=us-west1-c --scopes=userinfo-email,cloud-platform
# - Check for sufficient IAM privilege (GKE admin).
# - Check GCR is enabled for the project
# - If on GCE VM and using the default Compute service account
# make sure it has k8s engine admin role.
def check_gsutil(args):
p = subprocess.run(['gsutil', 'ls'],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)
if p.returncode != 0:
return TaskEvent(
success=False,
message='Unable to list the GCS buckets. Makes sure you are authenticated via gsutil.'
)
return TaskEvent(success=True)
def make_run_event(p, success, message):
stdout = (p.stdout or b'').decode('utf-8').strip()
stderr = (p.stderr or b'').decode('utf-8').strip()
stdouterr = '\n'.join([x for x in [stdout, stderr] if x])
return TaskEvent(
success=success,
message=message.format(stdout=stdout, stderr=stderr, stdouterr=stdouterr))
def run(args, **kwargs):
def execute_task(task, event_queue):
try:
event = task.function(args, **task.args)
event_queue.put(
TaskEvent(
name=task.name, success=event.success, message=event.message))
except:
event_queue.put(
TaskEvent(
name=task.name,
success=False,
message='Unexpected exception: {}'.format(
traceback.format_exception(*sys.exc_info()))))
tasks = {}
for task_name, v in kwargs.items():
if isinstance(v, tuple):
# Flexibly allowing variable length tuple.
# Tuple is the essentially Task without the name.
v = tuple(list(v) + [None] * (len(Task._fields) - len(v) - 1))
fn, prerequisites, extra_args = v
else:
fn = v
prerequisites = None
extra_args = None
tasks[task_name] = Task(
name=task_name,
function=fn,
prerequisites=set(prerequisites or []),
args=extra_args or {})
# Dry-run to ensure valid DAG.
do_run(
tasks, lambda task, event_queue: event_queue.put(
TaskEvent(name=task.name, success=True)))
with ThreadPool(5) as pool:
return do_run(
tasks, lambda task, event_queue: pool.apply_async(
execute_task, args=(task, event_queue)))
def do_run(tasks, run_fn):
dones = set()
starteds = set()
failed = False
event_queue = queue.Queue()
while True:
if len(dones) >= len(tasks):
break
if failed and len(dones) >= len(starteds):
break
if not failed:
# Identify tasks that have become eligible (0 prerequisites)
# and execute them. Note that identified eligible tasks might be
# already running and need to be filtered out. This is because we
# handle one task event at a time.
prereq_counts = [(task.name, len(task.prerequisites.difference(dones)))
for task in tasks.values()
if task.name not in dones]
prereq_counts.sort(key=lambda name_count: name_count[1], reverse=True)
candidate_names = [name for (name, count) in prereq_counts if count == 0]
if not candidate_names:
raise BadPrerequisitesException('Found a cycle in {}'.format(
[name for (name, _) in prereq_counts]))
for name in [c for c in candidate_names if c not in starteds]:
starteds.add(name)
run_fn(tasks[name], event_queue)
# Wait for and process the next task event.
task_event = event_queue.get()
if task_event.name:
dones.add(task_event.name)
if not task_event.success:
failed = True
if task_event.message:
print('\n{}\n\n===='.format(task_event.message.strip()))
return not failed
if __name__ == "__main__":
main()