forked from openshift/origin-aggregated-logging
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.sh
708 lines (640 loc) · 28.6 KB
/
util.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
#!/bin/bash
if ! type -t os::log::info > /dev/null ; then
source "${OS_O_A_L_DIR:-..}/hack/lib/init.sh"
fi
LOGGING_NS=${LOGGING_NS:-openshift-logging}
function get_es_dcs() {
oc get dc --selector logging-infra=elasticsearch ${1:+-l component=$1} -o name
}
function get_curator_dcs() {
oc get dc --selector logging-infra=curator -o name
}
function get_es_pod() {
local clustertype="$1"
# $1 - cluster name postfix
# for allinone there is just "elasticsearch"
# for split apps/infra there will be separate elasticsearch-app and elasticsearch-infra
if [ -n "$( oc -n $LOGGING_NS get deployment -l cluster-name=elasticsearch,es-node-master=true -o name 2> /dev/null )" ] ; then
oc -n $LOGGING_NS get pod -l cluster-name=elasticsearch,es-node-master=true --no-headers 2> /dev/null | awk '$3 == "Running" {print $1}'
elif [ $clustertype = es-ops ] && \
[ -n "$( oc -n $LOGGING_NS get deployment -l cluster-name=elasticsearch-infra,es-node-master=true -o name 2> /dev/null )" ] ; then
oc -n $LOGGING_NS get pod -l cluster-name=elasticsearch-infra,es-node-master=true --no-headers 2> /dev/null | awk '$3 == "Running" {print $1}'
elif [ $clustertype = es ] && \
[ -n "$( oc -n $LOGGING_NS get deployment -l cluster-name=elasticsearch-app,es-node-master=true -o name 2> /dev/null )" ] ; then
oc -n $LOGGING_NS get pod -l cluster-name=elasticsearch-app,es-node-master=true --no-headers 2> /dev/null | awk '$3 == "Running" {print $1}'
elif [ -z "$(oc -n $LOGGING_NS get dc -l cluster-name=logging-${clustertype},es-node-role=clientdata --no-headers 2> /dev/null | awk '{print $1}')" ] ; then
oc -n $LOGGING_NS get pods -l component=${clustertype} --no-headers 2> /dev/null | awk '$3 == "Running" {print $1}'
else
oc -n $LOGGING_NS get pods -l cluster-name=logging-${clustertype},es-node-role=clientdata --no-headers 2> /dev/null | awk '$3 == "Running" {print $1}'
fi
}
function get_es_svc() {
local clustertype="$1"
# $1 - cluster name postfix
if oc -n $LOGGING_NS get svc elasticsearch -o jsonpath='{.metadata.name}' 2> /dev/null ; then
return 0
elif [ $clustertype = es-ops ] && oc -n $LOGGING_NS get svc elasticsearch-infra -o jsonpath='{.metadata.name}' 2> /dev/null ; then
return 0
elif [ $clustertype = es ] && oc -n $LOGGING_NS get svc elasticsearch-app -o jsonpath='{.metadata.name}' 2> /dev/null ; then
return 0
else
oc -n $LOGGING_NS get svc logging-$clustertype -o jsonpath='{.metadata.name}' 2> /dev/null || {
if [ "$clustertype" != "es-ops" ] ; then
# ignore missing es-ops - probably not deployed with ops cluster - otherwise, report it
oc -n $LOGGING_NS get svc logging-$clustertype -o jsonpath='{.metadata.name}' 2>&1 | artifact_out || :
fi
}
fi
}
function get_running_pod() {
# $1 is component for selector
oc get pods -l component=$1 --no-headers 2> /dev/null | awk '$3 == "Running" {print $1}'
}
function get_completed_pod() {
# $1 is component for selector
oc get pods -l component=$1 --no-headers 2> /dev/null | awk '$3 == "Completed" {print $1}'
}
function get_error_pod() {
# $1 is component for selector
oc get pods -l component=$1 --no-headers 2> /dev/null | awk '$3 == "Error" {print $1}'
}
function get_es_cert_path() {
if [ ! -d "${OS_O_A_L_DIR}/temp/es_certs" ]; then
mkdir -p ${OS_O_A_L_DIR}/temp/es_certs
if oc get secret/elasticsearch > /dev/null 2>&1 ; then
oc extract -n $LOGGING_NS secret/elasticsearch --to=${OS_O_A_L_DIR}/temp/es_certs
else
oc extract -n $LOGGING_NS secret/logging-elasticsearch --to=${OS_O_A_L_DIR}/temp/es_certs
fi
fi
echo ${OS_O_A_L_DIR}/temp/es_certs
}
function create_users() {
# args are username password true|false (isadmin)
local addeduser=""
local username=""
local pwd=""
local isadmin=""
local existingusers=""
for item in "$@" ; do
if [ -z "$username" ] ; then username="$item" ; continue ; fi
if [ -z "$pwd" ] ; then pwd="$item" ; continue ; fi
if [ -z "$isadmin" ] ; then isadmin="$item" ; fi
if oc get user $username > /dev/null 2>&1 ; then
existingusers="$existingusers $username"
else
echo create_users: new user $username
oc get users
if [ ! -d ${OS_O_A_L_DIR}/temp ] ; then
mkdir -p ${OS_O_A_L_DIR}/temp
fi
local htpwargs="-b"
if [ ! -f ${OS_O_A_L_DIR}/temp/htpw.file ] ; then
htpwargs="$htpwargs -c"
fi
htpasswd $htpwargs ${OS_O_A_L_DIR}/temp/htpw.file "$username" "$pwd"
addeduser=true
fi
username=""
pwd=""
isadmin=""
done
if [ "${addeduser:-false}" = true ] ; then
# set management state to managed for auth operator
local mgmtstate=$( oc get authentication.operator cluster -o jsonpath='{.spec.managementState}' )
if [ "$mgmtstate" != Managed ] ; then
oc patch authentication.operator cluster --type=merge -p "{\"spec\":{\"managementState\": \"Managed\"}}"
fi
# kick the console pods because they cache oauth metadata (temporary, should not be required)
oc delete pods -n openshift-console --all --force --grace-period=0
# kick the monitoring pods because they cache oauth metadata (temporary, should not be required)
oc delete pods -n openshift-monitoring --all --force --grace-period=0
# see if there is already an htpass-secret - if so, delete it and recreate it
if oc -n openshift-config get secret htpass-secret > /dev/null 2>&1 ; then
oc -n openshift-config delete secret htpass-secret
os::cmd::try_until_failure "oc -n openshift-config get secret htpass-secret > /dev/null 2>&1"
fi
oc -n openshift-config create secret generic htpass-secret --from-file=htpasswd=${OS_O_A_L_DIR}/temp/htpw.file
os::cmd::try_until_success "oc -n openshift-config get secret htpass-secret > /dev/null 2>&1"
# configure HTPasswd IDP if not already configured
if ! oc get oauth -o jsonpath='{.items[*].spec.identityProviders[*].type}' | grep -q -i '^htpasswd$' ; then
oc apply -f - <<EOF
apiVersion: config.openshift.io/v1
kind: OAuth
metadata:
name: cluster
spec:
identityProviders:
- name: htpassidp
challenge: true
login: true
mappingMethod: claim
type: HTPasswd
htpasswd:
fileData:
name: htpass-secret
EOF
fi
# fix ca cert in kubeconfig - should not be necessary much longer
if [ ! -d ${OS_O_A_L_DIR}/temp ] ; then
mkdir -p ${OS_O_A_L_DIR}/temp
fi
if [ ! -f ${OS_O_A_L_DIR}/temp/tls.crt ] ; then
oc -n openshift-ingress-operator extract secret/router-ca --to=${OS_O_A_L_DIR}/temp --keys=tls.crt
oc config view --minify -o go-template='{{index .clusters 0 "cluster" "certificate-authority-data" }}' --raw=true | base64 -d > ${OS_O_A_L_DIR}/temp/current-ca.crt
cat ${OS_O_A_L_DIR}/temp/tls.crt >> ${OS_O_A_L_DIR}/temp/current-ca.crt
for cluster in $( oc config get-clusters | grep -v NAME ) ; do
oc config set-cluster "$cluster" --certificate-authority=${OS_O_A_L_DIR}/temp/current-ca.crt --embed-certs=true
done
fi
# iterate over the users again, doing the oc login to ensure they exist
while [ -n "${1:-}" ] ; do
username="$1" ; shift
pwd="$1" ; shift
isadmin="$1"; shift
local existinguser=""
local skip=""
if [ -n "$existingusers" ] ; then
for existinguser in $existingusers ; do
if [ "$existinguser" = "$username" ] ; then
skip=true
fi
done
fi
if [ "${skip:-false}" = true ] ; then
continue
fi
# wait until oc login succeeds
os::cmd::try_until_success "oc login -u '$username' -p '$pwd'" $((3 * minute)) 3
oc login -u system:admin
if [ $isadmin = true ] ; then
echo adding cluster-admin role to user "$username"
oc adm policy add-cluster-role-to-user cluster-admin "$username"
else
echo user "$username" is not an admin user
fi
done
fi
}
# set the test_token, test_name, and test_ip for token auth
function get_test_user_token() {
local current_project; current_project="$( oc project -q )"
create_users ${1:-${LOG_ADMIN_USER:-admin}} ${2:-${LOG_ADMIN_PW:-admin}} ${3:-true}
oc login --username=${1:-${LOG_ADMIN_USER:-admin}} --password=${2:-${LOG_ADMIN_PW:-admin}} > /dev/null
test_token="$(oc whoami -t)"
test_name="$(oc whoami)"
test_ip="127.0.0.1"
oc login --username=system:admin > /dev/null
oc project "${current_project}" > /dev/null
}
# $1 - kibana pod name
# $2 - es hostname (e.g. logging-es or logging-es-ops) - should be output of get_es_svc es or es-ops
# $3 - endpoint (e.g. /projects.*/_search)
# $4 - token
# stdout is the JSON output from Elasticsearch
# stderr is curl errors
curl_es_from_kibana() {
local pod="$1"
local eshost="$2"
local endpoint="$3"
local test_token="$4"
shift; shift; shift; shift
local args=( "${@:-}" )
local secret_dir="/etc/kibana/keys/"
oc -n $LOGGING_NS exec "${pod}" -c kibana -- curl --connect-timeout 1 --silent --insecure "${args[@]}" \
--cert "${secret_dir}cert" \
--key "${secret_dir}key" \
-H "Authorization: Bearer $test_token" \
-H "X-Forwarded-For: 127.0.0.1" \
"https://${eshost}:9200${endpoint}"
}
# $1 - es pod name
# $2 - es endpoint
# rest - any args to pass to curl
function curl_es_pod() {
local pod="$1"
local endpoint="$2"
shift; shift
local args=( "${@:-}" )
local secret_dir="/etc/elasticsearch/secret/"
oc -n $LOGGING_NS exec -c elasticsearch "${pod}" -- curl --silent --insecure "${args[@]}" \
--key "${secret_dir}admin-key" \
--cert "${secret_dir}admin-cert" \
"https://localhost:9200${endpoint}"
}
# $1 - es svc name
# $2 - es endpoint
# rest - any args to pass to curl
function curl_es() {
local svc_name="$1"
local endpoint="$2"
shift; shift
local args=( "${@:-}" )
local secret_dir="$(get_es_cert_path)/"
local cert=${CURL_ES_CERT:-${secret_dir}/admin-cert}
local key=${CURL_ES_KEY:-${secret_dir}/admin-key}
curl --silent --insecure "${args[@]}" \
--key "${key}" \
--cert "${cert}" \
"https://${svc_name}.${LOGGING_NS}.svc:9200${endpoint}"
}
# $1 - es svc name
# $2 - es endpoint
# rest - any args to pass to curl
function curl_es_input() {
local svc_name="$1"
local endpoint="$2"
shift; shift
local args=( "${@:-}" )
local secret_dir="$(get_es_cert_path)/"
local cert=${CURL_ES_CERT:-${secret_dir}/admin-cert}
local key=${CURL_ES_KEY:-${secret_dir}/admin-key}
curl --silent --insecure "${args[@]}" \
--key "${key}" \
--cert "${cert}" \
"https://${svc_name}.${LOGGING_NS}.svc:9200${endpoint}"
}
function curl_es_pod_with_token() {
local pod="$1"
local endpoint="$2"
local test_token="$3"
shift; shift; shift;
local args=( "${@:-}" )
oc -n $LOGGING_NS exec -c elasticsearch "${pod}" -- curl --silent --insecure "${args[@]}" \
-H "Authorization: Bearer $test_token" \
-H "X-Forwarded-For: 127.0.0.1" \
"https://localhost:9200${endpoint}"
}
function curl_es_pod_with_username_password() {
local pod="$1"
local endpoint="$2"
local test_name="$3"
local test_password="$4"
shift; shift; shift; shift
local args=( "${@:-}" )
oc -n $LOGGING_NS exec -c elasticsearch "${pod}" -- curl --silent --insecure "${args[@]}" \
-H "Authorization: Basic $( echo -n ${test_name}:${test_password} | base64 -w 0 )" \
"https://localhost:9200${endpoint}"
}
function curl_es_pod_with_username_password_not_silent() {
local pod="$1"
local endpoint="$2"
local test_name="$3"
local test_password="$4"
shift; shift; shift; shift
local args=( "${@:-}" )
oc -n $LOGGING_NS exec -c elasticsearch "${pod}" -- curl --insecure "${args[@]}" \
-H "Authorization: Basic $( echo -n ${test_name}:${test_password} | base64 -w 0 )" \
"https://localhost:9200${endpoint}"
}
function curl_es_with_token() {
local svc_name="$1"
local endpoint="$2"
local test_token="$3"
shift; shift; shift
local args=( "${@:-}" )
curl --silent --insecure "${args[@]}" \
-H "Authorization: Bearer $test_token" \
"https://${svc_name}:9200${endpoint}"
}
function curl_es_pod_with_token_and_input() {
local pod="$1"
local endpoint="$2"
local test_token="$3"
shift; shift; shift
local args=( "${@:-}" )
oc -n $LOGGING_NS exec -c elasticsearch -i "${pod}" -- curl --silent --insecure "${args[@]}" \
-H "Authorization: Bearer $test_token" \
-H "Content-type: application/json" \
"https://localhost:9200${endpoint}"
}
function curl_es_with_token_and_input() {
local svc_name="$1"
local endpoint="$2"
local test_token="$3"
shift; shift; shift
local args=( "${@:-}" )
curl --silent --insecure "${args[@]}" \
-H "Authorization: Bearer $test_token" \
-H "Content-type: application/json" \
"https://${svc_name}.${LOGGING_NS}:9200${endpoint}"
}
# $1 - es pod name
# $2 - index name (e.g. project.logging, project.test, .operations, etc.)
# $3 - _count or _search
# $4 - field to search
# $5 - search string
# stdout is the JSON output from Elasticsearch
# stderr is curl errors
function query_es_from_es() {
curl_es_pod "$1" "/${2}*/${3}?q=${4}:${5}" --connect-timeout 1
}
function get_count_from_json() {
python -c 'import json, sys; print json.loads(sys.stdin.read()).get("count", 0)'
}
# https://github.com/ViaQ/integration-tests/issues/8
function get_count_from_json_from_search() {
python -c 'import json, sys; print json.loads(sys.stdin.read()).get("responses", [{}])[0].get("hits", {}).get("total", 0)'
}
# $1 - unique value to search for in es
function add_test_message() {
local kib_pod=`get_running_pod kibana`
oc -n $LOGGING_NS exec $kib_pod -c kibana -- curl --connect-timeout 1 -s \
http://localhost:5601/$1 > /dev/null 2>&1
}
function flush_fluentd_pos_files() {
os::cmd::expect_success "sudo rm -f /var/log/journal.pos /var/log/journal_pos.json"
}
function get_journal_pos_cursor() {
if sudo test -s /var/log/journal.pos ; then
sudo cat /var/log/journal.pos
elif sudo test -s /var/log/journal_pos.json ; then
sudo python -c 'import sys,json; print json.load(file(sys.argv[1]))["journal"]' /var/log/journal_pos.json
else
echo ""
fi
}
# $1 - command to call to pass the uuid_es
# $2 - command to call to pass the uuid_es_ops
# $3 - expected number of matches
function wait_for_fluentd_to_catch_up() {
local starttime=$( date +%s.%9N )
local startsecs=$( date --date=@${starttime} +%s )
local startjournal="$( date +'%Y-%m-%d %H:%M:%S' --date=@$starttime )"
os::log::debug START wait_for_fluentd_to_catch_up at $( date --date=@${starttime} -u --rfc-3339=ns )
local es_svc=$( get_es_svc es )
local es_ops_svc=$( get_es_svc es-ops )
es_ops_svc=${es_ops_svc:-$es_svc}
local uuid_es=$( uuidgen | sed 's/[-]//g' )
local uuid_es_ops=$( uuidgen | sed 's/[-]//g' )
local expected=${3:-1}
local timeout=${TIMEOUT:-600}
local appsproject=${4:-$LOGGING_NS}
local priority=${TEST_REC_PRIORITY:-info}
wait_for_fluentd_ready
# look for the messages in the source
local fullmsg="GET /${uuid_es} 404 "
local checkpids
if docker_uses_journal ; then
sudo journalctl -m -f -o export | \
awk -v "es=MESSAGE=.*$fullmsg" -v "es_ops=SYSLOG_IDENTIFIER=$uuid_es_ops" \
-v es_out=$ARTIFACT_DIR/es_out.txt -v es_ops_out=$ARTIFACT_DIR/es_ops_out.txt '
BEGIN{RS="";FS="\n"};
$0 ~ es {print > es_out; app += 1; if (app && op) {exit 0}};
$0 ~ es_ops {print > es_ops_out; op += 1; if (app && op) {exit 0}};
' 2>&1 | artifact_out & checkpids=$!
else
sudo journalctl -m -f -o export | \
awk -v "es_ops=SYSLOG_IDENTIFIER=$uuid_es_ops" -v es_ops_out=$ARTIFACT_DIR/es_ops_out.txt '
BEGIN{RS="";FS="\n"};
$0 ~ es_ops {print > es_ops_out; exit 0}' 2>&1 | artifact_out & checkpids=$!
while ! sudo find /var/log/containers -name \*.log -exec grep -b -n "$fullmsg" {} + > $ARTIFACT_DIR/es_out.txt 2> $ARTIFACT_DIR/es_errs.txt ; do
sleep 1
done & checkpids="$checkpids $!"
fi
add_test_message $uuid_es
artifact_log added es message $uuid_es
logger -i -p local6.${priority} -t $uuid_es_ops $uuid_es_ops
artifact_log added es-ops message $uuid_es_ops
local errqs
local rc=0
local qs='{"query":{"bool":{"filter":{"match_phrase":{"message":"'"${fullmsg}"'"}},"must":{"term":{"kubernetes.container_name":"kibana"}}}}}'
case "${appsproject}" in
default|openshift|openshift-*) logging_index=".operations.*" ; es_svc=$es_ops_svc ;;
*) logging_index="project.${appsproject}.*" ;;
esac
# poll for logs to show up
if os::cmd::try_until_text "curl_es ${es_svc} /${logging_index}/_count -X POST -d '$qs' | get_count_from_json" $expected $(( timeout * second )); then
artifact_log good - $FUNCNAME: found $expected record $logging_index for \'$fullmsg\'
if [ -n "${1:-}" ] ; then
curl_es ${es_svc} /${logging_index}/_count -X POST -d "$qs" 2>&1 | artifact_out
curl_es ${es_svc} /${logging_index}/_count -X POST -d "$qs" 2>&1 | get_count_from_json | artifact_out
curl_es ${es_svc} "/${logging_index}/_search" -X POST -d "$qs" | jq . > $ARTIFACT_DIR/apps.json
$1 $uuid_es $ARTIFACT_DIR/apps.json
fi
else
os::log::error $FUNCNAME: not found $expected record $logging_index for \'$fullmsg\' after $timeout seconds
curl_es ${es_svc} /${logging_index}/_search -X POST -d "$qs" > $ARTIFACT_DIR/apps_search_output.raw 2>&1 || :
if [ -s $ARTIFACT_DIR/es_out.txt ] ; then
os::log::error "$( cat $ARTIFACT_DIR/es_out.txt )"
else
os::log::error apps record for "$fullmsg" not found in source
fi
if docker_uses_journal ; then
os::log::error here is the current fluentd journal cursor
sudo cat /var/log/journal.pos || :
sudo cat /var/log/journal_pos.json || :
echo ""
os::log::error starttime in journald format is $( date --date=@$starttime +%s%6N )
# first and last couple of records in the journal
sudo journalctl -m -S "$startjournal" -n 20 -o export > $ARTIFACT_DIR/apps_err_journal_first.txt
sudo journalctl -m -S "$startjournal" -r -n 20 -o export > $ARTIFACT_DIR/apps_err_journal_last.txt
elif sudo test -f /var/log/es-containers.log.pos ; then
sudo cat /var/log/es-containers.log.pos > $ARTIFACT_DIR/es-containers.log.pos
fi
# records since start of function in ascending @timestamp order - see what records were added around
# the time our record should have been added
errqs='{"query":{"range":{"@timestamp":{"gte":"'"$( date --date=@${starttime} -u -Ins )"'"}}},"sort":[{"@timestamp":{"order":"asc"}}],"size":20}'
curl_es ${es_svc} /${logging_index}/_search -X POST -d "$errqs" | jq . > $ARTIFACT_DIR/apps_err_recs_asc.json 2>&1 || :
# last records in descending @timestamp order - see what records have been added recently
errqs='{"query":{"range":{"@timestamp":{"gte":"'"$( date --date=@${starttime} -u -Ins )"'"}}},"sort":[{"@timestamp":{"order":"desc"}}],"size":20}'
curl_es ${es_svc} /${logging_index}/_search -X POST -d "$errqs" | jq . > $ARTIFACT_DIR/apps_err_recs_desc.json 2>&1 || :
rc=1
fi
qs='{"query":{"term":{"systemd.u.SYSLOG_IDENTIFIER":"'"${uuid_es_ops}"'"}}}'
if os::cmd::try_until_text "curl_es ${es_ops_svc} /.operations.*/_count -X POST -d '$qs' | get_count_from_json" $expected $(( timeout * second )); then
os::log::debug good - $FUNCNAME: found $expected record .operations for $uuid_es_ops
if [ -n "${2:-}" ] ; then
curl_es ${es_ops_svc} "/.operations.*/_search" -X POST -d "$qs" | jq . > $ARTIFACT_DIR/ops.json
$2 $uuid_es_ops $ARTIFACT_DIR/ops.json
fi
else
os::log::error $FUNCNAME: not found $expected record .operations for $uuid_es_ops after $timeout seconds
curl_es ${es_ops_svc} /.operations.*/_search -X POST -d "$qs" > $ARTIFACT_DIR/ops_search_output.raw 2>&1 || :
os::log::error "Checking journal for $uuid_es_ops..."
if [ -s $ARTIFACT_DIR/es_ops_out.txt ] ; then
os::log::error "$( cat $ARTIFACT_DIR/es_ops_out.txt )"
else
os::log::error ops record for "$uuid_es_ops" not found in journal
fi
os::log::error here is the current fluentd journal cursor
sudo cat /var/log/journal.pos || :
sudo cat /var/log/journal_pos.json || :
echo ""
os::log::error starttime in journald format is $( date --date=@$starttime +%s%6N )
# first and last couple of records in the journal
sudo journalctl -m -S "$startjournal" -n 20 -o export > $ARTIFACT_DIR/ops_err_journal_first.txt
sudo journalctl -m -S "$startjournal" -r -n 20 -o export > $ARTIFACT_DIR/ops_err_journal_last.txt
# records since start of function in ascending @timestamp order - see what records were added around
# the time our record should have been added
errqs='{"query":{"range":{"@timestamp":{"gte":"'"$( date --date=@${starttime} -u -Ins )"'"}}},"sort":[{"@timestamp":{"order":"asc"}}],"size":20}'
curl_es ${es_ops_svc} /.operations.*/_search -X POST -d "$errqs" | jq . > $ARTIFACT_DIR/ops_err_recs_asc.json 2>&1 || :
# last records in descending @timestamp order - see what records have been added recently
errqs='{"query":{"range":{"@timestamp":{"gte":"'"$( date --date=@${starttime} -u -Ins )"'"}}},"sort":[{"@timestamp":{"order":"desc"}}],"size":20}'
curl_es ${es_ops_svc} /.operations.*/_search -X POST -d "$errqs" | jq . > $ARTIFACT_DIR/ops_err_recs_desc.json 2>&1 || :
rc=1
fi
kill $checkpids > /dev/null 2>&1 || :
kill -9 $checkpids > /dev/null 2>&1 || :
local endtime=$( date +%s.%9N )
local endsecs=$( date --date=@${endtime} +%s )
os::log::debug END wait_for_fluentd_to_catch_up took $( expr $endsecs - $startsecs ) seconds at $( date --date=@${endtime} -u --rfc-3339=ns )
return $rc
}
docker_uses_journal() {
# note the unintuitive logic - in this case, a 0 return means true, and a 1
# return means false
# need to be able to handle cases like
# OPTIONS='--log-driver=json-file ....' # or use --log-driver=journald
# if "log-driver" is set in /etc/docker/daemon.json, assume that it is
# authoritative
# otherwise, look for /etc/sysconfig/docker
if type -p docker > /dev/null && sudo docker info 2>&1 | grep -q 'Logging Driver: journald' ; then
return 0
elif sudo grep -q '^[^#].*"log-driver":' /etc/docker/daemon.json 2> /dev/null ; then
if sudo grep -q '^[^#].*"log-driver":.*journald' /etc/docker/daemon.json 2> /dev/null ; then
return 0
fi
elif sudo grep -q "^OPTIONS='[^']*--log-driver=journald" /etc/sysconfig/docker 2> /dev/null ; then
return 0
fi
return 1
}
wait_for_fluentd_ready() {
local timeout=${1:-60}
# wait until fluentd is actively reading from the source (journal or files)
os::cmd::try_until_success "sudo test -s /var/log/journal.pos -o -s /var/log/journal_pos.json" $(( timeout * second ))
if docker_uses_journal ; then
: # done
else
os::cmd::try_until_success "sudo test -f /var/log/es-containers.log.pos" $(( timeout * second ))
fi
}
extra_artifacts_testname=$( basename $0 )
extra_artifacts=$ARTIFACT_DIR/${extra_artifacts_testname}-artifacts.txt
internal_artifact_log() {
local ts=$1 ; shift
echo \[${ts}\] "$@" >> $extra_artifacts
}
artifact_log() {
internal_artifact_log "$( date +%Y-%m-%dT%H:%M:%S.%3N%z )" "$@"
}
artifact_out() {
local ts="$( date +%Y-%m-%dT%H:%M:%S.%3N%z )"
local line
while IFS= read -r line ; do
internal_artifact_log "${ts}" "$line"
done
}
# e.g. 2 or 5 or 6
get_es_major_ver() {
local es_svc=$( get_es_svc es )
curl_es $es_svc "/" | jq -r '.version.number | split(".")[0]'
}
# fields are given like this: c a r s q
get_bulk_thread_pool_url() {
local es_ver=$1
local headers=$2
shift; shift
# remaining args are fields
local url="/_cat/thread_pool"
local comma=""
local pref=""
if [ "${es_ver}" -gt 2 ] ; then
url="${url}/bulk"
else
pref="b"
fi
url="${url}?"
if [ -n "${headers}" ] ; then
url="${url}v&h="
else
url="${url}h="
fi
while [ -n "${1:-}" ] ; do
url="${url}${comma}${pref}$1"
comma=,
shift
done
echo $url
}
# fluentd may have pod logs and logs in the file
get_fluentd_pod_log() {
local pod=${1:-$( get_running_pod fluentd )}
local logfile=${2:-/var/log/fluentd/fluentd.log}
oc logs $pod 2>&1
if oc exec $pod -- logs 2>&1 ; then
: # done
elif sudo test -f $logfile ; then
# can't read from the pod directly - see if we can get the log file
sudo cat $logfile
fi
}
get_mux_pod_log() {
local pod=${1:-$( get_running_pod mux )}
local logfile=${2:-/var/log/fluentd/fluentd.log}
oc logs $pod 2>&1
oc exec $pod -- cat $logfile 2> /dev/null || :
}
# writes all pod logs to the given outdir or $ARTIFACT_DIR in the form
# pod_name.container_name.log
# it will get both the oc logs output and any log files produced by
# the pod
get_all_logging_pod_logs() {
local outdir=${1:-$ARTIFACT_DIR}
local p
local container
for p in $(oc get pods -n ${LOGGING_NS} -o jsonpath='{.items[*].metadata.name}') ; do
for container in $(oc get po $p -o jsonpath='{.spec.containers[*].name}') ; do
case "$p" in
logging-fluentd-*|fluentd-*) get_fluentd_pod_log $p > $ARTIFACT_DIR/$p.$container.log 2>&1 ;;
logging-mux-*) get_mux_pod_log $p > $ARTIFACT_DIR/$p.$container.log 2>&1 ;;
logging-es-*|elasticsearch-*) oc logs -n ${LOGGING_NS} -c $container $p > $ARTIFACT_DIR/$p.$container.log 2>&1
oc exec -c elasticsearch -n ${LOGGING_NS} $p -- logs >> $ARTIFACT_DIR/$p.$container.log 2>&1
;;
*) oc logs -n ${LOGGING_NS} -c $container $p > $ARTIFACT_DIR/$p.$container.log 2>&1 ;;
esac
done
done
}
stop_fluentd() {
local fpod=${1:-$( get_running_pod fluentd )}
local wait_time=${2:-$(( 2 * minute ))}
oc label node -l logging-infra-fluentd=true --overwrite logging-infra-fluentd=false
os::cmd::try_until_text "oc get $fluentd_ds -o jsonpath='{ .status.numberReady }'" "0" $wait_time
# not sure if it is a bug or a flake, but sometimes .status.numberReady is 0, the fluentd pod hangs around
# in the Terminating state for many seconds, which seems to cause problems with subsequent tests
# so, we have to wait for the pod to completely disappear - we cannot rely on .status.numberReady == 0
if [ -n "${fpod:-}" ] ; then
os::cmd::try_until_failure "oc get pod $fpod > /dev/null 2>&1" $wait_time
fi
}
start_fluentd() {
local cleanfirst=${1:-false}
local wait_time=${2:-$(( 2 * minute ))}
if [ "$cleanfirst" != false ] ; then
flush_fluentd_pos_files
sudo rm -rf /var/log/fluentd/fluentd.log
if [ "${CLEANBUFFERS:-true}" = true ] ; then
sudo rm -rf /var/lib/fluentd/*
fi
fi
oc label node -l logging-infra-fluentd=false --overwrite logging-infra-fluentd=true
os::cmd::try_until_text "oc get pods -l component=fluentd" "^(logging-)*fluentd-.* Running " $wait_time
}
get_fluentd_ds_name() {
if oc -n ${LOGGING_NS} get daemonset fluentd -o name > /dev/null 2>&1 ; then
echo daemonset/fluentd
else
echo daemonset/logging-fluentd
fi
}
fluentd_ds=${fluentd_ds:-$(get_fluentd_ds_name)}
get_fluentd_cm_name() {
if oc -n ${LOGGING_NS} get configmap fluentd -o name > /dev/null 2>&1 ; then
echo configmap/fluentd
else
echo configmap/logging-fluentd
fi
}
fluentd_cm=${fluentd_cm:-$(get_fluentd_cm_name)}