Skip to content

Commit 1dcedc4

Browse files
committed
ext4dist
1 parent cd1cad1 commit 1dcedc4

File tree

4 files changed

+477
-0
lines changed

4 files changed

+477
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ Tools:
7474
- tools/[execsnoop](tools/execsnoop.py): Trace new processes via exec() syscalls. [Examples](tools/execsnoop_example.txt).
7575
- tools/[dcsnoop](tools/dcsnoop.py): Trace directory entry cache (dcache) lookups. [Examples](tools/dcsnoop_example.txt).
7676
- tools/[dcstat](tools/dcstat.py): Directory entry cache (dcache) stats. [Examples](tools/dcstat_example.txt).
77+
- tools/[ext4dist](tools/ext4dist.py): Summarize ext4 operation latency. [Examples](tools/ext4dist_example.txt).
7778
- tools/[ext4slower](tools/ext4slower.py): Trace slow ext4 operations. [Examples](tools/ext4slower_example.txt).
7879
- tools/[filelife](tools/filelife.py): Trace the lifespan of short-lived files. [Examples](tools/filelife_example.txt).
7980
- tools/[fileslower](tools/fileslower.py): Trace slow synchronous file reads and writes. [Examples](tools/fileslower_example.txt).

man/man8/ext4dist.8

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
.TH ext4dist 8 "2016-02-12" "USER COMMANDS"
2+
.SH NAME
3+
ext4dist \- Summarize ext4 operation latency. Uses Linux eBPF/bcc.
4+
.SH SYNOPSIS
5+
.B ext4dist [\-h] [\-T] [\-N] [\-d] [interval] [count]
6+
.SH DESCRIPTION
7+
This tool summarizes time (latency) spent in common ext4 file operations: reads,
8+
writes, opens, and syncs, and presents it as a power-of-2 histogram. It uses an
9+
in-kernel eBPF map to store the histogram for efficiency.
10+
11+
Since this works by tracing the ext4_file_operations interface functions, it
12+
will need updating to match any changes to these functions.
13+
14+
Since this uses BPF, only the root user can use this tool.
15+
.SH REQUIREMENTS
16+
CONFIG_BPF and bcc.
17+
.SH OPTIONS
18+
.TP
19+
\-h
20+
Print usage message.
21+
.TP
22+
\-T
23+
Don't include timestamps on interval output.
24+
.TP
25+
\-m
26+
Output in milliseconds.
27+
.TP
28+
\-p PID
29+
Trace this PID only.
30+
.SH EXAMPLES
31+
.TP
32+
Trace ext4 operation time, and print a summary on Ctrl-C:
33+
#
34+
.B ext4dist
35+
.TP
36+
Trace PID 181 only:
37+
#
38+
.B ext4dist -p 181
39+
.TP
40+
Print 1 second summaries, 10 times:
41+
#
42+
.B ext4dist 1 10
43+
.TP
44+
1 second summaries, printed in milliseconds
45+
#
46+
.B ext4dist \-m 1
47+
.SH FIELDS
48+
.TP
49+
msecs
50+
Range of milliseconds for this bucket.
51+
.TP
52+
usecs
53+
Range of microseconds for this bucket.
54+
.TP
55+
count
56+
Number of operations in this time range.
57+
.TP
58+
distribution
59+
ASCII representation of the distribution (the count column).
60+
.SH OVERHEAD
61+
This adds low-overhead instrumentation to these ext4 operations,
62+
including reads and writes from the file system cache. Such reads and writes
63+
can be very frequent (depending on the workload; eg, 1M/sec), at which
64+
point the overhead of this tool may become noticeable.
65+
Measure and quantify before use.
66+
.SH SOURCE
67+
This is from bcc.
68+
.IP
69+
https://github.com/iovisor/bcc
70+
.PP
71+
Also look in the bcc distribution for a companion _examples.txt file containing
72+
example usage, output, and commentary for this tool.
73+
.SH OS
74+
Linux
75+
.SH STABILITY
76+
Unstable - in development.
77+
.SH AUTHOR
78+
Brendan Gregg
79+
.SH SEE ALSO
80+
ext4snoop(8)

tools/ext4dist.py

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
#!/usr/bin/python
2+
# @lint-avoid-python-3-compatibility-imports
3+
#
4+
# ext4dist Summarize ext4 operation latency.
5+
# For Linux, uses BCC, eBPF.
6+
#
7+
# USAGE: ext4dist [-h] [-T] [-m] [-p PID] [interval] [count]
8+
#
9+
# Copyright 2016 Netflix, Inc.
10+
# Licensed under the Apache License, Version 2.0 (the "License")
11+
#
12+
# 12-Feb-2016 Brendan Gregg Created this.
13+
14+
from __future__ import print_function
15+
from bcc import BPF
16+
from time import sleep, strftime
17+
import argparse
18+
19+
# symbols
20+
kallsyms = "/proc/kallsyms"
21+
22+
# arguments
23+
examples = """examples:
24+
./ext4dist # show operation latency as a histogram
25+
./ext4dist -p 181 # trace PID 181 only
26+
./ext4dist 1 10 # print 1 second summaries, 10 times
27+
./ext4dist -m 5 # 5s summaries, milliseconds
28+
"""
29+
parser = argparse.ArgumentParser(
30+
description="Summarize ext4 operation latency",
31+
formatter_class=argparse.RawDescriptionHelpFormatter,
32+
epilog=examples)
33+
parser.add_argument("-T", "--notimestamp", action="store_true",
34+
help="don't include timestamp on interval output")
35+
parser.add_argument("-m", "--milliseconds", action="store_true",
36+
help="output in milliseconds")
37+
parser.add_argument("-p", "--pid",
38+
help="trace this PID only")
39+
parser.add_argument("interval", nargs="?",
40+
help="output interval, in seconds")
41+
parser.add_argument("count", nargs="?", default=99999999,
42+
help="number of outputs")
43+
args = parser.parse_args()
44+
pid = args.pid
45+
countdown = int(args.count)
46+
if args.milliseconds:
47+
factor = 1000000
48+
label = "msecs"
49+
else:
50+
factor = 1000
51+
label = "usecs"
52+
if args.interval and int(args.interval) == 0:
53+
print("ERROR: interval 0. Exiting.")
54+
exit()
55+
debug = 0
56+
57+
# define BPF program
58+
bpf_text = """
59+
#include <uapi/linux/ptrace.h>
60+
#include <linux/fs.h>
61+
#include <linux/sched.h>
62+
63+
#define OP_NAME_LEN 8
64+
typedef struct dist_key {
65+
char op[OP_NAME_LEN];
66+
u64 slot;
67+
} dist_key_t;
68+
BPF_HASH(start, u32);
69+
BPF_HISTOGRAM(dist, dist_key_t);
70+
71+
// time operation
72+
int trace_entry(struct pt_regs *ctx)
73+
{
74+
u32 pid = bpf_get_current_pid_tgid();
75+
if (FILTER_PID)
76+
return 0;
77+
u64 ts = bpf_ktime_get_ns();
78+
start.update(&pid, &ts);
79+
return 0;
80+
}
81+
82+
// The current ext4 (Linux 4.5) uses generic_file_read_iter(), instead of it's
83+
// own function, for reads. So we need to trace that and then filter on ext4,
84+
// which I do by checking file->f_op.
85+
int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb)
86+
{
87+
u32 pid = bpf_get_current_pid_tgid();
88+
if (FILTER_PID)
89+
return 0;
90+
91+
// ext4 filter on file->f_op == ext4_file_operations
92+
struct file *fp = iocb->ki_filp;
93+
if ((u64)fp->f_op != EXT4_FILE_OPERATIONS)
94+
return 0;
95+
96+
u64 ts = bpf_ktime_get_ns();
97+
start.update(&pid, &ts);
98+
return 0;
99+
}
100+
101+
static int trace_return(struct pt_regs *ctx, const char *op)
102+
{
103+
u64 *tsp;
104+
u32 pid = bpf_get_current_pid_tgid();
105+
106+
// fetch timestamp and calculate delta
107+
tsp = start.lookup(&pid);
108+
if (tsp == 0) {
109+
return 0; // missed start or filtered
110+
}
111+
u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR;
112+
113+
// store as histogram
114+
dist_key_t key = {.slot = bpf_log2l(delta)};
115+
__builtin_memcpy(&key.op, op, sizeof(key.op));
116+
dist.increment(key);
117+
118+
start.delete(&pid);
119+
return 0;
120+
}
121+
122+
int trace_read_return(struct pt_regs *ctx)
123+
{
124+
char *op = "read";
125+
return trace_return(ctx, op);
126+
}
127+
128+
int trace_write_return(struct pt_regs *ctx)
129+
{
130+
char *op = "write";
131+
return trace_return(ctx, op);
132+
}
133+
134+
int trace_open_return(struct pt_regs *ctx)
135+
{
136+
char *op = "open";
137+
return trace_return(ctx, op);
138+
}
139+
140+
int trace_fsync_return(struct pt_regs *ctx)
141+
{
142+
char *op = "fsync";
143+
return trace_return(ctx, op);
144+
}
145+
"""
146+
147+
# code replacements
148+
with open(kallsyms) as syms:
149+
ops = ''
150+
for line in syms:
151+
(addr, size, name) = line.rstrip().split(" ", 2)
152+
if name == "ext4_file_operations":
153+
ops = "0x" + addr
154+
break
155+
if ops == '':
156+
print("ERROR: no ext4_file_operations in /proc/kallsyms. Exiting.")
157+
exit()
158+
bpf_text = bpf_text.replace('EXT4_FILE_OPERATIONS', ops)
159+
bpf_text = bpf_text.replace('FACTOR', str(factor))
160+
if args.pid:
161+
bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
162+
else:
163+
bpf_text = bpf_text.replace('FILTER_PID', '0')
164+
if debug:
165+
print(bpf_text)
166+
167+
# load BPF program
168+
b = BPF(text=bpf_text)
169+
170+
# Common file functions. See earlier comment about generic_file_read_iter().
171+
b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry")
172+
b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_entry")
173+
b.attach_kprobe(event="ext4_file_open", fn_name="trace_entry")
174+
b.attach_kprobe(event="ext4_sync_file", fn_name="trace_entry")
175+
b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return")
176+
b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return")
177+
b.attach_kretprobe(event="ext4_file_open", fn_name="trace_open_return")
178+
b.attach_kretprobe(event="ext4_sync_file", fn_name="trace_fsync_return")
179+
180+
print("Tracing ext4 operation latency... Hit Ctrl-C to end.")
181+
182+
# output
183+
exiting = 0
184+
dist = b.get_table("dist")
185+
while (1):
186+
try:
187+
if args.interval:
188+
sleep(int(args.interval))
189+
else:
190+
sleep(99999999)
191+
except KeyboardInterrupt:
192+
exiting = 1
193+
194+
print()
195+
if args.interval and (not args.notimestamp):
196+
print(strftime("%H:%M:%S:"))
197+
198+
dist.print_log2_hist(label, "operation")
199+
dist.clear()
200+
201+
countdown -= 1
202+
if exiting or countdown == 0:
203+
exit()

0 commit comments

Comments
 (0)