From a4d665ab5acd15473cdbd9a120467d9c7d936f7d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 11 Mar 2025 14:44:08 -0700 Subject: [PATCH] examples/c: add hashing and naive substring search algo Also benchmark it a little. Performance obviously will depend on haystack and needle strings and so on, but hashing implementation seems to be on par with naive implementation for short strings, but is getting relatively faster as strings become longer and/or pattern match happens further into the string. E.g., for searching "ra" in "abracadabra" (end of short string): substr-2084331 [012] ..... 2514091.887184: bpf_trace_printk: BENCH HASHED 156 ns/iter substr-2084331 [012] ..... 2514091.891784: bpf_trace_printk: BENCH NAIVE 183 ns/iter For searching "eaba" in "abacabadabacabaeabacabadabacaba" (middle of longer string): substr-2082624 [015] ..... 2514066.577106: bpf_trace_printk: BENCH HASHED 289 ns/iter substr-2082624 [015] ..... 2514066.588243: bpf_trace_printk: BENCH NAIVE 445 ns/iter But searching all occurences of "a" inside "abracadabra" (almost immediate match in rather short string): substr-2111313 [078] ..... 2514466.822019: bpf_trace_printk: BENCH HASHED 259 ns/iter substr-2111313 [078] ..... 2514466.827745: bpf_trace_printk: BENCH NAIVE 228 ns/iter Overall, hashed variant seems best from practical point of view. Signed-off-by: Andrii Nakryiko --- examples/c/.gitignore | 1 + examples/c/Makefile | 2 +- examples/c/substr.bpf.c | 192 ++++++++++++++++++++++++++++++++++++++++ examples/c/substr.c | 58 ++++++++++++ 4 files changed, 252 insertions(+), 1 deletion(-) create mode 100644 examples/c/substr.bpf.c create mode 100644 examples/c/substr.c diff --git a/examples/c/.gitignore b/examples/c/.gitignore index 2a7351f4..6ce8faa3 100644 --- a/examples/c/.gitignore +++ b/examples/c/.gitignore @@ -15,3 +15,4 @@ /lsm /cmake-build-debug/ /cmake-build-release/ +/substr diff --git a/examples/c/Makefile b/examples/c/Makefile index 6a82f267..0e2aa6c0 100644 --- a/examples/c/Makefile +++ b/examples/c/Makefile @@ -25,7 +25,7 @@ CFLAGS := -g -Wall ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) APPS = minimal minimal_legacy minimal_ns bootstrap uprobe kprobe fentry \ - usdt sockfilter tc ksyscall task_iter lsm + usdt sockfilter tc ksyscall task_iter lsm substr CARGO ?= $(shell which cargo) ifeq ($(strip $(CARGO)),) diff --git a/examples/c/substr.bpf.c b/examples/c/substr.bpf.c new file mode 100644 index 00000000..3d33b3c6 --- /dev/null +++ b/examples/c/substr.bpf.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2020 Facebook */ +#include +#include +#include + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +int my_pid = 0; + +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif + +#define MAX_STR_LEN 128 +struct cstr { + char data[MAX_STR_LEN]; +}; + +static __always_inline u64 cstr_pos(u64 pos) +{ + /* prevent compiler reordering comparison below with array access in cstr_char() */ + barrier_var(pos); + /* `pos >= MAX_STR_LEN` never happens, but we need to make verifier happy */ + pos = likely(pos < MAX_STR_LEN) ? pos : 0; + barrier_var(pos); + return pos; +} + +static __always_inline char cstr_char(const struct cstr *s, u64 pos) +{ + return s->data[cstr_pos(pos)]; +} + +unsigned zero = 0, one = 1; /* obfuscate integers for verifier */ + +static bool __substr_match(const struct cstr *haystack __arg_nonnull, + const struct cstr *needle __arg_nonnull, + int pos) +{ + u64 i; + char c; + + bpf_for(i, 0, MAX_STR_LEN) { + c = cstr_char(needle, i); + if (c == '\0') + return true; + if (c != cstr_char(haystack, pos + i)) + return false; + } + + return true; +} + +/* + * Find substring `needle` in a string `haystack`, starting from position + * `start` (zero-indexed). Returns substring start position (>= `start`) if + * match is found; negative result, otherwise. + */ +__noinline int substr_hashed(const struct cstr *haystack __arg_nonnull, + const struct cstr *needle __arg_nonnull, + int start) +{ + u32 i, need_hash = zero, hay_hash = zero, mul = one; + int need_len = zero, hay_len = zero, p; + + bpf_for(i, 0, MAX_STR_LEN) { + if (needle->data[i] == '\0') + break; + + need_len += 1; + need_hash = need_hash * 31 + (u32)needle->data[i]; + mul *= 31; + } + + if (need_len == 0) /* emtpy substring always matches */ + return start; + + bpf_for(i, start, MAX_STR_LEN) { + if (haystack->data[i] == '\0') + return -1; + + hay_hash = hay_hash * 31 + (u32)haystack->data[i]; + hay_len += 1; + if (hay_len < need_len) { + continue; + } else if (hay_len > need_len) { + hay_len -= 1; + hay_hash -= mul * cstr_char(haystack, i - hay_len); + } + + /* now hay_len == need_len */ + p = i - (hay_len - 1); + if (hay_hash == need_hash && __substr_match(haystack, needle, p)) + return p; + } + + return -1; +} + +__noinline int substr_naive(const struct cstr *haystack __arg_nonnull, + const struct cstr *needle __arg_nonnull, + int start) +{ + int *p; + + bpf_for_each(num, p, start, MAX_STR_LEN) { + if (cstr_char(haystack, *p) == '\0') + break; + + if (__substr_match(haystack, needle, *p)) + return *p; + } + + return -1; +} + +#define BENCH 0 +#define BENCH_ITERS 25000 + +#if BENCH +static struct cstr haystack = { "abacabadabacabaeabacabadabacaba" }; +static struct cstr needle = { "eaba" }; +#else +static struct cstr haystack = { "abracadabra" }; +static struct cstr needle = { "a" }; +#endif + +SEC("raw_tp/sys_enter") +int test_substr_hashed(void *ctx) +{ + int pid = bpf_get_current_pid_tgid() >> 32; + int i, p; + + if (pid != my_pid) + return 0; + +#if BENCH + u64 start, end; + start = bpf_ktime_get_ns(); + bpf_repeat(BENCH_ITERS) { +#endif + p = -1; + bpf_repeat(MAX_STR_LEN) { + p = substr_hashed(&haystack, &needle, p + 1); + if (p < 0) + break; +#if !BENCH + bpf_printk("HASHED match at pos #%d!", p); +#endif + } + +#if BENCH + } + end = bpf_ktime_get_ns(); + bpf_printk("BENCH HASHED %lu ns/iter", (end - start) / BENCH_ITERS); +#endif + return 0; +} + +SEC("raw_tp/sys_enter") +int test_substr_naive(void *ctx) +{ + int pid = bpf_get_current_pid_tgid() >> 32; + int i, p; + u64 start, end; + + if (pid != my_pid) + return 0; + +#if BENCH + start = bpf_ktime_get_ns(); + bpf_repeat(BENCH_ITERS) { +#endif + p = -1; + bpf_repeat(MAX_STR_LEN) { + p = substr_naive(&haystack, &needle, p + 1); + if (p < 0) + break; +#if !BENCH + bpf_printk("NAIVE match at pos #%d!", p); +#endif + } + +#if BENCH + } + end = bpf_ktime_get_ns(); + bpf_printk("BENCH NAIVE %lu ns/iter", (end - start) / BENCH_ITERS); +#endif + + return 0; +} diff --git a/examples/c/substr.c b/examples/c/substr.c new file mode 100644 index 00000000..a4243435 --- /dev/null +++ b/examples/c/substr.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include +#include "substr.skel.h" + +static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + return vfprintf(stderr, format, args); +} + +int main(int argc, char **argv) +{ + struct substr_bpf *skel; + int err; + + /* Set up libbpf errors and debug info callback */ + libbpf_set_print(libbpf_print_fn); + + /* Open BPF application */ + skel = substr_bpf__open(); + if (!skel) { + fprintf(stderr, "Failed to open BPF skeleton\n"); + return 1; + } + + /* ensure BPF program only handles write() syscalls from our process */ + skel->bss->my_pid = getpid(); + + /* Load & verify BPF programs */ + err = substr_bpf__load(skel); + if (err) { + fprintf(stderr, "Failed to load and verify BPF skeleton\n"); + goto cleanup; + } + + /* Attach tracepoint handler */ + err = substr_bpf__attach(skel); + if (err) { + fprintf(stderr, "Failed to attach BPF skeleton\n"); + goto cleanup; + } + + printf("Successfully started! Please run `sudo cat /sys/kernel/debug/tracing/trace_pipe` " + "to see output of the BPF programs.\n"); + + for (;;) { + /* trigger our BPF program */ + fprintf(stderr, "."); + sleep(1); + } + +cleanup: + substr_bpf__destroy(skel); + return -err; +}