diff --git a/headers/vmlinux/vmlinux_net.h b/headers/vmlinux/vmlinux_net.h
index 5f16c57b..64b26212 100644
--- a/headers/vmlinux/vmlinux_net.h
+++ b/headers/vmlinux/vmlinux_net.h
@@ -67,6 +67,7 @@ struct sk_buff {
 	__u8 nf_trace: 1;
 	__u8 ip_summed: 2;
 	__u8 ooo_okay: 1;
+	__u8 tstamp_type: 2;
 	__u8 l4_hash: 1;
 	__u8 sw_hash: 1;
 	__u8 wifi_acked_valid: 1;
@@ -145,4 +146,8 @@ enum ip_conntrack_status {
 	IPS_CONFIRMED = (1 << IPS_CONFIRMED_BIT),
 };
 
+struct scm_timestamping_internal {
+        struct timespec64 ts[3];
+};
+
 #endif /* __VMLINUX_NET_H__ */
diff --git a/headers/vmlinux/vmlinux_types.h b/headers/vmlinux/vmlinux_types.h
index d7b3bed0..96411e12 100644
--- a/headers/vmlinux/vmlinux_types.h
+++ b/headers/vmlinux/vmlinux_types.h
@@ -11,4 +11,11 @@ typedef __u64 u64;
 
 typedef s64 ktime_t;
 
+typedef __s64 time64_t;
+
+struct timespec64 {
+        time64_t tv_sec;
+        long int tv_nsec;
+};
+
 #endif /* __VMLINUX_TYPES_H__ */
diff --git a/netstacklat/.gitignore b/netstacklat/.gitignore
new file mode 100644
index 00000000..1d232888
--- /dev/null
+++ b/netstacklat/.gitignore
@@ -0,0 +1 @@
+netstacklat
diff --git a/netstacklat/Makefile b/netstacklat/Makefile
new file mode 100644
index 00000000..8efef417
--- /dev/null
+++ b/netstacklat/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+
+USER_TARGETS   := netstacklat
+BPF_TARGETS    := netstacklat.bpf
+BPF_SKEL_OBJ   := netstacklat.bpf.o
+
+EXTRA_DEPS += netstacklat.h bits.bpf.h
+LDLIBS     += -lm
+
+LIB_DIR = ../lib
+
+include $(LIB_DIR)/common.mk
+
diff --git a/netstacklat/README.md b/netstacklat/README.md
new file mode 100644
index 00000000..403a9df5
--- /dev/null
+++ b/netstacklat/README.md
@@ -0,0 +1,58 @@
+# Netstacklat - Monitor latency within the network stack
+Netstacklat is a simple tool for monitoring latency within the Linux
+network stack for ingress traffic. The tool relies on the kernel time
+stamping received packets (`SOF_TIMESTAMPING_RX_SOFTWARE`),
+specifically setting `sk_buff->tstamp`. It then reports when packets
+arrive at various hooks relative to this timestamp, i.e. the time
+between the packet being timestamped by the kernel and reaching a
+specific hook.
+
+The tool is based on the following bpftrace script from Jesper
+Dangaard Brouer:
+```console
+sudo bpftrace -e '
+	kfunc:tcp_v4_do_rcv,
+	kfunc:tcp_data_queue,
+	kfunc:udp_queue_rcv_one_skb
+	{
+		$tai_offset=37000000000;
+		$now=nsecs(tai)-$tai_offset; @cnt[probe]=count(); @total[probe]=count();
+		$ts=args->skb->tstamp; $delta=$now-(uint64)$ts;
+		@hist_ns[probe]=hist($delta);
+		@stats[probe]=stats($delta);
+		//printf("now:%llu - ts:%llu = delta:%llu\n", $now, $ts, $delta);
+	}
+	interval:s:10 {time("\n%H:%M:%S\n");
+		print(@cnt); clear(@cnt);
+		print(@total);
+		print(@stats);
+		print(@hist_ns);
+	}'
+```
+
+The eBPF part of the tool (`netstacklat.bpf.c`) is designed to be
+compatible with
+[ebpf_exporter](https://github.com/cloudflare/ebpf_exporter), so that
+the data can easily be exported to Prometheus. The easiest way to use
+netstacklat together with ebpf-exporter is simply to point it to this
+directory, i.e.
+```console
+$ ebpf_exporter --config.dir=<path>/bpf-examples/netstacklat --config.names=netstacklat
+```
+
+Alternatively, you can copy over the files to ebpf-exporter's example
+repository.
+```console
+$ cp netstacklat.{bpf.c,h,yaml} -t <path>/ebpf_exporter/examples/
+# Fix up some header includes (e.g. "vmlinux_local.h" -> <vmlinux.h>
+$ make -C <path>/ebpf_exporter/examples build
+$ ebpf_exporter --config.dir=<path>/ebpf_exporter/examples --config.names
+```
+
+Note that when using together with ebpf-exporter, some of the
+functionality handled by netstacklat's userspace program will not be
+available. This includes setting the `TAI_OFFSET` constant in
+`netstacklat.bpf.c` to match your system's TAI offset (you can do this
+manually instead), and enabling RX timestamping by the kernel (see the
+`enable_sw_rx_tstamps()` function in `netstacklat.c` for an example of
+how to do this).
diff --git a/netstacklat/bits.bpf.h b/netstacklat/bits.bpf.h
new file mode 100644
index 00000000..2b7e825d
--- /dev/null
+++ b/netstacklat/bits.bpf.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* From https://github.com/iovisor/bcc/blob/v0.25.0/libbpf-tools/bits.bpf.h*/
+
+#ifndef __BITS_BPF_H
+#define __BITS_BPF_H
+
+static __always_inline u64 log2(u32 v)
+{
+    u32 shift, r;
+
+    r = (v > 0xFFFF) << 4; v >>= r;
+    shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+    shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+    shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+    r |= (v >> 1);
+
+    return r;
+}
+
+static __always_inline u64 log2l(u64 v)
+{
+    u32 hi = v >> 32;
+    if (hi)
+        return log2(hi) + 32;
+    else
+        return log2(v);
+}
+
+#endif /* __BITS_BPF_H */
diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c
new file mode 100644
index 00000000..899a9638
--- /dev/null
+++ b/netstacklat/netstacklat.bpf.c
@@ -0,0 +1,338 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#include "vmlinux_local.h"
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#include "netstacklat.h"
+#include "bits.bpf.h"
+
+char LICENSE[] SEC("license") = "GPL";
+
+
+volatile const __s64 TAI_OFFSET = (37LL * NS_PER_S);
+volatile const struct netstacklat_bpf_config user_config = {
+	.filter_pid = false,
+};
+
+/*
+ * Alternative definition of sk_buff to handle renaming of the field
+ * mono_delivery_time to tstamp_type. See
+ * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
+ */
+struct sk_buff___old {
+	union {
+		ktime_t tstamp;
+		u64 skb_mstamp_ns;
+	};
+	__u8 mono_delivery_time: 1;
+} __attribute__((preserve_access_index));
+
+/*
+ * To be compatible with ebpf-exporter, all histograms need a key whose final
+ * member is the histogram bucket index
+ */
+struct hist_key {
+	u32 bucket;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, HIST_NBUCKETS);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_latency_ip_start_seconds SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, HIST_NBUCKETS);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_latency_tcp_start_seconds SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, HIST_NBUCKETS);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_latency_udp_start_seconds SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, HIST_NBUCKETS);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_latency_tcp_sock_enqueued_seconds SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, HIST_NBUCKETS);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_latency_udp_sock_enqueued_seconds SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, HIST_NBUCKETS);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_latency_tcp_sock_read_seconds SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, HIST_NBUCKETS);
+	__type(key, u32);
+	__type(value, u64);
+} netstack_latency_udp_sock_read_seconds SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, PID_MAX_LIMIT);
+	__type(key, u32);
+	__type(value, u8);
+} netstack_pidfilter SEC(".maps");
+
+static u32 get_exp2_histogram_bucket_idx(u64 value, u32 max_bucket)
+{
+	u32 bucket = log2l(value);
+
+	// Right-inclusive histogram, so "round up" the log value
+	if (bucket > 0 && 1ULL << bucket < value)
+		bucket++;
+
+	if (bucket > max_bucket)
+		bucket = max_bucket;
+
+	return bucket;
+}
+
+/*
+ * Same call signature as the increment_exp2_histogram_nosync macro from
+ * https://github.com/cloudflare/ebpf_exporter/blob/master/examples/maps.bpf.h
+ * but provided as a function.
+ *
+ * Unlike the macro, only works with keys of type struct hist_key. The hist_key
+ * struct must be provided by value (rather than as a pointer) to keep the same
+ * call signature as the ebpf-exporter macro, although this will get inefficent
+ * if struct hist_key grows large.
+ */
+static void increment_exp2_histogram_nosync(void *map, struct hist_key key,
+					    u64 value, u32 max_bucket)
+{
+	u64 *bucket_count;
+
+	// Increment histogram
+	key.bucket = get_exp2_histogram_bucket_idx(value, max_bucket);
+	bucket_count = bpf_map_lookup_elem(map, &key);
+	if (bucket_count)
+		(*bucket_count)++;
+
+	// Increment sum at end of histogram
+	if (value == 0)
+		return;
+
+	key.bucket = max_bucket + 1;
+	bucket_count = bpf_map_lookup_elem(map, &key);
+	if (bucket_count)
+		*bucket_count += value;
+}
+
+static void *hook_to_histmap(enum netstacklat_hook hook)
+{
+	switch (hook) {
+	case NETSTACKLAT_HOOK_IP_RCV:
+		return &netstack_latency_ip_start_seconds;
+	case NETSTACKLAT_HOOK_TCP_START:
+		return &netstack_latency_tcp_start_seconds;
+	case NETSTACKLAT_HOOK_UDP_START:
+		return &netstack_latency_udp_start_seconds;
+	case NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED:
+		return &netstack_latency_tcp_sock_enqueued_seconds;
+	case NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED:
+		return &netstack_latency_udp_sock_enqueued_seconds;
+	case NETSTACKLAT_HOOK_TCP_SOCK_READ:
+		return &netstack_latency_tcp_sock_read_seconds;
+	case NETSTACKLAT_HOOK_UDP_SOCK_READ:
+		return &netstack_latency_udp_sock_read_seconds;
+	default:
+		return NULL;
+	}
+}
+
+static ktime_t time_since(ktime_t tstamp)
+{
+	ktime_t now;
+
+	if (tstamp <= 0)
+		return -1;
+
+	now = bpf_ktime_get_tai_ns() - TAI_OFFSET;
+	if (tstamp > now)
+		return -1;
+
+	return now - tstamp;
+}
+
+static void record_latency(ktime_t latency, enum netstacklat_hook hook)
+{
+	struct hist_key key = { 0 };
+	increment_exp2_histogram_nosync(hook_to_histmap(hook), key, latency,
+					HIST_MAX_LATENCY_SLOT);
+}
+
+static void record_latency_since(ktime_t tstamp, enum netstacklat_hook hook)
+{
+	ktime_t latency = time_since(tstamp);
+	if (latency >= 0)
+		record_latency(latency, hook);
+}
+
+static void record_skb_latency(struct sk_buff *skb, enum netstacklat_hook hook)
+{
+	if (bpf_core_field_exists(skb->tstamp_type)) {
+		/*
+		 * For kernels >= v6.11 the tstamp_type being non-zero
+		 * (SKB_CLOCK_REALTIME) implies that skb->tstamp holds a
+		 * preserved TX timestamp rather than a RX timestamp. See
+		 * https://lore.kernel.org/all/20240509211834.3235191-2-quic_abchauha@quicinc.com/
+		 */
+		if (BPF_CORE_READ_BITFIELD(skb, tstamp_type) > 0)
+			return;
+
+	} else {
+		/*
+		 * For kernels < v6.11, the field was called mono_delivery_time
+		 * instead, see https://lore.kernel.org/all/20220302195525.3480280-1-kafai@fb.com/
+		 * Kernels < v5.18 do not have the mono_delivery_field either,
+		 * but we do not support those anyways (as they lack the
+		 * bpf_ktime_get_tai_ns helper)
+		 */
+		struct sk_buff___old *skb_old = (void *)skb;
+		if (BPF_CORE_READ_BITFIELD(skb_old, mono_delivery_time) > 0)
+			return;
+	}
+
+	record_latency_since(skb->tstamp, hook);
+}
+
+static bool filter_pid(u32 pid)
+{
+	u8 *pid_ok;
+
+	if (!user_config.filter_pid)
+		// No PID filter - all PIDs ok
+		return true;
+
+	pid_ok = bpf_map_lookup_elem(&netstack_pidfilter, &pid);
+	if (!pid_ok)
+		return false;
+
+	return *pid_ok > 0;
+}
+
+static bool filter_current_task(void)
+{
+	__u32 tgid;
+
+	if (!user_config.filter_pid)
+		return true;
+
+	tgid = bpf_get_current_pid_tgid() >> 32;
+	return filter_pid(tgid);
+}
+
+static void record_socket_latency(struct sock *sk, ktime_t tstamp,
+				  enum netstacklat_hook hook)
+{
+	if (!filter_current_task())
+		return;
+
+	record_latency_since(tstamp, hook);
+}
+
+SEC("fentry/ip_rcv_core")
+int BPF_PROG(netstacklat_ip_rcv_core, struct sk_buff *skb, void *block,
+	     void *tp, void *res, bool compat_mode)
+{
+	record_skb_latency(skb, NETSTACKLAT_HOOK_IP_RCV);
+	return 0;
+}
+
+SEC("fentry/ip6_rcv_core")
+int BPF_PROG(netstacklat_ip6_rcv_core, struct sk_buff *skb, void *block,
+	     void *tp, void *res, bool compat_mode)
+{
+	record_skb_latency(skb, NETSTACKLAT_HOOK_IP_RCV);
+	return 0;
+}
+
+SEC("fentry/tcp_v4_rcv")
+int BPF_PROG(netstacklat_tcp_v4_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NETSTACKLAT_HOOK_TCP_START);
+	return 0;
+}
+
+SEC("fentry/tcp_v6_rcv")
+int BPF_PROG(netstacklat_tcp_v6_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NETSTACKLAT_HOOK_TCP_START);
+	return 0;
+}
+
+SEC("fentry/udp_rcv")
+int BPF_PROG(netstacklat_udp_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_START);
+	return 0;
+}
+
+SEC("fentry/udpv6_rcv")
+int BPF_PROG(netstacklat_udpv6_rcv, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_START);
+	return 0;
+}
+
+SEC("fexit/tcp_data_queue")
+int BPF_PROG(netstacklat_tcp_data_queue, struct sock *sk, struct sk_buff *skb)
+{
+	record_skb_latency(skb, NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED);
+	return 0;
+}
+
+SEC("fexit/udp_queue_rcv_one_skb")
+int BPF_PROG(netstacklat_udp_queue_rcv_one_skb, struct sock *sk,
+	     struct sk_buff *skb)
+{
+	record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED);
+	return 0;
+}
+
+SEC("fexit/udpv6_queue_rcv_one_skb")
+int BPF_PROG(netstacklat_udpv6_queue_rcv_one_skb, struct sock *sk,
+	     struct sk_buff *skb)
+{
+	record_skb_latency(skb, NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED);
+	return 0;
+}
+
+SEC("fentry/tcp_recv_timestamp")
+int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
+	     struct scm_timestamping_internal *tss)
+{
+	struct timespec64 *ts = &tss->ts[0];
+	record_socket_latency(sk, (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec,
+			      NETSTACKLAT_HOOK_TCP_SOCK_READ);
+	return 0;
+}
+
+SEC("fentry/skb_consume_udp")
+int BPF_PROG(netstacklat_skb_consume_udp, struct sock *sk, struct sk_buff *skb,
+	     int len)
+{
+	record_socket_latency(sk, skb->tstamp, NETSTACKLAT_HOOK_UDP_SOCK_READ);
+	return 0;
+}
diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c
new file mode 100644
index 00000000..f70fa87f
--- /dev/null
+++ b/netstacklat/netstacklat.c
@@ -0,0 +1,1049 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+static const char *__doc__ =
+	"Netstacklat - Monitor latency to various points in the ingress network stack";
+
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <signal.h>
+#include <time.h>
+#include <math.h>
+#include <getopt.h>
+#include <ctype.h>
+#include <sys/signalfd.h>
+#include <sys/timerfd.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <sys/timex.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include <linux/types.h>
+#include <linux/net_tstamp.h>
+
+#include "netstacklat.h"
+#include "netstacklat.bpf.skel.h"
+
+#define MAX_EPOLL_EVENTS 8
+
+/*
+ * Used to pack both a "type" and a value into the epoll_event.data.u64 member.
+ * The topmost bits indicates the type (SIG, TIMER, etc) while the remaining
+ * bits can be used for the value. The MASK can be used to filter out the
+ * type/value.
+ */
+#define NETSTACKLAT_EPOLL_SIG (1ULL << 63)
+#define NETSTACKLAT_EPOLL_TIMER (1ULL << 62)
+#define NETSTACKLAT_EPOLL_MASK \
+	(~(NETSTACKLAT_EPOLL_SIG | NETSTACKLAT_EPOLL_TIMER))
+
+// Magical value used to indicate that the program should be aborted
+#define NETSTACKLAT_ABORT 424242
+
+#define MAX_BUCKETSPAN_STRLEN 16
+#define MAX_BUCKETCOUNT_STRLEN 10
+#define MAX_BAR_STRLEN (80 - 6 - MAX_BUCKETSPAN_STRLEN - MAX_BUCKETCOUNT_STRLEN)
+
+#define MAX_HOOK_PROGS 4
+
+// Maximum number of different pids that can be filtered for
+#define MAX_FILTER_PIDS 4096
+
+struct hook_prog_collection {
+	struct bpf_program *progs[MAX_HOOK_PROGS];
+	int nprogs;
+};
+
+struct netstacklat_config {
+	struct netstacklat_bpf_config bpf_conf;
+	double report_interval_s;
+	bool enabled_hooks[NETSTACKLAT_N_HOOKS];
+	int npids;
+	__u32 pids[MAX_FILTER_PIDS];
+};
+
+static const struct option long_options[] = {
+	{ "help",            no_argument,       NULL, 'h' },
+	{ "report-interval", required_argument, NULL, 'r' },
+	{ "list-probes",     no_argument,       NULL, 'l' },
+	{ "enable-probe",    required_argument, NULL, 'e' },
+	{ "disable-probe",   required_argument, NULL, 'd' },
+	{ "filter-pid",      required_argument, NULL, 'p' },
+	{ 0, 0, 0, 0 }
+};
+
+static const struct option *optval_to_longopt(int val)
+{
+	int i;
+
+	for (i = 0; long_options[i].name != 0; i++) {
+		if (long_options[i].val == val)
+			return &long_options[i];
+	}
+
+	return NULL;
+}
+
+static int generate_optstr(char *buf, size_t size)
+{
+	int i, optlen, strlen = 0;
+	char optstr[4];
+
+	for (i = 0; long_options[i].name != 0; i++) {
+		if (long_options[i].flag || !isalnum(long_options[i].val))
+			continue;
+
+		optlen = snprintf(
+			optstr, sizeof(optstr), "%c%s", long_options[i].val,
+			long_options[i].has_arg == optional_argument ? "::" :
+			long_options[i].has_arg == required_argument ? ":" :
+								       "");
+		if (strlen + optlen < size) {
+			strncpy(buf + strlen, optstr, optlen + 1);
+		}
+		strlen += optlen;
+	}
+
+	return strlen + 1;
+}
+
+static void print_usage(FILE *stream, const char *prog_name)
+{
+	int i;
+
+	fprintf(stream, "\nDOCUMENTATION:\n%s\n", __doc__);
+	fprintf(stream, "\n");
+	fprintf(stream, " Usage: %s (options-see-below)\n", prog_name);
+	fprintf(stream, " Listing options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		if (!long_options[i].flag && isalnum(long_options[i].val))
+			fprintf(stream, " -%c, ", long_options[i].val);
+		else
+			fprintf(stream, "     ");
+
+		printf(" --%s", long_options[i].name);
+
+		if (long_options[i].has_arg == required_argument)
+			fprintf(stream, " <ARG>");
+		else if (long_options[i].has_arg == optional_argument)
+			fprintf(stream, "[ARG]");
+
+		fprintf(stream, "\n");
+	}
+	printf("\n");
+}
+
+static const char *hook_to_str(enum netstacklat_hook hook)
+{
+	switch (hook) {
+	case NETSTACKLAT_HOOK_IP_RCV:
+		return "ip-start";
+	case NETSTACKLAT_HOOK_TCP_START:
+		return "tcp-start";
+	case NETSTACKLAT_HOOK_UDP_START:
+		return "udp-start";
+	case NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED:
+		return "tcp-socket-enqueued";
+	case NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED:
+		return "udp-socket-enqueued";
+	case NETSTACKLAT_HOOK_TCP_SOCK_READ:
+		return "tcp-socket-read";
+	case NETSTACKLAT_HOOK_UDP_SOCK_READ:
+		return "udp-socket-read";
+	default:
+		return "invalid";
+	}
+}
+
+static enum netstacklat_hook str_to_hook(const char *str)
+{
+	enum netstacklat_hook hook;
+
+	for (hook = 1; hook < NETSTACKLAT_N_HOOKS; hook++) {
+		if (strcmp(str, hook_to_str(hook)) == 0)
+			return hook;
+	}
+
+	return NETSTACKLAT_HOOK_INVALID;
+}
+
+static const char *hook_to_description(enum netstacklat_hook hook)
+{
+	switch (hook) {
+	case NETSTACKLAT_HOOK_IP_RCV:
+		return "packet has reached the IP-stack, i.e. past the traffic control layer";
+	case NETSTACKLAT_HOOK_TCP_START:
+		return "packet has reached the local TCP-stack, i.e. past the IP (and routing) stack";
+	case NETSTACKLAT_HOOK_UDP_START:
+		return "packet has reached the local UDP-stack, i.e. past the IP (and routing) stack";
+	case NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED:
+		return "packet has been enqueued to a TCP socket, i.e. end of the kernel receive stack";
+	case NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED:
+		return "packed has been enqueued to a UDP socket, i.e. end of the kernel receive stack";
+	case NETSTACKLAT_HOOK_TCP_SOCK_READ:
+		return "packet payload has been read from TCP socket, i.e. delivered to user space";
+	case NETSTACKLAT_HOOK_UDP_SOCK_READ:
+		return "packet payload has been read from UDP socket, i.e. delivered to user space";
+	default:
+		return "not a valid hook";
+	}
+}
+
+static int hook_to_histmap(enum netstacklat_hook hook,
+			   const struct netstacklat_bpf *obj)
+{
+	switch (hook) {
+	case NETSTACKLAT_HOOK_IP_RCV:
+		return bpf_map__fd(obj->maps.netstack_latency_ip_start_seconds);
+	case NETSTACKLAT_HOOK_TCP_START:
+		return bpf_map__fd(
+			obj->maps.netstack_latency_tcp_start_seconds);
+	case NETSTACKLAT_HOOK_UDP_START:
+		return bpf_map__fd(
+			obj->maps.netstack_latency_udp_start_seconds);
+	case NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED:
+		return bpf_map__fd(
+			obj->maps.netstack_latency_tcp_sock_enqueued_seconds);
+	case NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED:
+		return bpf_map__fd(
+			obj->maps.netstack_latency_udp_sock_enqueued_seconds);
+	case NETSTACKLAT_HOOK_TCP_SOCK_READ:
+		return bpf_map__fd(
+			obj->maps.netstack_latency_tcp_sock_read_seconds);
+	case NETSTACKLAT_HOOK_UDP_SOCK_READ:
+		return bpf_map__fd(
+			obj->maps.netstack_latency_udp_sock_read_seconds);
+	default:
+		return -EINVAL;
+	}
+}
+
+static void hook_to_progs(struct hook_prog_collection *progs,
+			  enum netstacklat_hook hook,
+			  const struct netstacklat_bpf *obj)
+{
+	switch (hook) {
+	case NETSTACKLAT_HOOK_IP_RCV:
+		progs->progs[0] = obj->progs.netstacklat_ip_rcv_core;
+		progs->progs[1] = obj->progs.netstacklat_ip6_rcv_core;
+		progs->nprogs = 2;
+		break;
+	case NETSTACKLAT_HOOK_TCP_START:
+		progs->progs[0] = obj->progs.netstacklat_tcp_v4_rcv;
+		progs->progs[1] = obj->progs.netstacklat_tcp_v6_rcv;
+		progs->nprogs = 2;
+		break;
+	case NETSTACKLAT_HOOK_UDP_START:
+		progs->progs[0] = obj->progs.netstacklat_udp_rcv;
+		progs->progs[1] = obj->progs.netstacklat_udpv6_rcv;
+		progs->nprogs = 2;
+		break;
+	case NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED:
+		progs->progs[0] = obj->progs.netstacklat_tcp_data_queue;
+		progs->nprogs = 1;
+		break;
+	case NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED:
+		progs->progs[0] = obj->progs.netstacklat_udp_queue_rcv_one_skb;
+		progs->progs[1] =
+			obj->progs.netstacklat_udpv6_queue_rcv_one_skb;
+		progs->nprogs = 2;
+		break;
+	case NETSTACKLAT_HOOK_TCP_SOCK_READ:
+		progs->progs[0] = obj->progs.netstacklat_tcp_recv_timestamp;
+		progs->nprogs = 1;
+		break;
+	case NETSTACKLAT_HOOK_UDP_SOCK_READ:
+		progs->progs[0] = obj->progs.netstacklat_skb_consume_udp;
+		progs->nprogs = 1;
+		break;
+	default:
+		progs->nprogs = 0;
+		break;
+	}
+}
+
+static void list_hooks(FILE *stream)
+{
+	enum netstacklat_hook hook;
+
+	fprintf(stream, "available hooks:\n");
+	for (hook = 1; hook < NETSTACKLAT_N_HOOKS; hook++)
+		fprintf(stream, "  %s: %s\n", hook_to_str(hook),
+			hook_to_description(hook));
+}
+
+static int parse_bounded_double(double *res, const char *str, double low,
+				double high, const char *name)
+{
+	char *endptr;
+	errno = 0;
+
+	*res = strtod(str, &endptr);
+	if (endptr == str || strlen(str) != endptr - str) {
+		fprintf(stderr, "%s %s is not a valid number\n", name, str);
+		return -EINVAL;
+	}
+
+	if (errno == ERANGE) {
+		fprintf(stderr, "%s %s overflowed\n", name, str);
+		return -ERANGE;
+	}
+
+	if (*res < low || *res > high) {
+		fprintf(stderr, "%s must be in range [%g, %g]\n", name, low, high);
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
+static int parse_bounded_long(long long *res, const char *str, long long low,
+			      long long high, const char *name)
+{
+	char *endptr;
+	errno = 0;
+
+	*res = strtoll(str, &endptr, 10);
+	if (endptr == str || strlen(str) != endptr - str) {
+		fprintf(stderr, "%s %s is not a valid integer\n", name, str);
+		return -EINVAL;
+	}
+
+	if (errno == ERANGE) {
+		fprintf(stderr, "%s %s overflowed\n", name, str);
+		return -ERANGE;
+	}
+
+	if (*res < low || *res > high) {
+		fprintf(stderr, "%s must be in range [%lld, %lld]\n", name, low,
+			high);
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
+/*
+ * Parses a comma-delimited string of hook-names, and sets the positions for
+ * the hooks that appear in the string to true.
+ */
+static int parse_hooks(bool hooks[NETSTACKLAT_N_HOOKS], const char *_str)
+{
+	enum netstacklat_hook hook;
+	char *tokp = NULL;
+	char str[1024];
+	char *hookstr;
+	int i;
+
+	for (i = 0; i < NETSTACKLAT_N_HOOKS; i++)
+		hooks[i] = false;
+
+	if (strlen(_str) >= sizeof(str))
+		return -E2BIG;
+	strcpy(str, _str);
+
+	hookstr = strtok_r(str, ",", &tokp);
+	while (hookstr) {
+		hook = str_to_hook(hookstr);
+		if (hook == NETSTACKLAT_HOOK_INVALID) {
+			fprintf(stderr, "%s is not a valid hook\n", hookstr);
+			return -EINVAL;
+		}
+
+		hooks[hook] = true;
+
+		hookstr = strtok_r(NULL, ",", &tokp);
+	}
+
+	return 0;
+}
+
+static int parse_pids(size_t size, __u32 arr[size], const char *_str,
+		      const char *name)
+{
+	char *pidstr, *str;
+	char *tokp = NULL;
+	int err, i = 0;
+	long long val;
+
+	str = malloc(strlen(_str) + 1);
+	if (!str)
+		return -ENOMEM;
+	strcpy(str, _str);
+
+	pidstr = strtok_r(str, ",", &tokp);
+	while (pidstr && i < size) {
+		err = parse_bounded_long(&val, pidstr, 1, PID_MAX_LIMIT, name);
+		if (err)
+			goto exit;
+		arr[i] = val;
+
+		pidstr = strtok_r(NULL, ",", &tokp);
+		i++;
+	}
+
+	if (pidstr)
+		// Parsed size pids, but more still remain
+		err = -E2BIG;
+
+exit:
+	free(str);
+	return err ?: i;
+}
+
+int parse_arguments(int argc, char *argv[], struct netstacklat_config *conf)
+{
+	bool hooks_on = false, hooks_off = false;
+	bool hooks[NETSTACKLAT_N_HOOKS];
+	char optstr[64];
+	int opt, err, i;
+	double fval;
+
+	conf->npids = 0;
+	conf->bpf_conf.filter_pid = false;
+
+	for (i = 0; i < NETSTACKLAT_N_HOOKS; i++)
+		// All probes enabled by default
+		conf->enabled_hooks[i] = true;
+
+	if (generate_optstr(optstr, sizeof(optstr)) > sizeof(optstr)) {
+		fprintf(stderr,
+			"Internal error: optstr too short to fit all long_options\n");
+		return -ENAMETOOLONG;
+	}
+
+	while ((opt = getopt_long(argc, argv, optstr, long_options,
+				  NULL)) != -1) {
+		switch (opt) {
+		case 'r': // report interval
+			err = parse_bounded_double(
+				&fval, optarg, 0.01, 3600 * 24,
+				optval_to_longopt(opt)->name);
+			if (err)
+				return err;
+
+			conf->report_interval_s = fval;
+			break;
+		case 'l': // list-probes
+			list_hooks(stdout);
+			exit(EXIT_SUCCESS);
+		case 'e': // enable-probes
+			err = parse_hooks(hooks, optarg);
+			if (err)
+				return err;
+
+			for (i = 1; i < NETSTACKLAT_N_HOOKS; i++)
+				conf->enabled_hooks[i] = hooks[i];
+			hooks_on = true;
+			break;
+		case 'd': // disable-probes
+			err = parse_hooks(hooks, optarg);
+			if (err)
+				return err;
+
+			for (i = 1; i < NETSTACKLAT_N_HOOKS; i++)
+				conf->enabled_hooks[i] = !hooks[i];
+			hooks_off = true;
+			break;
+		case 'p': // filter-pids
+			err = parse_pids(ARRAY_SIZE(conf->pids) - conf->npids,
+					 conf->pids + conf->npids, optarg,
+					 optval_to_longopt(opt)->name);
+			if (err < 0)
+				return err;
+
+			conf->npids += err;
+			conf->bpf_conf.filter_pid = true;
+			break;
+		case 'h': // help
+			print_usage(stdout, argv[0]);
+			exit(EXIT_SUCCESS);
+		default:
+			// unrecognized option reported by getopt, so just print usage
+			print_usage(stderr, argv[0]);
+			return -EINVAL;
+		}
+	}
+
+	if (hooks_on && hooks_off) {
+		fprintf(stderr,
+			"%s and %s are mutually exclusive, only use one of them\n",
+			optval_to_longopt('e')->name,
+			optval_to_longopt('d')->name);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int find_first_nonzero_bucket(size_t n, const __u64 hist[n])
+{
+	int i;
+
+	for (i = 0; i < n; i++) {
+		if (hist[i] > 0)
+			return i;
+	}
+
+	return -1;
+}
+
+static int find_last_nonzero_bucket(size_t n, const __u64 hist[n])
+{
+	int i;
+
+	for (i = n - 1; i >= 0; i--) {
+		if (hist[i] > 0)
+			return i;
+	}
+
+	return -1;
+}
+
+static int find_largest_bucket(size_t n, const __u64 hist[n])
+{
+	__u64 max_val = 0;
+	int i;
+
+	for (i = 0; i < n; i++) {
+		if (hist[i] > max_val)
+			max_val = hist[i];
+	}
+
+	return max_val;
+}
+
+static double ns_to_siprefix(double ns, char **prefix)
+{
+	static char *prefixes[] = { "n", "u", "m", "" };
+	int psteps = 0;
+
+	while (ns >= 1000 && psteps < ARRAY_SIZE(prefixes) - 1) {
+		ns /= 1000;
+		psteps++;
+	}
+
+	*prefix = prefixes[psteps];
+
+	return ns;
+}
+
+static void print_nchars(FILE *stream, char c, int n)
+{
+	while (n-- > 0)
+		putc(c, stream);
+}
+
+static int print_bucket_interval(FILE *stream, double low_bound_ns,
+				 double high_bound_ns)
+{
+	char *lprefix, *hprefix;
+	double low_si, high_si;
+
+	low_si = ns_to_siprefix(low_bound_ns, &lprefix);
+
+	if (isinf(high_bound_ns)) {
+		high_si = INFINITY;
+		hprefix = " ";
+	} else {
+		high_si = ns_to_siprefix(high_bound_ns, &hprefix);
+	}
+
+	return fprintf(stream, "%c%.3g%ss, %.3g%ss]",
+		       low_bound_ns == 0 ? '[' : '(', low_si, lprefix, high_si,
+		       hprefix);
+}
+
+static void print_histbar(FILE *stream, __u64 count, __u64 max_count)
+{
+	int barlen = round((double)count / max_count * MAX_BAR_STRLEN);
+
+	fprintf(stream, "|");
+	print_nchars(stream, '@', barlen);
+	print_nchars(stream, ' ', MAX_BAR_STRLEN - barlen);
+	fprintf(stream, "|");
+}
+
+static void print_log2hist(FILE *stream, size_t n, const __u64 hist[n],
+			   double multiplier)
+{
+	int bucket, start_bucket, end_bucket, max_bucket, len;
+	double low_bound, high_bound, avg;
+	__u64 count = 0;
+	char *prefix;
+
+	start_bucket = find_first_nonzero_bucket(n - 1, hist);
+	end_bucket = find_last_nonzero_bucket(n - 1, hist);
+	max_bucket = find_largest_bucket(n - 1, hist);
+
+	for (bucket = max(0, start_bucket); bucket <= end_bucket; bucket++) {
+		low_bound = pow(2, bucket - 1) * multiplier;
+		high_bound = pow(2, bucket) * multiplier;
+
+		// First bucket includes 0 (i.e. [0, 1] rather than (0.5, 1])
+		if (bucket == 0)
+			low_bound = 0;
+		// Last bucket includes all values too large for the second-last bucket
+		if (bucket == n - 2)
+			high_bound = INFINITY;
+
+		len = print_bucket_interval(stream, low_bound, high_bound);
+		print_nchars(stream, ' ',
+			     max(0, MAX_BUCKETSPAN_STRLEN - len) + 1);
+		fprintf(stream, "%*llu ", MAX_BUCKETCOUNT_STRLEN, hist[bucket]);
+		print_histbar(stream, hist[bucket], max_bucket);
+		fprintf(stream, "\n");
+
+		count += hist[bucket];
+	}
+
+	// Final "bucket" is the sum of all values in the histogram
+	if (count > 0) {
+		avg = ns_to_siprefix((double)hist[n - 1] / count, &prefix);
+		fprintf(stream, "count: %llu, average: %.2f%ss\n", count, avg,
+			prefix);
+	} else {
+		fprintf(stream, "count: %llu, average: -\n", count);
+	}
+}
+
+static void merge_percpu_hist(size_t n, int ncpus,
+			      const __u64 percpu_hist[n][ncpus],
+			      __u64 merged_hist[n])
+{
+	int idx, cpu;
+
+	memset(merged_hist, 0, sizeof(__u64) * n);
+
+	for (idx = 0; idx < n; idx++) {
+		for (cpu = 0; cpu < ncpus; cpu++) {
+			merged_hist[idx] += percpu_hist[idx][cpu];
+		}
+	}
+}
+
+static int fetch_hist_map(int map_fd, __u64 hist[HIST_NBUCKETS])
+{
+	__u32 in_batch, out_batch, count = HIST_NBUCKETS;
+	int ncpus = libbpf_num_possible_cpus();
+	__u32 idx, buckets_fetched = 0;
+	__u64 (*percpu_hist)[ncpus];
+	__u32 *keys;
+	int err = 0;
+
+	DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, batch_opts, .flags = BPF_EXIST);
+
+	percpu_hist = calloc(HIST_NBUCKETS, sizeof(*percpu_hist));
+	keys = calloc(HIST_NBUCKETS, sizeof(*keys));
+	if (!percpu_hist || !keys) {
+		err = -ENOMEM;
+		goto exit;
+	}
+
+	while (buckets_fetched < HIST_NBUCKETS) {
+		err = bpf_map_lookup_batch(map_fd,
+					   buckets_fetched > 0 ? &in_batch : NULL,
+					   &out_batch, keys + buckets_fetched,
+					   percpu_hist + buckets_fetched, &count,
+					   &batch_opts);
+		if (err == -ENOENT) // All entries fetched
+			err = 0;
+		else if (err)
+			goto exit;
+
+		// Verify keys match expected idx range
+		for (idx = buckets_fetched; idx < buckets_fetched + count; idx++) {
+			if (keys[idx] != idx) {
+				err = -EBADSLT;
+				goto exit;
+			}
+		}
+
+		in_batch = out_batch;
+		buckets_fetched += count;
+		count = HIST_NBUCKETS - buckets_fetched;
+	}
+
+	merge_percpu_hist(HIST_NBUCKETS, ncpus, percpu_hist, hist);
+
+exit:
+	free(percpu_hist);
+	free(keys);
+	return err;
+}
+
+static int report_stats(const struct netstacklat_config *conf,
+			const struct netstacklat_bpf *obj)
+{
+	enum netstacklat_hook hook;
+	__u64 hist[HIST_NBUCKETS] = { 0 };
+	time_t t;
+	int err;
+
+	time(&t);
+	printf("%s", ctime(&t));
+
+	for (hook = 1; hook < NETSTACKLAT_N_HOOKS; hook++) {
+		if (!conf->enabled_hooks[hook])
+			continue;
+
+		printf("%s:\n", hook_to_str(hook));
+
+		err = fetch_hist_map(hook_to_histmap(hook, obj), hist);
+		if (err)
+			return err;
+
+		print_log2hist(stdout, ARRAY_SIZE(hist), hist, 1);
+		printf("\n");
+	}
+	fflush(stdout);
+
+	return 0;
+}
+
+static int enable_sw_rx_tstamps(void)
+{
+	int tstamp_opt = SOF_TIMESTAMPING_RX_SOFTWARE;
+	int sock_fd, err;
+
+	sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+	if (sock_fd < 0) {
+		err = -errno;
+		fprintf(stderr, "Failed opening socket: %s\n", strerror(-err));
+		return err;
+	}
+
+	err = setsockopt(sock_fd, SOL_SOCKET, SO_TIMESTAMPING, &tstamp_opt,
+			 sizeof(tstamp_opt));
+	if (err) {
+		err = -errno;
+		fprintf(stderr, "Failed setting SO_TIMESTAMPING option: %s\n",
+			strerror(-err));
+		goto err_socket;
+	}
+
+	return 0;
+
+err_socket:
+	close(sock_fd);
+	return err;
+}
+
+static __s64 get_tai_offset(void)
+{
+	struct ntptimeval ntpt;
+
+	ntp_gettimex(&ntpt);
+	return ntpt.tai;
+}
+
+static void set_programs_to_load(const struct netstacklat_config *conf,
+				 struct netstacklat_bpf *obj)
+{
+	struct hook_prog_collection progs;
+	enum netstacklat_hook hook;
+	int i;
+
+	for (hook = 1; hook < NETSTACKLAT_N_HOOKS; hook++) {
+		hook_to_progs(&progs, hook, obj);
+
+		for (i = 0; i < progs.nprogs; i++)
+			bpf_program__set_autoload(progs.progs[i],
+						  conf->enabled_hooks[hook]);
+	}
+}
+
+static int init_signalfd(void)
+{
+	sigset_t mask;
+	int fd, err;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGINT);
+	sigaddset(&mask, SIGTERM);
+
+	fd = signalfd(-1, &mask, 0);
+	if (fd < 0)
+		return -errno;
+
+	err = pthread_sigmask(SIG_BLOCK, &mask, NULL);
+	if (err) {
+		err = -errno;
+		close(fd);
+		return err;
+	}
+
+	return fd;
+}
+
+static int handle_signal(int sig_fd)
+{
+	struct signalfd_siginfo sig_info;
+	ssize_t size;
+
+	size = read(sig_fd, &sig_info, sizeof(sig_info));
+	if (size != sizeof(sig_info)) {
+		fprintf(stderr, "Failed reading signal fd\n");
+		return -EBADFD;
+	}
+
+	switch (sig_info.ssi_signo) {
+	case SIGINT:
+	case SIGTERM:
+		return NETSTACKLAT_ABORT;
+	default:
+		fprintf(stderr, "Unexpected signal: %d\n", sig_info.ssi_signo);
+		return -EBADR;
+	}
+}
+
+static int setup_timer(__u64 interval_ns)
+{
+	struct itimerspec timercfg = {
+		.it_value = { .tv_sec = interval_ns / NS_PER_S,
+			      .tv_nsec = interval_ns % NS_PER_S },
+		.it_interval = { .tv_sec = interval_ns / NS_PER_S,
+				 .tv_nsec = interval_ns % NS_PER_S }
+	};
+	int fd, err;
+
+	fd = timerfd_create(CLOCK_MONOTONIC, 0);
+	if (fd < 0) {
+		return -errno;
+	}
+
+	err = timerfd_settime(fd, 0, &timercfg, NULL);
+	if (err) {
+		err = -errno;
+		close(fd);
+		return err;
+	}
+
+	return fd;
+}
+
+static int handle_timer(int timer_fd, const struct netstacklat_config *conf,
+			const struct netstacklat_bpf *obj)
+{
+	__u64 timer_exps;
+	ssize_t size;
+
+	size = read(timer_fd, &timer_exps, sizeof(timer_exps));
+	if (size != sizeof(timer_exps)) {
+		fprintf(stderr, "Failed reading timer fd\n");
+		return -EBADFD;
+	}
+
+	if (timer_exps == 0)
+		return 0;
+	if (timer_exps > 1)
+		fprintf(stderr, "Warning: Missed %llu reporting intervals\n",
+			timer_exps - 1);
+
+	return report_stats(conf, obj);
+}
+
+static int epoll_add_event(int epoll_fd, int fd, __u64 event_type, __u64 value)
+{
+	struct epoll_event ev = {
+		.events = EPOLLIN,
+		.data = { .u64 = event_type | value },
+	};
+
+	if (value & ~NETSTACKLAT_EPOLL_MASK)
+		return -EINVAL;
+
+	return epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &ev) ? -errno : 0;
+}
+
+static int setup_epoll_instance(int sig_fd, int timer_fd)
+{
+	int epoll_fd, err = 0;
+
+	epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+	if (epoll_fd < 0)
+		return -errno;
+
+	err = epoll_add_event(epoll_fd, sig_fd, NETSTACKLAT_EPOLL_SIG, sig_fd);
+	if (err)
+		goto err;
+
+	err = epoll_add_event(epoll_fd, timer_fd, NETSTACKLAT_EPOLL_TIMER,
+			      timer_fd);
+	if (err)
+		goto err;
+
+	return epoll_fd;
+
+err:
+	close(epoll_fd);
+	return err;
+}
+
+static int poll_events(int epoll_fd, const struct netstacklat_config *conf,
+		       const struct netstacklat_bpf *obj)
+{
+	struct epoll_event events[MAX_EPOLL_EVENTS];
+	int i, n, fd, err = 0;
+	__u64 epoll_type;
+
+	n = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 100);
+	if (n < 0)
+		return -errno;
+
+	for (i = 0; i < n; i++) {
+		epoll_type = events[i].data.u64 & ~NETSTACKLAT_EPOLL_MASK;
+		fd = events[i].data.u64 & NETSTACKLAT_EPOLL_MASK;
+
+		switch (epoll_type) {
+		case NETSTACKLAT_EPOLL_SIG:
+			err = handle_signal(fd);
+			break;
+		case NETSTACKLAT_EPOLL_TIMER:
+			err = handle_timer(fd, conf, obj);
+			break;
+		default:
+			fprintf(stderr, "Warning: unexpected epoll data: %lu\n",
+				events[i].data.u64);
+			break;
+		}
+
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+static int init_pidfilter_map(const struct netstacklat_bpf *obj,
+			      const struct netstacklat_config *conf)
+{
+	__u8 pid_ok_val = 1;
+	int map_fd, err;
+	__u32 i;
+
+	map_fd = bpf_map__fd(obj->maps.netstack_pidfilter);
+	for (i = 0; i < conf->npids; i++) {
+		err = bpf_map_update_elem(map_fd, &conf->pids[i], &pid_ok_val,
+					  0);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int sig_fd, timer_fd, epoll_fd, sock_fd, err;
+	struct netstacklat_config config = {
+		.report_interval_s = 5,
+	};
+	struct netstacklat_bpf *obj;
+	char errmsg[128];
+
+	err = parse_arguments(argc, argv, &config);
+	if (err) {
+		fprintf(stderr, "Failed parsing arguments: %s\n",
+			strerror(-err));
+		return EXIT_FAILURE;
+	}
+
+	sock_fd = enable_sw_rx_tstamps();
+	if (sock_fd < 0) {
+		err = sock_fd;
+		fprintf(stderr,
+			"Failed enabling software RX timestamping: %s\n",
+			strerror(-err));
+		return EXIT_FAILURE;
+	}
+
+	obj = netstacklat_bpf__open();
+	if (!obj) {
+		err = libbpf_get_error(obj);
+		libbpf_strerror(err, errmsg, sizeof(errmsg));
+		fprintf(stderr, "Failed opening eBPF object file: %s\n", errmsg);
+		goto exit_sockfd;
+	}
+
+	obj->rodata->TAI_OFFSET = get_tai_offset() * NS_PER_S;
+	obj->rodata->user_config = config.bpf_conf;
+
+	set_programs_to_load(&config, obj);
+
+	err = netstacklat_bpf__load(obj);
+	if (err) {
+		libbpf_strerror(err, errmsg, sizeof(errmsg));
+		fprintf(stderr, "Failed loading eBPF programs: %s\n", errmsg);
+		goto exit_destroy_bpf;
+	}
+
+	err = init_pidfilter_map(obj, &config);
+	if (err) {
+		libbpf_strerror(err, errmsg, sizeof(errmsg));
+		fprintf(stderr, "Failed filling the pid filter map: %s\n",
+			errmsg);
+		goto exit_destroy_bpf;
+	}
+
+	err = netstacklat_bpf__attach(obj);
+	if (err) {
+		libbpf_strerror(err, errmsg, sizeof(errmsg));
+		fprintf(stderr, "Failed to attach eBPF programs: %s\n", errmsg);
+		goto exit_destroy_bpf;
+	}
+
+	sig_fd = init_signalfd();
+	if (sig_fd < 0) {
+		err = sig_fd;
+		fprintf(stderr, "Failed setting up signal handling: %s\n",
+			strerror(-err));
+		goto exit_detach_bpf;
+	}
+
+	timer_fd = setup_timer(config.report_interval_s * NS_PER_S);
+	if (timer_fd < 0) {
+		err = timer_fd;
+		fprintf(stderr, "Failed creating timer: %s\n", strerror(-err));
+		goto exit_sigfd;
+	}
+
+	epoll_fd = setup_epoll_instance(sig_fd, timer_fd);
+	if (epoll_fd < 0) {
+		err = epoll_fd;
+		fprintf(stderr, "Failed setting up epoll: %s\n",
+			strerror(-err));
+		goto exit_timerfd;
+	}
+
+	// Report stats until user shuts down program
+	while (true) {
+		err = poll_events(epoll_fd, &config, obj);
+
+		if (err) {
+			if (err == NETSTACKLAT_ABORT) {
+				// Report stats a final time before terminating
+				err = report_stats(&config, obj);
+			} else {
+				libbpf_strerror(err, errmsg, sizeof(errmsg));
+				fprintf(stderr, "Failed polling fds: %s\n",
+					errmsg);
+			}
+			break;
+		}
+	}
+
+	// Cleanup
+	close(epoll_fd);
+exit_timerfd:
+	close(timer_fd);
+exit_sigfd:
+	close(sig_fd);
+exit_detach_bpf:
+	netstacklat_bpf__detach(obj);
+exit_destroy_bpf:
+	netstacklat_bpf__destroy(obj);
+exit_sockfd:
+	close(sock_fd);
+	return err ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h
new file mode 100644
index 00000000..bb0162a1
--- /dev/null
+++ b/netstacklat/netstacklat.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef NETSTACKLAT_H
+#define NETSTACKLAT_H
+
+#define HIST_MAX_LATENCY_SLOT 34 // 2^34 ns -> ~17s
+/*
+ * MAX_LATENCY_SLOT + 1 buckets for hist, + 1 "bucket" for the "sum key"
+ * (https://github.com/cloudflare/ebpf_exporter?tab=readme-ov-file#sum-keys)
+ * that ebpf_exporter expects for exp2 hists (see how it's used in the
+ * increment_exp2_histogram_nosync() function)
+ */
+#define HIST_NBUCKETS (HIST_MAX_LATENCY_SLOT + 2)
+
+#define NS_PER_S 1000000000
+
+// The highest possible PID on a Linux system (from /include/linux/threads.h)
+#define PID_MAX_LIMIT (4 * 1024 * 1024)
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+#endif
+
+#ifndef max
+#define max(a, b)                   \
+	({                          \
+		typeof(a) _a = (a); \
+		typeof(b) _b = (b); \
+		_a > _b ? _a : _b;  \
+	})
+#endif
+
+enum netstacklat_hook {
+	NETSTACKLAT_HOOK_INVALID = 0,
+	NETSTACKLAT_HOOK_IP_RCV,
+	NETSTACKLAT_HOOK_TCP_START,
+	NETSTACKLAT_HOOK_UDP_START,
+	NETSTACKLAT_HOOK_TCP_SOCK_ENQUEUED,
+	NETSTACKLAT_HOOK_UDP_SOCK_ENQUEUED,
+	NETSTACKLAT_HOOK_TCP_SOCK_READ,
+	NETSTACKLAT_HOOK_UDP_SOCK_READ,
+	NETSTACKLAT_N_HOOKS,
+};
+
+struct netstacklat_bpf_config
+{
+	bool filter_pid;
+};
+
+#endif
+
diff --git a/netstacklat/netstacklat.yaml b/netstacklat/netstacklat.yaml
new file mode 100644
index 00000000..2fb99530
--- /dev/null
+++ b/netstacklat/netstacklat.yaml
@@ -0,0 +1,79 @@
+metrics:
+  histograms:
+    - name: netstack_latency_ip_start_seconds
+      help: Time for packet to reach the start of the IP-stack
+      bucket_type: exp2
+      bucket_min: 0
+      bucket_max: 34
+      bucket_multiplier: 0.000000001 # nanoseconds to seconds
+      labels:
+        - name: bucket
+          size: 4
+          decoders:
+            - name: uint
+    - name: netstack_latency_tcp_start_seconds
+      help: Time for packet to reach the start of the TCP stack
+      bucket_type: exp2
+      bucket_min: 0
+      bucket_max: 34
+      bucket_multiplier: 0.000000001 # nanoseconds to seconds
+      labels:
+        - name: bucket
+          size: 4
+          decoders:
+            - name: uint
+    - name: netstack_latency_udp_start_seconds
+      help: Time until packet to reach the start of the UDP stack
+      bucket_type: exp2
+      bucket_min: 0
+      bucket_max: 34
+      bucket_multiplier: 0.000000001 # nanoseconds to seconds
+      labels:
+        - name: bucket
+          size: 4
+          decoders:
+            - name: uint
+    - name: netstack_latency_tcp_sock_enqueued_seconds
+      help: Time until packet is queued to TCP socket
+      bucket_type: exp2
+      bucket_min: 0
+      bucket_max: 34
+      bucket_multiplier: 0.000000001 # nanoseconds to seconds
+      labels:
+        - name: bucket
+          size: 4
+          decoders:
+            - name: uint
+    - name: netstack_latency_udp_sock_enqueued_seconds
+      help: Time until packet is queued to UDP socket
+      bucket_type: exp2
+      bucket_min: 0
+      bucket_max: 34
+      bucket_multiplier: 0.000000001 # nanoseconds to seconds
+      labels:
+        - name: bucket
+          size: 4
+          decoders:
+            - name: uint
+    - name: netstack_latency_tcp_sock_read_seconds
+      help: Time until packet data is read from TCP socket
+      bucket_type: exp2
+      bucket_min: 0
+      bucket_max: 34
+      bucket_multiplier: 0.000000001 # nanoseconds to seconds
+      labels:
+        - name: bucket
+          size: 4
+          decoders:
+            - name: uint
+    - name: netstack_latency_udp_sock_read_seconds
+      help: Time until packet data is read from UDP socket
+      bucket_type: exp2
+      bucket_min: 0
+      bucket_max: 34
+      bucket_multiplier: 0.000000001 # nanoseconds to seconds
+      labels:
+        - name: bucket
+          size: 4
+          decoders:
+            - name: uint