What eBPF actually is
eBPF (extended Berkeley Packet Filter) is a virtual machine embedded in the Linux kernel. You write a small program in a restricted C subset, compile it to eBPF bytecode, and load it into the kernel. The kernel's verifier statically analyses the program — proving it terminates, never accesses out-of-bounds memory, and doesn't call arbitrary kernel functions — then JIT-compiles it to native machine code.
The result: safe, sandboxed programs running at kernel speed, without loading kernel modules, without rebooting, and without risking a kernel panic. This is why every major observability tool (Cilium, Pixie, Falco, Tetragon) has migrated to eBPF.
The verifier — what it checks
The verifier is the reason eBPF is safe. It performs:
- Control flow analysis — the program must be a DAG (directed acyclic graph). No unbounded loops (before kernel 5.3; bounded loops were added later).
- Register tracking — the verifier tracks the type and range of every register at every instruction. If you dereference a pointer without checking for NULL, the verifier rejects the program.
- Stack depth limiting — maximum 512 bytes of stack per program
- Helper function whitelist — eBPF programs can only call approved
bpf_*helper functions, not arbitrary kernel symbols
# See the verifier's output when loading a program bpftool prog load my_prog.o /sys/fs/bpf/my_prog 2>&1 | head -50
eBPF map types — choosing the right one
Maps are the data structures shared between eBPF programs and userspace. Choosing the wrong map type is the most common performance mistake.
BPF_MAP_TYPE_HASH
Generic hash table. O(1) average lookup. Use for: per-IP counters, connection tracking, policy enforcement. Downside: worst-case O(n) lookups under hash collision attacks.
BPF_MAP_TYPE_PERCPU_HASH
Hash table with per-CPU values. Eliminates spinlocks for high-frequency updates. Use for: counters updated on every packet. Read from userspace with bpftool map dump and sum across CPUs.
BPF_MAP_TYPE_LRU_HASH
Hash table that evicts least-recently-used entries when full. Use for: connection tracking where you can't bound the number of entries.
BPF_MAP_TYPE_RINGBUF
Lock-free ring buffer for streaming events to userspace. Prefer this over PERF_EVENT_ARRAY for new programs — lower overhead and supports variable-length records.
Writing a TC classifier that tracks per-port bytes
Unlike XDP (which runs in the driver), TC (Traffic Control) classifiers run after the sk_buff is allocated, giving access to more metadata. Here's a program that tracks bytes per destination port:
// tc_port_bytes.c
#include <linux/bpf.h>
#include <linux/pkt_cls.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
struct port_stats {
__u64 bytes;
__u64 packets;
};
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
__type(key, __u16); // destination port
__type(value, struct port_stats);
__uint(max_entries, 65536);
} port_bytes SEC(".maps");
static __always_inline int parse_and_count(struct __sk_buff *skb) {
void *data = (void *)(long)skb->data;
void *data_end = (void *)(long)skb->data_end;
struct ethhdr *eth = data;
if ((void *)(eth + 1) > data_end) return TC_ACT_OK;
if (bpf_ntohs(eth->h_proto) != ETH_P_IP) return TC_ACT_OK;
struct iphdr *ip = (void *)(eth + 1);
if ((void *)(ip + 1) > data_end) return TC_ACT_OK;
__u16 dport = 0;
__u8 proto = ip->protocol;
void *l4 = (void *)ip + (ip->ihl * 4);
if (proto == IPPROTO_TCP) {
struct tcphdr *tcp = l4;
if ((void *)(tcp + 1) > data_end) return TC_ACT_OK;
dport = bpf_ntohs(tcp->dest);
} else if (proto == IPPROTO_UDP) {
struct udphdr *udp = l4;
if ((void *)(udp + 1) > data_end) return TC_ACT_OK;
dport = bpf_ntohs(udp->dest);
} else {
return TC_ACT_OK;
}
struct port_stats *stats = bpf_map_lookup_elem(&port_bytes, &dport);
if (stats) {
stats->bytes += skb->len;
stats->packets += 1;
} else {
struct port_stats init = { .bytes = skb->len, .packets = 1 };
bpf_map_update_elem(&port_bytes, &dport, &init, BPF_ANY);
}
return TC_ACT_OK; // always let the packet through
}
SEC("tc")
int tc_ingress(struct __sk_buff *skb) { return parse_and_count(skb); }
char _license[] SEC("license") = "GPL";
Attach with tc and read results
# Compile
clang -O2 -target bpf -c tc_port_bytes.c -o tc_port_bytes.o \
-I/usr/include/$(uname -m)-linux-gnu
# Create a clsact qdisc on the interface
tc qdisc add dev eth0 clsact
# Attach as ingress classifier
tc filter add dev eth0 ingress bpf da obj tc_port_bytes.o sec tc
# Read the per-CPU map and sum values
bpftool map dump name port_bytes | \
python3 -c "
import sys, json
from collections import defaultdict
data = json.load(sys.stdin)
totals = defaultdict(int)
for entry in data:
port = entry['key']
totals[port] += entry['value']['bytes']
for port, b in sorted(totals.items(), key=lambda x: -x[1])[:20]:
print(f'port {port:5d}: {b/1e6:.1f} MB')
"
Use BPF_MAP_TYPE_PERCPU_HASH for any counter updated per-packet. The per-CPU variant eliminates atomic operations entirely — each CPU writes to its own slot, and you aggregate in userspace. At 1 Mpps this difference is measurable.