深入了解 ebpf map
探究 ebpf map 在 kernel 中加载和使用的原理,最后给出两个比较常见的 map 使用的示例
ebpf map 原理
map 创建原理
ebpf 程序需要由用户态进程(程序) load 进内核,由于 bpf syscall 的存在,ebpf map 创建可以大致分两种方式:
- 用户态进程(程序)通过 bpf syscall 来创建和管理 ebpf map;
- 内核态在 load ebpf 程序的时候通过解析 ELF 文件的 map section 来创建 ebpf map;
1. 用户态创建 map
用户态直接创建 map 的原理主要是根据 bpf_attr
定义调用 bpf syscall 实现的,具体如下代码所示:
#include <linux/bpf.h>
union bpf_attr my_map_attr {
.map_type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 1024,
};
int fd = bpf(BPF_MAP_CREATE, &my_map_attr, sizeof(my_map_attr));
2. 内核态创建 map
在 bpf 程序中添加如下结构体声明即可在 bpf 成加载流程中创建 ebpf map,关键的原理还是在于 kernel 加载 bpf 程序的时候(load_bpf_file()),如果解析 bpf 程序(object 文件)中包括 maps section 定义就会触发 load_maps执行,这其中就包括了 bpf_create_map_node() 和 bpf_create_map_in_map_node() 函数。
💡
注意这里所涉及的 API 特指 libbpf,llvm 在将 bpf 程序的 c 文件编译成 object 文件时会链接 libbpf
struct bpf_map_def SEC("maps") my_bpf_map = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 100,
};
ebpf map 管理
map 创建完成之后涉及到的就是 CRUD 了,主要还是 map 相关几个操作函数:
helper:(kernel space)
- void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
- long bpf_map_update_elem(struct bpf_map *map, const void *key,
const void *value, u64 flags)
- long bpf_map_delete_elem(struct bpf_map *map, const void *key)
- ...
libbpf:(user space)
// https://elixir.bootlin.com/linux/v4.19.261/source/tools/lib/bpf/bpf.c#L299
- int bpf_map_lookup_elem(int fd, const void *key, void *value)
- int bpf_map_update_elem(int fd, const void *key, const void *value,
__u64 flags)
- int bpf_map_delete_elem(int fd, const void *key)
- ...
ebpf map 使用方法
共享数据
/* example ebpf program from:
* https://github.com/bigwhite/experiments/blob/master/ebpf-examples/execve-counter/execve_counter.bpf.c
*/
#include <bpf/bpf_helpers.h>
typedef __u64 u64;
typedef char stringkey[64];
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 128);
//__type(key, stringkey);
stringkey* key;
__type(value, u64);
} execve_counter SEC(".maps");
SEC("tracepoint/syscalls/sys_enter_execve")
int bpf_prog(void *ctx) {
stringkey key = "execve_counter";
u64 *v = NULL;
v = bpf_map_lookup_elem(&execve_counter, &key);
if (v != NULL) {
*v += 1;
bpf_map_update_elem(&execve_counter, &key, v, BPF_ANY);
bpf_printk("map value: %d\n", *v);
}
return 0;
}
char LICENSE[] SEC("license") = "Dual BSD/GPL";
/* example userspace program from:
* https://github.com/bigwhite/experiments/blob/master/ebpf-examples/execve-counter/execve_counter.c
*/
#include <stdio.h>
#include <unistd.h>
#include <sys/resource.h>
#include <bpf/libbpf.h>
#include <linux/bpf.h>
#include "execve_counter.skel.h"
typedef __u64 u64;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
return vfprintf(stderr, format, args);
}
int main(int argc, char **argv)
{
struct execve_counter_bpf *skel;
int err;
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
/* Set up libbpf errors and debug info callback */
libbpf_set_print(libbpf_print_fn);
/* Open BPF application */
skel = execve_counter_bpf__open();
if (!skel) {
fprintf(stderr, "Failed to open BPF skeleton\n");
return 1;
}
/* Load & verify BPF programs */
err = execve_counter_bpf__load(skel);
if (err) {
fprintf(stderr, "Failed to load and verify BPF skeleton\n");
goto cleanup;
}
/* init the counter */
char key[16] = "execve_counter";
u64 v = 0;
err = bpf_map__update_elem(skel->maps.execve_counter, &key, sizeof(key), &v, sizeof(v), BPF_ANY);
if (err != 0) {
fprintf(stderr, "Failed to init the counter, %d\n", err);
goto cleanup;
}
/* Attach tracepoint handler */
err = execve_counter_bpf__attach(skel);
if (err) {
fprintf(stderr, "Failed to attach BPF skeleton\n");
goto cleanup;
}
for (;;) {
// read counter value from map
//
//LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map,
// const void *key, size_t key_sz,
// void *value, size_t value_sz, __u64 flags);
// /usr/local/bpf/include/bpf/libbpf.h
err = bpf_map__lookup_elem(skel->maps.execve_counter, &key, sizeof(key), &v, sizeof(v), BPF_ANY);
if (err != 0) {
fprintf(stderr, "Lookup key from map error: %d\n", err);
goto cleanup;
} else {
printf("execve_counter is %llu\n", v);
}
sleep(5);
}
cleanup:
execve_counter_bpf__destroy(skel);
return -err;
}
共享事件
perf buffer
/* bpf program example from:
* https://github.com/anakryiko/bpf-ringbuf-examples/blob/main/src/perfbuf-output.bpf.c
*/
/* BPF perfbuf map */
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
} pb SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 1);
__type(key, int);
__type(value, struct event);
} heap SEC(".maps");
SEC("tp/sched/sched_process_exec")
int handle_exec(struct trace_event_raw_sched_process_exec *ctx)
{
unsigned fname_off = ctx->__data_loc_filename & 0xFFFF;
struct event *e;
int zero = 0;
e = bpf_map_lookup_elem(&heap, &zero);
if (!e) /* can't happen */
return 0;
e->pid = bpf_get_current_pid_tgid() >> 32;
bpf_get_current_comm(&e->comm, sizeof(e->comm));
bpf_probe_read_str(&e->filename, sizeof(e->filename), (void *)ctx + fname_off);
bpf_perf_event_output(ctx, &pb, BPF_F_CURRENT_CPU, e, sizeof(*e));
return 0;
}
/* userspace program example from:
* https://github.com/anakryiko/bpf-ringbuf-examples/blob/main/src/perfbuf-output.c
*/
int main(int argc, char **argv)
{
struct perf_buffer *pb = NULL;
struct perf_buffer_opts pb_opts = {};
struct perfbuf_output_bpf *skel;
int err;
/* Set up libbpf logging callback */
libbpf_set_print(libbpf_print_fn);
/* Bump RLIMIT_MEMLOCK to create BPF maps */
bump_memlock_rlimit();
/* Clean handling of Ctrl-C */
signal(SIGINT, sig_handler);
signal(SIGTERM, sig_handler);
/* Load and verify BPF application */
skel = perfbuf_output_bpf__open_and_load();
if (!skel) {
fprintf(stderr, "Failed to open and load BPF skeleton\n");
return 1;
}
/* Attach tracepoint */
err = perfbuf_output_bpf__attach(skel);
if (err) {
fprintf(stderr, "Failed to attach BPF skeleton\n");
goto cleanup;
}
/* Set up ring buffer polling */
pb_opts.sample_cb = handle_event;
pb = perf_buffer__new(bpf_map__fd(skel->maps.pb), 8 /* 32KB per CPU */, &pb_opts);
if (libbpf_get_error(pb)) {
err = -1;
fprintf(stderr, "Failed to create perf buffer\n");
goto cleanup;
}
/* Process events */
printf("%-8s %-5s %-7s %-16s %s\n",
"TIME", "EVENT", "PID", "COMM", "FILENAME");
while (!exiting) {
err = perf_buffer__poll(pb, 100 /* timeout, ms */);
/* Ctrl-C will cause -EINTR */
if (err == -EINTR) {
err = 0;
break;
}
if (err < 0) {
printf("Error polling perf buffer: %d\n", err);
break;
}
}
cleanup:
perf_buffer__free(pb);
perfbuf_output_bpf__destroy(skel);
return err < 0 ? -err : 0;
}
ring buffer
/* bpf program example from:
* https://github.com/anakryiko/bpf-ringbuf-examples/blob/main/src/ringbuf-reserve-submit.bpf.c
*/
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include "common.h"
char LICENSE[] SEC("license") = "Dual BSD/GPL";
/* BPF ringbuf map */
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 256 * 1024 /* 256 KB */);
} rb SEC(".maps");
SEC("tp/sched/sched_process_exec")
int handle_exec(struct trace_event_raw_sched_process_exec *ctx)
{
unsigned fname_off = ctx->__data_loc_filename & 0xFFFF;
struct event *e;
e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
if (!e)
return 0;
e->pid = bpf_get_current_pid_tgid() >> 32;
bpf_get_current_comm(&e->comm, sizeof(e->comm));
bpf_probe_read_str(&e->filename, sizeof(e->filename), (void *)ctx + fname_off);
bpf_ringbuf_submit(e, 0);
return 0;
}
/* userspace example program:
* https://github.com/anakryiko/bpf-ringbuf-examples/blob/main/src/ringbuf-reserve-submit.c
*/
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <time.h>
#include <sys/resource.h>
#include <bpf/libbpf.h>
#include "common.h"
#include "ringbuf-reserve-submit.skel.h"
int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
/* Ignore debug-level libbpf logs */
if (level > LIBBPF_INFO)
return 0;
return vfprintf(stderr, format, args);
}
void bump_memlock_rlimit(void)
{
struct rlimit rlim_new = {
.rlim_cur = RLIM_INFINITY,
.rlim_max = RLIM_INFINITY,
};
if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) {
fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n");
exit(1);
}
}
static volatile bool exiting = false;
static void sig_handler(int sig)
{
exiting = true;
}
int handle_event(void *ctx, void *data, size_t data_sz)
{
const struct event *e = data;
struct tm *tm;
char ts[32];
time_t t;
time(&t);
tm = localtime(&t);
strftime(ts, sizeof(ts), "%H:%M:%S", tm);
printf("%-8s %-5s %-7d %-16s %s\n", ts, "EXEC", e->pid, e->comm, e->filename);
return 0;
}
int main(int argc, char **argv)
{
struct ring_buffer *rb = NULL;
struct ringbuf_reserve_submit_bpf *skel;
int err;
/* Set up libbpf logging callback */
libbpf_set_print(libbpf_print_fn);
/* Bump RLIMIT_MEMLOCK to create BPF maps */
bump_memlock_rlimit();
/* Clean handling of Ctrl-C */
signal(SIGINT, sig_handler);
signal(SIGTERM, sig_handler);
/* Load and verify BPF application */
skel = ringbuf_reserve_submit_bpf__open_and_load();
if (!skel) {
fprintf(stderr, "Failed to open and load BPF skeleton\n");
return 1;
}
/* Attach tracepoint */
err = ringbuf_reserve_submit_bpf__attach(skel);
if (err) {
fprintf(stderr, "Failed to attach BPF skeleton\n");
goto cleanup;
}
/* Set up ring buffer polling */
rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL);
if (!rb) {
err = -1;
fprintf(stderr, "Failed to create ring buffer\n");
goto cleanup;
}
/* Process events */
printf("%-8s %-5s %-7s %-16s %s\n",
"TIME", "EVENT", "PID", "COMM", "FILENAME");
while (!exiting) {
err = ring_buffer__poll(rb, 100 /* timeout, ms */);
/* Ctrl-C will cause -EINTR */
if (err == -EINTR) {
err = 0;
break;
}
if (err < 0) {
printf("Error polling ring buffer: %d\n", err);
break;
}
}
cleanup:
ring_buffer__free(rb);
ringbuf_reserve_submit_bpf__destroy(skel);
return err < 0 ? -err : 0;
}