Oct 15, 2022 6 min read ebpf

深入了解 ebpf map

探究 ebpf map 在 kernel 中加载和使用的原理，最后给出两个比较常见的 map 使用的示例

ebpf map 原理

map 创建原理

ebpf 程序需要由用户态进程（程序） load 进内核，由于 bpf syscall 的存在，ebpf map 创建可以大致分两种方式：

用户态进程（程序）通过 bpf syscall 来创建和管理 ebpf map；
内核态在 load ebpf 程序的时候通过解析 ELF 文件的 map section 来创建 ebpf map；

1. 用户态创建 map

用户态直接创建 map 的原理主要是根据 bpf_attr 定义调用 bpf syscall 实现的，具体如下代码所示：

#include <linux/bpf.h>

union bpf_attr my_map_attr {
  .map_type = BPF_MAP_TYPE_ARRAY,
  .key_size = sizeof(int),
  .value_size = sizeof(int),
  .max_entries = 1024,
};

int fd = bpf(BPF_MAP_CREATE, &my_map_attr, sizeof(my_map_attr));

2. 内核态创建 map

在 bpf 程序中添加如下结构体声明即可在 bpf 成加载流程中创建 ebpf map，关键的原理还是在于 kernel 加载 bpf 程序的时候（load_bpf_file()），如果解析 bpf 程序（object 文件）中包括 maps section 定义就会触发 load_maps执行，这其中就包括了 bpf_create_map_node() 和 bpf_create_map_in_map_node() 函数。

💡

注意这里所涉及的 API 特指 libbpf，llvm 在将 bpf 程序的 c 文件编译成 object 文件时会链接 libbpf

struct bpf_map_def SEC("maps") my_bpf_map = {
  .type       = BPF_MAP_TYPE_HASH, 
  .key_size   = sizeof(int),
  .value_size   = sizeof(int),
  .max_entries = 100,
};

ebpf map 管理

map 创建完成之后涉及到的就是 CRUD 了，主要还是 map 相关几个操作函数：

helper:(kernel space)
- void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
- long bpf_map_update_elem(struct bpf_map *map, const void *key,
       const void *value, u64 flags)
- long bpf_map_delete_elem(struct bpf_map *map, const void *key)
- ...

libbpf:(user space)
// https://elixir.bootlin.com/linux/v4.19.261/source/tools/lib/bpf/bpf.c#L299
- int bpf_map_lookup_elem(int fd, const void *key, void *value)
- int bpf_map_update_elem(int fd, const void *key, const void *value,
			__u64 flags)
- int bpf_map_delete_elem(int fd, const void *key)
- ...

ebpf map 使用方法

共享数据

/* example ebpf program from:
 *   https://github.com/bigwhite/experiments/blob/master/ebpf-examples/execve-counter/execve_counter.bpf.c
 */

#include <bpf/bpf_helpers.h>

typedef __u64 u64;
typedef char stringkey[64];

struct {
    __uint(type, BPF_MAP_TYPE_HASH);
    __uint(max_entries, 128);
    //__type(key, stringkey);
	stringkey* key;
    __type(value, u64);
} execve_counter SEC(".maps");

SEC("tracepoint/syscalls/sys_enter_execve")
int bpf_prog(void *ctx) {
  stringkey key = "execve_counter";
  u64 *v = NULL;
  v = bpf_map_lookup_elem(&execve_counter, &key);
  if (v != NULL) {
    *v += 1;
    bpf_map_update_elem(&execve_counter, &key, v, BPF_ANY);
    bpf_printk("map value: %d\n", *v);
  }
  return 0;
}

char LICENSE[] SEC("license") = "Dual BSD/GPL";

/* example userspace program from:
 *   https://github.com/bigwhite/experiments/blob/master/ebpf-examples/execve-counter/execve_counter.c
 */
#include <stdio.h>
#include <unistd.h>
#include <sys/resource.h>
#include <bpf/libbpf.h>
#include <linux/bpf.h>
#include "execve_counter.skel.h"

typedef __u64 u64;

static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
	return vfprintf(stderr, format, args);
}

int main(int argc, char **argv)
{
	struct execve_counter_bpf *skel;
	int err;

	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
	/* Set up libbpf errors and debug info callback */
	libbpf_set_print(libbpf_print_fn);

	/* Open BPF application */
	skel = execve_counter_bpf__open();
	if (!skel) {
		fprintf(stderr, "Failed to open BPF skeleton\n");
		return 1;
	}

	/* Load & verify BPF programs */
	err = execve_counter_bpf__load(skel);
	if (err) {
		fprintf(stderr, "Failed to load and verify BPF skeleton\n");
		goto cleanup;
	}

	/* init the counter */
	char key[16] = "execve_counter";
	u64 v = 0;
	err = bpf_map__update_elem(skel->maps.execve_counter, &key, sizeof(key), &v, sizeof(v), BPF_ANY);
	if (err != 0) {
		fprintf(stderr, "Failed to init the counter, %d\n", err);
		goto cleanup;
	}

	/* Attach tracepoint handler */
	err = execve_counter_bpf__attach(skel);
	if (err) {
		fprintf(stderr, "Failed to attach BPF skeleton\n");
		goto cleanup;
	}

	for (;;) {
			// read counter value from map
			//
			//LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map,
            //        const void *key, size_t key_sz,
            //        void *value, size_t value_sz, __u64 flags);
			//        /usr/local/bpf/include/bpf/libbpf.h
			err = bpf_map__lookup_elem(skel->maps.execve_counter, &key, sizeof(key), &v, sizeof(v), BPF_ANY);
			if (err != 0) {
               fprintf(stderr, "Lookup key from map error: %d\n", err);
               goto cleanup;
			} else {
			   printf("execve_counter is %llu\n", v);
			}
			
			sleep(5);
	}

cleanup:
	execve_counter_bpf__destroy(skel);
	return -err;
}

共享事件

perf buffer

/* bpf program example from: 
 *  https://github.com/anakryiko/bpf-ringbuf-examples/blob/main/src/perfbuf-output.bpf.c
 */
/* BPF perfbuf map */
struct {
	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
	__uint(key_size, sizeof(int));
	__uint(value_size, sizeof(int));
} pb SEC(".maps");

struct {
	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
	__uint(max_entries, 1);
	__type(key, int);
	__type(value, struct event);
} heap SEC(".maps");

SEC("tp/sched/sched_process_exec")
int handle_exec(struct trace_event_raw_sched_process_exec *ctx)
{
	unsigned fname_off = ctx->__data_loc_filename & 0xFFFF;
	struct event *e;
	int zero = 0;
	
	e = bpf_map_lookup_elem(&heap, &zero);
	if (!e) /* can't happen */
		return 0;

	e->pid = bpf_get_current_pid_tgid() >> 32;
	bpf_get_current_comm(&e->comm, sizeof(e->comm));
	bpf_probe_read_str(&e->filename, sizeof(e->filename), (void *)ctx + fname_off);

	bpf_perf_event_output(ctx, &pb, BPF_F_CURRENT_CPU, e, sizeof(*e));
	return 0;
}

/* userspace program example from:
 *   https://github.com/anakryiko/bpf-ringbuf-examples/blob/main/src/perfbuf-output.c
 */
int main(int argc, char **argv)
{
	struct perf_buffer *pb = NULL;
	struct perf_buffer_opts pb_opts = {};
	struct perfbuf_output_bpf *skel;
	int err;

	/* Set up libbpf logging callback */
	libbpf_set_print(libbpf_print_fn);

	/* Bump RLIMIT_MEMLOCK to create BPF maps */
	bump_memlock_rlimit();

	/* Clean handling of Ctrl-C */
	signal(SIGINT, sig_handler);
	signal(SIGTERM, sig_handler);

	/* Load and verify BPF application */
	skel = perfbuf_output_bpf__open_and_load();
	if (!skel) {
		fprintf(stderr, "Failed to open and load BPF skeleton\n");
		return 1;
	}

	/* Attach tracepoint */
	err = perfbuf_output_bpf__attach(skel);
	if (err) {
		fprintf(stderr, "Failed to attach BPF skeleton\n");
		goto cleanup;
	}

	/* Set up ring buffer polling */
	pb_opts.sample_cb = handle_event;
	pb = perf_buffer__new(bpf_map__fd(skel->maps.pb), 8 /* 32KB per CPU */, &pb_opts);
	if (libbpf_get_error(pb)) {
		err = -1;
		fprintf(stderr, "Failed to create perf buffer\n");
		goto cleanup;
	}

	/* Process events */
	printf("%-8s %-5s %-7s %-16s %s\n",
	       "TIME", "EVENT", "PID", "COMM", "FILENAME");
	while (!exiting) {
		err = perf_buffer__poll(pb, 100 /* timeout, ms */);
		/* Ctrl-C will cause -EINTR */
		if (err == -EINTR) {
			err = 0;
			break;
		}
		if (err < 0) {
			printf("Error polling perf buffer: %d\n", err);
			break;
		}
	}

cleanup:
	perf_buffer__free(pb);
	perfbuf_output_bpf__destroy(skel);

	return err < 0 ? -err : 0;
}

ring buffer

/* bpf program example from: 
 *   https://github.com/anakryiko/bpf-ringbuf-examples/blob/main/src/ringbuf-reserve-submit.bpf.c
 */
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include "common.h"

char LICENSE[] SEC("license") = "Dual BSD/GPL";

/* BPF ringbuf map */
struct {
	__uint(type, BPF_MAP_TYPE_RINGBUF);
	__uint(max_entries, 256 * 1024 /* 256 KB */);
} rb SEC(".maps");

SEC("tp/sched/sched_process_exec")
int handle_exec(struct trace_event_raw_sched_process_exec *ctx)
{
	unsigned fname_off = ctx->__data_loc_filename & 0xFFFF;
	struct event *e;
	
	e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
	if (!e)
		return 0;

	e->pid = bpf_get_current_pid_tgid() >> 32;
	bpf_get_current_comm(&e->comm, sizeof(e->comm));
	bpf_probe_read_str(&e->filename, sizeof(e->filename), (void *)ctx + fname_off);

	bpf_ringbuf_submit(e, 0);
	return 0;
}


/* userspace example program:
 *   https://github.com/anakryiko/bpf-ringbuf-examples/blob/main/src/ringbuf-reserve-submit.c
 */
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <time.h>
#include <sys/resource.h>
#include <bpf/libbpf.h>
#include "common.h"
#include "ringbuf-reserve-submit.skel.h"

int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
	/* Ignore debug-level libbpf logs */
	if (level > LIBBPF_INFO)
		return 0;
	return vfprintf(stderr, format, args);
}

void bump_memlock_rlimit(void)
{
	struct rlimit rlim_new = {
		.rlim_cur	= RLIM_INFINITY,
		.rlim_max	= RLIM_INFINITY,
	};

	if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) {
		fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n");
		exit(1);
	}
}

static volatile bool exiting = false;

static void sig_handler(int sig)
{
	exiting = true;
}

int handle_event(void *ctx, void *data, size_t data_sz)
{
	const struct event *e = data;
	struct tm *tm;
	char ts[32];
	time_t t;

	time(&t);
	tm = localtime(&t);
	strftime(ts, sizeof(ts), "%H:%M:%S", tm);

	printf("%-8s %-5s %-7d %-16s %s\n", ts, "EXEC", e->pid, e->comm, e->filename);

	return 0;
}

int main(int argc, char **argv)
{
	struct ring_buffer *rb = NULL;
	struct ringbuf_reserve_submit_bpf *skel;
	int err;

	/* Set up libbpf logging callback */
	libbpf_set_print(libbpf_print_fn);

	/* Bump RLIMIT_MEMLOCK to create BPF maps */
	bump_memlock_rlimit();

	/* Clean handling of Ctrl-C */
	signal(SIGINT, sig_handler);
	signal(SIGTERM, sig_handler);

	/* Load and verify BPF application */
	skel = ringbuf_reserve_submit_bpf__open_and_load();
	if (!skel) {
		fprintf(stderr, "Failed to open and load BPF skeleton\n");
		return 1;
	}

	/* Attach tracepoint */
	err = ringbuf_reserve_submit_bpf__attach(skel);
	if (err) {
		fprintf(stderr, "Failed to attach BPF skeleton\n");
		goto cleanup;
	}

	/* Set up ring buffer polling */
	rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL);
	if (!rb) {
		err = -1;
		fprintf(stderr, "Failed to create ring buffer\n");
		goto cleanup;
	}

	/* Process events */
	printf("%-8s %-5s %-7s %-16s %s\n",
	       "TIME", "EVENT", "PID", "COMM", "FILENAME");
	while (!exiting) {
		err = ring_buffer__poll(rb, 100 /* timeout, ms */);
		/* Ctrl-C will cause -EINTR */
		if (err == -EINTR) {
			err = 0;
			break;
		}
		if (err < 0) {
			printf("Error polling ring buffer: %d\n", err);
			break;
		}
	}

cleanup:
	ring_buffer__free(rb);
	ringbuf_reserve_submit_bpf__destroy(skel);

	return err < 0 ? -err : 0;
}

ebpf map 原理

map 创建原理

1. 用户态创建 map

2. 内核态创建 map

ebpf map 管理

ebpf map 使用方法

共享数据

共享事件

perf buffer

ring buffer

References

You might also like...

Agentic AI

如何在人工智能领域建立自己的职业生涯

我的消费决策模型

关于计算机安全的一些碎碎念

来自 Google 内部的另外一种声音：AI 没有护城河

Popular tags