diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c index 781aa1d128f1..f1670e3014ed 100644 --- a/tools/lib/bpf/usdt.c +++ b/tools/lib/bpf/usdt.c @@ -183,10 +183,56 @@ * gain. */ +#define USDT_BASE_SEC ".stapsdt.base" +#define USDT_SEMA_SEC ".probes" +#define USDT_NOTE_SEC ".note.stapsdt" +#define USDT_NOTE_TYPE 3 +#define USDT_NOTE_NAME "stapsdt" + +/* should match exactly enum __bpf_usdt_arg_type from bpf_usdt.bpf.h */ +enum usdt_arg_type { + USDT_ARG_CONST, + USDT_ARG_REG, + USDT_ARG_REG_DEREF, +}; + +/* should match exactly struct __bpf_usdt_arg_spec from bpf_usdt.bpf.h */ +struct usdt_arg_spec { + __u64 val_off; + enum usdt_arg_type arg_type; + short reg_off; + bool arg_signed; + char arg_bitshift; +}; + +/* should match BPF_USDT_MAX_ARG_CNT in usdt.bpf.h */ +#define USDT_MAX_ARG_CNT 12 + +/* should match struct __bpf_usdt_spec from usdt.bpf.h */ +struct usdt_spec { + struct usdt_arg_spec args[USDT_MAX_ARG_CNT]; + __u64 usdt_cookie; + short arg_cnt; +}; + +struct usdt_note { + const char *provider; + const char *name; + /* USDT args specification string, e.g.: + * "-4@%esi -4@-24(%rbp) -4@%ecx 2@%ax 8@%rdx" + */ + const char *args; + long loc_addr; + long base_addr; + long sema_addr; +}; + struct usdt_target { long abs_ip; long rel_ip; long sema_off; + struct usdt_spec spec; + const char *spec_str; }; struct usdt_manager { @@ -292,11 +338,450 @@ static int sanity_check_usdt_elf(Elf *elf, const char *path) return 0; } +static int find_elf_sec_by_name(Elf *elf, const char *sec_name, GElf_Shdr *shdr, Elf_Scn **scn) +{ + Elf_Scn *sec = NULL; + size_t shstrndx; + + if (elf_getshdrstrndx(elf, &shstrndx)) + return -EINVAL; + + /* check if ELF is corrupted and avoid calling elf_strptr if yes */ + if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL)) + return -EINVAL; + + while ((sec = elf_nextscn(elf, sec)) != NULL) { + char *name; + + if (!gelf_getshdr(sec, shdr)) + return -EINVAL; + + name = elf_strptr(elf, shstrndx, shdr->sh_name); + if (name && strcmp(sec_name, name) == 0) { + *scn = sec; + return 0; + } + } + + return -ENOENT; +} + +struct elf_seg { + long start; + long end; + long offset; + bool is_exec; +}; + +static int cmp_elf_segs(const void *_a, const void *_b) +{ + const struct elf_seg *a = _a; + const struct elf_seg *b = _b; + + return a->start < b->start ? -1 : 1; +} + +static int parse_elf_segs(Elf *elf, const char *path, struct elf_seg **segs, size_t *seg_cnt) +{ + GElf_Phdr phdr; + size_t n; + int i, err; + struct elf_seg *seg; + void *tmp; + + *seg_cnt = 0; + + if (elf_getphdrnum(elf, &n)) { + err = -errno; + return err; + } + + for (i = 0; i < n; i++) { + if (!gelf_getphdr(elf, i, &phdr)) { + err = -errno; + return err; + } + + pr_debug("usdt: discovered PHDR #%d in '%s': vaddr 0x%lx memsz 0x%lx offset 0x%lx type 0x%lx flags 0x%lx\n", + i, path, (long)phdr.p_vaddr, (long)phdr.p_memsz, (long)phdr.p_offset, + (long)phdr.p_type, (long)phdr.p_flags); + if (phdr.p_type != PT_LOAD) + continue; + + tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); + if (!tmp) + return -ENOMEM; + + *segs = tmp; + seg = *segs + *seg_cnt; + (*seg_cnt)++; + + seg->start = phdr.p_vaddr; + seg->end = phdr.p_vaddr + phdr.p_memsz; + seg->offset = phdr.p_offset; + seg->is_exec = phdr.p_flags & PF_X; + } + + if (*seg_cnt == 0) { + pr_warn("usdt: failed to find PT_LOAD program headers in '%s'\n", path); + return -ESRCH; + } + + qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); + return 0; +} + +static int parse_lib_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt) +{ + char path[PATH_MAX], line[PATH_MAX], mode[16]; + size_t seg_start, seg_end, seg_off; + struct elf_seg *seg; + int tmp_pid, i, err; + FILE *f; + + *seg_cnt = 0; + + /* Handle containerized binaries only accessible from + * /proc//root/. They will be reported as just / in + * /proc//maps. + */ + if (sscanf(lib_path, "/proc/%d/root%s", &tmp_pid, path) == 2 && pid == tmp_pid) + goto proceed; + + if (!realpath(lib_path, path)) { + pr_warn("usdt: failed to get absolute path of '%s' (err %d), using path as is...\n", + lib_path, -errno); + strcpy(path, lib_path); + } + +proceed: + sprintf(line, "/proc/%d/maps", pid); + f = fopen(line, "r"); + if (!f) { + err = -errno; + pr_warn("usdt: failed to open '%s' to get base addr of '%s': %d\n", + line, lib_path, err); + return err; + } + + /* We need to handle lines with no path at the end: + * + * 7f5c6f5d1000-7f5c6f5d3000 rw-p 001c7000 08:04 21238613 /usr/lib64/libc-2.17.so + * 7f5c6f5d3000-7f5c6f5d8000 rw-p 00000000 00:00 0 + * 7f5c6f5d8000-7f5c6f5d9000 r-xp 00000000 103:01 362990598 /data/users/andriin/linux/tools/bpf/usdt/libhello_usdt.so + */ + while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n", + &seg_start, &seg_end, mode, &seg_off, line) == 5) { + void *tmp; + + /* to handle no path case (see above) we need to capture line + * without skipping any whitespaces. So we need to strip + * leading whitespaces manually here + */ + i = 0; + while (isblank(line[i])) + i++; + if (strcmp(line + i, path) != 0) + continue; + + pr_debug("usdt: discovered segment for lib '%s': addrs %zx-%zx mode %s offset %zx\n", + path, seg_start, seg_end, mode, seg_off); + + /* ignore non-executable sections for shared libs */ + if (mode[2] != 'x') + continue; + + tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); + if (!tmp) { + err = -ENOMEM; + goto err_out; + } + + *segs = tmp; + seg = *segs + *seg_cnt; + *seg_cnt += 1; + + seg->start = seg_start; + seg->end = seg_end; + seg->offset = seg_off; + seg->is_exec = true; + } + + if (*seg_cnt == 0) { + pr_warn("usdt: failed to find '%s' (resolved to '%s') within PID %d memory mappings\n", + lib_path, path, pid); + err = -ESRCH; + goto err_out; + } + + qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); + err = 0; +err_out: + fclose(f); + return err; +} + +static struct elf_seg *find_elf_seg(struct elf_seg *segs, size_t seg_cnt, long addr, bool relative) +{ + struct elf_seg *seg; + int i; + + if (relative) { + /* for shared libraries, address is relative offset and thus + * should be fall within logical offset-based range of + * [offset_start, offset_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->offset <= addr && addr < seg->offset + (seg->end - seg->start)) + return seg; + } + } else { + /* for binaries, address is absolute and thus should be within + * absolute address range of [seg_start, seg_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->start <= addr && addr < seg->end) + return seg; + } + } + + return NULL; +} + +static int parse_usdt_note(Elf *elf, const char *path, long base_addr, + GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, + struct usdt_note *usdt_note); + +static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, long usdt_cookie); + static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *path, pid_t pid, const char *usdt_provider, const char *usdt_name, long usdt_cookie, struct usdt_target **out_targets, size_t *out_target_cnt) { - return -ENOTSUP; + size_t off, name_off, desc_off, seg_cnt = 0, lib_seg_cnt = 0, target_cnt = 0; + struct elf_seg *segs = NULL, *lib_segs = NULL; + struct usdt_target *targets = NULL, *target; + long base_addr = 0; + Elf_Scn *notes_scn, *base_scn; + GElf_Shdr base_shdr, notes_shdr; + GElf_Ehdr ehdr; + GElf_Nhdr nhdr; + Elf_Data *data; + int err; + + *out_targets = NULL; + *out_target_cnt = 0; + + err = find_elf_sec_by_name(elf, USDT_NOTE_SEC, ¬es_shdr, ¬es_scn); + if (err) { + pr_warn("usdt: no USDT notes section (%s) found in '%s'\n", USDT_NOTE_SEC, path); + return err; + } + + if (notes_shdr.sh_type != SHT_NOTE || !gelf_getehdr(elf, &ehdr)) { + pr_warn("usdt: invalid USDT notes section (%s) in '%s'\n", USDT_NOTE_SEC, path); + return -EINVAL; + } + + err = parse_elf_segs(elf, path, &segs, &seg_cnt); + if (err) { + pr_warn("usdt: failed to process ELF program segments for '%s': %d\n", path, err); + goto err_out; + } + + /* .stapsdt.base ELF section is optional, but is used for prelink + * offset compensation (see a big comment further below) + */ + if (find_elf_sec_by_name(elf, USDT_BASE_SEC, &base_shdr, &base_scn) == 0) + base_addr = base_shdr.sh_addr; + + data = elf_getdata(notes_scn, 0); + off = 0; + while ((off = gelf_getnote(data, off, &nhdr, &name_off, &desc_off)) > 0) { + long usdt_abs_ip, usdt_rel_ip, usdt_sema_off = 0; + struct usdt_note note; + struct elf_seg *seg = NULL; + void *tmp; + + err = parse_usdt_note(elf, path, base_addr, &nhdr, + data->d_buf, name_off, desc_off, ¬e); + if (err) + goto err_out; + + if (strcmp(note.provider, usdt_provider) != 0 || strcmp(note.name, usdt_name) != 0) + continue; + + /* We need to compensate "prelink effect". See [0] for details, + * relevant parts quoted here: + * + * Each SDT probe also expands into a non-allocated ELF note. You can + * find this by looking at SHT_NOTE sections and decoding the format; + * see below for details. Because the note is non-allocated, it means + * there is no runtime cost, and also preserved in both stripped files + * and .debug files. + * + * However, this means that prelink won't adjust the note's contents + * for address offsets. Instead, this is done via the .stapsdt.base + * section. This is a special section that is added to the text. We + * will only ever have one of these sections in a final link and it + * will only ever be one byte long. Nothing about this section itself + * matters, we just use it as a marker to detect prelink address + * adjustments. + * + * Each probe note records the link-time address of the .stapsdt.base + * section alongside the probe PC address. The decoder compares the + * base address stored in the note with the .stapsdt.base section's + * sh_addr. Initially these are the same, but the section header will + * be adjusted by prelink. So the decoder applies the difference to + * the probe PC address to get the correct prelinked PC address; the + * same adjustment is applied to the semaphore address, if any. + * + * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + */ + usdt_rel_ip = usdt_abs_ip = note.loc_addr; + if (base_addr) { + usdt_abs_ip += base_addr - note.base_addr; + usdt_rel_ip += base_addr - note.base_addr; + } + + if (ehdr.e_type == ET_EXEC) { + /* When attaching uprobes (which what USDTs basically + * are) kernel expects a relative IP to be specified, + * so if we are attaching to an executable ELF binary + * (i.e., not a shared library), we need to calculate + * proper relative IP based on ELF's load address + */ + seg = find_elf_seg(segs, seg_cnt, usdt_abs_ip, false /* relative */); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find ELF program segment for '%s:%s' in '%s' at IP 0x%lx\n", + usdt_provider, usdt_name, path, usdt_abs_ip); + goto err_out; + } + if (!seg->is_exec) { + err = -ESRCH; + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx) for '%s:%s' at IP 0x%lx is not executable\n", + path, seg->start, seg->end, usdt_provider, usdt_name, + usdt_abs_ip); + goto err_out; + } + + usdt_rel_ip = usdt_abs_ip - (seg->start - seg->offset); + } else if (!man->has_bpf_cookie) { /* ehdr.e_type == ET_DYN */ + /* If we don't have BPF cookie support but need to + * attach to a shared library, we'll need to know and + * record absolute addresses of attach points due to + * the need to lookup USDT spec by absolute IP of + * triggered uprobe. Doing this resolution is only + * possible when we have a specific PID of the process + * that's using specified shared library. BPF cookie + * removes the absolute address limitation as we don't + * need to do this lookup (we just use BPF cookie as + * an index of USDT spec), so for newer kernels with + * BPF cookie support libbpf supports USDT attachment + * to shared libraries with no PID filter. + */ + if (pid < 0) { + pr_warn("usdt: attaching to shared libaries without specific PID is not supported on current kernel\n"); + err = -ENOTSUP; + goto err_out; + } + + /* lib_segs are lazily initialized only if necessary */ + if (lib_seg_cnt == 0) { + err = parse_lib_segs(pid, path, &lib_segs, &lib_seg_cnt); + if (err) { + pr_warn("usdt: failed to get memory segments in PID %d for shared library '%s': %d\n", + pid, path, err); + goto err_out; + } + } + + seg = find_elf_seg(lib_segs, lib_seg_cnt, usdt_rel_ip, true /* relative */); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find shared lib memory segment for '%s:%s' in '%s' at relative IP 0x%lx\n", + usdt_provider, usdt_name, path, usdt_rel_ip); + goto err_out; + } + + usdt_abs_ip = seg->start + (usdt_rel_ip - seg->offset); + } + + pr_debug("usdt: probe for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved abs_ip 0x%lx rel_ip 0x%lx) args '%s' in segment [0x%lx, 0x%lx) at offset 0x%lx\n", + usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", path, + note.loc_addr, note.base_addr, usdt_abs_ip, usdt_rel_ip, note.args, + seg ? seg->start : 0, seg ? seg->end : 0, seg ? seg->offset : 0); + + /* Adjust semaphore address to be a relative offset */ + if (note.sema_addr) { + if (!man->has_sema_refcnt) { + pr_warn("usdt: kernel doesn't support USDT semaphore refcounting for '%s:%s' in '%s'\n", + usdt_provider, usdt_name, path); + err = -ENOTSUP; + goto err_out; + } + + seg = find_elf_seg(segs, seg_cnt, note.sema_addr, false /* relative */); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find ELF loadable segment with semaphore of '%s:%s' in '%s' at 0x%lx\n", + usdt_provider, usdt_name, path, note.sema_addr); + goto err_out; + } + if (seg->is_exec) { + err = -ESRCH; + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx] for semaphore of '%s:%s' at 0x%lx is executable\n", + path, seg->start, seg->end, usdt_provider, usdt_name, + note.sema_addr); + goto err_out; + } + + usdt_sema_off = note.sema_addr - (seg->start - seg->offset); + + pr_debug("usdt: sema for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved 0x%lx) in segment [0x%lx, 0x%lx] at offset 0x%lx\n", + usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", + path, note.sema_addr, note.base_addr, usdt_sema_off, + seg->start, seg->end, seg->offset); + } + + /* Record adjusted addresses and offsets and parse USDT spec */ + tmp = libbpf_reallocarray(targets, target_cnt + 1, sizeof(*targets)); + if (!tmp) { + err = -ENOMEM; + goto err_out; + } + targets = tmp; + + target = &targets[target_cnt]; + memset(target, 0, sizeof(*target)); + + target->abs_ip = usdt_abs_ip; + target->rel_ip = usdt_rel_ip; + target->sema_off = usdt_sema_off; + + /* notes->args references strings from Elf itself, so they can + * be referenced safely until elf_end() call + */ + target->spec_str = note.args; + + err = parse_usdt_spec(&target->spec, ¬e, usdt_cookie); + if (err) + goto err_out; + + target_cnt++; + } + + *out_targets = targets; + *out_target_cnt = target_cnt; + err = target_cnt; + +err_out: + free(segs); + free(lib_segs); + if (err < 0) + free(targets); + return err; } struct bpf_link_usdt { @@ -414,6 +899,7 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct link->uprobe_cnt++; } + free(targets); elf_end(elf); close(fd); @@ -422,8 +908,102 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct err_out: bpf_link__destroy(&link->link); + free(targets); if (elf) elf_end(elf); close(fd); return libbpf_err_ptr(err); } + +/* Parse out USDT ELF note from '.note.stapsdt' section. + * Logic inspired by perf's code. + */ +static int parse_usdt_note(Elf *elf, const char *path, long base_addr, + GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, + struct usdt_note *note) +{ + const char *provider, *name, *args; + long addrs[3]; + size_t len; + + /* sanity check USDT note name and type first */ + if (strncmp(data + name_off, USDT_NOTE_NAME, nhdr->n_namesz) != 0) + return -EINVAL; + if (nhdr->n_type != USDT_NOTE_TYPE) + return -EINVAL; + + /* sanity check USDT note contents ("description" in ELF terminology) */ + len = nhdr->n_descsz; + data = data + desc_off; + + /* +3 is the very minimum required to store three empty strings */ + if (len < sizeof(addrs) + 3) + return -EINVAL; + + /* get location, base, and semaphore addrs */ + memcpy(&addrs, data, sizeof(addrs)); + + /* parse string fields: provider, name, args */ + provider = data + sizeof(addrs); + + name = (const char *)memchr(provider, '\0', data + len - provider); + if (!name) /* non-zero-terminated provider */ + return -EINVAL; + name++; + if (name >= data + len || *name == '\0') /* missing or empty name */ + return -EINVAL; + + args = memchr(name, '\0', data + len - name); + if (!args) /* non-zero-terminated name */ + return -EINVAL; + ++args; + if (args >= data + len) /* missing arguments spec */ + return -EINVAL; + + note->provider = provider; + note->name = name; + if (*args == '\0' || *args == ':') + note->args = ""; + else + note->args = args; + note->loc_addr = addrs[0]; + note->base_addr = addrs[1]; + note->sema_addr = addrs[2]; + + return 0; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg); + +static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, long usdt_cookie) +{ + const char *s; + int len; + + spec->usdt_cookie = usdt_cookie; + spec->arg_cnt = 0; + + s = note->args; + while (s[0]) { + if (spec->arg_cnt >= USDT_MAX_ARG_CNT) { + pr_warn("usdt: too many USDT arguments (> %d) for '%s:%s' with args spec '%s'\n", + USDT_MAX_ARG_CNT, note->provider, note->name, note->args); + return -E2BIG; + } + + len = parse_usdt_arg(s, spec->arg_cnt, &spec->args[spec->arg_cnt]); + if (len < 0) + return len; + + s += len; + spec->arg_cnt++; + } + + return 0; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +{ + pr_warn("usdt: libbpf doesn't support USDTs on current architecture\n"); + return -ENOTSUP; +}