diff --git a/examples/cpp/pyperf/CMakeLists.txt b/examples/cpp/pyperf/CMakeLists.txt index 271ff80a6e99..8bdc7374a101 100644 --- a/examples/cpp/pyperf/CMakeLists.txt +++ b/examples/cpp/pyperf/CMakeLists.txt @@ -19,7 +19,19 @@ add_executable(PyPerf PyOffsets.cc PyPerfNativeStackTrace.cc ) -target_link_libraries(PyPerf pthread libunwind-ptrace.a libunwind-x86_64.a libunwind.a lzma) +target_link_libraries(PyPerf pthread libunwind-ptrace.a) + +execute_process(COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE) + +if(${ARCHITECTURE} STREQUAL "x86_64") + target_link_libraries(PyPerf libunwind-x86_64.a) +elseif(${ARCHITECTURE} STREQUAL "aarch64") + target_link_libraries(PyPerf libunwind-aarch64.a) +endif() +target_link_libraries(PyPerf libunwind.a) # this one is needed after the x86_64/aarch64 link + +target_link_libraries(PyPerf lzma) + if(NOT CMAKE_USE_LIBBPF_PACKAGE) target_link_libraries(PyPerf bcc-static) else() diff --git a/examples/cpp/pyperf/PyOffsets.cc b/examples/cpp/pyperf/PyOffsets.cc index bb969f862658..64deb3ca81f4 100644 --- a/examples/cpp/pyperf/PyOffsets.cc +++ b/examples/cpp/pyperf/PyOffsets.cc @@ -29,6 +29,7 @@ There are a couple of exceptions: 3. PyThreadState.thread - this field's name is "thread_id" in some Python versions. */ +#if defined(__x86_64__) extern const struct struct_offsets kPy27OffsetConfig = { .PyObject = { .ob_type = 8 @@ -229,13 +230,219 @@ extern const struct struct_offsets kPy310OffsetConfig = { }, }; +#elif defined(__aarch64__) + +extern const struct struct_offsets kPy27OffsetConfig = { + .PyObject = { + .ob_type = 8 + }, + .String = { + .data = 36, // offsetof(PyStringObject, ob_sval) + .size = 16, // offsetof(PyVarObject, ob_size) + }, + .PyTypeObject = { + .tp_name = 24 + }, + .PyThreadState = { + .next = 0, + .interp = 8, + .frame = 16, + .thread = 144, + }, + .PyInterpreterState = { + .tstate_head = 8, + }, + .PyRuntimeState = { + .interp_main = -1, // N/A + }, + .PyFrameObject = { + .f_back = 24, + .f_code = 32, + .f_lineno = 124, + .f_localsplus = 376, + }, + .PyCodeObject = { + .co_filename = 80, + .co_name = 88, + .co_varnames = 56, + .co_firstlineno = 96, + }, + .PyTupleObject = { + .ob_item = 24 + }, +}; + +extern const struct struct_offsets kPy36OffsetConfig = { + .PyObject = { + .ob_type = 8 + }, + .String = { + .data = 48, // offsetof(PyStringObject, ob_sval) + .size = 16, // offsetof(PyVarObject, ob_size) + }, + .PyTypeObject = { + .tp_name = 24 + }, + .PyThreadState = { + .next = 8, + .interp = 16, + .frame = 24, + .thread = 152, + }, + .PyInterpreterState = { + .tstate_head = 8, + }, + .PyRuntimeState = { + .interp_main = 32, + }, + .PyFrameObject = { + .f_back = 24, + .f_code = 32, + .f_lineno = 124, + .f_localsplus = 376, + }, + .PyCodeObject = { + .co_filename = 96, + .co_name = 104, + .co_varnames = 64, + .co_firstlineno = 36, + }, + .PyTupleObject = { + .ob_item = 24 + }, +}; + +extern const struct struct_offsets kPy37OffsetConfig = { + .PyObject = { + .ob_type = 8 + }, + .String = { + .data = 48, // offsetof(PyStringObject, ob_sval) + .size = 16, // offsetof(PyVarObject, ob_size) + }, + .PyTypeObject = { + .tp_name = 24 + }, + .PyThreadState = { + .next = 8, + .interp = 16, + .frame = 24, + .thread = 176, + }, + .PyInterpreterState = { + .tstate_head = 8, + }, + .PyRuntimeState = { + .interp_main = 32, + }, + .PyFrameObject = { + .f_back = 24, + .f_code = 32, + .f_lineno = 108, + .f_localsplus = 360, + }, + .PyCodeObject = { + .co_filename = 96, + .co_name = 104, + .co_varnames = 64, + .co_firstlineno = 36, + }, + .PyTupleObject = { + .ob_item = 24 + }, +}; + +extern const struct struct_offsets kPy38OffsetConfig = { + .PyObject = { + .ob_type = 8 + }, + .String = { + .data = 48, // offsetof(PyStringObject, ob_sval) + .size = 16, // offsetof(PyVarObject, ob_size) + }, + .PyTypeObject = { + .tp_name = 24 + }, + .PyThreadState = { + .next = 8, + .interp = 16, + .frame = 24, + .thread = 176, + }, + .PyInterpreterState = { + .tstate_head = 8, + }, + .PyRuntimeState = { + .interp_main = 40, // N/A + }, + .PyFrameObject = { + .f_back = 24, + .f_code = 32, + .f_lineno = 108, + .f_localsplus = 360, + }, + .PyCodeObject = { + .co_filename = 104, + .co_name = 112, + .co_varnames = 72, + .co_firstlineno = 40, + }, + .PyTupleObject = { + .ob_item = 24 + }, +}; + +extern const struct struct_offsets kPy310OffsetConfig = { + .PyObject = { + .ob_type = 8 + }, + .String = { + .data = 48, // offsetof(PyStringObject, ob_sval) + .size = 16, // offsetof(PyVarObject, ob_size) + }, + .PyTypeObject = { + .tp_name = 24 + }, + .PyThreadState = { + .next = 8, + .interp = 16, + .frame = 24, + .thread = 176, + }, + .PyInterpreterState = { + .tstate_head = 8, + }, + .PyRuntimeState = { + .interp_main = 40, + }, + .PyFrameObject = { + .f_back = 24, + .f_code = 32, + .f_lineno = 100, + .f_localsplus = 352, + }, + .PyCodeObject = { + .co_filename = 104, + .co_name = 112, + .co_varnames = 72, + .co_firstlineno = 40, + }, + .PyTupleObject = { + .ob_item = 24 + }, +}; + +#else +#error unknown arch +#endif + // List of mappings from Python 3 minor versions to offsets. `get_offsets` depends on this list // being sorted in ascending order when it searches through it. const std::vector> python3Versions = { {{3,6,0}, kPy36OffsetConfig}, {{3,7,0}, kPy37OffsetConfig}, {{3,8,0}, kPy38OffsetConfig}, - // 3.9 is same as 3.8 + // 3.9 is same as 3.8 (on both x86_64 and Aarch64) {{3,10,0}, kPy310OffsetConfig}, }; diff --git a/examples/cpp/pyperf/PyPerfBPFProgram.cc b/examples/cpp/pyperf/PyPerfBPFProgram.cc index a9be3f88e424..66902b4cf762 100644 --- a/examples/cpp/pyperf/PyPerfBPFProgram.cc +++ b/examples/cpp/pyperf/PyPerfBPFProgram.cc @@ -172,6 +172,7 @@ struct sample_state { uintptr_t constant_buffer_addr; uintptr_t interp_head; uintptr_t thread_state; + enum pthreads_impl pthreads_impl; struct struct_offsets offsets; uint32_t cur_cpu; uint32_t symbol_counter; @@ -215,13 +216,13 @@ get_task_thread_id(struct task_struct const *task, enum pthreads_impl pthreads_i // For glibc, corresponds to THREAD_SELF in "tls.h" in glibc source. // For musl, see definition of `__pthread_self`. -#ifdef __x86_64__ int ret; uint64_t fsbase; // HACK: Usually BCC would translate a deref of the field into `read_kernel` for us, but it // doesn't detect it due to the macro (because it transforms before preprocessing). bpf_probe_read_kernel(&fsbase, sizeof(fsbase), (u8*)task + FS_OFS); +#ifdef __x86_64__ switch (pthreads_impl) { case PTI_GLIBC: // 0x10 = offsetof(tcbhead_t, self) @@ -238,16 +239,76 @@ get_task_thread_id(struct task_struct const *task, enum pthreads_impl pthreads_i // driver passed bad value return ERROR_INVALID_PTHREADS_IMPL; } +#elif defined(__aarch64__) + switch (pthreads_impl) { + case PTI_GLIBC: + // TODO const bad + *thread_id = fsbase - 0x6f0; + ret = 0; + break; + + case PTI_MUSL: + // TODO ensure really same as x86 + // __pthread_self / __get_tp reads %fs:0x0 + // which corresponds to the field "self" in struct pthread + ret = bpf_probe_read_user(thread_id, sizeof(*thread_id), (void *)fsbase); + break; + + default: + // driver passed bad value + return ERROR_INVALID_PTHREADS_IMPL; + } +#else +#error "Unsupported platform" +#endif // __x86_64__ + + bpf_trace_printk("fs: %llx libc: %llx: ret %llx\n", fsbase, pthreads_impl, ret); if (ret < 0) { return ERROR_BAD_FSBASE; } return ERROR_NONE; +} -#else // __x86_64__ -#error "Unsupported platform" -#endif // __x86_64__ +static __always_inline int compare_task_thread_id(uint64_t a, uint64_t b, enum pthreads_impl pthreads_impl) { +#if defined(__x86_64__) + (void)pthreads_impl; + return a == b; +#elif defined(__aarch64__) + switch (pthreads_impl) { + case PTI_GLIBC: + return (int64_t)(a - b) < 0x500; + + case PTI_MUSL: + return a == b; + } +#endif +} + +static __always_inline int user_mode(struct pt_regs *regs) { + // ebpf doesn't allow direct access to regs (the ctx), so we need to copy it +#if defined(__x86_64__) + int cs; + bpf_probe_read_kernel(&cs, sizeof(cs), &(regs->cs)); + return cs & 3; +#elif defined(__aarch64__) + u64 pstate; + bpf_probe_read_kernel(&pstate, sizeof(pstate), &(regs->pstate)); + return (pstate & PSR_MODE_MASK) == PSR_MODE_EL0t; +#endif +} + +static __always_inline struct pt_regs *task_pt_regs_ptr(struct task_struct const *const task) { + unsigned long stack; + bpf_probe_read_kernel(&stack, sizeof(stack), (void*)((unsigned long)task + STACK_OFS)); +#if defined(__x86_64__) + // This is equivalent to `task_pt_regs(task)` for x86. Macros doesn't + // work properly on bcc, so we need to re-implement. + return (struct pt_regs *)(stack + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING) - 1; +#elif defined(__aarch64__) + return (struct pt_regs *)(stack + THREAD_SIZE) - 1; +#endif } // this function is trivial, but we need to do map lookup in separate function, @@ -309,37 +370,30 @@ on_event(struct pt_regs* ctx) { // Get raw native user stack struct pt_regs user_regs; - // ebpf doesn't allow direct access to ctx->cs, so we need to copy it - int cs; - bpf_probe_read_kernel(&cs, sizeof(cs), &(ctx->cs)); - // Are we in user mode? - if (cs & 3) { + if (user_mode(ctx)) { // Yes - use the registers context given to the BPF program - user_regs = *ctx; + bpf_probe_read_kernel(&user_regs, sizeof(user_regs), ctx); + // user_regs = *ctx; } else { // No - use the registers context of usermode, that is stored on the stack. - - // The third argument is equivalent to `task_pt_regs(task)` for x86. Macros doesn't - // work properly on bcc, so we need to re-implement. bpf_probe_read_kernel( &user_regs, sizeof(user_regs), - // Note - BCC emits an implicit bpf_probe_read_kernel() here (for the deref of 'task'). - // I don't like the implicitness (and it will be something we'll need to fix if we're ever - // to move from BCC). Meanwhile, I tried to change it to be explicit but the BPF assembly - // varies too much so I prefer to avoid this change now ;( - (struct pt_regs *)(*(unsigned long*)((unsigned long)task + STACK_OFS) + THREAD_SIZE - - TOP_OF_KERNEL_STACK_PADDING) - 1); + task_pt_regs_ptr(task)); } + event->user_stack_len = 0; +#if defined(__x86_64__) event->user_sp = user_regs.sp; event->user_ip = user_regs.ip; - event->user_stack_len = 0; - // Subtract 128 from sp for x86-ABI red zone uintptr_t top_of_stack = user_regs.sp - 128; - +#elif defined(__aarch64__) + event->user_sp = user_regs.sp; + event->user_ip = user_regs.pc; + uintptr_t top_of_stack = user_regs.sp; +#endif // Copy one page at the time - if one fails we don't want to lose the others int i; #pragma unroll @@ -368,9 +422,10 @@ on_event(struct pt_regs* ctx) { // Get PyThreadState of the thread that currently holds the GIL uintptr_t _PyThreadState_Current = 0; - bpf_probe_read_user( + int x = bpf_probe_read_user( &_PyThreadState_Current, sizeof(_PyThreadState_Current), (void*)pid_data->globals._PyThreadState_Current); + bpf_trace_printk("read addr %llx ret %d\n", (unsigned long)pid_data->globals._PyThreadState_Current, x); if (_PyThreadState_Current == 0) { // The GIL is released, we can only get native stacks // until it is held again. @@ -400,6 +455,7 @@ on_event(struct pt_regs* ctx) { state->offsets = pid_data->offsets; state->interp_head = pid_data->interp; state->constant_buffer_addr = pid_data->globals.constant_buffer; + state->pthreads_impl = pid_data->pthreads_impl; // Read pointer to first PyThreadState in thread states list: bpf_probe_read_user( @@ -437,7 +493,7 @@ get_thread_state(struct pt_regs *ctx) { for (int i = 0; i < THREAD_STATES_PER_PROG; ++i) { // Read the PyThreadState::thread_id to which this PyThreadState belongs: thread_id = read_tstate_thread_id(state->thread_state, &state->offsets); - if (thread_id == state->current_thread_id) { + if (compare_task_thread_id(thread_id, state->current_thread_id, state->pthreads_impl)) { goto found; } else if (unlikely(thread_id == BAD_THREAD_ID)) { diff --git a/examples/cpp/pyperf/PyPerfNativeStackTrace.cc b/examples/cpp/pyperf/PyPerfNativeStackTrace.cc index 834df113f4c2..5d30df773b35 100644 --- a/examples/cpp/pyperf/PyPerfNativeStackTrace.cc +++ b/examples/cpp/pyperf/PyPerfNativeStackTrace.cc @@ -53,9 +53,17 @@ NativeStackTrace::NativeStackTrace(uint32_t pid, const unsigned char *raw_stack, return; } - unw_accessors_t my_accessors = _UPT_accessors; - my_accessors.access_mem = NativeStackTrace::access_mem; - my_accessors.access_reg = NativeStackTrace::access_reg; + // We hook some of the accessors to control the level of access libunwind gets of the target processes. + unw_accessors_t my_accessors = { + .find_proc_info = _UPT_find_proc_info, + .put_unwind_info = _UPT_put_unwind_info, + .get_dyn_info_list_addr = _UPT_get_dyn_info_list_addr, + .access_mem = NativeStackTrace::UPT_access_mem, + .access_reg = NativeStackTrace::UPT_access_reg, + .access_fpreg = NativeStackTrace::UPT_access_fpreg, + .resume = NativeStackTrace::UPT_resume, + .get_proc_name = _UPT_get_proc_name, + }; ProcSyms* procSymbols = nullptr; // reserve memory for platform-defined path limit AND the symbol const size_t buf_size = SymbolMaxSize + PATH_MAX + sizeof("() "); @@ -147,8 +155,8 @@ NativeStackTrace::NativeStackTrace(uint32_t pid, const unsigned char *raw_stack, } } -int NativeStackTrace::access_reg(unw_addr_space_t as, unw_regnum_t regnum, - unw_word_t *valp, int write, void *arg) { +int NativeStackTrace::UPT_access_reg(unw_addr_space_t as, unw_regnum_t regnum, + unw_word_t *valp, int write, void *arg) { if (regnum == UNW_REG_SP) { if (write) { logInfo(2, "Libunwind attempts to write to SP\n"); @@ -173,15 +181,22 @@ int NativeStackTrace::access_reg(unw_addr_space_t as, unw_regnum_t regnum, } } -int NativeStackTrace::access_mem(unw_addr_space_t as, unw_word_t addr, - unw_word_t *valp, int write, void *arg) { +int NativeStackTrace::UPT_access_mem(unw_addr_space_t as, unw_word_t addr, + unw_word_t *valp, int write, void *arg) { if (write) { logInfo(3, "Libunwind unexpected mem write attempt\n"); return -UNW_EINVAL; } +#if defined(__x86_64__) // Subtract 128 for x86-ABI red zone - const uintptr_t top_of_stack = NativeStackTrace::sp - 128; + const unsigned redzone = 128; +#elif defined(__aarch64__) + const unsigned redzone = 0; +#else +#error unknown arch +#endif + const uintptr_t top_of_stack = NativeStackTrace::sp - redzone; const uintptr_t stack_start = top_of_stack & ~(getpagesize() - 1); const uintptr_t stack_end = stack_start + NativeStackTrace::stack_len; @@ -223,6 +238,17 @@ int NativeStackTrace::access_mem(unw_addr_space_t as, unw_word_t addr, return -UNW_EINVAL; } +int NativeStackTrace::UPT_access_fpreg(unw_addr_space_t as, unw_regnum_t reg, unw_fpreg_t *val, + int write, void *arg) { + logInfo(3, "Libunwind unexpected UPT_access_fpreg() attempt\n"); + return -UNW_EINVAL; +} + +int NativeStackTrace::UPT_resume(unw_addr_space_t as, unw_cursor_t *c, void *arg) { + logInfo(3, "Libunwind unexpected UPT_resume() attempt\n"); + return -UNW_EINVAL; +} + std::vector NativeStackTrace::get_stack_symbol() const { return symbols; } diff --git a/examples/cpp/pyperf/PyPerfNativeStackTrace.h b/examples/cpp/pyperf/PyPerfNativeStackTrace.h index cc8dcd0641bc..d7ddccdfd4e8 100644 --- a/examples/cpp/pyperf/PyPerfNativeStackTrace.h +++ b/examples/cpp/pyperf/PyPerfNativeStackTrace.h @@ -43,11 +43,14 @@ class NativeStackTrace { static uintptr_t sp; static ProcSymbolsCache procSymbolsCache; - static int access_reg(unw_addr_space_t as, unw_regnum_t regnum, - unw_word_t *valp, int write, void *arg); + static int UPT_access_reg(unw_addr_space_t as, unw_regnum_t regnum, + unw_word_t *valp, int write, void *arg); + static int UPT_access_mem(unw_addr_space_t as, unw_word_t addr, + unw_word_t *valp, int write, void *arg); + static int UPT_access_fpreg(unw_addr_space_t as, unw_regnum_t reg, unw_fpreg_t *val, + int write, void *arg); + static int UPT_resume(unw_addr_space_t as, unw_cursor_t *c, void *arg); - static int access_mem(unw_addr_space_t as, unw_word_t addr, unw_word_t *valp, - int write, void *arg); static ProcSyms* get_proc_symbols(uint32_t pid); }; diff --git a/src/cc/compat/linux/types.h b/src/cc/compat/linux/types.h index 44bccb936042..ae372dc1234b 100644 --- a/src/cc/compat/linux/types.h +++ b/src/cc/compat/linux/types.h @@ -51,10 +51,14 @@ typedef int bool; #define NULL ((void*)0) #define ENOSPC 28 + +#if defined(__x86_64__) + #define PAGE_SIZE 4096 #define PAGE_MASK (~(PAGE_SIZE-1)) #define THREAD_SIZE_ORDER 2 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) +#define TOP_OF_KERNEL_STACK_PADDING 0 struct pt_regs { /* @@ -91,6 +95,59 @@ struct pt_regs { /* top of stack page */ }; +#elif defined(__aarch64__) + +#define PSR_MODE_EL0t 0x00000000 +#define PSR_MODE_MASK 0x0000000f + +#define PAGE_SIZE 4096 // on all systems I saw, CONFIG_ARM64_PAGE_SHIFT=12 +#define PAGE_MASK (~(PAGE_SIZE-1)) + +#define KASAN_THREAD_SHIFT 0 +#define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) +// if CONFIG_VMAP_STACK is enabled & PAGE_SHIFT is 12, then this path gets selected: +#define THREAD_SHIFT MIN_THREAD_SHIFT +#define THREAD_SIZE (1UL << THREAD_SHIFT) + +struct user_pt_regs { + __u64 regs[31]; + __u64 sp; + __u64 pc; + __u64 pstate; +}; + +// this changes! +// this copy is from 5.16.0 +struct pt_regs { + union { + struct user_pt_regs user_regs; + struct { + u64 regs[31]; + u64 sp; + u64 pc; + u64 pstate; + }; + }; + u64 orig_x0; +#ifdef __AARCH64EB__ + u32 unused2; + s32 syscallno; +#else + s32 syscallno; + u32 unused2; +#endif + u64 sdei_ttbr1; + /* Only valid when ARM64_HAS_IRQ_PRIO_MASKING is enabled. */ + u64 pmr_save; + u64 stackframe[2]; + + /* Only valid for some EL1 exceptions. */ + u64 lockdep_hardirqs; + u64 exit_rcu; +}; + +#endif + # ifndef likely # define likely(x) __builtin_expect(x, 1) # endif @@ -312,8 +369,6 @@ unsigned long __rounddown_pow_of_two(unsigned long n) __roundup_pow_of_two(n) \ ) -#define TOP_OF_KERNEL_STACK_PADDING 0 - // END COPIED FROM LINUX #endif // _UAPI__LINUX_LINUX_H__ )********"