diff --git a/agent/Cargo.lock b/agent/Cargo.lock index 8ebcf72435c..bc4b0a96b81 100644 --- a/agent/Cargo.lock +++ b/agent/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "addr2line" @@ -2098,6 +2098,15 @@ dependencies = [ "cc", ] +[[package]] +name = "iced-x86" +version = "1.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c447cff8c7f384a7d4f741cfcff32f75f3ad02b406432e8d6c878d56b1edf6b" +dependencies = [ + "lazy_static", +] + [[package]] name = "id-arena" version = "2.2.1" @@ -4689,6 +4698,7 @@ dependencies = [ "cfg-if", "env_logger 0.11.5", "gimli 0.31.0", + "iced-x86", "libc", "log 0.4.22", "nix 0.29.0", diff --git a/agent/crates/trace-utils/Cargo.toml b/agent/crates/trace-utils/Cargo.toml index c69d5c40f76..e13e750dd1a 100644 --- a/agent/crates/trace-utils/Cargo.toml +++ b/agent/crates/trace-utils/Cargo.toml @@ -20,6 +20,7 @@ regex.workspace = true rustc-demangle = "0.1" semver = "1.0" thiserror = "1.0" +iced-x86 = { version = "1.21", default-features = false, features = ["std", "decoder"] } [build-dependencies] cc = "1.0" diff --git a/agent/crates/trace-utils/cbindgen.toml b/agent/crates/trace-utils/cbindgen.toml index 997612f2872..94628b8224c 100644 --- a/agent/crates/trace-utils/cbindgen.toml +++ b/agent/crates/trace-utils/cbindgen.toml @@ -60,7 +60,8 @@ include = [ "LuaUnwindInfo", "LuaUnwindTable", "LuaOfs", - "LjOfs" + "LjOfs", + "TSDInfo" ] exclude = [ "bpf_update_elem", @@ -80,6 +81,7 @@ ProcessShardList = "process_shard_list_t" UnwindEntry = "unwind_entry_t" UnwindEntryShard = "unwind_entry_shard_t" UnwindTable = "unwind_table_t" +TSDInfo = "tsd_info_t" PyCframe = "py_cframe_t" PyCodeObject = "py_code_object_t" PyFrameObject = "py_frame_object_t" diff --git a/agent/crates/trace-utils/src/error.rs b/agent/crates/trace-utils/src/error.rs index 11a0a52630f..4300fbad34d 100644 --- a/agent/crates/trace-utils/src/error.rs +++ b/agent/crates/trace-utils/src/error.rs @@ -37,6 +37,8 @@ pub enum Error { InvalidPointer(u64), #[error("Invalid or corrupted data")] InvalidData, + #[error("{0}")] + Msg(String), } pub type Result = std::result::Result; diff --git a/agent/crates/trace-utils/src/maps.rs b/agent/crates/trace-utils/src/maps.rs index ec65c19706d..94dd315017d 100644 --- a/agent/crates/trace-utils/src/maps.rs +++ b/agent/crates/trace-utils/src/maps.rs @@ -21,7 +21,7 @@ use std::path::PathBuf; use log::trace; -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct MemoryArea { pub m_start: u64, pub mx_start: u64, // start address of executable section diff --git a/agent/crates/trace-utils/src/trace_utils.h b/agent/crates/trace-utils/src/trace_utils.h index e35934fa992..51a40a91cf5 100644 --- a/agent/crates/trace-utils/src/trace_utils.h +++ b/agent/crates/trace-utils/src/trace_utils.h @@ -91,8 +91,41 @@ typedef struct { } unwind_entry_shard_t; typedef struct { - uint64_t thread_state_address; + /** + * Offset from thread pointer base (TPBASE) to TSD storage + */ + int16_t offset; + /** + * TSD key multiplier (glibc=16, musl=8) + */ + uint8_t multiplier; + /** + * Whether indirect addressing is needed (musl=1, glibc=0) + */ + uint8_t indirect; +} tsd_info_t; + +typedef struct { + /** + * Address of autoTLSkey variable in Python runtime + */ + uint64_t auto_tls_key_addr; + /** + * Python version encoded as 0xMMmm (e.g., 0x030A for 3.10) + */ + uint16_t version; + /** + * Thread Specific Data info for multi-threading support + */ + tsd_info_t tsd_info; + /** + * ID for looking up python_offsets in the offsets map + */ uint8_t offsets_id; + /** + * Padding for alignment + */ + uint8_t _padding[5]; } python_unwind_info_t; typedef struct { @@ -396,6 +429,12 @@ void v8_unwind_table_unload(v8_unwind_table_t *table, uint32_t pid); int32_t read_offset_of_stack_in_task_struct(void); +/** + * Read TPBASE offset from kernel functions + * Returns the offset of fsbase/tpidr in task_struct, or -1 on failure + */ +int64_t read_tpbase_offset(void); + int rustc_demangle(const char *mangled, char *out, size_t out_size); unwind_table_t *unwind_table_create(int32_t process_shard_list_map_fd, diff --git a/agent/crates/trace-utils/src/unwind.rs b/agent/crates/trace-utils/src/unwind.rs index c3067b6e090..2a83284f96a 100644 --- a/agent/crates/trace-utils/src/unwind.rs +++ b/agent/crates/trace-utils/src/unwind.rs @@ -19,6 +19,8 @@ pub mod elf_utils; pub mod lua; pub mod php; pub mod python; +pub mod tpbase; +pub mod tsd; pub mod v8; use std::alloc::{alloc, dealloc, handle_alloc_error, Layout}; diff --git a/agent/crates/trace-utils/src/unwind/python.rs b/agent/crates/trace-utils/src/unwind/python.rs index 8cdeb8718f4..06f1ca12474 100644 --- a/agent/crates/trace-utils/src/unwind/python.rs +++ b/agent/crates/trace-utils/src/unwind/python.rs @@ -18,8 +18,11 @@ use std::{ cell::OnceCell, collections::HashMap, ffi::CStr, fs, io::Write, mem, path::PathBuf, slice, }; +#[cfg(target_arch = "x86_64")] +use ahash::AHashMap; use libc::c_void; use log::{debug, trace, warn}; +use object::{Object, ObjectSection, ObjectSymbol}; use regex::Regex; use semver::{Version, VersionReq}; @@ -29,8 +32,73 @@ use crate::{ utils::{bpf_delete_elem, bpf_update_elem, get_errno, IdGenerator, BPF_ANY}, }; +/// Maximum distance from _PyRuntime for a valid autoTLSkey address +/// autoTLSkey should be within a few KB of _PyRuntime +/// This is a validation threshold, not a version-specific offset +#[cfg(target_arch = "x86_64")] +const PYRUNTIME_MAX_DISTANCE: u64 = 0x10000; + use super::elf_utils::MappedFile; +#[cfg(target_arch = "x86_64")] +use iced_x86::{Decoder, DecoderOptions, Mnemonic, OpKind, Register}; + +#[cfg(target_arch = "x86_64")] +#[derive(Clone, Copy, Debug)] +enum Value { + Known(u64), + /// Address expressed as _PyRuntime + offset (can be negative) + RuntimeBase(i64), + Unknown, +} + +#[cfg(target_arch = "x86_64")] +impl Value { + fn add(self, rhs: i64) -> Self { + match self { + Value::Known(v) => Value::Known(v.wrapping_add(rhs as u64)), + Value::RuntimeBase(off) => Value::RuntimeBase(off.saturating_add(rhs)), + Value::Unknown => Value::Unknown, + } + } + + fn sub(self, rhs: i64) -> Self { + match self { + Value::Known(v) => Value::Known(v.wrapping_sub(rhs as u64)), + Value::RuntimeBase(off) => Value::RuntimeBase(off.saturating_sub(rhs)), + Value::Unknown => Value::Unknown, + } + } + + fn shl(self, shift: u8) -> Self { + if shift >= 63 { + return Value::Unknown; + } + match self { + Value::Known(v) => Value::Known(v << shift), + Value::RuntimeBase(off) => { + Value::RuntimeBase(off.checked_shl(shift as u32).unwrap_or(0)) + } + Value::Unknown => Value::Unknown, + } + } + + fn or(self, rhs: u64) -> Self { + match self { + Value::Known(v) => Value::Known(v | rhs), + _ => Value::Unknown, + } + } + + fn to_runtime_addr(self, runtime_addr: u64) -> Option { + match self { + Value::Known(v) => Some(v), + Value::RuntimeBase(off) => Some(runtime_addr.wrapping_add(off as u64)), + Value::Unknown => None, + } + } +} + fn error_not_python(pid: u32) -> Error { Error::BadInterpreterType(pid, "python") } @@ -175,11 +243,393 @@ impl Interpreter { } Err(error_not_supported_version(self.pid, self.version.clone())) } + + /// Find the autoTLSkey address from PyGILState_GetThisThreadState function. + /// + /// Python stores each thread's PyThreadState in thread-local storage using + /// pthread_setspecific/getspecific. The key is stored in autoTLSkey variable + /// within _PyRuntime structure. + /// + /// This function analyzes the PyGILState_GetThisThreadState function to find + /// the address of the autoTLSkey variable. + const GIL_STATE_SYMBOL: &'static str = "PyGILState_GetThisThreadState"; + + fn find_auto_tls_key_address(&mut self) -> Result { + if !VersionReq::parse(">=3.7.0").unwrap().matches(&self.version) { + return Err(error_not_supported_version(self.pid, self.version.clone())); + } + + // Get _PyRuntime address as reference point + let runtime_addr = match self.find_symbol_address(Self::RUNTIME_SYMBOL)? { + Some(addr) => addr, + None => return Err(Error::Msg("_PyRuntime symbol not found".to_string())), + }; + + // Read PyGILState_GetThisThreadState function code + let (sym_addr, code) = self.read_symbol_code(Self::GIL_STATE_SYMBOL, 512)?; + + // Analyze the function to find the autoTLSkey address + #[cfg(target_arch = "x86_64")] + { + self.decode_auto_tls_key_x86(&code, sym_addr, runtime_addr) + } + + #[cfg(target_arch = "aarch64")] + { + self.decode_auto_tls_key_arm64(&code, sym_addr, runtime_addr) + } + + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + { + Err(Error::Msg( + "autoTLSkey extraction not supported on this architecture".to_string(), + )) + } + } + + fn read_symbol_code(&mut self, name: &str, size: usize) -> Result<(u64, Vec)> { + for file in [Some(&mut self.exe), self.lib.as_mut()] { + let Some(file) = file else { + continue; + }; + file.load()?; + let obj = object::File::parse(&*file.contents)?; + + if let Some(sym) = obj + .symbols() + .chain(obj.dynamic_symbols()) + .find(|s| s.name().map(|n| n == name).unwrap_or(false)) + { + let addr = sym.address(); + + // Find the section containing this symbol + for section in obj.sections() { + let sec_addr = section.address(); + let sec_size = section.size(); + if addr >= sec_addr && addr < sec_addr + sec_size { + let section_data = section.data()?; + let offset = (addr - sec_addr) as usize; + let read_size = size.min(section_data.len() - offset); + let code = section_data[offset..offset + read_size].to_vec(); + + // Calculate the actual address in memory + let ba = file.base_address()?; + let mem_addr = addr + ba; + + return Ok((mem_addr, code)); + } + } + } + } + Err(Error::Msg(format!("Symbol {} not found", name))) + } + + /// Decode autoTLSkey address from x86_64 assembly + #[cfg(target_arch = "x86_64")] + fn decode_auto_tls_key_x86( + &self, + code: &[u8], + code_addr: u64, + runtime_addr: u64, + ) -> Result { + let decoder = Decoder::with_ip(64, code, code_addr, DecoderOptions::NONE); + let mut regs = AHashMap::new(); + regs.insert(Register::RAX, Value::RuntimeBase(0)); + for instr in decoder { + Self::propagate_known(&mut regs, &instr); + + let op0 = Self::canon_reg(instr.op0_register()); + if op0 == Register::RDI { + // Fast path: mov/lea from [_PyRuntime + disp] into EDI/RDI + if instr.op1_kind() == OpKind::Memory && instr.memory_base() == Register::RAX { + let disp = instr.memory_displacement64() as u64; + let target_addr = runtime_addr.wrapping_add(disp); + if let Some(addr) = self.validate_auto_tls_candidate(target_addr, runtime_addr) + { + debug!( + "Found autoTLSkey address {:#x} (mov/lea from RAX base)", + addr + ); + return Ok(addr); + } + } + + if instr.mnemonic() == Mnemonic::Lea && instr.op1_kind() == OpKind::Memory { + if let Some(val) = Self::compute_mem_addr(&instr, ®s).or_else(|| { + Self::assume_runtime_base(&instr, runtime_addr).map(Value::Known) + }) { + if let Some(addr) = val.to_runtime_addr(runtime_addr) { + if let Some(valid) = + self.validate_auto_tls_candidate(addr, runtime_addr) + { + debug!("Found autoTLSkey address {:#x} (LEA)", valid); + return Ok(valid); + } + } + } + } + + if instr.mnemonic() == Mnemonic::Mov && instr.op1_kind() == OpKind::Memory { + if let Some(val) = Self::compute_mem_addr(&instr, ®s).or_else(|| { + Self::assume_runtime_base(&instr, runtime_addr).map(Value::Known) + }) { + if let Some(addr) = val.to_runtime_addr(runtime_addr) { + if let Some(valid) = + self.validate_auto_tls_candidate(addr, runtime_addr) + { + debug!("Found autoTLSkey address {:#x} (MOV)", valid); + return Ok(valid); + } + } + } + } + + if let Some(val) = regs + .get(&Register::RDI) + .and_then(|v| v.to_runtime_addr(runtime_addr)) + { + if let Some(addr) = self.validate_auto_tls_candidate(val, runtime_addr) { + debug!("Found autoTLSkey address {:#x}", addr); + return Ok(addr); + } + } + } + } + + // Fallback: calculate from known _PyRuntime offsets for Python 3.10 + // autoTLSkey is at _PyRuntime.gilstate.autoTSSkey._key + let fallback_addr = runtime_addr + PY310_INITIAL_STATE.auto_tls_key_offset; + debug!( + "Could not find autoTLSkey from disassembly, using fallback offset {:#x}", + fallback_addr + ); + Ok(fallback_addr) + } + + #[cfg(target_arch = "x86_64")] + fn validate_auto_tls_candidate(&self, target_addr: u64, runtime_addr: u64) -> Option { + let distance = if target_addr > runtime_addr { + target_addr - runtime_addr + } else { + runtime_addr - target_addr + }; + + if distance < PYRUNTIME_MAX_DISTANCE { + let auto_tls_key_addr = if self.version >= Version::new(3, 7, 0) && target_addr % 8 == 0 + { + target_addr + 4 + } else { + target_addr + }; + Some(auto_tls_key_addr) + } else { + None + } + } + + #[cfg(target_arch = "x86_64")] + fn canon_reg(reg: Register) -> Register { + match reg { + Register::RAX | Register::EAX => Register::RAX, + Register::RBX | Register::EBX => Register::RBX, + Register::RCX | Register::ECX => Register::RCX, + Register::RDX | Register::EDX => Register::RDX, + Register::RSI | Register::ESI => Register::RSI, + Register::RDI | Register::EDI => Register::RDI, + Register::R8 | Register::R8D => Register::R8, + Register::R9 | Register::R9D => Register::R9, + Register::R10 | Register::R10D => Register::R10, + Register::R11 | Register::R11D => Register::R11, + Register::R12 | Register::R12D => Register::R12, + Register::R13 | Register::R13D => Register::R13, + Register::R14 | Register::R14D => Register::R14, + Register::R15 | Register::R15D => Register::R15, + _ => reg, + } + } + + #[cfg(target_arch = "x86_64")] + fn propagate_known(map: &mut AHashMap, instr: &iced_x86::Instruction) { + match instr.mnemonic() { + Mnemonic::Mov => { + if instr.op0_kind() != OpKind::Register { + return; + } + let dst = Self::canon_reg(instr.op0_register()); + let val = match instr.op1_kind() { + OpKind::Immediate32 => Some(Value::Known(instr.immediate32() as u64)), + OpKind::Immediate64 => Some(Value::Known(instr.immediate64())), + OpKind::Register => map.get(&Self::canon_reg(instr.op1_register())).copied(), + OpKind::Memory => Self::compute_mem_addr(instr, map), + _ => None, + }; + if let Some(v) = val { + map.insert(dst, v); + } + } + Mnemonic::Lea => { + if instr.op0_kind() != OpKind::Register || instr.op1_kind() != OpKind::Memory { + return; + } + if let Some(addr) = Self::compute_mem_addr(instr, map) { + map.insert(Self::canon_reg(instr.op0_register()), addr); + } + } + Mnemonic::Add => { + if instr.op0_kind() != OpKind::Register { + return; + } + let dst = Self::canon_reg(instr.op0_register()); + if let Some(base) = map.get(&dst).copied() { + let delta = match instr.op1_kind() { + OpKind::Immediate32 => instr.immediate32() as i64, + OpKind::Immediate8 => instr.immediate8() as i64, + OpKind::Register => { + match map.get(&Self::canon_reg(instr.op1_register())).copied() { + Some(Value::Known(v)) => v as i64, + Some(Value::RuntimeBase(off)) => off, + _ => 0, + } + } + _ => 0, + }; + map.insert(dst, base.add(delta)); + } + } + Mnemonic::Sub => { + if instr.op0_kind() != OpKind::Register { + return; + } + let dst = Self::canon_reg(instr.op0_register()); + if let Some(base) = map.get(&dst).copied() { + let delta = match instr.op1_kind() { + OpKind::Immediate32 => instr.immediate32() as i64, + OpKind::Immediate8 => instr.immediate8() as i64, + OpKind::Register => { + match map.get(&Self::canon_reg(instr.op1_register())).copied() { + Some(Value::Known(v)) => v as i64, + Some(Value::RuntimeBase(off)) => off, + _ => 0, + } + } + _ => 0, + }; + map.insert(dst, base.sub(delta)); + } + } + Mnemonic::Shl => { + if instr.op0_kind() != OpKind::Register || instr.op1_kind() != OpKind::Immediate8 { + return; + } + let dst = Self::canon_reg(instr.op0_register()); + if let Some(base) = map.get(&dst).copied() { + let shift = instr.immediate8(); + map.insert(dst, base.shl(shift)); + } + } + Mnemonic::And => { + if instr.op0_kind() != OpKind::Register { + return; + } + let dst = Self::canon_reg(instr.op0_register()); + if let Some(base) = map.get(&dst).copied() { + let mask = match instr.op1_kind() { + OpKind::Immediate32 => instr.immediate32() as u64, + OpKind::Immediate8 => instr.immediate8() as u64, + _ => return, + }; + map.insert(dst, base.or(mask)); + } + } + Mnemonic::Or => { + if instr.op0_kind() != OpKind::Register { + return; + } + let dst = Self::canon_reg(instr.op0_register()); + if let Some(base) = map.get(&dst).copied() { + let val = match instr.op1_kind() { + OpKind::Immediate32 => instr.immediate32() as u64, + OpKind::Immediate8 => instr.immediate8() as u64, + _ => return, + }; + map.insert(dst, base.or(val)); + } + } + _ => {} + } + } + + #[cfg(target_arch = "x86_64")] + fn compute_mem_addr( + instr: &iced_x86::Instruction, + map: &AHashMap, + ) -> Option { + let disp = instr.memory_displacement64() as i64; + let base_val = match instr.memory_base() { + Register::RIP => Value::Known(instr.next_ip()), + Register::None => Value::Known(0), + base => map + .get(&Self::canon_reg(base)) + .copied() + .unwrap_or(Value::Unknown), + }; + + let with_disp = match base_val { + Value::Known(v) => Value::Known(v.wrapping_add(disp as u64)), + Value::RuntimeBase(off) => Value::RuntimeBase(off.saturating_add(disp)), + Value::Unknown => Value::Unknown, + }; + + let result = if instr.memory_index() != Register::None { + let idx_val = map.get(&Self::canon_reg(instr.memory_index())).copied()?; + let scale = instr.memory_index_scale() as i64; + match (with_disp, idx_val) { + (Value::Known(a), Value::Known(b)) => { + Value::Known(a.wrapping_add((b as i64 * scale) as u64)) + } + (Value::RuntimeBase(off), Value::Known(b)) => { + Value::RuntimeBase(off.saturating_add((b as i64 * scale) as i64)) + } + _ => Value::Unknown, + } + } else { + with_disp + }; + + Some(result) + } + + #[cfg(target_arch = "x86_64")] + fn assume_runtime_base(instr: &iced_x86::Instruction, runtime_addr: u64) -> Option { + if instr.memory_base() == Register::RAX { + return Some(runtime_addr.wrapping_add(instr.memory_displacement64() as u64)); + } + None + } + + /// Decode autoTLSkey address from ARM64 assembly + #[cfg(target_arch = "aarch64")] + fn decode_auto_tls_key_arm64( + &self, + _code: &[u8], + _code_addr: u64, + runtime_addr: u64, + ) -> Result { + // TODO: Implement ARM64 pattern analysis + // For now, use fallback from Python 3.10 known offset + let fallback_addr = runtime_addr + PY310_INITIAL_STATE.auto_tls_key_offset; + warn!( + "ARM64 autoTLSkey extraction not implemented, using fallback {:#x}", + fallback_addr + ); + Ok(fallback_addr) + } } pub struct InterpreterInfo { pub version: Version, pub thread_address: u64, + pub auto_tls_key_addr: u64, } impl InterpreterInfo { @@ -202,9 +652,24 @@ impl InterpreterInfo { ); let mut intp = Interpreter::new(pid, exe_area, lib_area)?; + + // Get auto_tls_key_addr for multi-threading support + let auto_tls_key_addr = match intp.find_auto_tls_key_address() { + Ok(addr) => addr, + Err(e) => { + warn!("Failed to find autoTLSkey address for process#{pid}: {e}, using fallback"); + // Fallback: calculate from known Python 3.10 _PyRuntime offset + let runtime_addr = intp + .find_symbol_address(Interpreter::RUNTIME_SYMBOL)? + .ok_or_else(|| Error::Msg("_PyRuntime not found".to_string()))?; + runtime_addr + PY310_INITIAL_STATE.auto_tls_key_offset + } + }; + Ok(Self { version: intp.version.clone(), thread_address: intp.thread_state_address()?, + auto_tls_key_addr, }) } @@ -220,18 +685,63 @@ impl InterpreterInfo { } } +/// Python runtime state offsets +/// These are offsets within the _PyRuntime global structure for a specific Python version pub struct InitialState { + /// Offset of _PyRuntime.gilstate.tstate_current (points to current thread's PyThreadState) + /// For single-threaded or main thread access tstate_current: u64, + + /// Offset of _PyRuntime.gilstate.autoTSSkey._key (pthread_key_t value) on x86_64 + /// Used for multi-threaded PyThreadState lookup via TSD + /// Note: This is the offset to the pthread_key_t value, not the Py_tss_t struct itself + #[cfg(target_arch = "x86_64")] + auto_tls_key_offset: u64, + + /// Offset of _PyRuntime.gilstate.autoTSSkey._key (pthread_key_t value) on ARM64 + /// Used for multi-threaded PyThreadState lookup via TSD + /// Note: This offset needs verification on actual ARM64 Python builds + #[cfg(target_arch = "aarch64")] + auto_tls_key_offset: u64, } +#[cfg(target_arch = "x86_64")] +const PY310_INITIAL_STATE: &InitialState = &InitialState { + tstate_current: 568, + auto_tls_key_offset: 0x24c, // _PyRuntime.gilstate.autoTSSkey._key +}; + +#[cfg(target_arch = "aarch64")] const PY310_INITIAL_STATE: &InitialState = &InitialState { tstate_current: 568, + auto_tls_key_offset: 0x57c, // _PyRuntime.gilstate.autoTSSkey._key (0x578 + 4) }; +/// Thread Specific Data information for accessing per-thread PyThreadState +/// This is used to correctly unwind Python stacks in multi-threaded applications +#[repr(C)] +#[derive(Debug, Clone, Copy, Default)] +pub struct TSDInfo { + /// Offset from thread pointer base (TPBASE) to TSD storage + pub offset: i16, + /// TSD key multiplier (glibc=16, musl=8) + pub multiplier: u8, + /// Whether indirect addressing is needed (musl=1, glibc=0) + pub indirect: u8, +} + #[repr(C)] pub struct PythonUnwindInfo { - pub thread_state_address: u64, + /// Address of autoTLSkey variable in Python runtime + pub auto_tls_key_addr: u64, + /// Python version encoded as 0xMMmm (e.g., 0x030A for 3.10) + pub version: u16, + /// Thread Specific Data info for multi-threading support + pub tsd_info: TSDInfo, + /// ID for looking up python_offsets in the offsets map pub offsets_id: u8, + /// Padding for alignment + pub _padding: [u8; 5], } #[repr(C)] @@ -394,11 +904,39 @@ impl PythonUnwindTable { id as u8 } }; - let info = PythonUnwindInfo { - thread_state_address: info.thread_address, + + // Extract TSD info for multi-threading support + let tsd_info = match super::tsd::extract_tsd_info(pid) { + Ok(tsd) => { + debug!( + "Extracted TSD info for process#{pid}: offset={}, multiplier={}, indirect={}", + tsd.offset, tsd.multiplier, tsd.indirect + ); + tsd + } + Err(e) => { + debug!("Failed to extract TSD info for process#{pid}: {e}, using defaults"); + super::tsd::get_default_tsd_info(pid) + } + }; + + // Encode version as 0xMMmm (e.g., 3.10 -> 0x030A) + let version = ((info.version.major as u16) << 8) | (info.version.minor as u16); + + let unwind_info = PythonUnwindInfo { + auto_tls_key_addr: info.auto_tls_key_addr, + version, + tsd_info, offsets_id, + _padding: [0; 5], }; - self.update_unwind_info_map(pid, &info); + + debug!( + "Loading Python unwind info for process#{pid}: autoTLSkey={:#x}, version={:#x}", + unwind_info.auto_tls_key_addr, unwind_info.version + ); + + self.update_unwind_info_map(pid, &unwind_info); } pub unsafe fn unload(&mut self, pid: u32) { diff --git a/agent/crates/trace-utils/src/unwind/tpbase.rs b/agent/crates/trace-utils/src/unwind/tpbase.rs new file mode 100644 index 00000000000..28283b5b704 --- /dev/null +++ b/agent/crates/trace-utils/src/unwind/tpbase.rs @@ -0,0 +1,810 @@ +/* + * Copyright (c) 2024 Yunshan Networks + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Thread Pointer Base (TPBASE) offset extraction +//! +//! This module extracts the offset of fsbase/tpidr in task_struct by analyzing +//! kernel functions. This is needed for accessing Thread Local Storage (TLS) +//! from eBPF programs. +//! +//! On x86_64, the thread pointer base is stored in task_struct.thread.fsbase +//! On arm64, it's stored in task_struct.thread.uw.tp_value + +use std::fs::File; +use std::io::{BufRead, BufReader, Read, Seek, SeekFrom}; + +use log::{debug, trace, warn}; + +use crate::error::{Error, Result}; + +// Constants for TPBASE offset validation +/// Minimum reasonable TPBASE offset (task_struct.thread.fsbase should be after basic fields) +const TPBASE_MIN_OFFSET: u32 = 500; +/// Maximum reasonable TPBASE offset (thread struct shouldn't be too deep in task_struct) +const TPBASE_MAX_OFFSET: u32 = 20000; + +// Default TPBASE offsets for different architectures +/// Default x86_64 TPBASE offset (common for Ubuntu/Debian kernels 5.x/6.x) +#[cfg(target_arch = "x86_64")] +const DEFAULT_X86_64_TPBASE_OFFSET: u64 = 0x1978; // 6520 bytes +/// Default ARM64 TPBASE offset (placeholder, needs verification) +#[cfg(target_arch = "aarch64")] +const DEFAULT_AARCH64_TPBASE_OFFSET: u64 = 0x1000; + +/// Kernel function analyzers for extracting TPBASE offset +struct Analyzer { + function_name: &'static str, + analyze: fn(&[u8]) -> Option, +} + +/// Get the list of analyzers for the current architecture +fn get_analyzers() -> Vec { + #[cfg(target_arch = "x86_64")] + { + vec![ + Analyzer { + function_name: "x86_fsbase_write_task", + analyze: analyze_fsbase_write_task_x86, + }, + Analyzer { + function_name: "aout_dump_debugregs", + analyze: analyze_aout_dump_debugregs_x86, + }, + ] + } + #[cfg(target_arch = "aarch64")] + { + vec![Analyzer { + function_name: "tls_thread_switch", + analyze: analyze_tls_thread_switch_arm64, + }] + } + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + { + vec![] + } +} + +/// Analyze x86_fsbase_write_task function to extract fsbase offset. +/// +/// This function simply writes the second argument (fsbase value) to +/// task_struct at the fsbase offset. Available since kernel version 4.20. +/// +/// Expected pattern: +/// 48 89 b7 XX XX XX XX mov %rsi, 0xXXXXXXXX(%rdi) +/// +/// Where %rdi is task_struct pointer and %rsi is fsbase value +#[cfg(target_arch = "x86_64")] +fn analyze_fsbase_write_task_x86(code: &[u8]) -> Option { + // Pattern: REX.W MOV r/m64, r64 with RDI base and RSI source + // 48 89 b7 = mov %rsi, offset(%rdi) + let pattern = [0x48, 0x89, 0xb7]; + + if let Some(idx) = code.windows(3).position(|w| w == pattern) { + if idx + 7 <= code.len() { + let offset = + u32::from_le_bytes([code[idx + 3], code[idx + 4], code[idx + 5], code[idx + 6]]); + trace!( + "Found fsbase offset {:#x} from x86_fsbase_write_task", + offset + ); + return Some(offset); + } + } + None +} + +/// Analyze aout_dump_debugregs function to extract fsbase offset. +/// This is a fallback for older kernels that don't have x86_fsbase_write_task. +/// +/// This function reads task->thread.fsbase, so we look for memory loads +/// from task_struct with a specific pattern. +#[cfg(target_arch = "x86_64")] +fn analyze_aout_dump_debugregs_x86(code: &[u8]) -> Option { + // This is more complex - would need full disassembly + // For simplicity, look for common patterns + // Pattern: mov XX(%rdi), %rXX or mov XX(%rsi), %rXX + + // Look for 48 8b XX XX XX XX XX patterns (mov r64, m64) + for i in 0..code.len().saturating_sub(7) { + // REX.W MOV r64, [rdi + disp32] + if code[i] == 0x48 && code[i + 1] == 0x8b { + let modrm = code[i + 2]; + let mod_field = modrm >> 6; + let rm_field = modrm & 0x7; + + // mod=10 (disp32), rm=111 (rdi) + if mod_field == 2 && rm_field == 7 { + let offset = + u32::from_le_bytes([code[i + 3], code[i + 4], code[i + 5], code[i + 6]]); + // fsbase offset is typically in range TPBASE_MIN_OFFSET-TPBASE_MAX_OFFSET + // and should be related to thread.fsbase + // The actual fsbase is at offset-16 from debugreg storage + if offset > TPBASE_MIN_OFFSET && offset < TPBASE_MAX_OFFSET { + // Adjust for the debugreg offset (fsbase is typically 16 bytes before) + let fsbase_offset = offset.saturating_sub(16); + trace!( + "Found potential fsbase offset {:#x} (adjusted from {:#x}) from aout_dump_debugregs", + fsbase_offset, offset + ); + return Some(fsbase_offset); + } + } + } + } + None +} + +/// Analyze tls_thread_switch function on ARM64 to extract tp_value offset. +#[cfg(target_arch = "aarch64")] +fn analyze_tls_thread_switch_arm64(_code: &[u8]) -> Option { + // ARM64 pattern analysis would go here + // For now, return None as this needs more investigation + None +} + +/// Read kernel function code at the given address +fn read_kernel_code(addr: u64, size: usize) -> Result> { + let mut file = File::open("/proc/kcore")?; + + // /proc/kcore is an ELF core dump format + // We need to parse the ELF headers to find the correct offset + // For simplicity, try to read from /dev/kmem first if available + + // Try /dev/kmem (may not be available on all systems) + if let Ok(mut kmem) = File::open("/dev/kmem") { + kmem.seek(SeekFrom::Start(addr))?; + let mut buf = vec![0u8; size]; + kmem.read_exact(&mut buf)?; + return Ok(buf); + } + + // Fallback: try to use /proc/kcore with ELF parsing + read_kernel_code_from_kcore(&mut file, addr, size) +} + +/// Read kernel code from /proc/kcore (ELF core dump format) +fn read_kernel_code_from_kcore(file: &mut File, addr: u64, size: usize) -> Result> { + use object::{elf, read::elf::FileHeader, Endianness}; + + // Read the entire header section first to parse program headers + file.seek(SeekFrom::Start(0))?; + let mut header_data = vec![0u8; 4096]; // Should be enough for headers + file.read_exact(&mut header_data)?; + + // Parse header to get program header info + let (endian, phoff, phnum, phentsize) = { + let header = elf::FileHeader64::::parse(&header_data[..]) + .map_err(|e| Error::Msg(format!("Failed to parse kcore ELF header: {}", e)))?; + let endian = header + .endian() + .map_err(|e| Error::Msg(format!("Failed to get endianness: {}", e)))?; + let phoff = header.e_phoff(endian) as usize; + let phnum = header.e_phnum(endian) as usize; + let phentsize = header.e_phentsize(endian) as usize; + (endian, phoff, phnum, phentsize) + }; + + // Ensure we have enough data + let needed_size = phoff + phnum * phentsize; + if needed_size > header_data.len() { + header_data.resize(needed_size, 0); + file.seek(SeekFrom::Start(0))?; + file.read_exact(&mut header_data)?; + } + + // Re-parse the header with complete data + let header = elf::FileHeader64::::parse(&header_data[..]) + .map_err(|e| Error::Msg(format!("Failed to parse kcore ELF header: {}", e)))?; + + let program_headers = header + .program_headers(endian, &header_data[..]) + .map_err(|e| Error::Msg(format!("Failed to parse program headers: {}", e)))?; + + find_and_read_segment(file, program_headers, endian, addr, size) +} + +/// Helper function to find and read segment from program headers +fn find_and_read_segment( + file: &mut File, + program_headers: &[object::elf::ProgramHeader64], + endian: E, + addr: u64, + size: usize, +) -> Result> { + use object::{elf, read::elf::ProgramHeader}; + + // Find the segment containing our address + for phdr in program_headers { + if phdr.p_type(endian) != elf::PT_LOAD { + continue; + } + + let p_vaddr = phdr.p_vaddr(endian); + let p_memsz = phdr.p_memsz(endian); + let p_offset = phdr.p_offset(endian); + + if addr >= p_vaddr && addr < p_vaddr + p_memsz { + let file_offset = p_offset + (addr - p_vaddr); + file.seek(SeekFrom::Start(file_offset))?; + let mut buf = vec![0u8; size]; + file.read_exact(&mut buf)?; + return Ok(buf); + } + } + + Err(Error::Msg(format!( + "Address {:#x} not found in kcore segments", + addr + ))) +} + +/// Look up a kernel symbol address from /proc/kallsyms +fn lookup_kernel_symbol(name: &str) -> Result { + let file = File::open("/proc/kallsyms")?; + let reader = BufReader::new(file); + + for line in reader.lines() { + let line = line?; + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 3 && parts[2] == name { + let addr = u64::from_str_radix(parts[0], 16) + .map_err(|e| Error::Msg(format!("Failed to parse address: {}", e)))?; + if addr == 0 { + // kallsyms may show 0 address when kptr_restrict is enabled + return Err(Error::Msg( + "kallsyms shows 0 address, try running as root or set kptr_restrict=0" + .to_string(), + )); + } + return Ok(addr); + } + } + + Err(Error::Msg(format!("Symbol {} not found in kallsyms", name))) +} + +/// Extract TPBASE offset from kernel functions +/// +/// This function tries multiple kernel functions to extract the offset +/// of the thread pointer base (fsbase on x86_64, tp_value on arm64) +/// within task_struct. +pub fn extract_tpbase_offset() -> Result { + let analyzers = get_analyzers(); + if analyzers.is_empty() { + return Err(Error::Msg( + "No TPBASE analyzers for this architecture".to_string(), + )); + } + + for analyzer in analyzers { + match lookup_kernel_symbol(analyzer.function_name) { + Ok(addr) => { + trace!( + "Found kernel symbol {} at {:#x}", + analyzer.function_name, + addr + ); + + // Read function code (256 bytes should be enough for analysis) + match read_kernel_code(addr, 256) { + Ok(code) => { + if let Some(offset) = (analyzer.analyze)(&code) { + // Sanity check: offset should be in reasonable range + if offset >= TPBASE_MIN_OFFSET && offset <= TPBASE_MAX_OFFSET { + debug!( + "Extracted TPBASE offset {} ({:#x}) from {}", + offset, offset, analyzer.function_name + ); + return Ok(offset as u64); + } else { + warn!( + "TPBASE offset {} from {} seems invalid (expected {}-{})", + offset, + analyzer.function_name, + TPBASE_MIN_OFFSET, + TPBASE_MAX_OFFSET + ); + } + } + } + Err(e) => { + debug!( + "Failed to read kernel code for {}: {}", + analyzer.function_name, e + ); + } + } + } + Err(e) => { + trace!("Symbol {} not found: {}", analyzer.function_name, e); + } + } + } + + // If we can't extract from kernel functions, try BTF as fallback + extract_tpbase_offset_from_btf() +} + +/// Extract TPBASE offset from kernel BTF information +fn extract_tpbase_offset_from_btf() -> Result { + // Try to use BTF to get the offset of thread.fsbase in task_struct + // This is a fallback when kernel function analysis fails + + // For x86_64: task_struct.thread.fsbase + // For arm64: task_struct.thread.uw.tp_value + + #[cfg(target_arch = "x86_64")] + { + // Try parsing /sys/kernel/btf/vmlinux + match parse_btf_for_tpbase() { + Ok(offset) => { + debug!( + "Extracted TPBASE offset {} ({:#x}) from BTF", + offset, offset + ); + return Ok(offset); + } + Err(e) => { + debug!("BTF parsing failed: {}", e); + } + } + + // Fallback to hardcoded defaults based on common kernel configurations + // These values are extracted from various kernel versions + get_default_tpbase_offset() + } + + #[cfg(target_arch = "aarch64")] + { + // For ARM64: task_struct.thread.uw.tp_value + // Default offset for common kernel configurations + get_default_tpbase_offset() + } + + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + { + Err(Error::Msg( + "TPBASE extraction not supported for this architecture".to_string(), + )) + } +} + +/// Parse BTF information from /sys/kernel/btf/vmlinux to get TPBASE offset +#[cfg(target_arch = "x86_64")] +fn parse_btf_for_tpbase() -> Result { + use std::fs::File; + use std::io::Read; + + // Read BTF data from sysfs + let btf_path = "/sys/kernel/btf/vmlinux"; + let mut file = + File::open(btf_path).map_err(|e| Error::Msg(format!("Cannot open {}: {}", btf_path, e)))?; + + let mut btf_data = Vec::new(); + file.read_to_end(&mut btf_data) + .map_err(|e| Error::Msg(format!("Cannot read {}: {}", btf_path, e)))?; + + // Parse BTF header to find task_struct and thread.fsbase offset + // BTF format: https://www.kernel.org/doc/html/latest/bpf/btf.html + parse_btf_task_struct_fsbase(&btf_data) +} + +/// Parse BTF data to find task_struct.thread.fsbase offset +#[cfg(target_arch = "x86_64")] +fn parse_btf_task_struct_fsbase(btf_data: &[u8]) -> Result { + // BTF header structure (from include/uapi/linux/btf.h) + // struct btf_header { + // __u16 magic; // 0xEB9F + // __u8 version; // 1 + // __u8 flags; + // __u32 hdr_len; + // __u32 type_off; // offset of type section + // __u32 type_len; // length of type section + // __u32 str_off; // offset of string section + // __u32 str_len; // length of string section + // }; + + if btf_data.len() < 24 { + return Err(Error::Msg("BTF data too small".to_string())); + } + + let magic = u16::from_le_bytes([btf_data[0], btf_data[1]]); + if magic != 0xEB9F { + return Err(Error::Msg(format!("Invalid BTF magic: {:#x}", magic))); + } + + let hdr_len = u32::from_le_bytes([btf_data[4], btf_data[5], btf_data[6], btf_data[7]]) as usize; + let type_off = + u32::from_le_bytes([btf_data[8], btf_data[9], btf_data[10], btf_data[11]]) as usize; + let type_len = + u32::from_le_bytes([btf_data[12], btf_data[13], btf_data[14], btf_data[15]]) as usize; + let str_off = + u32::from_le_bytes([btf_data[16], btf_data[17], btf_data[18], btf_data[19]]) as usize; + let str_len = + u32::from_le_bytes([btf_data[20], btf_data[21], btf_data[22], btf_data[23]]) as usize; + + let type_section_start = hdr_len + type_off; + let str_section_start = hdr_len + str_off; + + if type_section_start + type_len > btf_data.len() + || str_section_start + str_len > btf_data.len() + { + return Err(Error::Msg("BTF sections out of bounds".to_string())); + } + + let type_section = &btf_data[type_section_start..type_section_start + type_len]; + let str_section = &btf_data[str_section_start..str_section_start + str_len]; + + // Find task_struct type ID first + let task_struct_id = find_btf_struct_by_name(type_section, str_section, "task_struct")?; + + // Find thread member in task_struct + let thread_offset = + find_btf_member_offset(type_section, str_section, task_struct_id, "thread")?; + + // Find thread_struct type and fsbase offset within it + let thread_type_id = find_btf_member_type(type_section, str_section, task_struct_id, "thread")?; + let fsbase_offset = + find_btf_member_offset(type_section, str_section, thread_type_id, "fsbase")?; + + let total_offset = thread_offset + fsbase_offset; + + // Sanity check + if total_offset < TPBASE_MIN_OFFSET as u64 || total_offset > TPBASE_MAX_OFFSET as u64 { + return Err(Error::Msg(format!( + "BTF offset {} seems invalid (expected {}-{})", + total_offset, TPBASE_MIN_OFFSET, TPBASE_MAX_OFFSET + ))); + } + + Ok(total_offset) +} + +/// BTF type kinds +#[cfg(target_arch = "x86_64")] +const BTF_KIND_STRUCT: u32 = 4; +#[cfg(target_arch = "x86_64")] +const BTF_KIND_UNION: u32 = 5; + +/// Calculate extra size for a BTF type based on its kind and vlen +#[cfg(target_arch = "x86_64")] +fn btf_type_extra_size(kind: u32, vlen: u32) -> usize { + match kind { + 1 => 4, // BTF_KIND_INT + 2 => 0, // PTR + 3 => 12, // ARRAY + 4 | 5 => (vlen * 12) as usize, // STRUCT, UNION: each member is 12 bytes + 6 => (vlen * 8) as usize, // ENUM (64-bit: each enumerator is 8 bytes) + 7 => 0, // FWD + 8 => 0, // TYPEDEF + 9 => 0, // VOLATILE + 10 => 0, // CONST + 11 => 0, // RESTRICT + 12 => 0, // FUNC + 13 => (vlen * 8) as usize, // FUNC_PROTO: each param is 8 bytes + 14 => 12, // VAR + 15 => (vlen * 12) as usize, // DATASEC + 16 => 0, // FLOAT + 17 => 4, // DECL_TAG + 18 => 0, // TYPE_TAG + 19 => (vlen * 8) as usize, // ENUM64 + _ => 0, + } +} + +/// Find BTF struct type ID by name +#[cfg(target_arch = "x86_64")] +fn find_btf_struct_by_name(type_section: &[u8], str_section: &[u8], name: &str) -> Result { + // BTF type format: + // struct btf_type { + // __u32 name_off; + // __u32 info; // kind in bits 24-28 + // union { __u32 size; __u32 type; }; + // }; + // For struct, followed by btf_member array + + let mut offset = 0; + let mut type_id = 1u32; // BTF type IDs start at 1 + + while offset + 12 <= type_section.len() { + let name_off = u32::from_le_bytes([ + type_section[offset], + type_section[offset + 1], + type_section[offset + 2], + type_section[offset + 3], + ]) as usize; + + let info = u32::from_le_bytes([ + type_section[offset + 4], + type_section[offset + 5], + type_section[offset + 6], + type_section[offset + 7], + ]); + + let kind = (info >> 24) & 0x1f; + let vlen = info & 0xffff; // number of members for struct + + // Skip the base btf_type (12 bytes) + let extra_size = btf_type_extra_size(kind, vlen); + + if kind == BTF_KIND_STRUCT { + // Check if this is the struct we're looking for + if name_off < str_section.len() { + let type_name = get_btf_string(str_section, name_off); + if type_name == name { + return Ok(type_id); + } + } + } + + offset += 12 + extra_size; // 12 = type_base_size + type_id += 1; + } + + Err(Error::Msg(format!("BTF struct '{}' not found", name))) +} + +/// Find offset of a member in a BTF struct +#[cfg(target_arch = "x86_64")] +fn find_btf_member_offset( + type_section: &[u8], + str_section: &[u8], + struct_id: u32, + member_name: &str, +) -> Result { + // Navigate to the struct type entry + let mut offset = 0; + let mut current_id = 1u32; + + while offset + 12 <= type_section.len() && current_id <= struct_id { + let info = u32::from_le_bytes([ + type_section[offset + 4], + type_section[offset + 5], + type_section[offset + 6], + type_section[offset + 7], + ]); + + let kind = (info >> 24) & 0x1f; + let vlen = info & 0xffff; + + let extra_size = btf_type_extra_size(kind, vlen); + + if current_id == struct_id && (kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION) { + // Found the struct, now search its members + let members_start = offset + 12; // btf_type base size + for i in 0..vlen as usize { + let member_offset = members_start + i * 12; + if member_offset + 12 > type_section.len() { + break; + } + + let mem_name_off = u32::from_le_bytes([ + type_section[member_offset], + type_section[member_offset + 1], + type_section[member_offset + 2], + type_section[member_offset + 3], + ]) as usize; + + let mem_offset_bits = u32::from_le_bytes([ + type_section[member_offset + 8], + type_section[member_offset + 9], + type_section[member_offset + 10], + type_section[member_offset + 11], + ]); + + if mem_name_off < str_section.len() { + let mem_name = get_btf_string(str_section, mem_name_off); + if mem_name == member_name { + // Convert bits to bytes (normal struct members are byte-aligned) + return Ok((mem_offset_bits / 8) as u64); + } + } + } + return Err(Error::Msg(format!( + "Member '{}' not found in struct", + member_name + ))); + } + + offset += 12 + extra_size; // 12 = btf_type base size + current_id += 1; + } + + Err(Error::Msg(format!("Struct ID {} not found", struct_id))) +} + +/// Find type ID of a member in a BTF struct +#[cfg(target_arch = "x86_64")] +fn find_btf_member_type( + type_section: &[u8], + str_section: &[u8], + struct_id: u32, + member_name: &str, +) -> Result { + let mut offset = 0; + let mut current_id = 1u32; + + while offset + 12 <= type_section.len() && current_id <= struct_id { + let info = u32::from_le_bytes([ + type_section[offset + 4], + type_section[offset + 5], + type_section[offset + 6], + type_section[offset + 7], + ]); + + let kind = (info >> 24) & 0x1f; + let vlen = info & 0xffff; + + let extra_size = btf_type_extra_size(kind, vlen); + + if current_id == struct_id && (kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION) { + let members_start = offset + 12; // btf_type base size + for i in 0..vlen as usize { + let member_offset = members_start + i * 12; + if member_offset + 12 > type_section.len() { + break; + } + + let mem_name_off = u32::from_le_bytes([ + type_section[member_offset], + type_section[member_offset + 1], + type_section[member_offset + 2], + type_section[member_offset + 3], + ]) as usize; + + let mem_type = u32::from_le_bytes([ + type_section[member_offset + 4], + type_section[member_offset + 5], + type_section[member_offset + 6], + type_section[member_offset + 7], + ]); + + if mem_name_off < str_section.len() { + let mem_name = get_btf_string(str_section, mem_name_off); + if mem_name == member_name { + return Ok(mem_type); + } + } + } + return Err(Error::Msg(format!( + "Member '{}' not found in struct", + member_name + ))); + } + + offset += 12 + extra_size; // 12 = btf_type base size + current_id += 1; + } + + Err(Error::Msg(format!("Struct ID {} not found", struct_id))) +} + +/// Get null-terminated string from BTF string section +#[cfg(target_arch = "x86_64")] +fn get_btf_string(str_section: &[u8], offset: usize) -> &str { + if offset >= str_section.len() { + return ""; + } + let bytes = &str_section[offset..]; + let end = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len()); + std::str::from_utf8(&bytes[..end]).unwrap_or("") +} + +/// Get default TPBASE offset based on common kernel configurations +/// +/// These values are derived from analysis of various kernel versions. +/// The offset of task_struct.thread.fsbase varies based on kernel configuration +/// but typically falls within a predictable range. +fn get_default_tpbase_offset() -> Result { + #[cfg(target_arch = "x86_64")] + { + // Common offsets for x86_64 kernels: + // - kernel 5.x: typically around 0x1940-0x1A00 (6464-6656) + // - kernel 6.x: typically around 0x1940-0x1A80 (6464-6784) + // + // The exact offset depends on CONFIG_* options, especially: + // - CONFIG_KASAN + // - CONFIG_MEMCG + // - CONFIG_CGROUPS + // - CONFIG_BPF_SYSCALL + // + // We use a common value that works for most distributions + let default_offset = DEFAULT_X86_64_TPBASE_OFFSET; + + warn!( + "Using default TPBASE offset {:#x} ({}). This may not be accurate for all kernels.", + default_offset, default_offset + ); + Ok(default_offset) + } + + #[cfg(target_arch = "aarch64")] + { + // For ARM64: task_struct.thread.uw.tp_value + // Common offset for arm64 kernels + let default_offset = DEFAULT_AARCH64_TPBASE_OFFSET; + + warn!( + "Using default TPBASE offset {:#x} for arm64. This may not be accurate.", + default_offset + ); + Ok(default_offset) + } + + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + { + Err(Error::Msg( + "Default TPBASE offset not available for this architecture".to_string(), + )) + } +} + +/// C-callable function to read TPBASE offset +#[no_mangle] +pub extern "C" fn read_tpbase_offset() -> i64 { + match extract_tpbase_offset() { + Ok(offset) => offset as i64, + Err(e) => { + warn!("Failed to extract TPBASE offset: {}", e); + // Return default offset instead of -1 to allow Python profiling to work + #[cfg(target_arch = "x86_64")] + { + let default = DEFAULT_X86_64_TPBASE_OFFSET as i64; + warn!("Using fallback TPBASE offset: {:#x}", default); + return default; + } + #[cfg(target_arch = "aarch64")] + { + let default = DEFAULT_AARCH64_TPBASE_OFFSET as i64; + warn!("Using fallback TPBASE offset: {:#x}", default); + return default; + } + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + { + -1 + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_analyze_fsbase_write_task_x86() { + // Test pattern: mov %rsi, 0x1234(%rdi) + // 48 89 b7 34 12 00 00 + let code = [0x48, 0x89, 0xb7, 0x34, 0x12, 0x00, 0x00]; + let result = analyze_fsbase_write_task_x86(&code); + assert_eq!(result, Some(0x1234)); + } + + #[test] + fn test_analyze_fsbase_write_task_x86_with_prefix() { + // Test with some prefix instructions + let code = [ + 0x55, // push rbp + 0x48, 0x89, 0xe5, // mov rbp, rsp + 0x48, 0x89, 0xb7, 0x78, 0x19, 0x00, 0x00, // mov %rsi, 0x1978(%rdi) + 0x5d, // pop rbp + 0xc3, // ret + ]; + let result = analyze_fsbase_write_task_x86(&code); + assert_eq!(result, Some(0x1978)); + } +} diff --git a/agent/crates/trace-utils/src/unwind/tsd.rs b/agent/crates/trace-utils/src/unwind/tsd.rs new file mode 100644 index 00000000000..7c88269074f --- /dev/null +++ b/agent/crates/trace-utils/src/unwind/tsd.rs @@ -0,0 +1,864 @@ +/* + * Copyright (c) 2024 Yunshan Networks + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Thread Specific Data (TSD) extraction module +//! +//! This module analyzes the C library's pthread_getspecific function to extract +//! the parameters needed to access thread-specific data from eBPF. +//! +//! The TSD info includes: +//! - offset: Offset from thread pointer base to TSD storage +//! - multiplier: Size of each TSD entry (8 for musl, 16 for glibc) +//! - indirect: Whether indirect addressing is needed (1 for musl, 0 for glibc) +//! +//! C library implementations: +//! - musl: pthread->tsd[key] (indirect, multiplier=8) +//! - glibc: pthread->specific_1stblock[key].data (direct, multiplier=16) + +use std::cell::OnceCell; +use std::fs; +use std::path::PathBuf; + +#[cfg(target_arch = "x86_64")] +use ahash::AHashMap; +use log::trace; +use object::{Object, ObjectSection, ObjectSymbol}; +use regex::Regex; + +use crate::error::{Error, Result}; +use crate::maps::{get_memory_mappings, MemoryArea}; + +use super::python::TSDInfo; + +#[cfg(target_arch = "x86_64")] +use iced_x86::{Decoder, DecoderOptions, Instruction, Mnemonic, OpKind, Register}; + +// TSD constants for different C library implementations +/// glibc: each TSD entry is struct pthread_key_data { uintptr_t seq; void *data; } = 16 bytes +const GLIBC_TSD_MULTIPLIER: u8 = 16; +/// musl: each TSD entry is a pointer = 8 bytes +const MUSL_TSD_MULTIPLIER: u8 = 8; + +/// glibc: TSD is inline in pthread struct (no indirection) +const GLIBC_TSD_INDIRECT: u8 = 0; +/// musl: pthread->tsd is a pointer that needs dereferencing +const MUSL_TSD_INDIRECT: u8 = 1; + +/// Default glibc TSD offset (pthread->specific_1stblock + 8, for data field) +const GLIBC_TSD_DEFAULT_OFFSET: i16 = 0x318; +/// Default musl TSD offset (pthread->tsd) +const MUSL_TSD_DEFAULT_OFFSET: i16 = 128; + +thread_local! { + static LIBC_REGEX: OnceCell = OnceCell::new(); +} + +/// Check if a DSO path potentially contains pthread code +pub fn is_potential_tsd_dso(path: &str) -> bool { + LIBC_REGEX.with(|r| { + r.get_or_init(|| Regex::new(r".*/(ld-musl|libc|libpthread)([-.].*)?\.so").unwrap()) + .is_match(path) + }) +} + +/// Find the libc DSO for a given process +fn find_libc_dso(pid: u32) -> Result { + let mm = get_memory_mappings(pid)?; + + // Priority: libc.so.6 > libpthread.so.0 > ld-musl-*.so.1 + // Note: Modern glibc (2.34+) has pthread implementation in libc.so.6, + // so we prefer libc.so.6 over libpthread.so.0 + let candidates: Vec<&MemoryArea> = mm + .iter() + .filter(|m| is_potential_tsd_dso(&m.path)) + .collect(); + + if candidates.is_empty() { + return Err(Error::Msg(format!( + "No libc/pthread DSO found for process {}", + pid + ))); + } + + // Prefer libc.so.6 for modern glibc (pthread is now in libc) + for c in &candidates { + if c.path.contains("libc.so") { + return Ok((*c).clone()); + } + } + // Fallback to libpthread.so.0 for older glibc + for c in &candidates { + if c.path.contains("libpthread") { + return Ok((*c).clone()); + } + } + // musl libc variants + for c in &candidates { + if c.path.contains("libc.musl") || c.path.contains("ld-musl") { + return Ok((*c).clone()); + } + } + + Ok(candidates[0].clone()) +} + +/// Read pthread_getspecific function code from a DSO +fn read_pthread_getspecific_code(pid: u32, dso: &MemoryArea) -> Result> { + // Try reading from the file system first (works for host processes) + if let Ok(code) = read_pthread_getspecific_from_file(pid, dso) { + return Ok(code); + } + + // Fallback: read from process memory (works for container processes) + read_pthread_getspecific_from_memory(pid, dso) +} + +/// Read pthread_getspecific function code from file system +fn read_pthread_getspecific_from_file(pid: u32, dso: &MemoryArea) -> Result> { + let base: PathBuf = ["/proc", &pid.to_string(), "root"].iter().collect(); + let path = base.join(&dso.path[1..]); + + let data = fs::read(&path) + .map_err(|e| Error::Msg(format!("Cannot read DSO file {}: {}", path.display(), e)))?; + let obj = object::File::parse(&*data)?; + + read_symbol_code_from_elf(&obj, dso) +} + +/// Read pthread_getspecific function code from process memory +fn read_pthread_getspecific_from_memory(pid: u32, dso: &MemoryArea) -> Result> { + use std::fs::File; + use std::io::{Read, Seek, SeekFrom}; + + // First, get symbol offset from the DSO file (try multiple paths) + let symbol_offset = get_pthread_getspecific_offset(pid, dso)?; + + // Calculate actual memory address + let mem_addr = dso.m_start + symbol_offset; + + // Read from /proc/{pid}/mem + let mem_path = format!("/proc/{}/mem", pid); + let mut mem_file = File::open(&mem_path) + .map_err(|e| Error::Msg(format!("Cannot open {}: {}", mem_path, e)))?; + + mem_file + .seek(SeekFrom::Start(mem_addr)) + .map_err(|e| Error::Msg(format!("Cannot seek in {}: {}", mem_path, e)))?; + + let mut code = vec![0u8; 256]; + mem_file + .read_exact(&mut code) + .map_err(|e| Error::Msg(format!("Cannot read from {}: {}", mem_path, e)))?; + + trace!( + "Read pthread_getspecific from process memory at {:#x}", + mem_addr + ); + + Ok(code) +} + +/// Get the offset of pthread_getspecific within the DSO +fn get_pthread_getspecific_offset(pid: u32, dso: &MemoryArea) -> Result { + // Handle " (deleted)" suffix which may be present in memory mappings + let clean_path_str = if let Some(stripped) = dso.path.strip_suffix(" (deleted)") { + stripped + } else { + &dso.path + }; + + // 1. Try /proc/{pid}/root/{path} + let base: PathBuf = ["/proc", &pid.to_string(), "root"].iter().collect(); + // dso.path usually starts with / + let relative_path = if clean_path_str.starts_with('/') { + &clean_path_str[1..] + } else { + clean_path_str + }; + let path1 = base.join(relative_path); + + if let Ok(data) = fs::read(&path1) { + if let Ok(obj) = object::File::parse(&*data) { + if let Some(offset) = find_pthread_getspecific_offset(&obj) { + return Ok(offset); + } + } + } + + // 2. Try the path directly (for container overlay fs) + if let Ok(data) = fs::read(clean_path_str) { + if let Ok(obj) = object::File::parse(&*data) { + if let Some(offset) = find_pthread_getspecific_offset(&obj) { + return Ok(offset); + } + } + } + + // Note: Do not fallback to host libc paths here. + // If we are in a container, the host libc likely has different offsets. + // Reading incorrect offsets causes us to read garbage from process memory, + // leading to failures in extract_tsd_info. + + Err(Error::Msg(format!( + "Cannot find pthread_getspecific offset in DSO: {}", + dso.path + ))) +} + +/// Find pthread_getspecific symbol offset in an ELF file +fn find_pthread_getspecific_offset(obj: &object::File) -> Option { + let symbol_names = ["__pthread_getspecific", "pthread_getspecific"]; + + for name in symbol_names { + if let Some(sym) = obj + .symbols() + .chain(obj.dynamic_symbols()) + .find(|s| s.name().map(|n| n == name).unwrap_or(false)) + { + return Some(sym.address()); + } + } + None +} + +/// Read symbol code from an ELF file object +fn read_symbol_code_from_elf(obj: &object::File, _dso: &MemoryArea) -> Result> { + // Try both glibc and musl symbol names + let symbol_names = ["__pthread_getspecific", "pthread_getspecific"]; + + for name in symbol_names { + if let Some(sym) = obj + .symbols() + .chain(obj.dynamic_symbols()) + .find(|s| s.name().map(|n| n == name).unwrap_or(false)) + { + let addr = sym.address(); + let size = sym.size().max(256) as usize; // Read at least 256 bytes + + // Read code from the DSO file + if let Some(section) = obj.sections().find(|s| { + let (_start, len) = s.file_range().unwrap_or((0, 0)); + addr >= s.address() && addr < s.address() + len + }) { + let section_data = section.data()?; + let offset = (addr - section.address()) as usize; + let end = (offset + size).min(section_data.len()); + + trace!( + "Found {} at {:#x}, reading {} bytes", + name, + addr, + end - offset + ); + return Ok(section_data[offset..end].to_vec()); + } + } + } + + Err(Error::Msg( + "pthread_getspecific symbol not found in DSO".to_string(), + )) +} + +/// Extract TSD info from pthread_getspecific code (x86_64) +#[cfg(target_arch = "x86_64")] +fn extract_tsd_info_x86(code: &[u8]) -> Result { + if let Some(info) = decode_tsd_info_with_disasm(code) { + return Ok(info); + } + legacy_extract_tsd_info_x86(code) +} + +/// Legacy pattern-based extractor kept as a fallback for unknown sequences. +#[cfg(target_arch = "x86_64")] +fn legacy_extract_tsd_info_x86(code: &[u8]) -> Result { + // musl pattern (indirect): + // mov %fs:0x0, %rax ; get pthread struct pointer + // mov offset(%rax), %rax ; load tsd pointer (indirect) + // mov (%rax,%rdi,8), %rax ; return tsd[key] + + // Check for musl pattern first + let musl_fs_pattern = [0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00]; + if code.starts_with(&musl_fs_pattern) { + for i in 9..code.len().saturating_sub(6) { + if code[i] == 0x48 && code[i + 1] == 0x8b && code[i + 2] == 0x80 { + let offset = + i32::from_le_bytes([code[i + 3], code[i + 4], code[i + 5], code[i + 6]]); + trace!("Found musl TSD pattern: offset={}", offset); + return Ok(TSDInfo { + offset: offset as i16, + multiplier: MUSL_TSD_MULTIPLIER, + indirect: MUSL_TSD_INDIRECT, + }); + } + } + } + + // Check for glibc pattern (fs:0x10) + let glibc_fs_pattern = [0x64, 0x48, 0x8b, 0x04, 0x25, 0x10, 0x00, 0x00, 0x00]; + if code.starts_with(&glibc_fs_pattern) || code.windows(9).any(|w| w == glibc_fs_pattern) { + // Strategy 1: Look for SIB addressing (older glibc) + // mov offset(%rax,%rdi,8), %rax + for i in 0..code.len().saturating_sub(8) { + if code[i] == 0x48 && code[i + 1] == 0x8b && code[i + 2] == 0x84 { + let sib = code[i + 3]; + let scale = 1 << (sib >> 6); + let offset = + i32::from_le_bytes([code[i + 4], code[i + 5], code[i + 6], code[i + 7]]); + + if scale == 8 || scale == 16 { + trace!( + "Found glibc SIB pattern: offset={}, scale={}", + offset, + scale + ); + return Ok(TSDInfo { + offset: (offset + 8) as i16, + multiplier: GLIBC_TSD_MULTIPLIER, + indirect: GLIBC_TSD_INDIRECT, + }); + } + } + } + + // Strategy 2: Look for split instruction pattern (modern glibc) + // 1. mov %fs:0x10, %rax + // 2. ... (shl/add instructions) + // 3. mov offset(%rax), %rax + for i in 0..code.len().saturating_sub(7) { + // mov offset(%rax), %rax -> 48 8b 80 XX XX XX XX + if code[i] == 0x48 && code[i + 1] == 0x8b && code[i + 2] == 0x80 { + let offset = + i32::from_le_bytes([code[i + 3], code[i + 4], code[i + 5], code[i + 6]]); + + // Heuristic check for reasonable TSD offset + if offset > 0x100 && offset < 0x2000 { + trace!("Found glibc split pattern: offset={}", offset); + return Ok(TSDInfo { + offset: offset as i16, + multiplier: GLIBC_TSD_MULTIPLIER, + indirect: GLIBC_TSD_INDIRECT, + }); + } + } + } + } + + Err(Error::Msg(format!( + "Could not extract TSD info from x86_64 code (len={}). Dump: {:02x?}", + code.len(), + code + ))) +} + +/// Extract TSD info from pthread_getspecific code (ARM64) +#[cfg(target_arch = "aarch64")] +fn extract_tsd_info_arm64(_code: &[u8]) -> Result { + // TODO: Implement ARM64 TSD extraction + // For now, use default musl-like parameters + Ok(TSDInfo { + offset: 0, + multiplier: MUSL_TSD_MULTIPLIER, + indirect: MUSL_TSD_INDIRECT, + }) +} + +/// Extract TSD info for a given process +pub fn extract_tsd_info(pid: u32) -> Result { + let dso = find_libc_dso(pid)?; + trace!("Found libc DSO for process {}: {}", pid, dso.path); + + let code = read_pthread_getspecific_code(pid, &dso)?; + trace!("Read {} bytes of pthread_getspecific code", code.len()); + + #[cfg(target_arch = "x86_64")] + { + extract_tsd_info_x86(&code) + } + + #[cfg(target_arch = "aarch64")] + { + extract_tsd_info_arm64(&code) + } + + #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + { + Err(Error::Msg( + "TSD extraction not supported on this architecture".to_string(), + )) + } +} + +#[cfg(target_arch = "x86_64")] +#[derive(Clone, Debug)] +enum Expr { + Unknown, + Const(i64), + Key, + Fs(u64), + Add(Box, Box), + Mul(Box, i64), + Deref(Box), +} + +#[cfg(target_arch = "x86_64")] +#[derive(Clone, Copy, Debug)] +enum BaseComponent { + Fs(u64), + FsDeref { disp: u64, offset: i64 }, +} + +#[cfg(target_arch = "x86_64")] +fn decode_tsd_info_with_disasm(code: &[u8]) -> Option { + let decoder = Decoder::with_ip(64, code, 0, DecoderOptions::NONE); + let mut regs = vec![ + (Register::RAX, Expr::Unknown), + (Register::RBX, Expr::Unknown), + (Register::RCX, Expr::Unknown), + (Register::RDX, Expr::Unknown), + (Register::RSI, Expr::Unknown), + (Register::RDI, Expr::Key), + ] + .into_iter() + .collect::>(); + + let mut result_expr = None; + for instr in decoder { + match instr.mnemonic() { + Mnemonic::Mov => handle_mov(&instr, &mut regs), + Mnemonic::Lea => handle_lea(&instr, &mut regs), + Mnemonic::Add => handle_add(&instr, &mut regs), + Mnemonic::Shl => handle_shl(&instr, &mut regs), + Mnemonic::Imul => handle_imul(&instr, &mut regs), + Mnemonic::Ret => { + result_expr = regs.get(&Register::RAX).cloned(); + break; + } + _ => {} + } + } + + let Some(expr) = result_expr else { + return None; + }; + match_tsd_expr(&expr) +} + +#[cfg(target_arch = "x86_64")] +fn match_tsd_expr(expr: &Expr) -> Option { + match_glibc(expr).or_else(|| match_musl(expr)) +} + +#[cfg(target_arch = "x86_64")] +fn match_glibc(expr: &Expr) -> Option { + let Expr::Deref(inner) = expr else { + return None; + }; + let (base, key_coeff, offset) = linear_components(inner)?; + let (fs_disp, base_offset) = match base { + BaseComponent::Fs(fs_disp) => (fs_disp, 0), + BaseComponent::FsDeref { disp, offset } => (disp, offset), + }; + if key_coeff == 0 || fs_disp != 0x10 { + return None; + } + let total_offset = offset + base_offset; + if total_offset < i16::MIN as i64 || total_offset > i16::MAX as i64 { + return None; + } + Some(TSDInfo { + offset: total_offset as i16, + multiplier: key_coeff as u8, + indirect: GLIBC_TSD_INDIRECT, + }) +} + +#[cfg(target_arch = "x86_64")] +fn match_musl(expr: &Expr) -> Option { + let Expr::Deref(inner) = expr else { + return None; + }; + let mut parts = Vec::new(); + flatten_add(inner, &mut parts); + + let mut base = None; + let mut key_coeff = 0i64; + let mut offset = 0i64; + + for part in parts { + match part { + Expr::Deref(addr) => { + if let Some((base_comp, 0, extra)) = linear_components(addr) { + match base_comp { + BaseComponent::Fs(disp) if disp == 0 => base = Some(extra), + BaseComponent::FsDeref { disp, offset: off } if disp == 0 => { + base = Some(off + extra) + } + _ => {} + } + } + } + _ => { + if let Some((None, kc, off)) = linear_components_optional_base(part) { + key_coeff += kc; + offset += off; + } + } + } + } + + let Some(base_off) = base else { + return None; + }; + if key_coeff == 0 { + return None; + } + let total_offset = base_off + offset; + if total_offset < i16::MIN as i64 || total_offset > i16::MAX as i64 { + return None; + } + Some(TSDInfo { + offset: total_offset as i16, + multiplier: key_coeff as u8, + indirect: MUSL_TSD_INDIRECT, + }) +} + +#[cfg(target_arch = "x86_64")] +fn flatten_add<'a>(expr: &'a Expr, parts: &mut Vec<&'a Expr>) { + match expr { + Expr::Add(a, b) => { + flatten_add(a, parts); + flatten_add(b, parts); + } + _ => parts.push(expr), + } +} + +#[cfg(target_arch = "x86_64")] +fn linear_components(expr: &Expr) -> Option<(BaseComponent, i64, i64)> { + let (base_opt, key, offset) = linear_components_optional_base(expr)?; + let base = base_opt?; + Some((base, key, offset)) +} + +#[cfg(target_arch = "x86_64")] +fn linear_components_optional_base(expr: &Expr) -> Option<(Option, i64, i64)> { + let mut base = None; + let mut key_coeff = 0i64; + let mut offset = 0i64; + + fn walk( + expr: &Expr, + base: &mut Option, + key_coeff: &mut i64, + offset: &mut i64, + ) -> bool { + match expr { + Expr::Add(a, b) => walk(a, base, key_coeff, offset) && walk(b, base, key_coeff, offset), + Expr::Const(c) => { + *offset += *c; + true + } + Expr::Key => { + *key_coeff += 1; + true + } + Expr::Mul(inner, factor) => { + let mut inner_base = None; + let mut inner_key = 0; + let mut inner_off = 0; + if !walk(inner, &mut inner_base, &mut inner_key, &mut inner_off) { + return false; + } + if inner_base.is_some() { + // Do not try to scale base pointer; treat as unsupported + return false; + } + *key_coeff += inner_key * *factor; + *offset += inner_off * *factor; + true + } + Expr::Fs(disp) => { + if base.is_some() { + return false; + } + *base = Some(BaseComponent::Fs(*disp)); + true + } + Expr::Deref(inner) => { + // Allow one level of deref for musl base + let (inner_base, inner_key, inner_off) = + match linear_components_optional_base(inner) { + Some(v) => v, + None => return false, + }; + if inner_key != 0 || inner_base.is_none() { + return false; + } + if base.is_some() { + return false; + } + let Some(BaseComponent::Fs(disp)) = inner_base else { + return false; + }; + *base = Some(BaseComponent::FsDeref { + disp, + offset: inner_off, + }); + true + } + Expr::Unknown => true, + } + } + + if walk(expr, &mut base, &mut key_coeff, &mut offset) { + Some((base, key_coeff, offset)) + } else { + None + } +} + +#[cfg(target_arch = "x86_64")] +fn handle_mov(instr: &Instruction, regs: &mut AHashMap) { + if instr.op0_kind() != OpKind::Register { + return; + } + let dst = canonical_reg(instr.op0_register()); + let new_expr = match instr.op1_kind() { + OpKind::Register => regs + .get(&canonical_reg(instr.op1_register())) + .cloned() + .unwrap_or(Expr::Unknown), + OpKind::Immediate64 => Expr::Const(instr.immediate64() as i64), + OpKind::Immediate32 => Expr::Const(instr.immediate32() as i64), + OpKind::Memory => Expr::Deref(Box::new(build_address_expr(instr, regs))), + _ => Expr::Unknown, + }; + regs.insert(dst, new_expr); +} + +#[cfg(target_arch = "x86_64")] +fn handle_lea(instr: &Instruction, regs: &mut AHashMap) { + if instr.op0_kind() != OpKind::Register || instr.op1_kind() != OpKind::Memory { + return; + } + let dst = canonical_reg(instr.op0_register()); + let expr = build_address_expr(instr, regs); + regs.insert(dst, expr); +} + +#[cfg(target_arch = "x86_64")] +fn handle_add(instr: &Instruction, regs: &mut AHashMap) { + if instr.op0_kind() != OpKind::Register { + return; + } + let dst = canonical_reg(instr.op0_register()); + let left = regs.get(&dst).cloned().unwrap_or(Expr::Unknown); + let right = match instr.op1_kind() { + OpKind::Immediate32 => Expr::Const(instr.immediate32() as i64), + OpKind::Immediate8 => Expr::Const(instr.immediate8() as i64), + OpKind::Register => regs + .get(&canonical_reg(instr.op1_register())) + .cloned() + .unwrap_or(Expr::Unknown), + OpKind::Memory => Expr::Deref(Box::new(build_address_expr(instr, regs))), + _ => Expr::Unknown, + }; + regs.insert(dst, Expr::Add(Box::new(left), Box::new(right))); +} + +#[cfg(target_arch = "x86_64")] +fn handle_shl(instr: &Instruction, regs: &mut AHashMap) { + if instr.op0_kind() != OpKind::Register || instr.op1_kind() != OpKind::Immediate8 { + return; + } + let dst = canonical_reg(instr.op0_register()); + let shift = instr.immediate8(); + if shift >= 63 { + return; + } + let factor = 1i64 << shift; + let expr = regs.get(&dst).cloned().unwrap_or(Expr::Unknown); + regs.insert(dst, Expr::Mul(Box::new(expr), factor)); +} + +#[cfg(target_arch = "x86_64")] +fn handle_imul(instr: &Instruction, regs: &mut AHashMap) { + // Only handle the form: imul reg, reg, imm8/imm32 + if instr.op_count() != 3 { + return; + } + if instr.op0_kind() != OpKind::Register || instr.op1_kind() != OpKind::Register { + return; + } + let dst = canonical_reg(instr.op0_register()); + let src = canonical_reg(instr.op1_register()); + if dst != src { + return; + } + let factor = match instr.op2_kind() { + OpKind::Immediate8 => instr.immediate8() as i64, + OpKind::Immediate32 => instr.immediate32() as i64, + _ => return, + }; + let expr = regs.get(&dst).cloned().unwrap_or(Expr::Unknown); + regs.insert(dst, Expr::Mul(Box::new(expr), factor)); +} + +#[cfg(target_arch = "x86_64")] +fn canonical_reg(reg: Register) -> Register { + match reg { + Register::RAX | Register::EAX | Register::AX | Register::AL => Register::RAX, + Register::RBX | Register::EBX | Register::BX | Register::BL => Register::RBX, + Register::RCX | Register::ECX | Register::CX | Register::CL => Register::RCX, + Register::RDX | Register::EDX | Register::DX | Register::DL => Register::RDX, + Register::RSI | Register::ESI | Register::SI | Register::SIL => Register::RSI, + Register::RDI | Register::EDI | Register::DI | Register::DIL => Register::RDI, + Register::R8 | Register::R8D | Register::R8W | Register::R8L => Register::R8, + Register::R9 | Register::R9D | Register::R9W | Register::R9L => Register::R9, + Register::R10 | Register::R10D | Register::R10W | Register::R10L => Register::R10, + Register::R11 | Register::R11D | Register::R11W | Register::R11L => Register::R11, + Register::R12 | Register::R12D | Register::R12W | Register::R12L => Register::R12, + Register::R13 | Register::R13D | Register::R13W | Register::R13L => Register::R13, + Register::R14 | Register::R14D | Register::R14W | Register::R14L => Register::R14, + Register::R15 | Register::R15D | Register::R15W | Register::R15L => Register::R15, + _ => reg, + } +} + +#[cfg(target_arch = "x86_64")] +fn build_address_expr(instr: &Instruction, regs: &AHashMap) -> Expr { + let mut expr = + if instr.segment_prefix() == Register::FS && instr.memory_base() == Register::None { + Expr::Fs(instr.memory_displacement64()) + } else { + Expr::Const(instr.memory_displacement64() as i64) + }; + + // Base register + if instr.memory_base() != Register::None { + let base = canonical_reg(instr.memory_base()); + let base_expr = regs.get(&base).cloned().unwrap_or(Expr::Unknown); + expr = Expr::Add(Box::new(expr), Box::new(base_expr)); + } else if instr.segment_prefix() == Register::FS && instr.memory_base() == Register::None { + // already handled + } + + // RIP-relative + if instr.memory_base() == Register::RIP { + expr = Expr::Const(instr.next_ip().wrapping_add(instr.memory_displacement64()) as i64); + } + + // Index register + if instr.memory_index() != Register::None { + let idx = canonical_reg(instr.memory_index()); + let idx_expr = regs.get(&idx).cloned().unwrap_or(Expr::Unknown); + let scale = instr.memory_index_scale() as i64; + expr = Expr::Add( + Box::new(expr), + Box::new(Expr::Mul(Box::new(idx_expr), scale)), + ); + } + + expr +} + +/// Get default TSD info based on detected libc type +pub fn get_default_tsd_info(pid: u32) -> TSDInfo { + // Try to detect libc type from memory mappings + if let Ok(mm) = get_memory_mappings(pid) { + for m in &mm { + if m.path.contains("musl") || m.path.contains("ld-musl") { + // musl libc + return TSDInfo { + offset: MUSL_TSD_DEFAULT_OFFSET, + multiplier: MUSL_TSD_MULTIPLIER, + indirect: MUSL_TSD_INDIRECT, + }; + } + if m.path.contains("libc.so.6") || m.path.contains("libpthread.so.0") { + // glibc + return TSDInfo { + offset: GLIBC_TSD_DEFAULT_OFFSET, + multiplier: GLIBC_TSD_MULTIPLIER, + indirect: GLIBC_TSD_INDIRECT, + }; + } + } + } + + // Default to glibc-like parameters (most common) + TSDInfo { + offset: GLIBC_TSD_DEFAULT_OFFSET, + multiplier: GLIBC_TSD_MULTIPLIER, + indirect: GLIBC_TSD_INDIRECT, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_is_potential_tsd_dso() { + assert!(is_potential_tsd_dso("/lib/x86_64-linux-gnu/libc.so.6")); + assert!(is_potential_tsd_dso( + "/lib/x86_64-linux-gnu/libpthread.so.0" + )); + assert!(is_potential_tsd_dso("/lib/ld-musl-x86_64.so.1")); + assert!(is_potential_tsd_dso("/usr/lib/libc.musl-x86_64.so.1")); + assert!(!is_potential_tsd_dso("/usr/lib/libpython3.10.so")); + assert!(!is_potential_tsd_dso("/lib/libm.so.6")); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_extract_tsd_info_musl() { + // Simplified musl pthread_getspecific pattern + let code = [ + 0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, // mov %fs:0x0, %rax + 0x48, 0x8b, 0x80, 0x80, 0x00, 0x00, 0x00, // mov 0x80(%rax), %rax + 0x48, 0x8b, 0x04, 0xf8, // mov (%rax,%rdi,8), %rax + 0xc3, // ret + ]; + let result = extract_tsd_info_x86(&code); + assert!(result.is_ok()); + let info = result.unwrap(); + assert_eq!(info.offset, 0x80); + assert_eq!(info.multiplier, 8); + assert_eq!(info.indirect, 1); + } + + #[test] + #[cfg(target_arch = "x86_64")] + fn test_extract_tsd_info_glibc_split() { + // pattern: fs:0x10 -> shl -> add -> mov offset + let code = [ + 0x64, 0x48, 0x8b, 0x04, 0x25, 0x10, 0x00, 0x00, 0x00, // mov %fs:0x10, %rax + 0x48, 0xc1, 0xe7, 0x04, // shl $0x4, %rdi + 0x48, 0x01, 0xf8, // add %rdi, %rax + 0x48, 0x8b, 0x80, 0x18, 0x03, 0x00, 0x00, // mov 0x318(%rax), %rax + 0xc3, + ]; + let result = extract_tsd_info_x86(&code); + assert!(result.is_ok()); + let info = result.unwrap(); + assert_eq!(info.offset, 0x318); + assert_eq!(info.multiplier, 16); + assert_eq!(info.indirect, 0); + } +} diff --git a/agent/src/ebpf/kernel/include/perf_profiler.h b/agent/src/ebpf/kernel/include/perf_profiler.h index 2a8acb9f8c7..5ca6794eeb2 100644 --- a/agent/src/ebpf/kernel/include/perf_profiler.h +++ b/agent/src/ebpf/kernel/include/perf_profiler.h @@ -102,6 +102,7 @@ struct stack_trace_key_t { typedef struct { __u32 task_struct_stack_offset; + __u64 tpbase_offset; /* Offset of fsbase/tpidr in task_struct for TLS access */ } unwind_sysinfo_t; #define CLASS_NAME_LEN 32 diff --git a/agent/src/ebpf/kernel/perf_profiler.bpf.c b/agent/src/ebpf/kernel/perf_profiler.bpf.c index 803cdafed43..b2b97b103b6 100644 --- a/agent/src/ebpf/kernel/perf_profiler.bpf.c +++ b/agent/src/ebpf/kernel/perf_profiler.bpf.c @@ -165,15 +165,13 @@ MAP_ARRAY(unwind_sysinfo, __u32, unwind_sysinfo_t, 1, FEATURE_FLAG_DWARF_UNWINDI * - To increase capacity, modify max_entries below and rebuild */ -// Python: stores thread state address for stack unwinding -// - python_tstate_addr_map: key=PID, value=thread_state_address (per-thread) -// Pre-allocated: 65536 * (4 + 8 + 32) ≈ 2.8 MB (htab_elem overhead included) +// Python: stores per-process Python unwinding information // - python_unwind_info_map: key=PID, value=python_unwind_info_t (per-process) -// Pre-allocated: 65536 * (4 + 16 + 32) ≈ 3.3 MB (htab_elem overhead included) +// Contains auto_tls_key_addr, version, TSD info for multi-threaded PyThreadState lookup +// Pre-allocated: 65536 * (4 + 24 + 32) ≈ 3.8 MB (htab_elem overhead included) // - python_offsets_map: key=offsets_id, value=python_offsets_t (per-version, reference counted) // Supports 1 Python version at a time (upgrades replace entry) // Pre-allocated: 1 * (1 + 216 + 32) ≈ 249 bytes -MAP_HASH(python_tstate_addr_map, __u32, __u64, 65536, FEATURE_FLAG_PROFILE_PYTHON) MAP_HASH(python_unwind_info_map, __u32, python_unwind_info_t, 65536, FEATURE_FLAG_PROFILE_PYTHON) MAP_HASH(python_offsets_map, __u8, python_offsets_t, 1, FEATURE_FLAG_PROFILE_PYTHON) @@ -1278,6 +1276,116 @@ __u32 get_symbol_id(symbol_t * symbol) return id; } +/* + * TSD (Thread Specific Data) helper functions for multi-threaded Python support + * + * These functions read thread-local storage to get per-thread PyThreadState, + * enabling correct Python stack unwinding in multi-threaded applications. + */ + +/* + * Get the thread pointer base (TPBASE) from the current task's task_struct. + * On x86_64, this is the fsbase value; on arm64, it's tp_value. + * + * The TPBASE points to the C library's per-thread data structure (struct pthread) + * which contains thread-local storage including Python's PyThreadState pointer. + */ +static inline __attribute__ ((always_inline)) +int tsd_get_base(void **tsd_base) +{ + __u32 zero = 0; + unwind_sysinfo_t *sysinfo = unwind_sysinfo__lookup(&zero); + if (sysinfo == NULL || sysinfo->tpbase_offset == 0) { + bpf_debug("[TSD] sysinfo or tpbase_offset not available"); + return -1; + } + + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + + /* + * Read task->thread.fsbase (x86_64) or equivalent from task_struct. + * We use the dynamically calculated tpbase_offset since the struct layout + * varies by kernel version. + */ + void *tpbase_ptr = ((char *)task) + sysinfo->tpbase_offset; + if (bpf_probe_read_kernel(tsd_base, sizeof(void *), tpbase_ptr)) { + bpf_debug("[TSD] Failed to read tpbase value"); + return -1; + } + + return 0; +} + +/* + * Read from Thread Specific Data location associated with the provided key. + * + * For musl libc (indirect): + * pthread->tsd is a pointer to an array, so we need an extra dereference + * tsd_addr = *(tsd_base + offset) + key * 8 + * + * For glibc (direct): + * pthread->specific_1stblock is an inline array + * tsd_addr = tsd_base + 0x10 + offset + key * 16 + * (each entry is struct pthread_key_data { uintptr_t seq; void *data; }) + */ +static inline __attribute__ ((always_inline)) +int tsd_read(const tsd_info_t *tsi, const void *tsd_base, int key, void **out) +{ + const void *tsd_addr = tsd_base + tsi->offset; + + if (tsi->indirect) { + /* musl: read the pointer that points to the TSD array */ + if (bpf_probe_read_user(&tsd_addr, sizeof(tsd_addr), tsd_addr)) { + bpf_debug("[TSD] Failed to read indirect TSD pointer"); + return -1; + } + } + + /* Calculate final address using key and multiplier */ + tsd_addr += key * tsi->multiplier; + + bpf_debug("[TSD] Reading from tsd_addr=%lx (key=%d)", (unsigned long)tsd_addr, key); + if (bpf_probe_read_user(out, sizeof(*out), tsd_addr)) { + bpf_debug("[TSD] Failed to read TSD value"); + return -1; + } + + return 0; +} + +/* + * Get PyThreadState for the current thread using Thread Specific Data (TSD). + * + * Python stores each thread's PyThreadState in thread-local storage using + * pthread_setspecific(autoTLSkey, tstate). We read this value using TSD functions. + */ +static inline __attribute__ ((always_inline)) +void *get_py_thread_state(python_unwind_info_t *py_info) +{ + void *tsd_base; + if (tsd_get_base(&tsd_base) != 0) { + return NULL; + } + + /* Read the autoTLSkey value from Python runtime */ + int auto_tls_key; + if (bpf_probe_read_user(&auto_tls_key, sizeof(auto_tls_key), + (void *)py_info->auto_tls_key_addr)) { + bpf_debug("[PYTHON] Failed to read autoTLSkey"); + return NULL; + } + + bpf_debug("[PYTHON] autoTLSkey=%d, tsd_base=%lx", auto_tls_key, (unsigned long)tsd_base); + + /* Read PyThreadState from TSD */ + void *thread_state = NULL; + if (tsd_read(&py_info->tsd_info, tsd_base, auto_tls_key, &thread_state) != 0) { + return NULL; + } + + return thread_state; +} + static inline __attribute__ ((always_inline)) int pre_python_unwind(void *ctx, unwind_state_t * state, map_group_t *maps, int jmp_idx) { @@ -1294,20 +1402,11 @@ int pre_python_unwind(void *ctx, unwind_state_t * state, return 0; } - void *thread_state; - if (bpf_probe_read_user - (&thread_state, sizeof(thread_state), - (void *)py_unwind_info->thread_state_address) != 0) { - return 0; - } - + /* Get per-thread PyThreadState using TSD mechanism */ + void *thread_state = get_py_thread_state(py_unwind_info); if (thread_state == NULL) { - __u64 *addr = python_tstate_addr_map__lookup(&state->key.tgid); - if (addr && *addr != 0) { - thread_state = (void *)*addr; - } else { - return 0; - } + bpf_debug("[PYTHON] Failed to get thread state via TSD"); + return 0; } if (bpf_probe_read_user @@ -2067,18 +2166,11 @@ PROGPE(v8_unwind) (struct bpf_perf_event_data *ctx) { return 0; } -URETPROG(python_save_tstate_addr) (struct pt_regs * ctx) { - __u64 ret = PT_REGS_RC(ctx); - __u32 tgid = bpf_get_current_pid_tgid() >> 32; - - __u64 *addr = python_tstate_addr_map__lookup(&tgid); - if (addr) { - *addr = ret; - } else { - python_tstate_addr_map__update(&tgid, &ret); - } - return 0; -} +/* + * NOTE: python_save_tstate_addr uprobe has been removed. + * The TSD mechanism now handles per-thread PyThreadState lookup directly + * without needing to intercept PyEval_SaveThread calls. + */ PROGPE(oncpu_output) (struct bpf_perf_event_data * ctx) { __u32 zero = 0; diff --git a/agent/src/ebpf/user/unwind_tracer.c b/agent/src/ebpf/user/unwind_tracer.c index b7bab0c1d4e..82f9379dc4b 100644 --- a/agent/src/ebpf/user/unwind_tracer.c +++ b/agent/src/ebpf/user/unwind_tracer.c @@ -241,11 +241,33 @@ static bool requires_dwarf_unwind_table(int pid) { } int unwind_tracer_init(struct bpf_tracer *tracer) { - int32_t offset = read_offset_of_stack_in_task_struct(); - if (offset < 0) { + /* Initialize unwind_sysinfo with task_struct offsets */ + int32_t stack_offset = read_offset_of_stack_in_task_struct(); + if (stack_offset < 0) { ebpf_warning("unwind tracer init: failed to get field stack offset in task struct from btf"); ebpf_warning("unwinder may not handle in kernel perf events correctly"); - } else if (!bpf_table_set_value(tracer, MAP_UNWIND_SYSINFO_NAME, 0, &offset)) { + } + + /* Get tpbase_offset for TLS access (needed for Python multi-threading support) */ + int64_t tpbase_offset = read_tpbase_offset(); + if (tpbase_offset < 0) { + ebpf_warning("unwind tracer init: failed to get tpbase offset from kernel"); + ebpf_warning("Python multi-threaded profiling may not work correctly"); + tpbase_offset = 0; + } else { + ebpf_info("unwind tracer init: tpbase_offset=%ld", tpbase_offset); + } + + /* Update unwind_sysinfo map with both offsets */ + struct { + uint32_t task_struct_stack_offset; + uint64_t tpbase_offset; + } sysinfo = { + .task_struct_stack_offset = stack_offset > 0 ? (uint32_t)stack_offset : 0, + .tpbase_offset = (uint64_t)tpbase_offset, + }; + + if (!bpf_table_set_value(tracer, MAP_UNWIND_SYSINFO_NAME, 0, &sysinfo)) { ebpf_warning("unwind tracer init: update %s error", MAP_UNWIND_SYSINFO_NAME); ebpf_warning("unwinder may not handle in kernel perf events correctly"); } @@ -337,48 +359,7 @@ int unwind_tracer_init(struct bpf_tracer *tracer) { return 0; } -static struct symbol python_symbols[] = { { .type = PYTHON_UPROBE, - .symbol = "PyEval_SaveThread", - .probe_func = URETPROBE_FUNC_NAME(python_save_tstate_addr), - .is_probe_ret = true, }, }; -static void python_parse_and_register(int pid, struct tracer_probes_conf *conf) { - char *path = NULL; - int n = 0; - - if (pid <= 1) - goto out; - - if (!is_user_process(pid)) - goto out; - - // Python symbols may reside in the main executable or libpython.so - // Check both - path = get_elf_path_by_pid(pid); - if (path) { - n = add_probe_sym_to_tracer_probes(pid, path, conf, python_symbols, NELEMS(python_symbols)); - if (n > 0) { - ebpf_info("python uprobe, pid:%d, path:%s\n", pid, path); - free(path); - return; - } - } - - path = get_so_path_by_pid_and_name(pid, "python3"); - if (!path) { - path = get_so_path_by_pid_and_name(pid, "python2"); - if (!path) { - goto out; - } - } - - ebpf_info("python uprobe, pid:%d, path:%s\n", pid, path); - add_probe_sym_to_tracer_probes(pid, path, conf, python_symbols, NELEMS(python_symbols)); - -out: - free(path); - return; -} static void lua_parse_and_register(int pid, struct tracer_probes_conf *conf) { lua_runtime_info_t info = {0}; @@ -567,11 +548,7 @@ void unwind_events_handle(void) { tracer = event->tracer; if (tracer && python_profiler_enabled() && is_python_process(event->pid)) { python_unwind_table_load(g_python_unwind_table, event->pid); - pthread_mutex_lock(&tracer->mutex_probes_lock); - python_parse_and_register(event->pid, tracer->tps); - tracer_uprobes_update(tracer); - tracer_hooks_process(tracer, HOOK_ATTACH, &count); - pthread_mutex_unlock(&tracer->mutex_probes_lock); + // Note: Python profiling uses TSD mechanism, no uprobes needed } if (tracer && php_profiler_enabled() && is_php_process(event->pid)) {