diff --git a/agent/Cargo.lock b/agent/Cargo.lock
index 8ebcf72435c..bc4b0a96b81 100644
--- a/agent/Cargo.lock
+++ b/agent/Cargo.lock
@@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 
 [[package]]
 name = "addr2line"
@@ -2098,6 +2098,15 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "iced-x86"
+version = "1.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c447cff8c7f384a7d4f741cfcff32f75f3ad02b406432e8d6c878d56b1edf6b"
+dependencies = [
+ "lazy_static",
+]
+
 [[package]]
 name = "id-arena"
 version = "2.2.1"
@@ -4689,6 +4698,7 @@ dependencies = [
  "cfg-if",
  "env_logger 0.11.5",
  "gimli 0.31.0",
+ "iced-x86",
  "libc",
  "log 0.4.22",
  "nix 0.29.0",
diff --git a/agent/crates/trace-utils/Cargo.toml b/agent/crates/trace-utils/Cargo.toml
index c69d5c40f76..e13e750dd1a 100644
--- a/agent/crates/trace-utils/Cargo.toml
+++ b/agent/crates/trace-utils/Cargo.toml
@@ -20,6 +20,7 @@ regex.workspace = true
 rustc-demangle = "0.1"
 semver = "1.0"
 thiserror = "1.0"
+iced-x86 = { version = "1.21", default-features = false, features = ["std", "decoder"] }
 
 [build-dependencies]
 cc = "1.0"
diff --git a/agent/crates/trace-utils/cbindgen.toml b/agent/crates/trace-utils/cbindgen.toml
index 997612f2872..94628b8224c 100644
--- a/agent/crates/trace-utils/cbindgen.toml
+++ b/agent/crates/trace-utils/cbindgen.toml
@@ -60,7 +60,8 @@ include = [
   "LuaUnwindInfo",
   "LuaUnwindTable",
   "LuaOfs",
-  "LjOfs"
+  "LjOfs",
+  "TSDInfo"
 ]
 exclude = [
   "bpf_update_elem",
@@ -80,6 +81,7 @@ ProcessShardList = "process_shard_list_t"
 UnwindEntry = "unwind_entry_t"
 UnwindEntryShard = "unwind_entry_shard_t"
 UnwindTable = "unwind_table_t"
+TSDInfo = "tsd_info_t"
 PyCframe = "py_cframe_t"
 PyCodeObject = "py_code_object_t"
 PyFrameObject = "py_frame_object_t"
diff --git a/agent/crates/trace-utils/src/error.rs b/agent/crates/trace-utils/src/error.rs
index 11a0a52630f..4300fbad34d 100644
--- a/agent/crates/trace-utils/src/error.rs
+++ b/agent/crates/trace-utils/src/error.rs
@@ -37,6 +37,8 @@ pub enum Error {
     InvalidPointer(u64),
     #[error("Invalid or corrupted data")]
     InvalidData,
+    #[error("{0}")]
+    Msg(String),
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
diff --git a/agent/crates/trace-utils/src/maps.rs b/agent/crates/trace-utils/src/maps.rs
index ec65c19706d..94dd315017d 100644
--- a/agent/crates/trace-utils/src/maps.rs
+++ b/agent/crates/trace-utils/src/maps.rs
@@ -21,7 +21,7 @@ use std::path::PathBuf;
 
 use log::trace;
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct MemoryArea {
     pub m_start: u64,
     pub mx_start: u64, // start address of executable section
diff --git a/agent/crates/trace-utils/src/trace_utils.h b/agent/crates/trace-utils/src/trace_utils.h
index e35934fa992..51a40a91cf5 100644
--- a/agent/crates/trace-utils/src/trace_utils.h
+++ b/agent/crates/trace-utils/src/trace_utils.h
@@ -91,8 +91,41 @@ typedef struct {
 } unwind_entry_shard_t;
 
 typedef struct {
-    uint64_t thread_state_address;
+    /**
+     * Offset from thread pointer base (TPBASE) to TSD storage
+     */
+    int16_t offset;
+    /**
+     * TSD key multiplier (glibc=16, musl=8)
+     */
+    uint8_t multiplier;
+    /**
+     * Whether indirect addressing is needed (musl=1, glibc=0)
+     */
+    uint8_t indirect;
+} tsd_info_t;
+
+typedef struct {
+    /**
+     * Address of autoTLSkey variable in Python runtime
+     */
+    uint64_t auto_tls_key_addr;
+    /**
+     * Python version encoded as 0xMMmm (e.g., 0x030A for 3.10)
+     */
+    uint16_t version;
+    /**
+     * Thread Specific Data info for multi-threading support
+     */
+    tsd_info_t tsd_info;
+    /**
+     * ID for looking up python_offsets in the offsets map
+     */
     uint8_t offsets_id;
+    /**
+     * Padding for alignment
+     */
+    uint8_t _padding[5];
 } python_unwind_info_t;
 
 typedef struct {
@@ -396,6 +429,12 @@ void v8_unwind_table_unload(v8_unwind_table_t *table, uint32_t pid);
 
 int32_t read_offset_of_stack_in_task_struct(void);
 
+/**
+ * Read TPBASE offset from kernel functions
+ * Returns the offset of fsbase/tpidr in task_struct, or -1 on failure
+ */
+int64_t read_tpbase_offset(void);
+
 int rustc_demangle(const char *mangled, char *out, size_t out_size);
 
 unwind_table_t *unwind_table_create(int32_t process_shard_list_map_fd,
diff --git a/agent/crates/trace-utils/src/unwind.rs b/agent/crates/trace-utils/src/unwind.rs
index c3067b6e090..2a83284f96a 100644
--- a/agent/crates/trace-utils/src/unwind.rs
+++ b/agent/crates/trace-utils/src/unwind.rs
@@ -19,6 +19,8 @@ pub mod elf_utils;
 pub mod lua;
 pub mod php;
 pub mod python;
+pub mod tpbase;
+pub mod tsd;
 pub mod v8;
 
 use std::alloc::{alloc, dealloc, handle_alloc_error, Layout};
diff --git a/agent/crates/trace-utils/src/unwind/python.rs b/agent/crates/trace-utils/src/unwind/python.rs
index 8cdeb8718f4..06f1ca12474 100644
--- a/agent/crates/trace-utils/src/unwind/python.rs
+++ b/agent/crates/trace-utils/src/unwind/python.rs
@@ -18,8 +18,11 @@ use std::{
     cell::OnceCell, collections::HashMap, ffi::CStr, fs, io::Write, mem, path::PathBuf, slice,
 };
 
+#[cfg(target_arch = "x86_64")]
+use ahash::AHashMap;
 use libc::c_void;
 use log::{debug, trace, warn};
+use object::{Object, ObjectSection, ObjectSymbol};
 use regex::Regex;
 use semver::{Version, VersionReq};
 
@@ -29,8 +32,73 @@ use crate::{
     utils::{bpf_delete_elem, bpf_update_elem, get_errno, IdGenerator, BPF_ANY},
 };
 
+/// Maximum distance from _PyRuntime for a valid autoTLSkey address
+/// autoTLSkey should be within a few KB of _PyRuntime
+/// This is a validation threshold, not a version-specific offset
+#[cfg(target_arch = "x86_64")]
+const PYRUNTIME_MAX_DISTANCE: u64 = 0x10000;
+
 use super::elf_utils::MappedFile;
 
+#[cfg(target_arch = "x86_64")]
+use iced_x86::{Decoder, DecoderOptions, Mnemonic, OpKind, Register};
+
+#[cfg(target_arch = "x86_64")]
+#[derive(Clone, Copy, Debug)]
+enum Value {
+    Known(u64),
+    /// Address expressed as _PyRuntime + offset (can be negative)
+    RuntimeBase(i64),
+    Unknown,
+}
+
+#[cfg(target_arch = "x86_64")]
+impl Value {
+    fn add(self, rhs: i64) -> Self {
+        match self {
+            Value::Known(v) => Value::Known(v.wrapping_add(rhs as u64)),
+            Value::RuntimeBase(off) => Value::RuntimeBase(off.saturating_add(rhs)),
+            Value::Unknown => Value::Unknown,
+        }
+    }
+
+    fn sub(self, rhs: i64) -> Self {
+        match self {
+            Value::Known(v) => Value::Known(v.wrapping_sub(rhs as u64)),
+            Value::RuntimeBase(off) => Value::RuntimeBase(off.saturating_sub(rhs)),
+            Value::Unknown => Value::Unknown,
+        }
+    }
+
+    fn shl(self, shift: u8) -> Self {
+        if shift >= 63 {
+            return Value::Unknown;
+        }
+        match self {
+            Value::Known(v) => Value::Known(v << shift),
+            Value::RuntimeBase(off) => {
+                Value::RuntimeBase(off.checked_shl(shift as u32).unwrap_or(0))
+            }
+            Value::Unknown => Value::Unknown,
+        }
+    }
+
+    fn or(self, rhs: u64) -> Self {
+        match self {
+            Value::Known(v) => Value::Known(v | rhs),
+            _ => Value::Unknown,
+        }
+    }
+
+    fn to_runtime_addr(self, runtime_addr: u64) -> Option<u64> {
+        match self {
+            Value::Known(v) => Some(v),
+            Value::RuntimeBase(off) => Some(runtime_addr.wrapping_add(off as u64)),
+            Value::Unknown => None,
+        }
+    }
+}
+
 fn error_not_python(pid: u32) -> Error {
     Error::BadInterpreterType(pid, "python")
 }
@@ -175,11 +243,393 @@ impl Interpreter {
         }
         Err(error_not_supported_version(self.pid, self.version.clone()))
     }
+
+    /// Find the autoTLSkey address from PyGILState_GetThisThreadState function.
+    ///
+    /// Python stores each thread's PyThreadState in thread-local storage using
+    /// pthread_setspecific/getspecific. The key is stored in autoTLSkey variable
+    /// within _PyRuntime structure.
+    ///
+    /// This function analyzes the PyGILState_GetThisThreadState function to find
+    /// the address of the autoTLSkey variable.
+    const GIL_STATE_SYMBOL: &'static str = "PyGILState_GetThisThreadState";
+
+    fn find_auto_tls_key_address(&mut self) -> Result<u64> {
+        if !VersionReq::parse(">=3.7.0").unwrap().matches(&self.version) {
+            return Err(error_not_supported_version(self.pid, self.version.clone()));
+        }
+
+        // Get _PyRuntime address as reference point
+        let runtime_addr = match self.find_symbol_address(Self::RUNTIME_SYMBOL)? {
+            Some(addr) => addr,
+            None => return Err(Error::Msg("_PyRuntime symbol not found".to_string())),
+        };
+
+        // Read PyGILState_GetThisThreadState function code
+        let (sym_addr, code) = self.read_symbol_code(Self::GIL_STATE_SYMBOL, 512)?;
+
+        // Analyze the function to find the autoTLSkey address
+        #[cfg(target_arch = "x86_64")]
+        {
+            self.decode_auto_tls_key_x86(&code, sym_addr, runtime_addr)
+        }
+
+        #[cfg(target_arch = "aarch64")]
+        {
+            self.decode_auto_tls_key_arm64(&code, sym_addr, runtime_addr)
+        }
+
+        #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+        {
+            Err(Error::Msg(
+                "autoTLSkey extraction not supported on this architecture".to_string(),
+            ))
+        }
+    }
+
+    fn read_symbol_code(&mut self, name: &str, size: usize) -> Result<(u64, Vec<u8>)> {
+        for file in [Some(&mut self.exe), self.lib.as_mut()] {
+            let Some(file) = file else {
+                continue;
+            };
+            file.load()?;
+            let obj = object::File::parse(&*file.contents)?;
+
+            if let Some(sym) = obj
+                .symbols()
+                .chain(obj.dynamic_symbols())
+                .find(|s| s.name().map(|n| n == name).unwrap_or(false))
+            {
+                let addr = sym.address();
+
+                // Find the section containing this symbol
+                for section in obj.sections() {
+                    let sec_addr = section.address();
+                    let sec_size = section.size();
+                    if addr >= sec_addr && addr < sec_addr + sec_size {
+                        let section_data = section.data()?;
+                        let offset = (addr - sec_addr) as usize;
+                        let read_size = size.min(section_data.len() - offset);
+                        let code = section_data[offset..offset + read_size].to_vec();
+
+                        // Calculate the actual address in memory
+                        let ba = file.base_address()?;
+                        let mem_addr = addr + ba;
+
+                        return Ok((mem_addr, code));
+                    }
+                }
+            }
+        }
+        Err(Error::Msg(format!("Symbol {} not found", name)))
+    }
+
+    /// Decode autoTLSkey address from x86_64 assembly
+    #[cfg(target_arch = "x86_64")]
+    fn decode_auto_tls_key_x86(
+        &self,
+        code: &[u8],
+        code_addr: u64,
+        runtime_addr: u64,
+    ) -> Result<u64> {
+        let decoder = Decoder::with_ip(64, code, code_addr, DecoderOptions::NONE);
+        let mut regs = AHashMap::new();
+        regs.insert(Register::RAX, Value::RuntimeBase(0));
+        for instr in decoder {
+            Self::propagate_known(&mut regs, &instr);
+
+            let op0 = Self::canon_reg(instr.op0_register());
+            if op0 == Register::RDI {
+                // Fast path: mov/lea from [_PyRuntime + disp] into EDI/RDI
+                if instr.op1_kind() == OpKind::Memory && instr.memory_base() == Register::RAX {
+                    let disp = instr.memory_displacement64() as u64;
+                    let target_addr = runtime_addr.wrapping_add(disp);
+                    if let Some(addr) = self.validate_auto_tls_candidate(target_addr, runtime_addr)
+                    {
+                        debug!(
+                            "Found autoTLSkey address {:#x} (mov/lea from RAX base)",
+                            addr
+                        );
+                        return Ok(addr);
+                    }
+                }
+
+                if instr.mnemonic() == Mnemonic::Lea && instr.op1_kind() == OpKind::Memory {
+                    if let Some(val) = Self::compute_mem_addr(&instr, &regs).or_else(|| {
+                        Self::assume_runtime_base(&instr, runtime_addr).map(Value::Known)
+                    }) {
+                        if let Some(addr) = val.to_runtime_addr(runtime_addr) {
+                            if let Some(valid) =
+                                self.validate_auto_tls_candidate(addr, runtime_addr)
+                            {
+                                debug!("Found autoTLSkey address {:#x} (LEA)", valid);
+                                return Ok(valid);
+                            }
+                        }
+                    }
+                }
+
+                if instr.mnemonic() == Mnemonic::Mov && instr.op1_kind() == OpKind::Memory {
+                    if let Some(val) = Self::compute_mem_addr(&instr, &regs).or_else(|| {
+                        Self::assume_runtime_base(&instr, runtime_addr).map(Value::Known)
+                    }) {
+                        if let Some(addr) = val.to_runtime_addr(runtime_addr) {
+                            if let Some(valid) =
+                                self.validate_auto_tls_candidate(addr, runtime_addr)
+                            {
+                                debug!("Found autoTLSkey address {:#x} (MOV)", valid);
+                                return Ok(valid);
+                            }
+                        }
+                    }
+                }
+
+                if let Some(val) = regs
+                    .get(&Register::RDI)
+                    .and_then(|v| v.to_runtime_addr(runtime_addr))
+                {
+                    if let Some(addr) = self.validate_auto_tls_candidate(val, runtime_addr) {
+                        debug!("Found autoTLSkey address {:#x}", addr);
+                        return Ok(addr);
+                    }
+                }
+            }
+        }
+
+        // Fallback: calculate from known _PyRuntime offsets for Python 3.10
+        // autoTLSkey is at _PyRuntime.gilstate.autoTSSkey._key
+        let fallback_addr = runtime_addr + PY310_INITIAL_STATE.auto_tls_key_offset;
+        debug!(
+            "Could not find autoTLSkey from disassembly, using fallback offset {:#x}",
+            fallback_addr
+        );
+        Ok(fallback_addr)
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    fn validate_auto_tls_candidate(&self, target_addr: u64, runtime_addr: u64) -> Option<u64> {
+        let distance = if target_addr > runtime_addr {
+            target_addr - runtime_addr
+        } else {
+            runtime_addr - target_addr
+        };
+
+        if distance < PYRUNTIME_MAX_DISTANCE {
+            let auto_tls_key_addr = if self.version >= Version::new(3, 7, 0) && target_addr % 8 == 0
+            {
+                target_addr + 4
+            } else {
+                target_addr
+            };
+            Some(auto_tls_key_addr)
+        } else {
+            None
+        }
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    fn canon_reg(reg: Register) -> Register {
+        match reg {
+            Register::RAX | Register::EAX => Register::RAX,
+            Register::RBX | Register::EBX => Register::RBX,
+            Register::RCX | Register::ECX => Register::RCX,
+            Register::RDX | Register::EDX => Register::RDX,
+            Register::RSI | Register::ESI => Register::RSI,
+            Register::RDI | Register::EDI => Register::RDI,
+            Register::R8 | Register::R8D => Register::R8,
+            Register::R9 | Register::R9D => Register::R9,
+            Register::R10 | Register::R10D => Register::R10,
+            Register::R11 | Register::R11D => Register::R11,
+            Register::R12 | Register::R12D => Register::R12,
+            Register::R13 | Register::R13D => Register::R13,
+            Register::R14 | Register::R14D => Register::R14,
+            Register::R15 | Register::R15D => Register::R15,
+            _ => reg,
+        }
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    fn propagate_known(map: &mut AHashMap<Register, Value>, instr: &iced_x86::Instruction) {
+        match instr.mnemonic() {
+            Mnemonic::Mov => {
+                if instr.op0_kind() != OpKind::Register {
+                    return;
+                }
+                let dst = Self::canon_reg(instr.op0_register());
+                let val = match instr.op1_kind() {
+                    OpKind::Immediate32 => Some(Value::Known(instr.immediate32() as u64)),
+                    OpKind::Immediate64 => Some(Value::Known(instr.immediate64())),
+                    OpKind::Register => map.get(&Self::canon_reg(instr.op1_register())).copied(),
+                    OpKind::Memory => Self::compute_mem_addr(instr, map),
+                    _ => None,
+                };
+                if let Some(v) = val {
+                    map.insert(dst, v);
+                }
+            }
+            Mnemonic::Lea => {
+                if instr.op0_kind() != OpKind::Register || instr.op1_kind() != OpKind::Memory {
+                    return;
+                }
+                if let Some(addr) = Self::compute_mem_addr(instr, map) {
+                    map.insert(Self::canon_reg(instr.op0_register()), addr);
+                }
+            }
+            Mnemonic::Add => {
+                if instr.op0_kind() != OpKind::Register {
+                    return;
+                }
+                let dst = Self::canon_reg(instr.op0_register());
+                if let Some(base) = map.get(&dst).copied() {
+                    let delta = match instr.op1_kind() {
+                        OpKind::Immediate32 => instr.immediate32() as i64,
+                        OpKind::Immediate8 => instr.immediate8() as i64,
+                        OpKind::Register => {
+                            match map.get(&Self::canon_reg(instr.op1_register())).copied() {
+                                Some(Value::Known(v)) => v as i64,
+                                Some(Value::RuntimeBase(off)) => off,
+                                _ => 0,
+                            }
+                        }
+                        _ => 0,
+                    };
+                    map.insert(dst, base.add(delta));
+                }
+            }
+            Mnemonic::Sub => {
+                if instr.op0_kind() != OpKind::Register {
+                    return;
+                }
+                let dst = Self::canon_reg(instr.op0_register());
+                if let Some(base) = map.get(&dst).copied() {
+                    let delta = match instr.op1_kind() {
+                        OpKind::Immediate32 => instr.immediate32() as i64,
+                        OpKind::Immediate8 => instr.immediate8() as i64,
+                        OpKind::Register => {
+                            match map.get(&Self::canon_reg(instr.op1_register())).copied() {
+                                Some(Value::Known(v)) => v as i64,
+                                Some(Value::RuntimeBase(off)) => off,
+                                _ => 0,
+                            }
+                        }
+                        _ => 0,
+                    };
+                    map.insert(dst, base.sub(delta));
+                }
+            }
+            Mnemonic::Shl => {
+                if instr.op0_kind() != OpKind::Register || instr.op1_kind() != OpKind::Immediate8 {
+                    return;
+                }
+                let dst = Self::canon_reg(instr.op0_register());
+                if let Some(base) = map.get(&dst).copied() {
+                    let shift = instr.immediate8();
+                    map.insert(dst, base.shl(shift));
+                }
+            }
+            Mnemonic::And => {
+                if instr.op0_kind() != OpKind::Register {
+                    return;
+                }
+                let dst = Self::canon_reg(instr.op0_register());
+                if let Some(base) = map.get(&dst).copied() {
+                    let mask = match instr.op1_kind() {
+                        OpKind::Immediate32 => instr.immediate32() as u64,
+                        OpKind::Immediate8 => instr.immediate8() as u64,
+                        _ => return,
+                    };
+                    map.insert(dst, base.or(mask));
+                }
+            }
+            Mnemonic::Or => {
+                if instr.op0_kind() != OpKind::Register {
+                    return;
+                }
+                let dst = Self::canon_reg(instr.op0_register());
+                if let Some(base) = map.get(&dst).copied() {
+                    let val = match instr.op1_kind() {
+                        OpKind::Immediate32 => instr.immediate32() as u64,
+                        OpKind::Immediate8 => instr.immediate8() as u64,
+                        _ => return,
+                    };
+                    map.insert(dst, base.or(val));
+                }
+            }
+            _ => {}
+        }
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    fn compute_mem_addr(
+        instr: &iced_x86::Instruction,
+        map: &AHashMap<Register, Value>,
+    ) -> Option<Value> {
+        let disp = instr.memory_displacement64() as i64;
+        let base_val = match instr.memory_base() {
+            Register::RIP => Value::Known(instr.next_ip()),
+            Register::None => Value::Known(0),
+            base => map
+                .get(&Self::canon_reg(base))
+                .copied()
+                .unwrap_or(Value::Unknown),
+        };
+
+        let with_disp = match base_val {
+            Value::Known(v) => Value::Known(v.wrapping_add(disp as u64)),
+            Value::RuntimeBase(off) => Value::RuntimeBase(off.saturating_add(disp)),
+            Value::Unknown => Value::Unknown,
+        };
+
+        let result = if instr.memory_index() != Register::None {
+            let idx_val = map.get(&Self::canon_reg(instr.memory_index())).copied()?;
+            let scale = instr.memory_index_scale() as i64;
+            match (with_disp, idx_val) {
+                (Value::Known(a), Value::Known(b)) => {
+                    Value::Known(a.wrapping_add((b as i64 * scale) as u64))
+                }
+                (Value::RuntimeBase(off), Value::Known(b)) => {
+                    Value::RuntimeBase(off.saturating_add((b as i64 * scale) as i64))
+                }
+                _ => Value::Unknown,
+            }
+        } else {
+            with_disp
+        };
+
+        Some(result)
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    fn assume_runtime_base(instr: &iced_x86::Instruction, runtime_addr: u64) -> Option<u64> {
+        if instr.memory_base() == Register::RAX {
+            return Some(runtime_addr.wrapping_add(instr.memory_displacement64() as u64));
+        }
+        None
+    }
+
+    /// Decode autoTLSkey address from ARM64 assembly
+    #[cfg(target_arch = "aarch64")]
+    fn decode_auto_tls_key_arm64(
+        &self,
+        _code: &[u8],
+        _code_addr: u64,
+        runtime_addr: u64,
+    ) -> Result<u64> {
+        // TODO: Implement ARM64 pattern analysis
+        // For now, use fallback from Python 3.10 known offset
+        let fallback_addr = runtime_addr + PY310_INITIAL_STATE.auto_tls_key_offset;
+        warn!(
+            "ARM64 autoTLSkey extraction not implemented, using fallback {:#x}",
+            fallback_addr
+        );
+        Ok(fallback_addr)
+    }
 }
 
 pub struct InterpreterInfo {
     pub version: Version,
     pub thread_address: u64,
+    pub auto_tls_key_addr: u64,
 }
 
 impl InterpreterInfo {
@@ -202,9 +652,24 @@ impl InterpreterInfo {
         );
 
         let mut intp = Interpreter::new(pid, exe_area, lib_area)?;
+
+        // Get auto_tls_key_addr for multi-threading support
+        let auto_tls_key_addr = match intp.find_auto_tls_key_address() {
+            Ok(addr) => addr,
+            Err(e) => {
+                warn!("Failed to find autoTLSkey address for process#{pid}: {e}, using fallback");
+                // Fallback: calculate from known Python 3.10 _PyRuntime offset
+                let runtime_addr = intp
+                    .find_symbol_address(Interpreter::RUNTIME_SYMBOL)?
+                    .ok_or_else(|| Error::Msg("_PyRuntime not found".to_string()))?;
+                runtime_addr + PY310_INITIAL_STATE.auto_tls_key_offset
+            }
+        };
+
         Ok(Self {
             version: intp.version.clone(),
             thread_address: intp.thread_state_address()?,
+            auto_tls_key_addr,
         })
     }
 
@@ -220,18 +685,63 @@ impl InterpreterInfo {
     }
 }
 
+/// Python runtime state offsets
+/// These are offsets within the _PyRuntime global structure for a specific Python version
 pub struct InitialState {
+    /// Offset of _PyRuntime.gilstate.tstate_current (points to current thread's PyThreadState)
+    /// For single-threaded or main thread access
     tstate_current: u64,
+
+    /// Offset of _PyRuntime.gilstate.autoTSSkey._key (pthread_key_t value) on x86_64
+    /// Used for multi-threaded PyThreadState lookup via TSD
+    /// Note: This is the offset to the pthread_key_t value, not the Py_tss_t struct itself
+    #[cfg(target_arch = "x86_64")]
+    auto_tls_key_offset: u64,
+
+    /// Offset of _PyRuntime.gilstate.autoTSSkey._key (pthread_key_t value) on ARM64
+    /// Used for multi-threaded PyThreadState lookup via TSD
+    /// Note: This offset needs verification on actual ARM64 Python builds
+    #[cfg(target_arch = "aarch64")]
+    auto_tls_key_offset: u64,
 }
 
+#[cfg(target_arch = "x86_64")]
+const PY310_INITIAL_STATE: &InitialState = &InitialState {
+    tstate_current: 568,
+    auto_tls_key_offset: 0x24c, // _PyRuntime.gilstate.autoTSSkey._key
+};
+
+#[cfg(target_arch = "aarch64")]
 const PY310_INITIAL_STATE: &InitialState = &InitialState {
     tstate_current: 568,
+    auto_tls_key_offset: 0x57c, // _PyRuntime.gilstate.autoTSSkey._key (0x578 + 4)
 };
 
+/// Thread Specific Data information for accessing per-thread PyThreadState
+/// This is used to correctly unwind Python stacks in multi-threaded applications
+#[repr(C)]
+#[derive(Debug, Clone, Copy, Default)]
+pub struct TSDInfo {
+    /// Offset from thread pointer base (TPBASE) to TSD storage
+    pub offset: i16,
+    /// TSD key multiplier (glibc=16, musl=8)
+    pub multiplier: u8,
+    /// Whether indirect addressing is needed (musl=1, glibc=0)
+    pub indirect: u8,
+}
+
 #[repr(C)]
 pub struct PythonUnwindInfo {
-    pub thread_state_address: u64,
+    /// Address of autoTLSkey variable in Python runtime
+    pub auto_tls_key_addr: u64,
+    /// Python version encoded as 0xMMmm (e.g., 0x030A for 3.10)
+    pub version: u16,
+    /// Thread Specific Data info for multi-threading support
+    pub tsd_info: TSDInfo,
+    /// ID for looking up python_offsets in the offsets map
     pub offsets_id: u8,
+    /// Padding for alignment
+    pub _padding: [u8; 5],
 }
 
 #[repr(C)]
@@ -394,11 +904,39 @@ impl PythonUnwindTable {
                 id as u8
             }
         };
-        let info = PythonUnwindInfo {
-            thread_state_address: info.thread_address,
+
+        // Extract TSD info for multi-threading support
+        let tsd_info = match super::tsd::extract_tsd_info(pid) {
+            Ok(tsd) => {
+                debug!(
+                    "Extracted TSD info for process#{pid}: offset={}, multiplier={}, indirect={}",
+                    tsd.offset, tsd.multiplier, tsd.indirect
+                );
+                tsd
+            }
+            Err(e) => {
+                debug!("Failed to extract TSD info for process#{pid}: {e}, using defaults");
+                super::tsd::get_default_tsd_info(pid)
+            }
+        };
+
+        // Encode version as 0xMMmm (e.g., 3.10 -> 0x030A)
+        let version = ((info.version.major as u16) << 8) | (info.version.minor as u16);
+
+        let unwind_info = PythonUnwindInfo {
+            auto_tls_key_addr: info.auto_tls_key_addr,
+            version,
+            tsd_info,
             offsets_id,
+            _padding: [0; 5],
         };
-        self.update_unwind_info_map(pid, &info);
+
+        debug!(
+            "Loading Python unwind info for process#{pid}: autoTLSkey={:#x}, version={:#x}",
+            unwind_info.auto_tls_key_addr, unwind_info.version
+        );
+
+        self.update_unwind_info_map(pid, &unwind_info);
     }
 
     pub unsafe fn unload(&mut self, pid: u32) {
diff --git a/agent/crates/trace-utils/src/unwind/tpbase.rs b/agent/crates/trace-utils/src/unwind/tpbase.rs
new file mode 100644
index 00000000000..28283b5b704
--- /dev/null
+++ b/agent/crates/trace-utils/src/unwind/tpbase.rs
@@ -0,0 +1,810 @@
+/*
+ * Copyright (c) 2024 Yunshan Networks
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//! Thread Pointer Base (TPBASE) offset extraction
+//!
+//! This module extracts the offset of fsbase/tpidr in task_struct by analyzing
+//! kernel functions. This is needed for accessing Thread Local Storage (TLS)
+//! from eBPF programs.
+//!
+//! On x86_64, the thread pointer base is stored in task_struct.thread.fsbase
+//! On arm64, it's stored in task_struct.thread.uw.tp_value
+
+use std::fs::File;
+use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
+
+use log::{debug, trace, warn};
+
+use crate::error::{Error, Result};
+
+// Constants for TPBASE offset validation
+/// Minimum reasonable TPBASE offset (task_struct.thread.fsbase should be after basic fields)
+const TPBASE_MIN_OFFSET: u32 = 500;
+/// Maximum reasonable TPBASE offset (thread struct shouldn't be too deep in task_struct)
+const TPBASE_MAX_OFFSET: u32 = 20000;
+
+// Default TPBASE offsets for different architectures
+/// Default x86_64 TPBASE offset (common for Ubuntu/Debian kernels 5.x/6.x)
+#[cfg(target_arch = "x86_64")]
+const DEFAULT_X86_64_TPBASE_OFFSET: u64 = 0x1978; // 6520 bytes
+/// Default ARM64 TPBASE offset (placeholder, needs verification)
+#[cfg(target_arch = "aarch64")]
+const DEFAULT_AARCH64_TPBASE_OFFSET: u64 = 0x1000;
+
+/// Kernel function analyzers for extracting TPBASE offset
+struct Analyzer {
+    function_name: &'static str,
+    analyze: fn(&[u8]) -> Option<u32>,
+}
+
+/// Get the list of analyzers for the current architecture
+fn get_analyzers() -> Vec<Analyzer> {
+    #[cfg(target_arch = "x86_64")]
+    {
+        vec![
+            Analyzer {
+                function_name: "x86_fsbase_write_task",
+                analyze: analyze_fsbase_write_task_x86,
+            },
+            Analyzer {
+                function_name: "aout_dump_debugregs",
+                analyze: analyze_aout_dump_debugregs_x86,
+            },
+        ]
+    }
+    #[cfg(target_arch = "aarch64")]
+    {
+        vec![Analyzer {
+            function_name: "tls_thread_switch",
+            analyze: analyze_tls_thread_switch_arm64,
+        }]
+    }
+    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+    {
+        vec![]
+    }
+}
+
+/// Analyze x86_fsbase_write_task function to extract fsbase offset.
+///
+/// This function simply writes the second argument (fsbase value) to
+/// task_struct at the fsbase offset. Available since kernel version 4.20.
+///
+/// Expected pattern:
+///   48 89 b7 XX XX XX XX    mov %rsi, 0xXXXXXXXX(%rdi)
+///
+/// Where %rdi is task_struct pointer and %rsi is fsbase value
+#[cfg(target_arch = "x86_64")]
+fn analyze_fsbase_write_task_x86(code: &[u8]) -> Option<u32> {
+    // Pattern: REX.W MOV r/m64, r64 with RDI base and RSI source
+    // 48 89 b7 = mov %rsi, offset(%rdi)
+    let pattern = [0x48, 0x89, 0xb7];
+
+    if let Some(idx) = code.windows(3).position(|w| w == pattern) {
+        if idx + 7 <= code.len() {
+            let offset =
+                u32::from_le_bytes([code[idx + 3], code[idx + 4], code[idx + 5], code[idx + 6]]);
+            trace!(
+                "Found fsbase offset {:#x} from x86_fsbase_write_task",
+                offset
+            );
+            return Some(offset);
+        }
+    }
+    None
+}
+
+/// Analyze aout_dump_debugregs function to extract fsbase offset.
+/// This is a fallback for older kernels that don't have x86_fsbase_write_task.
+///
+/// This function reads task->thread.fsbase, so we look for memory loads
+/// from task_struct with a specific pattern.
+#[cfg(target_arch = "x86_64")]
+fn analyze_aout_dump_debugregs_x86(code: &[u8]) -> Option<u32> {
+    // This is more complex - would need full disassembly
+    // For simplicity, look for common patterns
+    // Pattern: mov XX(%rdi), %rXX or mov XX(%rsi), %rXX
+
+    // Look for 48 8b XX XX XX XX XX patterns (mov r64, m64)
+    for i in 0..code.len().saturating_sub(7) {
+        // REX.W MOV r64, [rdi + disp32]
+        if code[i] == 0x48 && code[i + 1] == 0x8b {
+            let modrm = code[i + 2];
+            let mod_field = modrm >> 6;
+            let rm_field = modrm & 0x7;
+
+            // mod=10 (disp32), rm=111 (rdi)
+            if mod_field == 2 && rm_field == 7 {
+                let offset =
+                    u32::from_le_bytes([code[i + 3], code[i + 4], code[i + 5], code[i + 6]]);
+                // fsbase offset is typically in range TPBASE_MIN_OFFSET-TPBASE_MAX_OFFSET
+                // and should be related to thread.fsbase
+                // The actual fsbase is at offset-16 from debugreg storage
+                if offset > TPBASE_MIN_OFFSET && offset < TPBASE_MAX_OFFSET {
+                    // Adjust for the debugreg offset (fsbase is typically 16 bytes before)
+                    let fsbase_offset = offset.saturating_sub(16);
+                    trace!(
+                        "Found potential fsbase offset {:#x} (adjusted from {:#x}) from aout_dump_debugregs",
+                        fsbase_offset, offset
+                    );
+                    return Some(fsbase_offset);
+                }
+            }
+        }
+    }
+    None
+}
+
+/// Analyze tls_thread_switch function on ARM64 to extract tp_value offset.
+#[cfg(target_arch = "aarch64")]
+fn analyze_tls_thread_switch_arm64(_code: &[u8]) -> Option<u32> {
+    // ARM64 pattern analysis would go here
+    // For now, return None as this needs more investigation
+    None
+}
+
+/// Read kernel function code at the given address
+fn read_kernel_code(addr: u64, size: usize) -> Result<Vec<u8>> {
+    let mut file = File::open("/proc/kcore")?;
+
+    // /proc/kcore is an ELF core dump format
+    // We need to parse the ELF headers to find the correct offset
+    // For simplicity, try to read from /dev/kmem first if available
+
+    // Try /dev/kmem (may not be available on all systems)
+    if let Ok(mut kmem) = File::open("/dev/kmem") {
+        kmem.seek(SeekFrom::Start(addr))?;
+        let mut buf = vec![0u8; size];
+        kmem.read_exact(&mut buf)?;
+        return Ok(buf);
+    }
+
+    // Fallback: try to use /proc/kcore with ELF parsing
+    read_kernel_code_from_kcore(&mut file, addr, size)
+}
+
+/// Read kernel code from /proc/kcore (ELF core dump format)
+fn read_kernel_code_from_kcore(file: &mut File, addr: u64, size: usize) -> Result<Vec<u8>> {
+    use object::{elf, read::elf::FileHeader, Endianness};
+
+    // Read the entire header section first to parse program headers
+    file.seek(SeekFrom::Start(0))?;
+    let mut header_data = vec![0u8; 4096]; // Should be enough for headers
+    file.read_exact(&mut header_data)?;
+
+    // Parse header to get program header info
+    let (endian, phoff, phnum, phentsize) = {
+        let header = elf::FileHeader64::<Endianness>::parse(&header_data[..])
+            .map_err(|e| Error::Msg(format!("Failed to parse kcore ELF header: {}", e)))?;
+        let endian = header
+            .endian()
+            .map_err(|e| Error::Msg(format!("Failed to get endianness: {}", e)))?;
+        let phoff = header.e_phoff(endian) as usize;
+        let phnum = header.e_phnum(endian) as usize;
+        let phentsize = header.e_phentsize(endian) as usize;
+        (endian, phoff, phnum, phentsize)
+    };
+
+    // Ensure we have enough data
+    let needed_size = phoff + phnum * phentsize;
+    if needed_size > header_data.len() {
+        header_data.resize(needed_size, 0);
+        file.seek(SeekFrom::Start(0))?;
+        file.read_exact(&mut header_data)?;
+    }
+
+    // Re-parse the header with complete data
+    let header = elf::FileHeader64::<Endianness>::parse(&header_data[..])
+        .map_err(|e| Error::Msg(format!("Failed to parse kcore ELF header: {}", e)))?;
+
+    let program_headers = header
+        .program_headers(endian, &header_data[..])
+        .map_err(|e| Error::Msg(format!("Failed to parse program headers: {}", e)))?;
+
+    find_and_read_segment(file, program_headers, endian, addr, size)
+}
+
+/// Helper function to find and read segment from program headers
+fn find_and_read_segment<E: object::endian::Endian>(
+    file: &mut File,
+    program_headers: &[object::elf::ProgramHeader64<E>],
+    endian: E,
+    addr: u64,
+    size: usize,
+) -> Result<Vec<u8>> {
+    use object::{elf, read::elf::ProgramHeader};
+
+    // Find the segment containing our address
+    for phdr in program_headers {
+        if phdr.p_type(endian) != elf::PT_LOAD {
+            continue;
+        }
+
+        let p_vaddr = phdr.p_vaddr(endian);
+        let p_memsz = phdr.p_memsz(endian);
+        let p_offset = phdr.p_offset(endian);
+
+        if addr >= p_vaddr && addr < p_vaddr + p_memsz {
+            let file_offset = p_offset + (addr - p_vaddr);
+            file.seek(SeekFrom::Start(file_offset))?;
+            let mut buf = vec![0u8; size];
+            file.read_exact(&mut buf)?;
+            return Ok(buf);
+        }
+    }
+
+    Err(Error::Msg(format!(
+        "Address {:#x} not found in kcore segments",
+        addr
+    )))
+}
+
+/// Look up a kernel symbol address from /proc/kallsyms
+fn lookup_kernel_symbol(name: &str) -> Result<u64> {
+    let file = File::open("/proc/kallsyms")?;
+    let reader = BufReader::new(file);
+
+    for line in reader.lines() {
+        let line = line?;
+        let parts: Vec<&str> = line.split_whitespace().collect();
+        if parts.len() >= 3 && parts[2] == name {
+            let addr = u64::from_str_radix(parts[0], 16)
+                .map_err(|e| Error::Msg(format!("Failed to parse address: {}", e)))?;
+            if addr == 0 {
+                // kallsyms may show 0 address when kptr_restrict is enabled
+                return Err(Error::Msg(
+                    "kallsyms shows 0 address, try running as root or set kptr_restrict=0"
+                        .to_string(),
+                ));
+            }
+            return Ok(addr);
+        }
+    }
+
+    Err(Error::Msg(format!("Symbol {} not found in kallsyms", name)))
+}
+
+/// Extract TPBASE offset from kernel functions
+///
+/// This function tries multiple kernel functions to extract the offset
+/// of the thread pointer base (fsbase on x86_64, tp_value on arm64)
+/// within task_struct.
+pub fn extract_tpbase_offset() -> Result<u64> {
+    let analyzers = get_analyzers();
+    if analyzers.is_empty() {
+        return Err(Error::Msg(
+            "No TPBASE analyzers for this architecture".to_string(),
+        ));
+    }
+
+    for analyzer in analyzers {
+        match lookup_kernel_symbol(analyzer.function_name) {
+            Ok(addr) => {
+                trace!(
+                    "Found kernel symbol {} at {:#x}",
+                    analyzer.function_name,
+                    addr
+                );
+
+                // Read function code (256 bytes should be enough for analysis)
+                match read_kernel_code(addr, 256) {
+                    Ok(code) => {
+                        if let Some(offset) = (analyzer.analyze)(&code) {
+                            // Sanity check: offset should be in reasonable range
+                            if offset >= TPBASE_MIN_OFFSET && offset <= TPBASE_MAX_OFFSET {
+                                debug!(
+                                    "Extracted TPBASE offset {} ({:#x}) from {}",
+                                    offset, offset, analyzer.function_name
+                                );
+                                return Ok(offset as u64);
+                            } else {
+                                warn!(
+                                    "TPBASE offset {} from {} seems invalid (expected {}-{})",
+                                    offset,
+                                    analyzer.function_name,
+                                    TPBASE_MIN_OFFSET,
+                                    TPBASE_MAX_OFFSET
+                                );
+                            }
+                        }
+                    }
+                    Err(e) => {
+                        debug!(
+                            "Failed to read kernel code for {}: {}",
+                            analyzer.function_name, e
+                        );
+                    }
+                }
+            }
+            Err(e) => {
+                trace!("Symbol {} not found: {}", analyzer.function_name, e);
+            }
+        }
+    }
+
+    // If we can't extract from kernel functions, try BTF as fallback
+    extract_tpbase_offset_from_btf()
+}
+
+/// Extract TPBASE offset from kernel BTF information
+fn extract_tpbase_offset_from_btf() -> Result<u64> {
+    // Try to use BTF to get the offset of thread.fsbase in task_struct
+    // This is a fallback when kernel function analysis fails
+
+    // For x86_64: task_struct.thread.fsbase
+    // For arm64: task_struct.thread.uw.tp_value
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        // Try parsing /sys/kernel/btf/vmlinux
+        match parse_btf_for_tpbase() {
+            Ok(offset) => {
+                debug!(
+                    "Extracted TPBASE offset {} ({:#x}) from BTF",
+                    offset, offset
+                );
+                return Ok(offset);
+            }
+            Err(e) => {
+                debug!("BTF parsing failed: {}", e);
+            }
+        }
+
+        // Fallback to hardcoded defaults based on common kernel configurations
+        // These values are extracted from various kernel versions
+        get_default_tpbase_offset()
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    {
+        // For ARM64: task_struct.thread.uw.tp_value
+        // Default offset for common kernel configurations
+        get_default_tpbase_offset()
+    }
+
+    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+    {
+        Err(Error::Msg(
+            "TPBASE extraction not supported for this architecture".to_string(),
+        ))
+    }
+}
+
+/// Parse BTF information from /sys/kernel/btf/vmlinux to get TPBASE offset
+#[cfg(target_arch = "x86_64")]
+fn parse_btf_for_tpbase() -> Result<u64> {
+    use std::fs::File;
+    use std::io::Read;
+
+    // Read BTF data from sysfs
+    let btf_path = "/sys/kernel/btf/vmlinux";
+    let mut file =
+        File::open(btf_path).map_err(|e| Error::Msg(format!("Cannot open {}: {}", btf_path, e)))?;
+
+    let mut btf_data = Vec::new();
+    file.read_to_end(&mut btf_data)
+        .map_err(|e| Error::Msg(format!("Cannot read {}: {}", btf_path, e)))?;
+
+    // Parse BTF header to find task_struct and thread.fsbase offset
+    // BTF format: https://www.kernel.org/doc/html/latest/bpf/btf.html
+    parse_btf_task_struct_fsbase(&btf_data)
+}
+
+/// Parse BTF data to find task_struct.thread.fsbase offset
+#[cfg(target_arch = "x86_64")]
+fn parse_btf_task_struct_fsbase(btf_data: &[u8]) -> Result<u64> {
+    // BTF header structure (from include/uapi/linux/btf.h)
+    // struct btf_header {
+    //     __u16 magic;      // 0xEB9F
+    //     __u8  version;    // 1
+    //     __u8  flags;
+    //     __u32 hdr_len;
+    //     __u32 type_off;   // offset of type section
+    //     __u32 type_len;   // length of type section
+    //     __u32 str_off;    // offset of string section
+    //     __u32 str_len;    // length of string section
+    // };
+
+    if btf_data.len() < 24 {
+        return Err(Error::Msg("BTF data too small".to_string()));
+    }
+
+    let magic = u16::from_le_bytes([btf_data[0], btf_data[1]]);
+    if magic != 0xEB9F {
+        return Err(Error::Msg(format!("Invalid BTF magic: {:#x}", magic)));
+    }
+
+    let hdr_len = u32::from_le_bytes([btf_data[4], btf_data[5], btf_data[6], btf_data[7]]) as usize;
+    let type_off =
+        u32::from_le_bytes([btf_data[8], btf_data[9], btf_data[10], btf_data[11]]) as usize;
+    let type_len =
+        u32::from_le_bytes([btf_data[12], btf_data[13], btf_data[14], btf_data[15]]) as usize;
+    let str_off =
+        u32::from_le_bytes([btf_data[16], btf_data[17], btf_data[18], btf_data[19]]) as usize;
+    let str_len =
+        u32::from_le_bytes([btf_data[20], btf_data[21], btf_data[22], btf_data[23]]) as usize;
+
+    let type_section_start = hdr_len + type_off;
+    let str_section_start = hdr_len + str_off;
+
+    if type_section_start + type_len > btf_data.len()
+        || str_section_start + str_len > btf_data.len()
+    {
+        return Err(Error::Msg("BTF sections out of bounds".to_string()));
+    }
+
+    let type_section = &btf_data[type_section_start..type_section_start + type_len];
+    let str_section = &btf_data[str_section_start..str_section_start + str_len];
+
+    // Find task_struct type ID first
+    let task_struct_id = find_btf_struct_by_name(type_section, str_section, "task_struct")?;
+
+    // Find thread member in task_struct
+    let thread_offset =
+        find_btf_member_offset(type_section, str_section, task_struct_id, "thread")?;
+
+    // Find thread_struct type and fsbase offset within it
+    let thread_type_id = find_btf_member_type(type_section, str_section, task_struct_id, "thread")?;
+    let fsbase_offset =
+        find_btf_member_offset(type_section, str_section, thread_type_id, "fsbase")?;
+
+    let total_offset = thread_offset + fsbase_offset;
+
+    // Sanity check
+    if total_offset < TPBASE_MIN_OFFSET as u64 || total_offset > TPBASE_MAX_OFFSET as u64 {
+        return Err(Error::Msg(format!(
+            "BTF offset {} seems invalid (expected {}-{})",
+            total_offset, TPBASE_MIN_OFFSET, TPBASE_MAX_OFFSET
+        )));
+    }
+
+    Ok(total_offset)
+}
+
+/// BTF type kinds
+#[cfg(target_arch = "x86_64")]
+const BTF_KIND_STRUCT: u32 = 4;
+#[cfg(target_arch = "x86_64")]
+const BTF_KIND_UNION: u32 = 5;
+
+/// Calculate extra size for a BTF type based on its kind and vlen
+#[cfg(target_arch = "x86_64")]
+fn btf_type_extra_size(kind: u32, vlen: u32) -> usize {
+    match kind {
+        1 => 4,                        // BTF_KIND_INT
+        2 => 0,                        // PTR
+        3 => 12,                       // ARRAY
+        4 | 5 => (vlen * 12) as usize, // STRUCT, UNION: each member is 12 bytes
+        6 => (vlen * 8) as usize,      // ENUM (64-bit: each enumerator is 8 bytes)
+        7 => 0,                        // FWD
+        8 => 0,                        // TYPEDEF
+        9 => 0,                        // VOLATILE
+        10 => 0,                       // CONST
+        11 => 0,                       // RESTRICT
+        12 => 0,                       // FUNC
+        13 => (vlen * 8) as usize,     // FUNC_PROTO: each param is 8 bytes
+        14 => 12,                      // VAR
+        15 => (vlen * 12) as usize,    // DATASEC
+        16 => 0,                       // FLOAT
+        17 => 4,                       // DECL_TAG
+        18 => 0,                       // TYPE_TAG
+        19 => (vlen * 8) as usize,     // ENUM64
+        _ => 0,
+    }
+}
+
+/// Find BTF struct type ID by name
+#[cfg(target_arch = "x86_64")]
+fn find_btf_struct_by_name(type_section: &[u8], str_section: &[u8], name: &str) -> Result<u32> {
+    // BTF type format:
+    // struct btf_type {
+    //     __u32 name_off;
+    //     __u32 info;       // kind in bits 24-28
+    //     union { __u32 size; __u32 type; };
+    // };
+    // For struct, followed by btf_member array
+
+    let mut offset = 0;
+    let mut type_id = 1u32; // BTF type IDs start at 1
+
+    while offset + 12 <= type_section.len() {
+        let name_off = u32::from_le_bytes([
+            type_section[offset],
+            type_section[offset + 1],
+            type_section[offset + 2],
+            type_section[offset + 3],
+        ]) as usize;
+
+        let info = u32::from_le_bytes([
+            type_section[offset + 4],
+            type_section[offset + 5],
+            type_section[offset + 6],
+            type_section[offset + 7],
+        ]);
+
+        let kind = (info >> 24) & 0x1f;
+        let vlen = info & 0xffff; // number of members for struct
+
+        // Skip the base btf_type (12 bytes)
+        let extra_size = btf_type_extra_size(kind, vlen);
+
+        if kind == BTF_KIND_STRUCT {
+            // Check if this is the struct we're looking for
+            if name_off < str_section.len() {
+                let type_name = get_btf_string(str_section, name_off);
+                if type_name == name {
+                    return Ok(type_id);
+                }
+            }
+        }
+
+        offset += 12 + extra_size; // 12 = type_base_size
+        type_id += 1;
+    }
+
+    Err(Error::Msg(format!("BTF struct '{}' not found", name)))
+}
+
+/// Find offset of a member in a BTF struct
+#[cfg(target_arch = "x86_64")]
+fn find_btf_member_offset(
+    type_section: &[u8],
+    str_section: &[u8],
+    struct_id: u32,
+    member_name: &str,
+) -> Result<u64> {
+    // Navigate to the struct type entry
+    let mut offset = 0;
+    let mut current_id = 1u32;
+
+    while offset + 12 <= type_section.len() && current_id <= struct_id {
+        let info = u32::from_le_bytes([
+            type_section[offset + 4],
+            type_section[offset + 5],
+            type_section[offset + 6],
+            type_section[offset + 7],
+        ]);
+
+        let kind = (info >> 24) & 0x1f;
+        let vlen = info & 0xffff;
+
+        let extra_size = btf_type_extra_size(kind, vlen);
+
+        if current_id == struct_id && (kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION) {
+            // Found the struct, now search its members
+            let members_start = offset + 12; // btf_type base size
+            for i in 0..vlen as usize {
+                let member_offset = members_start + i * 12;
+                if member_offset + 12 > type_section.len() {
+                    break;
+                }
+
+                let mem_name_off = u32::from_le_bytes([
+                    type_section[member_offset],
+                    type_section[member_offset + 1],
+                    type_section[member_offset + 2],
+                    type_section[member_offset + 3],
+                ]) as usize;
+
+                let mem_offset_bits = u32::from_le_bytes([
+                    type_section[member_offset + 8],
+                    type_section[member_offset + 9],
+                    type_section[member_offset + 10],
+                    type_section[member_offset + 11],
+                ]);
+
+                if mem_name_off < str_section.len() {
+                    let mem_name = get_btf_string(str_section, mem_name_off);
+                    if mem_name == member_name {
+                        // Convert bits to bytes (normal struct members are byte-aligned)
+                        return Ok((mem_offset_bits / 8) as u64);
+                    }
+                }
+            }
+            return Err(Error::Msg(format!(
+                "Member '{}' not found in struct",
+                member_name
+            )));
+        }
+
+        offset += 12 + extra_size; // 12 = btf_type base size
+        current_id += 1;
+    }
+
+    Err(Error::Msg(format!("Struct ID {} not found", struct_id)))
+}
+
+/// Find type ID of a member in a BTF struct
+#[cfg(target_arch = "x86_64")]
+fn find_btf_member_type(
+    type_section: &[u8],
+    str_section: &[u8],
+    struct_id: u32,
+    member_name: &str,
+) -> Result<u32> {
+    let mut offset = 0;
+    let mut current_id = 1u32;
+
+    while offset + 12 <= type_section.len() && current_id <= struct_id {
+        let info = u32::from_le_bytes([
+            type_section[offset + 4],
+            type_section[offset + 5],
+            type_section[offset + 6],
+            type_section[offset + 7],
+        ]);
+
+        let kind = (info >> 24) & 0x1f;
+        let vlen = info & 0xffff;
+
+        let extra_size = btf_type_extra_size(kind, vlen);
+
+        if current_id == struct_id && (kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION) {
+            let members_start = offset + 12; // btf_type base size
+            for i in 0..vlen as usize {
+                let member_offset = members_start + i * 12;
+                if member_offset + 12 > type_section.len() {
+                    break;
+                }
+
+                let mem_name_off = u32::from_le_bytes([
+                    type_section[member_offset],
+                    type_section[member_offset + 1],
+                    type_section[member_offset + 2],
+                    type_section[member_offset + 3],
+                ]) as usize;
+
+                let mem_type = u32::from_le_bytes([
+                    type_section[member_offset + 4],
+                    type_section[member_offset + 5],
+                    type_section[member_offset + 6],
+                    type_section[member_offset + 7],
+                ]);
+
+                if mem_name_off < str_section.len() {
+                    let mem_name = get_btf_string(str_section, mem_name_off);
+                    if mem_name == member_name {
+                        return Ok(mem_type);
+                    }
+                }
+            }
+            return Err(Error::Msg(format!(
+                "Member '{}' not found in struct",
+                member_name
+            )));
+        }
+
+        offset += 12 + extra_size; // 12 = btf_type base size
+        current_id += 1;
+    }
+
+    Err(Error::Msg(format!("Struct ID {} not found", struct_id)))
+}
+
+/// Get null-terminated string from BTF string section
+#[cfg(target_arch = "x86_64")]
+fn get_btf_string(str_section: &[u8], offset: usize) -> &str {
+    if offset >= str_section.len() {
+        return "";
+    }
+    let bytes = &str_section[offset..];
+    let end = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len());
+    std::str::from_utf8(&bytes[..end]).unwrap_or("")
+}
+
+/// Get default TPBASE offset based on common kernel configurations
+///
+/// These values are derived from analysis of various kernel versions.
+/// The offset of task_struct.thread.fsbase varies based on kernel configuration
+/// but typically falls within a predictable range.
+fn get_default_tpbase_offset() -> Result<u64> {
+    #[cfg(target_arch = "x86_64")]
+    {
+        // Common offsets for x86_64 kernels:
+        // - kernel 5.x: typically around 0x1940-0x1A00 (6464-6656)
+        // - kernel 6.x: typically around 0x1940-0x1A80 (6464-6784)
+        //
+        // The exact offset depends on CONFIG_* options, especially:
+        // - CONFIG_KASAN
+        // - CONFIG_MEMCG
+        // - CONFIG_CGROUPS
+        // - CONFIG_BPF_SYSCALL
+        //
+        // We use a common value that works for most distributions
+        let default_offset = DEFAULT_X86_64_TPBASE_OFFSET;
+
+        warn!(
+            "Using default TPBASE offset {:#x} ({}). This may not be accurate for all kernels.",
+            default_offset, default_offset
+        );
+        Ok(default_offset)
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    {
+        // For ARM64: task_struct.thread.uw.tp_value
+        // Common offset for arm64 kernels
+        let default_offset = DEFAULT_AARCH64_TPBASE_OFFSET;
+
+        warn!(
+            "Using default TPBASE offset {:#x} for arm64. This may not be accurate.",
+            default_offset
+        );
+        Ok(default_offset)
+    }
+
+    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+    {
+        Err(Error::Msg(
+            "Default TPBASE offset not available for this architecture".to_string(),
+        ))
+    }
+}
+
+/// C-callable function to read TPBASE offset
+#[no_mangle]
+pub extern "C" fn read_tpbase_offset() -> i64 {
+    match extract_tpbase_offset() {
+        Ok(offset) => offset as i64,
+        Err(e) => {
+            warn!("Failed to extract TPBASE offset: {}", e);
+            // Return default offset instead of -1 to allow Python profiling to work
+            #[cfg(target_arch = "x86_64")]
+            {
+                let default = DEFAULT_X86_64_TPBASE_OFFSET as i64;
+                warn!("Using fallback TPBASE offset: {:#x}", default);
+                return default;
+            }
+            #[cfg(target_arch = "aarch64")]
+            {
+                let default = DEFAULT_AARCH64_TPBASE_OFFSET as i64;
+                warn!("Using fallback TPBASE offset: {:#x}", default);
+                return default;
+            }
+            #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+            {
+                -1
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_analyze_fsbase_write_task_x86() {
+        // Test pattern: mov %rsi, 0x1234(%rdi)
+        // 48 89 b7 34 12 00 00
+        let code = [0x48, 0x89, 0xb7, 0x34, 0x12, 0x00, 0x00];
+        let result = analyze_fsbase_write_task_x86(&code);
+        assert_eq!(result, Some(0x1234));
+    }
+
+    #[test]
+    fn test_analyze_fsbase_write_task_x86_with_prefix() {
+        // Test with some prefix instructions
+        let code = [
+            0x55, // push rbp
+            0x48, 0x89, 0xe5, // mov rbp, rsp
+            0x48, 0x89, 0xb7, 0x78, 0x19, 0x00, 0x00, // mov %rsi, 0x1978(%rdi)
+            0x5d, // pop rbp
+            0xc3, // ret
+        ];
+        let result = analyze_fsbase_write_task_x86(&code);
+        assert_eq!(result, Some(0x1978));
+    }
+}
diff --git a/agent/crates/trace-utils/src/unwind/tsd.rs b/agent/crates/trace-utils/src/unwind/tsd.rs
new file mode 100644
index 00000000000..7c88269074f
--- /dev/null
+++ b/agent/crates/trace-utils/src/unwind/tsd.rs
@@ -0,0 +1,864 @@
+/*
+ * Copyright (c) 2024 Yunshan Networks
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//! Thread Specific Data (TSD) extraction module
+//!
+//! This module analyzes the C library's pthread_getspecific function to extract
+//! the parameters needed to access thread-specific data from eBPF.
+//!
+//! The TSD info includes:
+//! - offset: Offset from thread pointer base to TSD storage
+//! - multiplier: Size of each TSD entry (8 for musl, 16 for glibc)
+//! - indirect: Whether indirect addressing is needed (1 for musl, 0 for glibc)
+//!
+//! C library implementations:
+//! - musl: pthread->tsd[key] (indirect, multiplier=8)
+//! - glibc: pthread->specific_1stblock[key].data (direct, multiplier=16)
+
+use std::cell::OnceCell;
+use std::fs;
+use std::path::PathBuf;
+
+#[cfg(target_arch = "x86_64")]
+use ahash::AHashMap;
+use log::trace;
+use object::{Object, ObjectSection, ObjectSymbol};
+use regex::Regex;
+
+use crate::error::{Error, Result};
+use crate::maps::{get_memory_mappings, MemoryArea};
+
+use super::python::TSDInfo;
+
+#[cfg(target_arch = "x86_64")]
+use iced_x86::{Decoder, DecoderOptions, Instruction, Mnemonic, OpKind, Register};
+
+// TSD constants for different C library implementations
+/// glibc: each TSD entry is struct pthread_key_data { uintptr_t seq; void *data; } = 16 bytes
+const GLIBC_TSD_MULTIPLIER: u8 = 16;
+/// musl: each TSD entry is a pointer = 8 bytes
+const MUSL_TSD_MULTIPLIER: u8 = 8;
+
+/// glibc: TSD is inline in pthread struct (no indirection)
+const GLIBC_TSD_INDIRECT: u8 = 0;
+/// musl: pthread->tsd is a pointer that needs dereferencing
+const MUSL_TSD_INDIRECT: u8 = 1;
+
+/// Default glibc TSD offset (pthread->specific_1stblock + 8, for data field)
+const GLIBC_TSD_DEFAULT_OFFSET: i16 = 0x318;
+/// Default musl TSD offset (pthread->tsd)
+const MUSL_TSD_DEFAULT_OFFSET: i16 = 128;
+
+thread_local! {
+    static LIBC_REGEX: OnceCell<Regex> = OnceCell::new();
+}
+
+/// Check if a DSO path potentially contains pthread code
+pub fn is_potential_tsd_dso(path: &str) -> bool {
+    LIBC_REGEX.with(|r| {
+        r.get_or_init(|| Regex::new(r".*/(ld-musl|libc|libpthread)([-.].*)?\.so").unwrap())
+            .is_match(path)
+    })
+}
+
+/// Find the libc DSO for a given process
+fn find_libc_dso(pid: u32) -> Result<MemoryArea> {
+    let mm = get_memory_mappings(pid)?;
+
+    // Priority: libc.so.6 > libpthread.so.0 > ld-musl-*.so.1
+    // Note: Modern glibc (2.34+) has pthread implementation in libc.so.6,
+    // so we prefer libc.so.6 over libpthread.so.0
+    let candidates: Vec<&MemoryArea> = mm
+        .iter()
+        .filter(|m| is_potential_tsd_dso(&m.path))
+        .collect();
+
+    if candidates.is_empty() {
+        return Err(Error::Msg(format!(
+            "No libc/pthread DSO found for process {}",
+            pid
+        )));
+    }
+
+    // Prefer libc.so.6 for modern glibc (pthread is now in libc)
+    for c in &candidates {
+        if c.path.contains("libc.so") {
+            return Ok((*c).clone());
+        }
+    }
+    // Fallback to libpthread.so.0 for older glibc
+    for c in &candidates {
+        if c.path.contains("libpthread") {
+            return Ok((*c).clone());
+        }
+    }
+    // musl libc variants
+    for c in &candidates {
+        if c.path.contains("libc.musl") || c.path.contains("ld-musl") {
+            return Ok((*c).clone());
+        }
+    }
+
+    Ok(candidates[0].clone())
+}
+
+/// Read pthread_getspecific function code from a DSO
+fn read_pthread_getspecific_code(pid: u32, dso: &MemoryArea) -> Result<Vec<u8>> {
+    // Try reading from the file system first (works for host processes)
+    if let Ok(code) = read_pthread_getspecific_from_file(pid, dso) {
+        return Ok(code);
+    }
+
+    // Fallback: read from process memory (works for container processes)
+    read_pthread_getspecific_from_memory(pid, dso)
+}
+
+/// Read pthread_getspecific function code from file system
+fn read_pthread_getspecific_from_file(pid: u32, dso: &MemoryArea) -> Result<Vec<u8>> {
+    let base: PathBuf = ["/proc", &pid.to_string(), "root"].iter().collect();
+    let path = base.join(&dso.path[1..]);
+
+    let data = fs::read(&path)
+        .map_err(|e| Error::Msg(format!("Cannot read DSO file {}: {}", path.display(), e)))?;
+    let obj = object::File::parse(&*data)?;
+
+    read_symbol_code_from_elf(&obj, dso)
+}
+
+/// Read pthread_getspecific function code from process memory
+fn read_pthread_getspecific_from_memory(pid: u32, dso: &MemoryArea) -> Result<Vec<u8>> {
+    use std::fs::File;
+    use std::io::{Read, Seek, SeekFrom};
+
+    // First, get symbol offset from the DSO file (try multiple paths)
+    let symbol_offset = get_pthread_getspecific_offset(pid, dso)?;
+
+    // Calculate actual memory address
+    let mem_addr = dso.m_start + symbol_offset;
+
+    // Read from /proc/{pid}/mem
+    let mem_path = format!("/proc/{}/mem", pid);
+    let mut mem_file = File::open(&mem_path)
+        .map_err(|e| Error::Msg(format!("Cannot open {}: {}", mem_path, e)))?;
+
+    mem_file
+        .seek(SeekFrom::Start(mem_addr))
+        .map_err(|e| Error::Msg(format!("Cannot seek in {}: {}", mem_path, e)))?;
+
+    let mut code = vec![0u8; 256];
+    mem_file
+        .read_exact(&mut code)
+        .map_err(|e| Error::Msg(format!("Cannot read from {}: {}", mem_path, e)))?;
+
+    trace!(
+        "Read pthread_getspecific from process memory at {:#x}",
+        mem_addr
+    );
+
+    Ok(code)
+}
+
+/// Get the offset of pthread_getspecific within the DSO
+fn get_pthread_getspecific_offset(pid: u32, dso: &MemoryArea) -> Result<u64> {
+    // Handle " (deleted)" suffix which may be present in memory mappings
+    let clean_path_str = if let Some(stripped) = dso.path.strip_suffix(" (deleted)") {
+        stripped
+    } else {
+        &dso.path
+    };
+
+    // 1. Try /proc/{pid}/root/{path}
+    let base: PathBuf = ["/proc", &pid.to_string(), "root"].iter().collect();
+    // dso.path usually starts with /
+    let relative_path = if clean_path_str.starts_with('/') {
+        &clean_path_str[1..]
+    } else {
+        clean_path_str
+    };
+    let path1 = base.join(relative_path);
+
+    if let Ok(data) = fs::read(&path1) {
+        if let Ok(obj) = object::File::parse(&*data) {
+            if let Some(offset) = find_pthread_getspecific_offset(&obj) {
+                return Ok(offset);
+            }
+        }
+    }
+
+    // 2. Try the path directly (for container overlay fs)
+    if let Ok(data) = fs::read(clean_path_str) {
+        if let Ok(obj) = object::File::parse(&*data) {
+            if let Some(offset) = find_pthread_getspecific_offset(&obj) {
+                return Ok(offset);
+            }
+        }
+    }
+
+    // Note: Do not fallback to host libc paths here.
+    // If we are in a container, the host libc likely has different offsets.
+    // Reading incorrect offsets causes us to read garbage from process memory,
+    // leading to failures in extract_tsd_info.
+
+    Err(Error::Msg(format!(
+        "Cannot find pthread_getspecific offset in DSO: {}",
+        dso.path
+    )))
+}
+
+/// Find pthread_getspecific symbol offset in an ELF file
+fn find_pthread_getspecific_offset(obj: &object::File) -> Option<u64> {
+    let symbol_names = ["__pthread_getspecific", "pthread_getspecific"];
+
+    for name in symbol_names {
+        if let Some(sym) = obj
+            .symbols()
+            .chain(obj.dynamic_symbols())
+            .find(|s| s.name().map(|n| n == name).unwrap_or(false))
+        {
+            return Some(sym.address());
+        }
+    }
+    None
+}
+
+/// Read symbol code from an ELF file object
+fn read_symbol_code_from_elf(obj: &object::File, _dso: &MemoryArea) -> Result<Vec<u8>> {
+    // Try both glibc and musl symbol names
+    let symbol_names = ["__pthread_getspecific", "pthread_getspecific"];
+
+    for name in symbol_names {
+        if let Some(sym) = obj
+            .symbols()
+            .chain(obj.dynamic_symbols())
+            .find(|s| s.name().map(|n| n == name).unwrap_or(false))
+        {
+            let addr = sym.address();
+            let size = sym.size().max(256) as usize; // Read at least 256 bytes
+
+            // Read code from the DSO file
+            if let Some(section) = obj.sections().find(|s| {
+                let (_start, len) = s.file_range().unwrap_or((0, 0));
+                addr >= s.address() && addr < s.address() + len
+            }) {
+                let section_data = section.data()?;
+                let offset = (addr - section.address()) as usize;
+                let end = (offset + size).min(section_data.len());
+
+                trace!(
+                    "Found {} at {:#x}, reading {} bytes",
+                    name,
+                    addr,
+                    end - offset
+                );
+                return Ok(section_data[offset..end].to_vec());
+            }
+        }
+    }
+
+    Err(Error::Msg(
+        "pthread_getspecific symbol not found in DSO".to_string(),
+    ))
+}
+
+/// Extract TSD info from pthread_getspecific code (x86_64)
+#[cfg(target_arch = "x86_64")]
+fn extract_tsd_info_x86(code: &[u8]) -> Result<TSDInfo> {
+    if let Some(info) = decode_tsd_info_with_disasm(code) {
+        return Ok(info);
+    }
+    legacy_extract_tsd_info_x86(code)
+}
+
+/// Legacy pattern-based extractor kept as a fallback for unknown sequences.
+#[cfg(target_arch = "x86_64")]
+fn legacy_extract_tsd_info_x86(code: &[u8]) -> Result<TSDInfo> {
+    // musl pattern (indirect):
+    //   mov %fs:0x0, %rax       ; get pthread struct pointer
+    //   mov offset(%rax), %rax  ; load tsd pointer (indirect)
+    //   mov (%rax,%rdi,8), %rax ; return tsd[key]
+
+    // Check for musl pattern first
+    let musl_fs_pattern = [0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00];
+    if code.starts_with(&musl_fs_pattern) {
+        for i in 9..code.len().saturating_sub(6) {
+            if code[i] == 0x48 && code[i + 1] == 0x8b && code[i + 2] == 0x80 {
+                let offset =
+                    i32::from_le_bytes([code[i + 3], code[i + 4], code[i + 5], code[i + 6]]);
+                trace!("Found musl TSD pattern: offset={}", offset);
+                return Ok(TSDInfo {
+                    offset: offset as i16,
+                    multiplier: MUSL_TSD_MULTIPLIER,
+                    indirect: MUSL_TSD_INDIRECT,
+                });
+            }
+        }
+    }
+
+    // Check for glibc pattern (fs:0x10)
+    let glibc_fs_pattern = [0x64, 0x48, 0x8b, 0x04, 0x25, 0x10, 0x00, 0x00, 0x00];
+    if code.starts_with(&glibc_fs_pattern) || code.windows(9).any(|w| w == glibc_fs_pattern) {
+        // Strategy 1: Look for SIB addressing (older glibc)
+        // mov offset(%rax,%rdi,8), %rax
+        for i in 0..code.len().saturating_sub(8) {
+            if code[i] == 0x48 && code[i + 1] == 0x8b && code[i + 2] == 0x84 {
+                let sib = code[i + 3];
+                let scale = 1 << (sib >> 6);
+                let offset =
+                    i32::from_le_bytes([code[i + 4], code[i + 5], code[i + 6], code[i + 7]]);
+
+                if scale == 8 || scale == 16 {
+                    trace!(
+                        "Found glibc SIB pattern: offset={}, scale={}",
+                        offset,
+                        scale
+                    );
+                    return Ok(TSDInfo {
+                        offset: (offset + 8) as i16,
+                        multiplier: GLIBC_TSD_MULTIPLIER,
+                        indirect: GLIBC_TSD_INDIRECT,
+                    });
+                }
+            }
+        }
+
+        // Strategy 2: Look for split instruction pattern (modern glibc)
+        // 1. mov %fs:0x10, %rax
+        // 2. ... (shl/add instructions)
+        // 3. mov offset(%rax), %rax
+        for i in 0..code.len().saturating_sub(7) {
+            // mov offset(%rax), %rax -> 48 8b 80 XX XX XX XX
+            if code[i] == 0x48 && code[i + 1] == 0x8b && code[i + 2] == 0x80 {
+                let offset =
+                    i32::from_le_bytes([code[i + 3], code[i + 4], code[i + 5], code[i + 6]]);
+
+                // Heuristic check for reasonable TSD offset
+                if offset > 0x100 && offset < 0x2000 {
+                    trace!("Found glibc split pattern: offset={}", offset);
+                    return Ok(TSDInfo {
+                        offset: offset as i16,
+                        multiplier: GLIBC_TSD_MULTIPLIER,
+                        indirect: GLIBC_TSD_INDIRECT,
+                    });
+                }
+            }
+        }
+    }
+
+    Err(Error::Msg(format!(
+        "Could not extract TSD info from x86_64 code (len={}). Dump: {:02x?}",
+        code.len(),
+        code
+    )))
+}
+
+/// Extract TSD info from pthread_getspecific code (ARM64)
+#[cfg(target_arch = "aarch64")]
+fn extract_tsd_info_arm64(_code: &[u8]) -> Result<TSDInfo> {
+    // TODO: Implement ARM64 TSD extraction
+    // For now, use default musl-like parameters
+    Ok(TSDInfo {
+        offset: 0,
+        multiplier: MUSL_TSD_MULTIPLIER,
+        indirect: MUSL_TSD_INDIRECT,
+    })
+}
+
+/// Extract TSD info for a given process
+pub fn extract_tsd_info(pid: u32) -> Result<TSDInfo> {
+    let dso = find_libc_dso(pid)?;
+    trace!("Found libc DSO for process {}: {}", pid, dso.path);
+
+    let code = read_pthread_getspecific_code(pid, &dso)?;
+    trace!("Read {} bytes of pthread_getspecific code", code.len());
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        extract_tsd_info_x86(&code)
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    {
+        extract_tsd_info_arm64(&code)
+    }
+
+    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+    {
+        Err(Error::Msg(
+            "TSD extraction not supported on this architecture".to_string(),
+        ))
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[derive(Clone, Debug)]
+enum Expr {
+    Unknown,
+    Const(i64),
+    Key,
+    Fs(u64),
+    Add(Box<Expr>, Box<Expr>),
+    Mul(Box<Expr>, i64),
+    Deref(Box<Expr>),
+}
+
+#[cfg(target_arch = "x86_64")]
+#[derive(Clone, Copy, Debug)]
+enum BaseComponent {
+    Fs(u64),
+    FsDeref { disp: u64, offset: i64 },
+}
+
+#[cfg(target_arch = "x86_64")]
+fn decode_tsd_info_with_disasm(code: &[u8]) -> Option<TSDInfo> {
+    let decoder = Decoder::with_ip(64, code, 0, DecoderOptions::NONE);
+    let mut regs = vec![
+        (Register::RAX, Expr::Unknown),
+        (Register::RBX, Expr::Unknown),
+        (Register::RCX, Expr::Unknown),
+        (Register::RDX, Expr::Unknown),
+        (Register::RSI, Expr::Unknown),
+        (Register::RDI, Expr::Key),
+    ]
+    .into_iter()
+    .collect::<AHashMap<_, _>>();
+
+    let mut result_expr = None;
+    for instr in decoder {
+        match instr.mnemonic() {
+            Mnemonic::Mov => handle_mov(&instr, &mut regs),
+            Mnemonic::Lea => handle_lea(&instr, &mut regs),
+            Mnemonic::Add => handle_add(&instr, &mut regs),
+            Mnemonic::Shl => handle_shl(&instr, &mut regs),
+            Mnemonic::Imul => handle_imul(&instr, &mut regs),
+            Mnemonic::Ret => {
+                result_expr = regs.get(&Register::RAX).cloned();
+                break;
+            }
+            _ => {}
+        }
+    }
+
+    let Some(expr) = result_expr else {
+        return None;
+    };
+    match_tsd_expr(&expr)
+}
+
+#[cfg(target_arch = "x86_64")]
+fn match_tsd_expr(expr: &Expr) -> Option<TSDInfo> {
+    match_glibc(expr).or_else(|| match_musl(expr))
+}
+
+#[cfg(target_arch = "x86_64")]
+fn match_glibc(expr: &Expr) -> Option<TSDInfo> {
+    let Expr::Deref(inner) = expr else {
+        return None;
+    };
+    let (base, key_coeff, offset) = linear_components(inner)?;
+    let (fs_disp, base_offset) = match base {
+        BaseComponent::Fs(fs_disp) => (fs_disp, 0),
+        BaseComponent::FsDeref { disp, offset } => (disp, offset),
+    };
+    if key_coeff == 0 || fs_disp != 0x10 {
+        return None;
+    }
+    let total_offset = offset + base_offset;
+    if total_offset < i16::MIN as i64 || total_offset > i16::MAX as i64 {
+        return None;
+    }
+    Some(TSDInfo {
+        offset: total_offset as i16,
+        multiplier: key_coeff as u8,
+        indirect: GLIBC_TSD_INDIRECT,
+    })
+}
+
+#[cfg(target_arch = "x86_64")]
+fn match_musl(expr: &Expr) -> Option<TSDInfo> {
+    let Expr::Deref(inner) = expr else {
+        return None;
+    };
+    let mut parts = Vec::new();
+    flatten_add(inner, &mut parts);
+
+    let mut base = None;
+    let mut key_coeff = 0i64;
+    let mut offset = 0i64;
+
+    for part in parts {
+        match part {
+            Expr::Deref(addr) => {
+                if let Some((base_comp, 0, extra)) = linear_components(addr) {
+                    match base_comp {
+                        BaseComponent::Fs(disp) if disp == 0 => base = Some(extra),
+                        BaseComponent::FsDeref { disp, offset: off } if disp == 0 => {
+                            base = Some(off + extra)
+                        }
+                        _ => {}
+                    }
+                }
+            }
+            _ => {
+                if let Some((None, kc, off)) = linear_components_optional_base(part) {
+                    key_coeff += kc;
+                    offset += off;
+                }
+            }
+        }
+    }
+
+    let Some(base_off) = base else {
+        return None;
+    };
+    if key_coeff == 0 {
+        return None;
+    }
+    let total_offset = base_off + offset;
+    if total_offset < i16::MIN as i64 || total_offset > i16::MAX as i64 {
+        return None;
+    }
+    Some(TSDInfo {
+        offset: total_offset as i16,
+        multiplier: key_coeff as u8,
+        indirect: MUSL_TSD_INDIRECT,
+    })
+}
+
+#[cfg(target_arch = "x86_64")]
+fn flatten_add<'a>(expr: &'a Expr, parts: &mut Vec<&'a Expr>) {
+    match expr {
+        Expr::Add(a, b) => {
+            flatten_add(a, parts);
+            flatten_add(b, parts);
+        }
+        _ => parts.push(expr),
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+fn linear_components(expr: &Expr) -> Option<(BaseComponent, i64, i64)> {
+    let (base_opt, key, offset) = linear_components_optional_base(expr)?;
+    let base = base_opt?;
+    Some((base, key, offset))
+}
+
+#[cfg(target_arch = "x86_64")]
+fn linear_components_optional_base(expr: &Expr) -> Option<(Option<BaseComponent>, i64, i64)> {
+    let mut base = None;
+    let mut key_coeff = 0i64;
+    let mut offset = 0i64;
+
+    fn walk(
+        expr: &Expr,
+        base: &mut Option<BaseComponent>,
+        key_coeff: &mut i64,
+        offset: &mut i64,
+    ) -> bool {
+        match expr {
+            Expr::Add(a, b) => walk(a, base, key_coeff, offset) && walk(b, base, key_coeff, offset),
+            Expr::Const(c) => {
+                *offset += *c;
+                true
+            }
+            Expr::Key => {
+                *key_coeff += 1;
+                true
+            }
+            Expr::Mul(inner, factor) => {
+                let mut inner_base = None;
+                let mut inner_key = 0;
+                let mut inner_off = 0;
+                if !walk(inner, &mut inner_base, &mut inner_key, &mut inner_off) {
+                    return false;
+                }
+                if inner_base.is_some() {
+                    // Do not try to scale base pointer; treat as unsupported
+                    return false;
+                }
+                *key_coeff += inner_key * *factor;
+                *offset += inner_off * *factor;
+                true
+            }
+            Expr::Fs(disp) => {
+                if base.is_some() {
+                    return false;
+                }
+                *base = Some(BaseComponent::Fs(*disp));
+                true
+            }
+            Expr::Deref(inner) => {
+                // Allow one level of deref for musl base
+                let (inner_base, inner_key, inner_off) =
+                    match linear_components_optional_base(inner) {
+                        Some(v) => v,
+                        None => return false,
+                    };
+                if inner_key != 0 || inner_base.is_none() {
+                    return false;
+                }
+                if base.is_some() {
+                    return false;
+                }
+                let Some(BaseComponent::Fs(disp)) = inner_base else {
+                    return false;
+                };
+                *base = Some(BaseComponent::FsDeref {
+                    disp,
+                    offset: inner_off,
+                });
+                true
+            }
+            Expr::Unknown => true,
+        }
+    }
+
+    if walk(expr, &mut base, &mut key_coeff, &mut offset) {
+        Some((base, key_coeff, offset))
+    } else {
+        None
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+fn handle_mov(instr: &Instruction, regs: &mut AHashMap<Register, Expr>) {
+    if instr.op0_kind() != OpKind::Register {
+        return;
+    }
+    let dst = canonical_reg(instr.op0_register());
+    let new_expr = match instr.op1_kind() {
+        OpKind::Register => regs
+            .get(&canonical_reg(instr.op1_register()))
+            .cloned()
+            .unwrap_or(Expr::Unknown),
+        OpKind::Immediate64 => Expr::Const(instr.immediate64() as i64),
+        OpKind::Immediate32 => Expr::Const(instr.immediate32() as i64),
+        OpKind::Memory => Expr::Deref(Box::new(build_address_expr(instr, regs))),
+        _ => Expr::Unknown,
+    };
+    regs.insert(dst, new_expr);
+}
+
+#[cfg(target_arch = "x86_64")]
+fn handle_lea(instr: &Instruction, regs: &mut AHashMap<Register, Expr>) {
+    if instr.op0_kind() != OpKind::Register || instr.op1_kind() != OpKind::Memory {
+        return;
+    }
+    let dst = canonical_reg(instr.op0_register());
+    let expr = build_address_expr(instr, regs);
+    regs.insert(dst, expr);
+}
+
+#[cfg(target_arch = "x86_64")]
+fn handle_add(instr: &Instruction, regs: &mut AHashMap<Register, Expr>) {
+    if instr.op0_kind() != OpKind::Register {
+        return;
+    }
+    let dst = canonical_reg(instr.op0_register());
+    let left = regs.get(&dst).cloned().unwrap_or(Expr::Unknown);
+    let right = match instr.op1_kind() {
+        OpKind::Immediate32 => Expr::Const(instr.immediate32() as i64),
+        OpKind::Immediate8 => Expr::Const(instr.immediate8() as i64),
+        OpKind::Register => regs
+            .get(&canonical_reg(instr.op1_register()))
+            .cloned()
+            .unwrap_or(Expr::Unknown),
+        OpKind::Memory => Expr::Deref(Box::new(build_address_expr(instr, regs))),
+        _ => Expr::Unknown,
+    };
+    regs.insert(dst, Expr::Add(Box::new(left), Box::new(right)));
+}
+
+#[cfg(target_arch = "x86_64")]
+fn handle_shl(instr: &Instruction, regs: &mut AHashMap<Register, Expr>) {
+    if instr.op0_kind() != OpKind::Register || instr.op1_kind() != OpKind::Immediate8 {
+        return;
+    }
+    let dst = canonical_reg(instr.op0_register());
+    let shift = instr.immediate8();
+    if shift >= 63 {
+        return;
+    }
+    let factor = 1i64 << shift;
+    let expr = regs.get(&dst).cloned().unwrap_or(Expr::Unknown);
+    regs.insert(dst, Expr::Mul(Box::new(expr), factor));
+}
+
+#[cfg(target_arch = "x86_64")]
+fn handle_imul(instr: &Instruction, regs: &mut AHashMap<Register, Expr>) {
+    // Only handle the form: imul reg, reg, imm8/imm32
+    if instr.op_count() != 3 {
+        return;
+    }
+    if instr.op0_kind() != OpKind::Register || instr.op1_kind() != OpKind::Register {
+        return;
+    }
+    let dst = canonical_reg(instr.op0_register());
+    let src = canonical_reg(instr.op1_register());
+    if dst != src {
+        return;
+    }
+    let factor = match instr.op2_kind() {
+        OpKind::Immediate8 => instr.immediate8() as i64,
+        OpKind::Immediate32 => instr.immediate32() as i64,
+        _ => return,
+    };
+    let expr = regs.get(&dst).cloned().unwrap_or(Expr::Unknown);
+    regs.insert(dst, Expr::Mul(Box::new(expr), factor));
+}
+
+#[cfg(target_arch = "x86_64")]
+fn canonical_reg(reg: Register) -> Register {
+    match reg {
+        Register::RAX | Register::EAX | Register::AX | Register::AL => Register::RAX,
+        Register::RBX | Register::EBX | Register::BX | Register::BL => Register::RBX,
+        Register::RCX | Register::ECX | Register::CX | Register::CL => Register::RCX,
+        Register::RDX | Register::EDX | Register::DX | Register::DL => Register::RDX,
+        Register::RSI | Register::ESI | Register::SI | Register::SIL => Register::RSI,
+        Register::RDI | Register::EDI | Register::DI | Register::DIL => Register::RDI,
+        Register::R8 | Register::R8D | Register::R8W | Register::R8L => Register::R8,
+        Register::R9 | Register::R9D | Register::R9W | Register::R9L => Register::R9,
+        Register::R10 | Register::R10D | Register::R10W | Register::R10L => Register::R10,
+        Register::R11 | Register::R11D | Register::R11W | Register::R11L => Register::R11,
+        Register::R12 | Register::R12D | Register::R12W | Register::R12L => Register::R12,
+        Register::R13 | Register::R13D | Register::R13W | Register::R13L => Register::R13,
+        Register::R14 | Register::R14D | Register::R14W | Register::R14L => Register::R14,
+        Register::R15 | Register::R15D | Register::R15W | Register::R15L => Register::R15,
+        _ => reg,
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+fn build_address_expr(instr: &Instruction, regs: &AHashMap<Register, Expr>) -> Expr {
+    let mut expr =
+        if instr.segment_prefix() == Register::FS && instr.memory_base() == Register::None {
+            Expr::Fs(instr.memory_displacement64())
+        } else {
+            Expr::Const(instr.memory_displacement64() as i64)
+        };
+
+    // Base register
+    if instr.memory_base() != Register::None {
+        let base = canonical_reg(instr.memory_base());
+        let base_expr = regs.get(&base).cloned().unwrap_or(Expr::Unknown);
+        expr = Expr::Add(Box::new(expr), Box::new(base_expr));
+    } else if instr.segment_prefix() == Register::FS && instr.memory_base() == Register::None {
+        // already handled
+    }
+
+    // RIP-relative
+    if instr.memory_base() == Register::RIP {
+        expr = Expr::Const(instr.next_ip().wrapping_add(instr.memory_displacement64()) as i64);
+    }
+
+    // Index register
+    if instr.memory_index() != Register::None {
+        let idx = canonical_reg(instr.memory_index());
+        let idx_expr = regs.get(&idx).cloned().unwrap_or(Expr::Unknown);
+        let scale = instr.memory_index_scale() as i64;
+        expr = Expr::Add(
+            Box::new(expr),
+            Box::new(Expr::Mul(Box::new(idx_expr), scale)),
+        );
+    }
+
+    expr
+}
+
+/// Get default TSD info based on detected libc type
+pub fn get_default_tsd_info(pid: u32) -> TSDInfo {
+    // Try to detect libc type from memory mappings
+    if let Ok(mm) = get_memory_mappings(pid) {
+        for m in &mm {
+            if m.path.contains("musl") || m.path.contains("ld-musl") {
+                // musl libc
+                return TSDInfo {
+                    offset: MUSL_TSD_DEFAULT_OFFSET,
+                    multiplier: MUSL_TSD_MULTIPLIER,
+                    indirect: MUSL_TSD_INDIRECT,
+                };
+            }
+            if m.path.contains("libc.so.6") || m.path.contains("libpthread.so.0") {
+                // glibc
+                return TSDInfo {
+                    offset: GLIBC_TSD_DEFAULT_OFFSET,
+                    multiplier: GLIBC_TSD_MULTIPLIER,
+                    indirect: GLIBC_TSD_INDIRECT,
+                };
+            }
+        }
+    }
+
+    // Default to glibc-like parameters (most common)
+    TSDInfo {
+        offset: GLIBC_TSD_DEFAULT_OFFSET,
+        multiplier: GLIBC_TSD_MULTIPLIER,
+        indirect: GLIBC_TSD_INDIRECT,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_is_potential_tsd_dso() {
+        assert!(is_potential_tsd_dso("/lib/x86_64-linux-gnu/libc.so.6"));
+        assert!(is_potential_tsd_dso(
+            "/lib/x86_64-linux-gnu/libpthread.so.0"
+        ));
+        assert!(is_potential_tsd_dso("/lib/ld-musl-x86_64.so.1"));
+        assert!(is_potential_tsd_dso("/usr/lib/libc.musl-x86_64.so.1"));
+        assert!(!is_potential_tsd_dso("/usr/lib/libpython3.10.so"));
+        assert!(!is_potential_tsd_dso("/lib/libm.so.6"));
+    }
+
+    #[test]
+    #[cfg(target_arch = "x86_64")]
+    fn test_extract_tsd_info_musl() {
+        // Simplified musl pthread_getspecific pattern
+        let code = [
+            0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, // mov %fs:0x0, %rax
+            0x48, 0x8b, 0x80, 0x80, 0x00, 0x00, 0x00, // mov 0x80(%rax), %rax
+            0x48, 0x8b, 0x04, 0xf8, // mov (%rax,%rdi,8), %rax
+            0xc3, // ret
+        ];
+        let result = extract_tsd_info_x86(&code);
+        assert!(result.is_ok());
+        let info = result.unwrap();
+        assert_eq!(info.offset, 0x80);
+        assert_eq!(info.multiplier, 8);
+        assert_eq!(info.indirect, 1);
+    }
+
+    #[test]
+    #[cfg(target_arch = "x86_64")]
+    fn test_extract_tsd_info_glibc_split() {
+        // pattern: fs:0x10 -> shl -> add -> mov offset
+        let code = [
+            0x64, 0x48, 0x8b, 0x04, 0x25, 0x10, 0x00, 0x00, 0x00, // mov %fs:0x10, %rax
+            0x48, 0xc1, 0xe7, 0x04, // shl $0x4, %rdi
+            0x48, 0x01, 0xf8, // add %rdi, %rax
+            0x48, 0x8b, 0x80, 0x18, 0x03, 0x00, 0x00, // mov 0x318(%rax), %rax
+            0xc3,
+        ];
+        let result = extract_tsd_info_x86(&code);
+        assert!(result.is_ok());
+        let info = result.unwrap();
+        assert_eq!(info.offset, 0x318);
+        assert_eq!(info.multiplier, 16);
+        assert_eq!(info.indirect, 0);
+    }
+}
diff --git a/agent/src/ebpf/kernel/include/perf_profiler.h b/agent/src/ebpf/kernel/include/perf_profiler.h
index 2a8acb9f8c7..5ca6794eeb2 100644
--- a/agent/src/ebpf/kernel/include/perf_profiler.h
+++ b/agent/src/ebpf/kernel/include/perf_profiler.h
@@ -102,6 +102,7 @@ struct stack_trace_key_t {
 
 typedef struct {
 	__u32 task_struct_stack_offset;
+	__u64 tpbase_offset;  /* Offset of fsbase/tpidr in task_struct for TLS access */
 } unwind_sysinfo_t;
 
 #define CLASS_NAME_LEN 32
diff --git a/agent/src/ebpf/kernel/perf_profiler.bpf.c b/agent/src/ebpf/kernel/perf_profiler.bpf.c
index 803cdafed43..b2b97b103b6 100644
--- a/agent/src/ebpf/kernel/perf_profiler.bpf.c
+++ b/agent/src/ebpf/kernel/perf_profiler.bpf.c
@@ -165,15 +165,13 @@ MAP_ARRAY(unwind_sysinfo, __u32, unwind_sysinfo_t, 1, FEATURE_FLAG_DWARF_UNWINDI
  *   - To increase capacity, modify max_entries below and rebuild
  */
 
-// Python: stores thread state address for stack unwinding
-// - python_tstate_addr_map: key=PID, value=thread_state_address (per-thread)
-//   Pre-allocated: 65536 * (4 + 8 + 32) ≈ 2.8 MB (htab_elem overhead included)
+// Python: stores per-process Python unwinding information
 // - python_unwind_info_map: key=PID, value=python_unwind_info_t (per-process)
-//   Pre-allocated: 65536 * (4 + 16 + 32) ≈ 3.3 MB (htab_elem overhead included)
+//   Contains auto_tls_key_addr, version, TSD info for multi-threaded PyThreadState lookup
+//   Pre-allocated: 65536 * (4 + 24 + 32) ≈ 3.8 MB (htab_elem overhead included)
 // - python_offsets_map: key=offsets_id, value=python_offsets_t (per-version, reference counted)
 //   Supports 1 Python version at a time (upgrades replace entry)
 //   Pre-allocated: 1 * (1 + 216 + 32) ≈ 249 bytes
-MAP_HASH(python_tstate_addr_map, __u32, __u64, 65536, FEATURE_FLAG_PROFILE_PYTHON)
 MAP_HASH(python_unwind_info_map, __u32, python_unwind_info_t, 65536, FEATURE_FLAG_PROFILE_PYTHON)
 MAP_HASH(python_offsets_map, __u8, python_offsets_t, 1, FEATURE_FLAG_PROFILE_PYTHON)
 
@@ -1278,6 +1276,116 @@ __u32 get_symbol_id(symbol_t * symbol)
 	return id;
 }
 
+/*
+ * TSD (Thread Specific Data) helper functions for multi-threaded Python support
+ *
+ * These functions read thread-local storage to get per-thread PyThreadState,
+ * enabling correct Python stack unwinding in multi-threaded applications.
+ */
+
+/*
+ * Get the thread pointer base (TPBASE) from the current task's task_struct.
+ * On x86_64, this is the fsbase value; on arm64, it's tp_value.
+ *
+ * The TPBASE points to the C library's per-thread data structure (struct pthread)
+ * which contains thread-local storage including Python's PyThreadState pointer.
+ */
+static inline __attribute__ ((always_inline))
+int tsd_get_base(void **tsd_base)
+{
+	__u32 zero = 0;
+	unwind_sysinfo_t *sysinfo = unwind_sysinfo__lookup(&zero);
+	if (sysinfo == NULL || sysinfo->tpbase_offset == 0) {
+		bpf_debug("[TSD] sysinfo or tpbase_offset not available");
+		return -1;
+	}
+
+	struct task_struct *task = (struct task_struct *)bpf_get_current_task();
+
+	/*
+	 * Read task->thread.fsbase (x86_64) or equivalent from task_struct.
+	 * We use the dynamically calculated tpbase_offset since the struct layout
+	 * varies by kernel version.
+	 */
+	void *tpbase_ptr = ((char *)task) + sysinfo->tpbase_offset;
+	if (bpf_probe_read_kernel(tsd_base, sizeof(void *), tpbase_ptr)) {
+		bpf_debug("[TSD] Failed to read tpbase value");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Read from Thread Specific Data location associated with the provided key.
+ *
+ * For musl libc (indirect):
+ *   pthread->tsd is a pointer to an array, so we need an extra dereference
+ *   tsd_addr = *(tsd_base + offset) + key * 8
+ *
+ * For glibc (direct):
+ *   pthread->specific_1stblock is an inline array
+ *   tsd_addr = tsd_base + 0x10 + offset + key * 16
+ *   (each entry is struct pthread_key_data { uintptr_t seq; void *data; })
+ */
+static inline __attribute__ ((always_inline))
+int tsd_read(const tsd_info_t *tsi, const void *tsd_base, int key, void **out)
+{
+	const void *tsd_addr = tsd_base + tsi->offset;
+
+	if (tsi->indirect) {
+		/* musl: read the pointer that points to the TSD array */
+		if (bpf_probe_read_user(&tsd_addr, sizeof(tsd_addr), tsd_addr)) {
+			bpf_debug("[TSD] Failed to read indirect TSD pointer");
+			return -1;
+		}
+	}
+
+	/* Calculate final address using key and multiplier */
+	tsd_addr += key * tsi->multiplier;
+
+	bpf_debug("[TSD] Reading from tsd_addr=%lx (key=%d)", (unsigned long)tsd_addr, key);
+	if (bpf_probe_read_user(out, sizeof(*out), tsd_addr)) {
+		bpf_debug("[TSD] Failed to read TSD value");
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Get PyThreadState for the current thread using Thread Specific Data (TSD).
+ *
+ * Python stores each thread's PyThreadState in thread-local storage using
+ * pthread_setspecific(autoTLSkey, tstate). We read this value using TSD functions.
+ */
+static inline __attribute__ ((always_inline))
+void *get_py_thread_state(python_unwind_info_t *py_info)
+{
+	void *tsd_base;
+	if (tsd_get_base(&tsd_base) != 0) {
+		return NULL;
+	}
+
+	/* Read the autoTLSkey value from Python runtime */
+	int auto_tls_key;
+	if (bpf_probe_read_user(&auto_tls_key, sizeof(auto_tls_key),
+				(void *)py_info->auto_tls_key_addr)) {
+		bpf_debug("[PYTHON] Failed to read autoTLSkey");
+		return NULL;
+	}
+
+	bpf_debug("[PYTHON] autoTLSkey=%d, tsd_base=%lx", auto_tls_key, (unsigned long)tsd_base);
+
+	/* Read PyThreadState from TSD */
+	void *thread_state = NULL;
+	if (tsd_read(&py_info->tsd_info, tsd_base, auto_tls_key, &thread_state) != 0) {
+		return NULL;
+	}
+
+	return thread_state;
+}
+
 static inline __attribute__ ((always_inline))
 int pre_python_unwind(void *ctx, unwind_state_t * state,
 		 map_group_t *maps, int jmp_idx) {
@@ -1294,20 +1402,11 @@ int pre_python_unwind(void *ctx, unwind_state_t * state,
         return 0;
 	}
 
-	void *thread_state;
-	if (bpf_probe_read_user
-	    (&thread_state, sizeof(thread_state),
-	     (void *)py_unwind_info->thread_state_address) != 0) {
-        return 0;
-	}
-
+	/* Get per-thread PyThreadState using TSD mechanism */
+	void *thread_state = get_py_thread_state(py_unwind_info);
 	if (thread_state == NULL) {
-        __u64 *addr = python_tstate_addr_map__lookup(&state->key.tgid);
-        if (addr && *addr != 0) {
-            thread_state = (void *)*addr;
-        } else {
-            return 0;
-		}
+		bpf_debug("[PYTHON] Failed to get thread state via TSD");
+        return 0;
 	}
 
 	if (bpf_probe_read_user
@@ -2067,18 +2166,11 @@ PROGPE(v8_unwind) (struct bpf_perf_event_data *ctx) {
 	return 0;
 }
 
-URETPROG(python_save_tstate_addr) (struct pt_regs * ctx) {
-	__u64 ret = PT_REGS_RC(ctx);
-	__u32 tgid = bpf_get_current_pid_tgid() >> 32;
-
-	__u64 *addr = python_tstate_addr_map__lookup(&tgid);
-	if (addr) {
-		*addr = ret;
-	} else {
-		python_tstate_addr_map__update(&tgid, &ret);
-	}
-	return 0;
-}
+/*
+ * NOTE: python_save_tstate_addr uprobe has been removed.
+ * The TSD mechanism now handles per-thread PyThreadState lookup directly
+ * without needing to intercept PyEval_SaveThread calls.
+ */
 
 PROGPE(oncpu_output) (struct bpf_perf_event_data * ctx) {
 	__u32 zero = 0;
diff --git a/agent/src/ebpf/user/unwind_tracer.c b/agent/src/ebpf/user/unwind_tracer.c
index b7bab0c1d4e..82f9379dc4b 100644
--- a/agent/src/ebpf/user/unwind_tracer.c
+++ b/agent/src/ebpf/user/unwind_tracer.c
@@ -241,11 +241,33 @@ static bool requires_dwarf_unwind_table(int pid) {
 }
 
 int unwind_tracer_init(struct bpf_tracer *tracer) {
-    int32_t offset = read_offset_of_stack_in_task_struct();
-    if (offset < 0) {
+    /* Initialize unwind_sysinfo with task_struct offsets */
+    int32_t stack_offset = read_offset_of_stack_in_task_struct();
+    if (stack_offset < 0) {
         ebpf_warning("unwind tracer init: failed to get field stack offset in task struct from btf");
         ebpf_warning("unwinder may not handle in kernel perf events correctly");
-    } else if (!bpf_table_set_value(tracer, MAP_UNWIND_SYSINFO_NAME, 0, &offset)) {
+    }
+
+    /* Get tpbase_offset for TLS access (needed for Python multi-threading support) */
+    int64_t tpbase_offset = read_tpbase_offset();
+    if (tpbase_offset < 0) {
+        ebpf_warning("unwind tracer init: failed to get tpbase offset from kernel");
+        ebpf_warning("Python multi-threaded profiling may not work correctly");
+        tpbase_offset = 0;
+    } else {
+        ebpf_info("unwind tracer init: tpbase_offset=%ld", tpbase_offset);
+    }
+
+    /* Update unwind_sysinfo map with both offsets */
+    struct {
+        uint32_t task_struct_stack_offset;
+        uint64_t tpbase_offset;
+    } sysinfo = {
+        .task_struct_stack_offset = stack_offset > 0 ? (uint32_t)stack_offset : 0,
+        .tpbase_offset = (uint64_t)tpbase_offset,
+    };
+
+    if (!bpf_table_set_value(tracer, MAP_UNWIND_SYSINFO_NAME, 0, &sysinfo)) {
         ebpf_warning("unwind tracer init: update %s error", MAP_UNWIND_SYSINFO_NAME);
         ebpf_warning("unwinder may not handle in kernel perf events correctly");
     }
@@ -337,48 +359,7 @@ int unwind_tracer_init(struct bpf_tracer *tracer) {
     return 0;
 }
 
-static struct symbol python_symbols[] = { { .type = PYTHON_UPROBE,
-                                            .symbol = "PyEval_SaveThread",
-                                            .probe_func = URETPROBE_FUNC_NAME(python_save_tstate_addr),
-                                            .is_probe_ret = true, }, };
 
-static void python_parse_and_register(int pid, struct tracer_probes_conf *conf) {
-    char *path = NULL;
-    int n = 0;
-
-    if (pid <= 1)
-        goto out;
-
-    if (!is_user_process(pid))
-        goto out;
-
-    // Python symbols may reside in the main executable or libpython.so
-    // Check both
-    path = get_elf_path_by_pid(pid);
-    if (path) {
-        n = add_probe_sym_to_tracer_probes(pid, path, conf, python_symbols, NELEMS(python_symbols));
-        if (n > 0) {
-            ebpf_info("python uprobe, pid:%d, path:%s\n", pid, path);
-            free(path);
-            return;
-        }
-    }
-
-    path = get_so_path_by_pid_and_name(pid, "python3");
-    if (!path) {
-        path = get_so_path_by_pid_and_name(pid, "python2");
-        if (!path) {
-            goto out;
-        }
-    }
-
-    ebpf_info("python uprobe, pid:%d, path:%s\n", pid, path);
-    add_probe_sym_to_tracer_probes(pid, path, conf, python_symbols, NELEMS(python_symbols));
-
-out:
-    free(path);
-    return;
-}
 
 static void lua_parse_and_register(int pid, struct tracer_probes_conf *conf) {
     lua_runtime_info_t info = {0};
@@ -567,11 +548,7 @@ void unwind_events_handle(void) {
         tracer = event->tracer;
         if (tracer && python_profiler_enabled() && is_python_process(event->pid)) {
             python_unwind_table_load(g_python_unwind_table, event->pid);
-            pthread_mutex_lock(&tracer->mutex_probes_lock);
-            python_parse_and_register(event->pid, tracer->tps);
-            tracer_uprobes_update(tracer);
-            tracer_hooks_process(tracer, HOOK_ATTACH, &count);
-            pthread_mutex_unlock(&tracer->mutex_probes_lock);
+            // Note: Python profiling uses TSD mechanism, no uprobes needed
         }
 
         if (tracer && php_profiler_enabled() && is_php_process(event->pid)) {