Skip to Content
Shadow Cheat EngineDev Historyv3 指针扫描实现计划

指针扫描完整实现计划 v3(两轮 Review 后终版)

架构决策(已确定)

全链路:内核收集 → Server BFS → 客户端纯展示

  • 内核:页表 walk + kmap + sorted VMA array 二分 → 收集所有有效指针对
  • Server(C,用户态):qsort + visited-set BFS → 输出指针链
  • 客户端:零计算,只显示结果 + 双击添加到 addr list
  • 一条 CMD (0xD6) 完成全部,客户端等 3-5 秒拿到结果

核心优势

  • 零用户态内存 API(无 ReadProcessMemory、无 /proc/pid/mem)
  • 直接页表 walk → kmap_local_page → 读物理页
  • 目标进程完全无感知

已跳过 Phase 1(纯客户端 Fallback)

直接实现内核收集 + server BFS,一步到位。


两轮 Review 确认的关键修正

Offset 顺序(已亲自 trace 确认)

  • BFS 输出 offsets[0] = 最靠近 target 的 offset,offsets[N-1] = 最靠近 base 的 offset
  • _resolve_pointerrange(len-1, -1, -1):先 deref 再加 offset,从 base 端开始
  • 不需要 reverse,BFS 原始输出即 CE order

架构修正:放弃滑动窗口,回到 vmalloc 模式

  • 滑动窗口 drop/reacquire mmap_read_lock 导致 VMA iterator use-after-free(P0 CRASH)
  • 改为:vmalloc(5M × 16 = 80MB),全部在一次 mmap_read_lock 下收集完
  • unlock 后单次 copy_to_user 到 server 的 malloc buffer
  • 12GB 手机 vmalloc 80MB 没问题

指针验证:预建 sorted VMA array(非 vma_lookup)

  • 65M 次 vma_lookup (maple tree) = 5-8 秒(cache thrashing)
  • 预建 sorted array (500 VMAs × 16B = 8KB,fit L1) + binary search = 0.6-1 秒
  • 在 vmalloc 模式下无 lock cycle 问题,VMA table 始终一致

内核模块:shadow_ce.h + shadow_ce.c

新增 shadow_ce.h

struct ce_ptr_entry { uint64_t source; // 指针存放地址 uint64_t target; // 指针的值(已 untag) }; struct ce_ptrscan_collect_req { int pid; uint32_t flags; // PTRSCAN_ALIGNED(1) | PTRSCAN_WRITABLE(2) uint32_t max_results; // 0 = 5M default unsigned long result_buf; // 用户态 buffer: ce_ptr_entry[] uint32_t result_count; // out: 实际数量 }; #define PTRSCAN_ALIGNED_ONLY (1 << 0) #define PTRSCAN_WRITABLE_ONLY (1 << 1) #define SHADOW_CE_PTRSCAN_COLLECT _IOWR(CE_IOC_MAGIC, 20, struct ce_ptrscan_collect_req)

do_ptrscan_collect() — vmalloc 模式

static int do_ptrscan_collect(struct ce_ptrscan_collect_req *req) { struct task_struct *task; struct mm_struct *mm; struct vm_area_struct *vma; uint32_t max_res = req->max_results ? min(req->max_results, 5000000u) : 5000000; uint32_t found = 0; int ret = 0; // 1. vmalloc 结果 buffer(最大 80MB) struct ce_ptr_entry *results = vmalloc(max_res * sizeof(*results)); if (!results) return -ENOMEM; // 2. 获取目标 mm rcu_read_lock(); task = pid_task(find_vpid(req->pid), PIDTYPE_PID); if (!task) { rcu_read_unlock(); vfree(results); return -ESRCH; } mm = get_task_mm(task); rcu_read_unlock(); if (!mm) { vfree(results); return -ESRCH; } mmap_read_lock(mm); // 3. 预建 sorted VMA range array(fit L1 cache) struct { uint64_t start, end; } *vranges; int vcount = 0; { VMA_ITERATOR(vmi_count, mm, 0); for_each_vma(vmi_count, vma) { if (vma->vm_flags & VM_READ) vcount++; } } vranges = vmalloc(vcount * sizeof(*vranges)); if (!vranges) { mmap_read_unlock(mm); mmput(mm); vfree(results); return -ENOMEM; } { int idx = 0; VMA_ITERATOR(vmi_fill, mm, 0); for_each_vma(vmi_fill, vma) { if (!(vma->vm_flags & VM_READ)) continue; vranges[idx].start = vma->vm_start; vranges[idx].end = vma->vm_end; idx++; } vcount = idx; } // vranges 已按 start 排序(VMA iterator 天然有序) // 4. 指针收集 VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { unsigned long addr, end; if (found >= max_res) break; if (!(vma->vm_flags & VM_READ)) continue; if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue; if ((req->flags & PTRSCAN_WRITABLE_ONLY) && !(vma->vm_flags & VM_WRITE)) continue; end = vma->vm_end; for (addr = vma->vm_start; addr < end && found < max_res; addr += PAGE_SIZE) { struct pte_result pr; void *kaddr; unsigned int off; if (signal_pending(current)) goto out_unlock; pr = walk_pte_safe(mm, vma, addr); if (!pr.page) continue; get_page(pr.page); if (pr.ptep) pte_unmap(pr.ptep); kaddr = kmap_local_page(pr.page); for (off = 0; off + 8 <= PAGE_SIZE; off += 8) { uint64_t val = *(uint64_t *)(kaddr + off); uint64_t clean = untagged_addr(val); if (!clean || clean >= TASK_SIZE) continue; // 二分查找 sorted VMA array(O(log N),N~500,全在 L1) int lo = 0, hi = vcount - 1; bool valid = false; while (lo <= hi) { int mid = (lo + hi) / 2; if (clean < vranges[mid].start) hi = mid - 1; else if (clean >= vranges[mid].end) lo = mid + 1; else { valid = true; break; } } if (valid && found < max_res) { results[found].source = (addr & PAGE_MASK) + off; results[found].target = clean; found++; } } kunmap_local(kaddr); put_page(pr.page); // 每 256 页 yield if (((addr >> PAGE_SHIFT) & 0xFF) == 0) cond_resched(); } } out_unlock: mmap_read_unlock(mm); // 5. 一次性 copy_to_user(unlock 后) if (found > 0) { if (copy_to_user((void __user *)req->result_buf, results, found * sizeof(*results))) ret = -EFAULT; } req->result_count = found; mmput(mm); vfree(vranges); vfree(results); return ret; }

关键设计点

  1. vmalloc 一整块:无 lock cycle,VMA iterator 始终有效
  2. sorted VMA array:~8KB fit L1,binary search ~10ns/lookup vs vma_lookup ~80ns
  3. untagged_addr():内核宏,自动适配 MTE/PAC/TBI + 39/48/52-bit VA
  4. signal_pending:每页检查,可被 SIGKILL 中断
  5. cond_resched:每 256 页让出 CPU(continue 之前,非死代码)
  6. copy_to_user 在 unlock 之后:和 do_scan 完全一致的模式

Server:server.c

CMD_PTRSCAN_RESOLVE (0xD6) — 一条命令全做

线程模型

  • 客户端创建专用 TCP 连接(第 5 条)
  • server accept → pthread_create → 独立线程处理
  • 不影响其他 4 条连接(scan/refresh/freeze/bp)

内存预算

数据大小
pairs (5M × 16B)80MB
sorted VMA array (kernel 内部)~8KB
cur_level (50K × ~80B)4MB
nxt_level (50K × ~80B)4MB
visited hash set (64K slots × 8B)512KB
chains (10K × ~80B)0.8MB
总计~89MB

BFS 关键修正

  1. visited hash set:push 前检查 if (visited[src]) continue,防止指数爆炸
  2. per-level cap 50K:防止 OOM
  3. module lookup binary search:排序 module_snapshot.entries by base,O(log N)
  4. create_module_snapshot 去掉 perm 过滤:包含 .rodata (r—) 段
  5. memcpy 只复制 depth×4 bytes:不是固定 64 bytes

Wire Protocol

Request: [1 byte] 0xD6 [4 bytes] pid [8 bytes] target_addr [4 bytes] max_depth (1-16) [4 bytes] max_offset (typically 4095) [4 bytes] flags (PTRSCAN_*) [4 bytes] max_chains (0 = 10000) Response: [4 bytes] status (0=ok) [4 bytes] module_count per module: [4 bytes] name_len [name_len bytes] name [8 bytes] base_addr [4 bytes] chain_count per chain: [4 bytes] depth [2 bytes] module_index (0xFFFF = not static) [8 bytes] base_offset (file-relative, uint64) [depth × 4 bytes] offsets[] (int32, signed)

base_offset 是 uint64(修正 uint32 截断 bug)。

BFS 伪代码(含 visited set)

#define MAX_LEVEL_WIDTH 50000 #define VISITED_SLOTS 65536 #define VISITED_MASK (VISITED_SLOTS - 1) uint64_t visited[VISITED_SLOTS]; // open-addressing hash set int visited_count = 0; bool visited_check_add(uint64_t addr) { uint32_t h = (uint32_t)(addr * 0x9E3779B97F4A7C15ULL >> 48); for (int i = 0; i < 8; i++) { // linear probe, max 8 uint32_t slot = (h + i) & VISITED_MASK; if (visited[slot] == addr) return true; // already seen if (visited[slot] == 0) { visited[slot] = addr; visited_count++; return false; // newly added } } return false; // table full, allow through } // BFS main loop for (int level = 0; level < max_depth && chain_count < max_chains; level++) { nxt_count = 0; for (int i = 0; i < cur_count && chain_count < max_chains; i++) { uint64_t find = cur_level[i].addr; uint64_t lo = (find > max_offset) ? find - max_offset : 0; // binary search lower_bound in sorted pairs int left = lower_bound(pairs, pair_count, lo); for (int j = left; j < pair_count && pairs[j].target <= find; j++) { uint64_t src = pairs[j].source; // DEDUP if (visited_check_add(src)) continue; int32_t offset = (int32_t)(find - pairs[j].target); // static check (binary search sorted modules) int mod_idx = find_module_bsearch(snap, src); if (mod_idx >= 0 && chain_count < max_chains) { // emit chain chains[chain_count].depth = level + 1; chains[chain_count].module_idx = mod_idx; chains[chain_count].base_offset = src - snap->entries[mod_idx].base + snap->entries[mod_idx].file_offset; memcpy(chains[chain_count].offsets, cur_level[i].offsets, level * sizeof(int32_t)); // 只复制已有的 depth 层 chains[chain_count].offsets[level] = offset; chain_count++; } // push to next level if (nxt_count < MAX_LEVEL_WIDTH) { nxt_level[nxt_count].addr = src; nxt_level[nxt_count].depth = level + 1; memcpy(nxt_level[nxt_count].offsets, cur_level[i].offsets, level * sizeof(int32_t)); nxt_level[nxt_count].offsets[level] = offset; nxt_count++; } } } // swap struct bfs_node *tmp = cur_level; cur_level = nxt_level; nxt_level = tmp; cur_count = nxt_count; }

客户端

ce_client.py: ptrscan_resolve()

  • 专用 CEClient 连接(不阻塞其他操作)
  • 单次 CMD 0xD6,等待 3-5 秒
  • 解析 module header + chains
  • base_offset 读 8 bytes (uint64)

ptrscan_results.py: PtrScanWorker

  • QThread 子类
  • run() 里创建专用 CEClient → 调 ptrscan_resolve → emit finished
  • UI 用 indeterminate progress bar(setRange(0,0)
  • Stop 按钮 close socket → _recv 抛 ConnectionError → 线程退出

双击 → 添加到 addr list

entry = AddrEntry( address=0, desc="ptrscan", vtype="4 Bytes", is_pointer=True, base_addr_str=f"{mod_name}+0x{base_offset:X}", # FILE-RELATIVE offset pointer_offsets=offsets, # BFS 原始输出,不 reverse ) self.pointer_selected.emit(entry)

base_addr_str 用 file-relative offset

  • "libgame.so+0x500100" 其中 0x500100 是 file offset
  • _parse_address 已有 segment 查找:foff <= offset < foff + size → base + (offset - foff)
  • ASLR 后 module 加载到新地址,segment lookup 自动转换为新 runtime address

性能预估(最终修正)

步骤耗时
内核收集 (500MB, sorted array binary search)1.5-3s
copy_to_user 80MB~50ms
qsort 5M entries (server)300-600ms
BFS depth 7, visited set, cap 50K/level (server)200-500ms
TCP 返回链 (~100KB)<10ms
总计2-4s

实现顺序

  1. shadow_ce.h: 加 struct + ioctl define
  2. shadow_ce.c: 实现 do_ptrscan_collect(~120 行,照 do_scan)
  3. server.c: 修 create_module_snapshot 去掉 perm 过滤 + 加 CMD 0xD6 handler + BFS(~250 行)
  4. ce_client.py: 加 ptrscan_resolve(~40 行)
  5. ptrscan_results.py: PtrScanWorker + 信号连接 + 双击 handler
  6. main_window.py: _open_ptrscan 传 host/port/pid + _add_ptr_from_scan

验证

  1. test_ptrscan.c: malloc A→B→C, 全局指针→A, 用 MCP 验证
  2. 集成测试: 真实游戏扫血量地址→指针扫描→双击添加→重启验证
Last updated on