指针扫描完整实现计划 v3(两轮 Review 后终版)
架构决策(已确定)
全链路:内核收集 → Server BFS → 客户端纯展示
- 内核:页表 walk + kmap + sorted VMA array 二分 → 收集所有有效指针对
- Server(C,用户态):qsort + visited-set BFS → 输出指针链
- 客户端:零计算,只显示结果 + 双击添加到 addr list
- 一条 CMD (0xD6) 完成全部,客户端等 3-5 秒拿到结果
核心优势
- 零用户态内存 API(无 ReadProcessMemory、无 /proc/pid/mem)
- 直接页表 walk → kmap_local_page → 读物理页
- 目标进程完全无感知
已跳过 Phase 1(纯客户端 Fallback)
直接实现内核收集 + server BFS,一步到位。
两轮 Review 确认的关键修正
Offset 顺序(已亲自 trace 确认)
- BFS 输出
offsets[0]= 最靠近 target 的 offset,offsets[N-1]= 最靠近 base 的 offset _resolve_pointer走range(len-1, -1, -1):先 deref 再加 offset,从 base 端开始- 不需要 reverse,BFS 原始输出即 CE order ✓
架构修正:放弃滑动窗口,回到 vmalloc 模式
- 滑动窗口 drop/reacquire mmap_read_lock 导致 VMA iterator use-after-free(P0 CRASH)
- 改为:vmalloc(5M × 16 = 80MB),全部在一次 mmap_read_lock 下收集完
- unlock 后单次 copy_to_user 到 server 的 malloc buffer
- 12GB 手机 vmalloc 80MB 没问题
指针验证:预建 sorted VMA array(非 vma_lookup)
- 65M 次 vma_lookup (maple tree) = 5-8 秒(cache thrashing)
- 预建 sorted array (500 VMAs × 16B = 8KB,fit L1) + binary search = 0.6-1 秒
- 在 vmalloc 模式下无 lock cycle 问题,VMA table 始终一致
内核模块:shadow_ce.h + shadow_ce.c
新增 shadow_ce.h
struct ce_ptr_entry {
uint64_t source; // 指针存放地址
uint64_t target; // 指针的值(已 untag)
};
struct ce_ptrscan_collect_req {
int pid;
uint32_t flags; // PTRSCAN_ALIGNED(1) | PTRSCAN_WRITABLE(2)
uint32_t max_results; // 0 = 5M default
unsigned long result_buf; // 用户态 buffer: ce_ptr_entry[]
uint32_t result_count; // out: 实际数量
};
#define PTRSCAN_ALIGNED_ONLY (1 << 0)
#define PTRSCAN_WRITABLE_ONLY (1 << 1)
#define SHADOW_CE_PTRSCAN_COLLECT _IOWR(CE_IOC_MAGIC, 20, struct ce_ptrscan_collect_req)do_ptrscan_collect() — vmalloc 模式
static int do_ptrscan_collect(struct ce_ptrscan_collect_req *req)
{
struct task_struct *task;
struct mm_struct *mm;
struct vm_area_struct *vma;
uint32_t max_res = req->max_results ? min(req->max_results, 5000000u) : 5000000;
uint32_t found = 0;
int ret = 0;
// 1. vmalloc 结果 buffer(最大 80MB)
struct ce_ptr_entry *results = vmalloc(max_res * sizeof(*results));
if (!results) return -ENOMEM;
// 2. 获取目标 mm
rcu_read_lock();
task = pid_task(find_vpid(req->pid), PIDTYPE_PID);
if (!task) { rcu_read_unlock(); vfree(results); return -ESRCH; }
mm = get_task_mm(task);
rcu_read_unlock();
if (!mm) { vfree(results); return -ESRCH; }
mmap_read_lock(mm);
// 3. 预建 sorted VMA range array(fit L1 cache)
struct { uint64_t start, end; } *vranges;
int vcount = 0;
{
VMA_ITERATOR(vmi_count, mm, 0);
for_each_vma(vmi_count, vma) {
if (vma->vm_flags & VM_READ) vcount++;
}
}
vranges = vmalloc(vcount * sizeof(*vranges));
if (!vranges) { mmap_read_unlock(mm); mmput(mm); vfree(results); return -ENOMEM; }
{
int idx = 0;
VMA_ITERATOR(vmi_fill, mm, 0);
for_each_vma(vmi_fill, vma) {
if (!(vma->vm_flags & VM_READ)) continue;
vranges[idx].start = vma->vm_start;
vranges[idx].end = vma->vm_end;
idx++;
}
vcount = idx;
}
// vranges 已按 start 排序(VMA iterator 天然有序)
// 4. 指针收集
VMA_ITERATOR(vmi, mm, 0);
for_each_vma(vmi, vma) {
unsigned long addr, end;
if (found >= max_res) break;
if (!(vma->vm_flags & VM_READ)) continue;
if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue;
if ((req->flags & PTRSCAN_WRITABLE_ONLY) && !(vma->vm_flags & VM_WRITE))
continue;
end = vma->vm_end;
for (addr = vma->vm_start; addr < end && found < max_res; addr += PAGE_SIZE) {
struct pte_result pr;
void *kaddr;
unsigned int off;
if (signal_pending(current)) goto out_unlock;
pr = walk_pte_safe(mm, vma, addr);
if (!pr.page) continue;
get_page(pr.page);
if (pr.ptep) pte_unmap(pr.ptep);
kaddr = kmap_local_page(pr.page);
for (off = 0; off + 8 <= PAGE_SIZE; off += 8) {
uint64_t val = *(uint64_t *)(kaddr + off);
uint64_t clean = untagged_addr(val);
if (!clean || clean >= TASK_SIZE) continue;
// 二分查找 sorted VMA array(O(log N),N~500,全在 L1)
int lo = 0, hi = vcount - 1;
bool valid = false;
while (lo <= hi) {
int mid = (lo + hi) / 2;
if (clean < vranges[mid].start) hi = mid - 1;
else if (clean >= vranges[mid].end) lo = mid + 1;
else { valid = true; break; }
}
if (valid && found < max_res) {
results[found].source = (addr & PAGE_MASK) + off;
results[found].target = clean;
found++;
}
}
kunmap_local(kaddr);
put_page(pr.page);
// 每 256 页 yield
if (((addr >> PAGE_SHIFT) & 0xFF) == 0)
cond_resched();
}
}
out_unlock:
mmap_read_unlock(mm);
// 5. 一次性 copy_to_user(unlock 后)
if (found > 0) {
if (copy_to_user((void __user *)req->result_buf, results,
found * sizeof(*results)))
ret = -EFAULT;
}
req->result_count = found;
mmput(mm);
vfree(vranges);
vfree(results);
return ret;
}关键设计点
- vmalloc 一整块:无 lock cycle,VMA iterator 始终有效
- sorted VMA array:~8KB fit L1,binary search ~10ns/lookup vs vma_lookup ~80ns
- untagged_addr():内核宏,自动适配 MTE/PAC/TBI + 39/48/52-bit VA
- signal_pending:每页检查,可被 SIGKILL 中断
- cond_resched:每 256 页让出 CPU(continue 之前,非死代码)
- copy_to_user 在 unlock 之后:和 do_scan 完全一致的模式
Server:server.c
CMD_PTRSCAN_RESOLVE (0xD6) — 一条命令全做
线程模型
- 客户端创建专用 TCP 连接(第 5 条)
- server accept → pthread_create → 独立线程处理
- 不影响其他 4 条连接(scan/refresh/freeze/bp)
内存预算
| 数据 | 大小 |
|---|---|
| pairs (5M × 16B) | 80MB |
| sorted VMA array (kernel 内部) | ~8KB |
| cur_level (50K × ~80B) | 4MB |
| nxt_level (50K × ~80B) | 4MB |
| visited hash set (64K slots × 8B) | 512KB |
| chains (10K × ~80B) | 0.8MB |
| 总计 | ~89MB |
BFS 关键修正
- visited hash set:push 前检查
if (visited[src]) continue,防止指数爆炸 - per-level cap 50K:防止 OOM
- module lookup binary search:排序 module_snapshot.entries by base,O(log N)
- create_module_snapshot 去掉 perm 过滤:包含 .rodata (r—) 段
- memcpy 只复制 depth×4 bytes:不是固定 64 bytes
Wire Protocol
Request:
[1 byte] 0xD6
[4 bytes] pid
[8 bytes] target_addr
[4 bytes] max_depth (1-16)
[4 bytes] max_offset (typically 4095)
[4 bytes] flags (PTRSCAN_*)
[4 bytes] max_chains (0 = 10000)
Response:
[4 bytes] status (0=ok)
[4 bytes] module_count
per module:
[4 bytes] name_len
[name_len bytes] name
[8 bytes] base_addr
[4 bytes] chain_count
per chain:
[4 bytes] depth
[2 bytes] module_index (0xFFFF = not static)
[8 bytes] base_offset (file-relative, uint64)
[depth × 4 bytes] offsets[] (int32, signed)base_offset 是 uint64(修正 uint32 截断 bug)。
BFS 伪代码(含 visited set)
#define MAX_LEVEL_WIDTH 50000
#define VISITED_SLOTS 65536
#define VISITED_MASK (VISITED_SLOTS - 1)
uint64_t visited[VISITED_SLOTS]; // open-addressing hash set
int visited_count = 0;
bool visited_check_add(uint64_t addr) {
uint32_t h = (uint32_t)(addr * 0x9E3779B97F4A7C15ULL >> 48);
for (int i = 0; i < 8; i++) { // linear probe, max 8
uint32_t slot = (h + i) & VISITED_MASK;
if (visited[slot] == addr) return true; // already seen
if (visited[slot] == 0) {
visited[slot] = addr;
visited_count++;
return false; // newly added
}
}
return false; // table full, allow through
}
// BFS main loop
for (int level = 0; level < max_depth && chain_count < max_chains; level++) {
nxt_count = 0;
for (int i = 0; i < cur_count && chain_count < max_chains; i++) {
uint64_t find = cur_level[i].addr;
uint64_t lo = (find > max_offset) ? find - max_offset : 0;
// binary search lower_bound in sorted pairs
int left = lower_bound(pairs, pair_count, lo);
for (int j = left; j < pair_count && pairs[j].target <= find; j++) {
uint64_t src = pairs[j].source;
// DEDUP
if (visited_check_add(src)) continue;
int32_t offset = (int32_t)(find - pairs[j].target);
// static check (binary search sorted modules)
int mod_idx = find_module_bsearch(snap, src);
if (mod_idx >= 0 && chain_count < max_chains) {
// emit chain
chains[chain_count].depth = level + 1;
chains[chain_count].module_idx = mod_idx;
chains[chain_count].base_offset = src - snap->entries[mod_idx].base
+ snap->entries[mod_idx].file_offset;
memcpy(chains[chain_count].offsets, cur_level[i].offsets,
level * sizeof(int32_t)); // 只复制已有的 depth 层
chains[chain_count].offsets[level] = offset;
chain_count++;
}
// push to next level
if (nxt_count < MAX_LEVEL_WIDTH) {
nxt_level[nxt_count].addr = src;
nxt_level[nxt_count].depth = level + 1;
memcpy(nxt_level[nxt_count].offsets, cur_level[i].offsets,
level * sizeof(int32_t));
nxt_level[nxt_count].offsets[level] = offset;
nxt_count++;
}
}
}
// swap
struct bfs_node *tmp = cur_level;
cur_level = nxt_level;
nxt_level = tmp;
cur_count = nxt_count;
}客户端
ce_client.py: ptrscan_resolve()
- 专用 CEClient 连接(不阻塞其他操作)
- 单次 CMD 0xD6,等待 3-5 秒
- 解析 module header + chains
- base_offset 读 8 bytes (uint64)
ptrscan_results.py: PtrScanWorker
- QThread 子类
- run() 里创建专用 CEClient → 调 ptrscan_resolve → emit finished
- UI 用 indeterminate progress bar(
setRange(0,0)) - Stop 按钮 close socket → _recv 抛 ConnectionError → 线程退出
双击 → 添加到 addr list
entry = AddrEntry(
address=0,
desc="ptrscan",
vtype="4 Bytes",
is_pointer=True,
base_addr_str=f"{mod_name}+0x{base_offset:X}", # FILE-RELATIVE offset
pointer_offsets=offsets, # BFS 原始输出,不 reverse
)
self.pointer_selected.emit(entry)base_addr_str 用 file-relative offset
"libgame.so+0x500100"其中0x500100是 file offset_parse_address已有 segment 查找:foff <= offset < foff + size → base + (offset - foff)- ASLR 后 module 加载到新地址,segment lookup 自动转换为新 runtime address
性能预估(最终修正)
| 步骤 | 耗时 |
|---|---|
| 内核收集 (500MB, sorted array binary search) | 1.5-3s |
| copy_to_user 80MB | ~50ms |
| qsort 5M entries (server) | 300-600ms |
| BFS depth 7, visited set, cap 50K/level (server) | 200-500ms |
| TCP 返回链 (~100KB) | <10ms |
| 总计 | 2-4s |
实现顺序
- shadow_ce.h: 加 struct + ioctl define
- shadow_ce.c: 实现 do_ptrscan_collect(~120 行,照 do_scan)
- server.c: 修 create_module_snapshot 去掉 perm 过滤 + 加 CMD 0xD6 handler + BFS(~250 行)
- ce_client.py: 加 ptrscan_resolve(~40 行)
- ptrscan_results.py: PtrScanWorker + 信号连接 + 双击 handler
- main_window.py: _open_ptrscan 传 host/port/pid + _add_ptr_from_scan
验证
- test_ptrscan.c: malloc A→B→C, 全局指针→A, 用 MCP 验证
- 集成测试: 真实游戏扫血量地址→指针扫描→双击添加→重启验证
Last updated on