漏洞成因

  • pipe维护了一个struct pipe_buffer的数组,每个pipe_buffer指向一个page,page里存的就是pipe的数据
  • 正常情况下,往pipe里写数据时会申请一个page,把数据拷贝到page里后再让pipe_buffer指向这个page。splice系统调用实现了一种零拷贝的技术,直接让pipe_buffer指向这个原始的数据page,这样就省去了内存拷贝的过程,提升效率
  • 往pipe里写数据时不可能每次都正好是page_size的整数倍,如果每次写数据都要重新分配一个新的page来存,必然会造成空间的浪费。但是如果pipe_buffer的PIPE_BUF_FLAG_CAN_MERGEflag被置位,数据就会接着上一次的数据在同一个page中写入,而不是申请新的page,减少了空间的浪费
  • 但是splice在给pipe_buffer赋值时没有初始化flag,这就造成之前被置位的PIPE_BUF_FLAG_CAN_MERGEflag不会被清除,所以只要先让所有的pipe_buffer的PIPE_BUF_FLAG_CAN_MERGEflag被置位,然后调用splice让pipe_buffer指向目标文件page cache,这时再向pipe里写数据就会直接修改page cache里的内容,造成任意文件覆盖漏洞

源码分析

以下源码来自Linux5.8.1

pipe

关键数据结构

pipe_inode_info

/**
 *  struct pipe_inode_info - a linux kernel pipe
 *  @mutex: mutex protecting the whole thing
 *  @rd_wait: reader wait point in case of empty pipe
 *  @wr_wait: writer wait point in case of full pipe
 *  @head: The point of buffer production
 *  @tail: The point of buffer consumption
 *  @note_loss: The next read() should insert a data-lost message
 *  @max_usage: The maximum number of slots that may be used in the ring
 *  @ring_size: total number of buffers (should be a power of 2)
 *  @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
 *  @tmp_page: cached released page
 *  @readers: number of current readers of this pipe
 *  @writers: number of current writers of this pipe
 *  @files: number of struct file referring this pipe (protected by ->i_lock)
 *  @r_counter: reader counter
 *  @w_counter: writer counter
 *  @fasync_readers: reader side fasync
 *  @fasync_writers: writer side fasync
 *  @bufs: the circular array of pipe buffers
 *  @user: the user who created this pipe
 *  @watch_queue: If this pipe is a watch_queue, this is the stuff for that
 **/
struct pipe_inode_info {
    struct mutex mutex;
    wait_queue_head_t rd_wait, wr_wait;
    unsigned int head;
    unsigned int tail;
    unsigned int max_usage;
    unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
    bool note_loss;
#endif
    unsigned int nr_accounted;
    unsigned int readers;
    unsigned int writers;
    unsigned int files;
    unsigned int r_counter;
    unsigned int w_counter;
    struct page *tmp_page;
    struct fasync_struct *fasync_readers;
    struct fasync_struct *fasync_writers;
    struct pipe_buffer *bufs;
    struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
    struct watch_queue *watch_queue;
#endif
};

其中bufs是一个struct pipe_buffer的数组,默认数量为16,每个pipe_buffer能存储一个page的数据。这16个page组成一个环形缓冲区,用来存储管道里的数据。

pipe_buffer

/**
 *  struct pipe_buffer - a linux kernel pipe buffer
 *  @page: the page containing the data for the pipe buffer
 *  @offset: offset of data inside the @page
 *  @len: length of data inside the @page
 *  @ops: operations associated with this buffer. See @pipe_buf_operations.
 *  @flags: pipe buffer flags. See above.
 *  @private: private data owned by the ops.
 **/
struct pipe_buffer {
    struct page *page;
    unsigned int offset, len;
    const struct pipe_buf_operations *ops;
    unsigned int flags;
    unsigned long private;
};

PIPE_BUF_FLAG_CAN_MERGE就包含在flags字段中,它将影响page指向的内存页

写pipe

调用write向pipe里写数据时会经过层层调用,最终实际调用pipe_write

static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
    struct file *filp = iocb->ki_filp;
    struct pipe_inode_info *pipe = filp->private_data;
    unsigned int head;
    ssize_t ret = 0;
    size_t total_len = iov_iter_count(from);
    ssize_t chars;
    bool was_empty = false;
    bool wake_next_writer = false;

    /* Null write succeeds. */
    if (unlikely(total_len == 0))
        return 0;

    __pipe_lock(pipe);

    // 确保读者数量不为0
    if (!pipe->readers) {
        send_sig(SIGPIPE, current, 0);
        ret = -EPIPE;
        goto out;
    }

#ifdef CONFIG_WATCH_QUEUE
    if (pipe->watch_queue) {
        ret = -EXDEV;
        goto out;
    }
#endif

    /*
     * Only wake up if the pipe started out empty, since
     * otherwise there should be no readers waiting.
     *
     * If it wasn't empty we try to merge new data into
     * the last buffer.
     *
     * That naturally merges small writes, but it also
     * page-aligs the rest of the writes for large writes
     * spanning multiple pages.
     */
    head = pipe->head;
    was_empty = pipe_empty(head, pipe->tail);
    chars = total_len & (PAGE_SIZE-1); // 要写入的数据的大小相对页帧大小的余数
    // 如果余数不为0,且pipe不为空
    if (chars && !was_empty) {
        unsigned int mask = pipe->ring_size - 1;
        // 当前头部的上一个缓冲区,因为要尝试将多余的数据与之前的数据合并
        struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; 
        int offset = buf->offset + buf->len;

        // 如果PIPE_BUF_FLAG_CAN_MERGE被置位,且buf能容下chars大小的数据
        if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
            offset + chars <= PAGE_SIZE) {
            ret = pipe_buf_confirm(pipe, buf);
            if (ret)
                goto out;

            // 将chars大小的数据写入缓冲区
            ret = copy_page_from_iter(buf->page, offset, chars, from);
            if (unlikely(ret < chars)) {
                ret = -EFAULT;
                goto out;
            }

            buf->len += ret;
            // 如果没有其余数据需要写入,则退出
            if (!iov_iter_count(from))
                goto out;
        }
    }

    for (;;) {
        // 确保对着数量不为0
        if (!pipe->readers) {
            send_sig(SIGPIPE, current, 0);
            if (!ret)
                ret = -EPIPE;
            break;
        }

        head = pipe->head;
        // 如果pipe没被填满
        if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
            unsigned int mask = pipe->ring_size - 1;
            struct pipe_buffer *buf = &pipe->bufs[head & mask];
            struct page *page = pipe->tmp_page; // tmp_page用来临时存数据
            int copied;

            // 如果tmp_page还未分配,则用alloc_page分配一个page并赋值
            if (!page) {
                page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
                if (unlikely(!page)) {
                    ret = ret ? : -ENOMEM;
                    break;
                }
                pipe->tmp_page = page;
            }

            /* Allocate a slot in the ring in advance and attach an
             * empty buffer.  If we fault or otherwise fail to use
             * it, either the reader will consume it or it'll still
             * be there for the next write.
             */
            // 自旋锁锁住读者等待队列
            spin_lock_irq(&pipe->rd_wait.lock);

            head = pipe->head;
            // 如果pipe已经被填满则进入下一次循环
            if (pipe_full(head, pipe->tail, pipe->max_usage)) {
                spin_unlock_irq(&pipe->rd_wait.lock);
                continue;
            }

            // 先让头部指针指向下一个缓冲区
            pipe->head = head + 1;
            spin_unlock_irq(&pipe->rd_wait.lock);// 释放自旋锁

            /* Insert it into the buffer array */
            buf = &pipe->bufs[head & mask];
            buf->page = page; //将之前分配的tmp_page赋值给buf->page
            buf->ops = &anon_pipe_buf_ops;
            buf->offset = 0;
            buf->len = 0;
            // 如果创建pipe时没有指定O_DIRECT选项,则将flags设置为PIPE_BUF_FLAG_CAN_MERGE
            // 所以只要创建pipe时不指定flags,就能将buffer的PIPE_BUF_FLAG_CAN_MERGE置位
            if (is_packetized(filp))
                buf->flags = PIPE_BUF_FLAG_PACKET;
            else
                buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
            pipe->tmp_page = NULL; // tmp_page置空

            // 拷贝一页大小的数据到page里
            copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
            if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
                if (!ret)
                    ret = -EFAULT;
                break;
            }
            ret += copied;
            buf->offset = 0;
            buf->len = copied;

            if (!iov_iter_count(from))
                break;
        }
        ......
}

splice

splice系统调用主要由do_splice函数完成,do_splice根据输入的文件描述符进入不同的分支,在本次漏洞利用中因为in是普通文件,out是pipe,所以会进入if (opipe)这个分支

/*
 * Determine where to splice to/from.
 */
long do_splice(struct file *in, loff_t __user *off_in,
        struct file *out, loff_t __user *off_out,
        size_t len, unsigned int flags)
{
    struct pipe_inode_info *ipipe;
    struct pipe_inode_info *opipe;
    loff_t offset;
    long ret;

    if (unlikely(!(in->f_mode & FMODE_READ) ||
             !(out->f_mode & FMODE_WRITE)))
        return -EBADF;

    ipipe = get_pipe_info(in, true);
    opipe = get_pipe_info(out, true);

    // in和out都是pipe
    if (ipipe && opipe) {
        ......
    }

    // 只有in是pipe
    if (ipipe) {
        ......
    }
    // 只有out是pipe
    if (opipe) {
        // 处理in和out的偏移
        if (off_out)
            return -ESPIPE;
        if (off_in) {
            if (!(in->f_mode & FMODE_PREAD))
                return -EINVAL;
            if (copy_from_user(&offset, off_in, sizeof(loff_t)))
                return -EFAULT;
        } else {
            offset = in->f_pos;
        }

        if (out->f_flags & O_NONBLOCK)
            flags |= SPLICE_F_NONBLOCK;

        pipe_lock(opipe);
        // 等待pipe有可用的缓冲区
        ret = wait_for_space(opipe, flags);
        if (!ret) {
            unsigned int p_space;

            /* Don't try to read more the pipe has space for. */
            p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail);// pipe可用空间
            len = min_t(size_t, len, p_space << PAGE_SHIFT);// 实际读取长度不能超过pipe可用空间

            ret = do_splice_to(in, &offset, opipe, len, flags); // 调用do_splice_to完成主要工作
        }
        pipe_unlock(opipe);
        if (ret > 0)
            wakeup_pipe_readers(opipe);
        if (!off_in)
            in->f_pos = offset;
        else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
            ret = -EFAULT;

        return ret;
    }

    return -EINVAL;
}

在do_splice_to中又调用了输入文件的splice_read函数,之后又经过一系列的调用,最终由copy_page_to_iter_pipe完成关联page_cage和pipe缓冲区的工作

tatic size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
             struct iov_iter *i)
{
    struct pipe_inode_info *pipe = i->pipe;
    struct pipe_buffer *buf;
    unsigned int p_tail = pipe->tail;
    unsigned int p_mask = pipe->ring_size - 1;
    unsigned int i_head = i->head;
    size_t off;

    if (unlikely(bytes > i->count))
        bytes = i->count;

    if (unlikely(!bytes))
        return 0;

    if (!sanity(i))
        return 0;

    off = i->iov_offset;
    buf = &pipe->bufs[i_head & p_mask];
    if (off) {
        // 如果要求的offset和实际的offset相同,且头部的buffer指向的就是当前的page cache
        // 则直接移动offset即可
        if (offset == off && buf->page == page) {
            /* merge with the last one */
            buf->len += bytes;
            i->iov_offset += bytes;
            goto out;
        }
        i_head++;
        buf = &pipe->bufs[i_head & p_mask];
    }
    if (pipe_full(i_head, p_tail, pipe->max_usage))
        return 0;

    buf->ops = &page_cache_pipe_buf_ops;
    // 增加page的应用计数
    get_page(page);
    // 将pipe缓冲区的page指针指向文件的page cache
    buf->page = page;
    buf->offset = offset;
    buf->len = bytes;

    pipe->head = i_head + 1;
    i->iov_offset = offset + bytes;
    i->head = i_head;
out:
    i->count -= bytes;
    return bytes;
}

可以看到copy_page_to_iter_pipe函数直接将page cache赋值给对应buffer的page指针,而没有对buffer的flags做初始化存在,使得之前被设置的PIPE_BUF_FLAG_CAN_MERGE仍然有效

能真正地覆盖文件内容吗

由上面的漏洞分析可知,最终完成的对page cache的覆写,而不是覆盖磁盘上的文件内容。当文件的page cache存在时,之后读取该文件都将直接从page cache中获取,所以只要该page cache存在,就相当于覆盖了文件内容。经测试,只要重启系统后page cache就会消失,此时再读取文件将会得到原文件内容。

但是page cache不是有writeback机制吗,只要触发该机制是不是就能将覆写后的page cache写回磁盘呢?

为了验证这个问题,我调用sync来手动触发writeback

观察程序输出结果发现,调用sync之后读取文件内容仍然是篡改过后的内容,看起来sync似乎真的把page cache里写回到了磁盘里

但当我重启系统之后发现文件内容又复原了,说明sync即没有把page cache写回到磁盘,也没有清除缓存中的内容,相当于直接忽略了这个被篡改过的page,这是为什么呢?

经过调试发现,在向普通文件写入数据时,调用的是generic_file_write_iter函数

经过如下图所示的调用,最终会调用set_page_dirty函数将该page置为dirty状态,所以最终会被writeback机制写回到磁盘中

正如源码分析中所说的,向pipe中写入数据时调用的是pipe_write,这时我给set_page_dirty函数设置断点发现,程序之后都没有调用这个函数,这点从源码中也可以证明。

这说明当我们利用漏洞修改page cache中的内容时,系统并没有将对应的page设置为dirty,所以这个修改对writeback机制来说是不可见的,自然会被忽略掉。

那为什么重启系统文件内容又会恢复呢?那是因为重启系统将所有的缓存都回收了,执行echo 1 > /proc/sys/vm/drop_caches命令能手动回收缓存,也能将文件内容恢复

参考资料

The Dirty Pipe Vulnerability

CVE-2022-0847-DirtyPipe-Exploit

Linux5.8.1源码

CVE-2022-0847 漏洞分析

VFS源码分析-Page Cache Writeback机制

点击收藏 | 0 关注 | 1
  • 动动手指,沙发就是你的了!
登录 后跟帖