Bug 202877

Summary: failure at fs/jbd2/commit.c:818/jbd2_journal_commit_transaction()!
Product: File System Reporter: Jungyeon (jungyeon)
Component: ext4Assignee: fs_ext4 (fs_ext4)
Status: NEW ---    
Severity: normal CC: 389387252, sandeen, tytso
Priority: P1    
Hardware: All   
OS: Linux   
Kernel Version: 5.0.0 Subsystem:
Regression: No Bisected commit-id:
Attachments: The (compressed) crafted image which causes crash
min_02.c.raw
minc_02.c

Description Jungyeon 2019-03-11 18:50:59 UTC
Created attachment 281723 [details]
The (compressed) crafted image which causes crash

- Overview
After mounting crafted image, I got this kernel panic while running attached program.
I also tried to reproduce on vm, but it only fails on lkl.

- Produces
./lkl/tools/lkl/ext4-combined -t ext4 -i tmp.img -p min_02.c.raw -v
(min_02.c shows it's internal programs)

- Messages

./lkl/tools/lkl/ext4-combined -t ext4 -i tmp.img -p min_02.c.raw -v
[ 0.000000] Linux version 5.0.0-rc6+ (jungyeon@copper) (gcc version 7.3.0 (Ubuntu 7.3.0-27ubuntu1~18.04)) #1 Fri Mar 8 14:10:53 EST 2019
[ 0.000000] memblock address range: 0x7fba88000000 - 0x7fba8ffff000
[ 0.000000] Built 1 zonelists, mobility grouping on. Total pages: 32319
[ 0.000000] Kernel command line: mem=128M virtio_mmio.device=316@0x1000000:1
[ 0.000000] Dentry cache hash table entries: 16384 (order: 5, 131072 bytes)
[ 0.000000] Inode-cache hash table entries: 8192 (order: 4, 65536 bytes)
[ 0.000000] Memory available: 129044k/131068k RAM
[ 0.000000] SLUB: HWalign=32, Order=0-3, MinObjects=0, CPUs=1, Nodes=1
[ 0.000000] NR_IRQS: 4096
[ 0.000000] lkl: irqs initialized
[ 0.000000] clocksource: lkl: mask: 0xffffffffffffffff max_cycles: 0x1cd42e4dffb, max_idle_ns: 881590591483 ns
[ 0.000003] lkl: time and timers initialized (irq2)
[ 0.000009] pid_max: default: 4096 minimum: 301
[ 0.000056] Mount-cache hash table entries: 512 (order: 0, 4096 bytes)
[ 0.000064] Mountpoint-cache hash table entries: 512 (order: 0, 4096 bytes)
[ 0.002528] printk: console [lkl_console0] enabled
[ 0.002556] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604462750000 ns
[ 0.003919] clocksource: Switched to clocksource lkl
[ 0.004137] virtio-mmio: Registering device virtio-mmio.0 at 0x1000000-0x100013b, IRQ 1.
[ 0.004390] workingset: timestamp_bits=62 max_order=15 bucket_order=0
[ 0.011219] virtio-mmio virtio-mmio.0: Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.
[ 0.011376] virtio_blk virtio0: [vda] 32768 512-byte logical blocks (16.8 MB/16.0 MiB)
[ 0.011809] random: get_random_bytes called from init_oops_id+0x35/0x40 with crng_init=0
[ 0.011975] Warning: unable to open an initial console.
[ 0.011999] This architecture does not have kernel memory protection.
[ 0.012003] Run /init as init process
[ 0.012966] EXT4-fs (vda): barriers disabled
[ 0.013321] [EXT4 FS bs=1024, gc=2, bpg=8192, ipg=2048, mo=e000c42c, mo2=0002]
[ 0.013338] System zones: 1-2, 66-581, 8193-8194
[ 0.013481] EXT4-fs (vda): mounting with "discard" option, but the device does not support discard
[ 0.013492] EXT4-fs (vda): mounted filesystem with journalled data mode. Opts: errors=remount-ro
v13 = syscall(SYS_open, (long)v2, 65536, 0);
syscall(SYS_getdents64, (long)v13, (long)v1, 2344);
syscall(SYS_fsync, (long)v13);
syscall(SYS_fsync, (long)v13);
syscall(SYS_readlink, (long)v10, (long)v1, 8192);
v15 = syscall(SYS_open, (long)v14, 66, 438);
syscall(SYS_write, (long)v15, (long)v1, 2229);
syscall(SYS_write, (long)v15, (long)v1, 3563);
syscall(SYS_ftruncate, (long)v15, 7336);
syscall(SYS_getdents64, (long)v13, (long)v1, 4633);
syscall(SYS_mkdir, (long)v16, 511);
syscall(SYS_fsync, (long)v13);
syscall(SYS_fsync, (long)v15);
syscall(SYS_unlink, (long)v8);
syscall(SYS_write, (long)v15, (long)v1, 7178);
syscall(SYS_readlink, (long)v14, (long)v1, 8192);
syscall(SYS_utimes, (long)v11, (long)v1);
syscall(SYS_ftruncate, (long)v15, 4018);
syscall(SYS_utimes, (long)v10, (long)v1);
syscall(SYS_ftruncate, (long)v15, 6005);
syscall(SYS_fsync, (long)v15);
syscall(SYS_rmdir, (long)v12);
syscall(SYS_pwrite64, (long)v15, (long)v1, 8027, 643);
syscall(SYS_getdents64, (long)v13, (long)v1, 3796);
syscall(SYS_mkdir, (long)v17, 511);
syscall(SYS_removexattr, (long)v3, (long)v18);
syscall(SYS_ftruncate, (long)v15, 53);
syscall(SYS_listxattr, (long)v5, (long)v1, 4138);
syscall(SYS_pwrite64, (long)v15, (long)v1, 7728, 1584);
syscall(SYS_fsync, (long)v15);
syscall(SYS_fsync, (long)v15);
syscall(SYS_write, (long)v15, (long)v1, 5164);
syscall(SYS_unlink, (long)v14);
syscall(SYS_write, (long)v15, (long)v1, 1752);
syscall(SYS_getdents64, (long)v13, (long)v1, 1582);
syscall(SYS_pwrite64, (long)v15, (long)v1, 5142, 5178);
syscall(SYS_removexattr, (long)v16, (long)v19);
v20 = syscall(SYS_open, (long)v3, 65536, 0);
syscall(SYS_fsync, (long)v15);
syscall(SYS_symlink, (long)v5, (long)v21);
syscall(SYS_link, (long)v10, (long)v22);
v23 = syscall(SYS_open, (long)v7, 2, 0);
syscall(SYS_ftruncate, (long)v15, 2545);
syscall(SYS_write, (long)v23, (long)v1, 901);
syscall(SYS_fdatasync, (long)v23);
syscall(SYS_link, (long)v10, (long)v24);
syscall(SYS_symlink, (long)v9, (long)v25);
syscall(SYS_fsync, (long)v15);
syscall(SYS_mkdir, (long)v26, 511);
[ 0.026492] random: fast init done
syscall(SYS_fdatasync, (long)v23);
syscall(SYS_write, (long)v23, (long)v1, 969);
syscall(SYS_readlink, (long)v2, (long)v1, 8192);
syscall(SYS_chmod, (long)v25, 3072);
syscall(SYS_fdatasync, (long)v23);
syscall(SYS_pwrite64, (long)v23, (long)v1, 1520, 1423);
syscall(SYS_fallocate, (long)v15, 65, 5353, 6797);
syscall(SYS_fsync, (long)v23);
syscall(SYS_listxattr, (long)v22, (long)v1, 1808);
syscall(SYS_pwrite64, (long)v23, (long)v1, 4742, 7814);
syscall(SYS_newlstat, (long)v21, (long)v1);
syscall(SYS_fsync, (long)v20);
syscall(SYS_write, (long)v15, (long)v1, 7312);
syscall(SYS_unlink, (long)v21);
syscall(SYS_fallocate, (long)v15, 1, 2243, 3657);
syscall(SYS_pread64, (long)v15, (long)v0, 3050, 4738);
syscall(SYS_fsync, (long)v15);
syscall(SYS_mkdir, (long)v27, 511);
syscall(SYS_fsync, (long)v20);
syscall(SYS_unlink, (long)v10);
syscall(SYS_access, (long)v27, 4);
syscall(SYS_rmdir, (long)v17);
syscall(SYS_pread64, (long)v15, (long)v0, 1206, 6444);
syscall(SYS_fdatasync, (long)v23);
syscall(SYS_rename, (long)v22, (long)v22);
syscall(SYS_fsync, (long)v15);
syscall(SYS_fsync, (long)v20);
syscall(SYS_pwrite64, (long)v15, (long)v1, 1720, 2884);
syscall(SYS_write, (long)v23, (long)v1, 3129);
syscall(SYS_access, (long)v9, 0);
syscall(SYS_write, (long)v15, (long)v1, 6250);
syscall(SYS_readlink, (long)v25, (long)v1, 8192);
[ 0.030600] JBD2: Spotted dirty metadata buffer (dev = vda, blocknr = 8541). There's a risk of filesystem corruption in case of system crash.
[ 0.030742] BUG: failure at fs/jbd2/commit.c:818/jbd2_journal_commit_transaction()!
[ 0.030754] Kernel panic - not syncing: BUG!
[ 0.030766] ---[ end Kernel panic - not syncing: BUG! ]---
ext4-combined: lib/posix-host.c:302: panic: Assertion `0' failed.
Aborted (core dumped)
Comment 1 Jungyeon 2019-03-11 18:59:30 UTC
Created attachment 281731 [details]
min_02.c.raw
Comment 2 Jungyeon 2019-03-11 18:59:49 UTC
Created attachment 281733 [details]
minc_02.c
Comment 3 Jungyeon 2019-03-13 16:25:13 UTC
https://gts3.org/~jungyeon/ext4-combined

at the link above, I uploaded the executable file required for this test
Comment 4 Eric Sandeen 2019-03-13 17:29:22 UTC
I'm not eager to download and run a random 16MB binary blob.  What is this "ext4-combined" utility?
Comment 5 Jungyeon 2019-03-14 18:47:10 UTC
Sorry for my lack of explanation. 
LKL is Linux Kernel Library. min_02.c is a program that calls lists of system calls in userspace and the craft image is a potentially faulty image to test error cases.

We are going to release our source code so that you can build the ext4-combined shortly. We needs some clean-up inside the codes before making it public.

I'm attaching stack dump at the last. I hope this could help you.
Additionally I used Linux version 5.0.0+ for this trace (and in the linked ext4-combined binary)

[    0.050478] BUG: failure at fs/jbd2/commit.c:818/jbd2_journal_commit_transaction()!
[    0.050491] Kernel panic - not syncing: BUG!
[    0.050497] Call Trace:
[    0.050506] (____ptrval____):  [<55555559bc94>] .LC81+0x5f/0xfb
[    0.050514] (____ptrval____):  [<5555555c6025>] major_names+0x75/0x80
[    0.050523] (____ptrval____):  [<555555604368>] kmem_cache_free+0x148/0x190
[    0.050533] (____ptrval____):  [<5555556aabff>] free_buffer_head+0x7f/0x190
[    0.050541] (____ptrval____):  [<5555555978f4>] .LC11+0x14/0x20
[    0.050551] (____ptrval____):  [<5555557f08db>] jbd2_journal_commit_transaction+0x398b/0x3fc0
[    0.050559] (____ptrval____):  [<555555597353>] .LC18+0x3/0x10
[    0.050568] (____ptrval____):  [<5555555b8fb9>] try_to_wake_up+0x169/0x190
[    0.050577] (____ptrval____):  [<5555558031be>] kjournald2+0x34e/0x400
[    0.050586] (____ptrval____):  [<5555555bfd30>] autoremove_wake_function+0x0/0x40
[    0.050593] (____ptrval____):  [<5555555978f4>] .LC11+0x14/0x20
[    0.050601] (____ptrval____):  [<5555555b3acb>] kthread+0x15b/0x170
[    0.050608] (____ptrval____):  [<555555802e70>] kjournald2+0x0/0x400
[    0.050615] (____ptrval____):  [<5555555b3970>] kthread+0x0/0x170
[    0.050623] (____ptrval____):  [<5555555970ab>] uidhash_table+0x3b/0x40
[    0.050630] 
[    0.050642] ---[ end Kernel panic - not syncing: BUG! ]---
Comment 6 Theodore Tso 2019-03-25 04:01:42 UTC
This looks like another case where the allocation bitmap has been corrupted exposing blocks that belong to the journal inode from getting reallocated.

Can you try two experiments?

1)  See if patching the image file as follows:

debugfs -w tmp.img
debugfs: setb 8258 1024
   <will spew a lot of messages of the form:
          setb: Warning: block NNN already set
    that's fine, ignore them.>
debugfs: quit

Causes your repro to no longer trigger a crash of your LKL binary.
(I guess I could do that, but it's late and I'm too lazy to set up a proper contained sandbox environment where I'd feel safe downloading a random 16MB binary executable from an unknown source and running it; so it's much more convenient if you could run this experiment for me.)

2)  Take the possible-fix patch which I just attached to kernel bugzilla #202879
 and see if it intercepts the failure and declares the file system corrupt such that the stack trace leading to the crash of your LKL binary no longer happens.   (This is something I *can't* do, since the sources and instructions for taking a patched version of the kernel and building a new LKL binary have not yet been made available.   So if you could also run this experiment and let me know the results, I would be much obliged.)

Thanks!!