On ARM server,we encounted a hardlockup problem baseed on sdei watchdog,watchdog period is 10s [ 6149.965863] CPU: 126 PID: 0 Comm: swapper/126 Kdump: loaded Tainted: G W 5.10.23-007.ali5000.alios7.aarch64 #1 [ 6149.977151] Hardware name: AlibabaCloud AliServer-Xuanwu2.0AM-02-2UC1P-5B/M1 EVB, BIOS 1.2.M1.AL.E.115.03 11/27/2021 [ 6149.987656] pstate: 604003c9 (nZCv DAIF +PAN -UAO -TCO BTYPE=--) [ 6149.993649] pc : machine_kexec+0x44/0x160 [ 6149.997645] lr : machine_kexec+0x44/0x160 [ 6150.001640] sp : ffff8000250f3c50 [ 6150.004940] x29: ffff8000250f3c50 x28: ffff0400064de300 [ 6150.010238] x27: 0000000000000000 x26: ffff800010b137f8 [ 6150.015535] x25: ffff800010010800 x24: 0000000000000008 [ 6150.020833] x23: ffff8000250f3cd8 x22: ffff8000119cd000 [ 6150.026130] x21: 0000000000000000 x20: ffff00080bc72c00 [ 6150.031428] x19: ffff00080bc72c00 x18: 0000000000000030 [ 6150.036725] x17: 0000000000000000 x16: 0000000000000000 [ 6150.042022] x15: ffff0400064de958 x14: ffffffffffffffff [ 6150.047320] x13: ffff8000a50f39a7 x12: ffff8000250f39af [ 6150.052617] x11: ffff80001171bcb8 x10: ffff800011590cb8 [ 6150.057914] x9 : ffff8000100da340 x8 : 0000000000055d6d [ 6150.063212] x7 : 800000010000cd40 x6 : ffff8000118c0e87 [ 6150.068509] x5 : ffff04473bbca048 x4 : ffff8000250f3ab0 [ 6150.073806] x3 : 0000000000000001 x2 : ffff04473bbca050 [ 6150.079104] x1 : 0000000000000000 x0 : 0000000000000000 [ 6150.084401] Call trace: [ 6150.086834] machine_kexec+0x44/0x160 [ 6150.090481] __crash_kexec+0x7c/0x12c [ 6150.094131] panic+0x198/0x368 [ 6150.097172] add_taint+0x0/0xac [ 6150.100300] watchdog_hardlockup_check+0xd4/0x160 [ 6150.104990] sdei_watchdog_callback+0x18/0x24 [ 6150.109333] sdei_event_handler+0x3c/0xb0 [ 6150.113328] _sdei_handler+0x88/0x150 [ 6150.116977] __sdei_handler+0x24/0x50 [ 6150.120625] __sdei_asm_handler+0xbc/0x150 [ 6150.124707] __cpu_do_idle+0x8/0x10 [ 6150.128182] arch_cpu_idle+0xc/0x3c [ 6150.131657] default_idle_call+0x40/0x190 [ 6150.135653] cpuidle_idle_call+0x160/0x1d4 [ 6150.139735] do_idle+0xb0/0x11c [ 6150.142862] cpu_startup_entry+0x28/0xa0 [ 6150.146770] secondary_start_kernel+0x104/0x170 [ 6150.151287] ---[ end trace aa3574da538122bc ]--- [ 6150.155891] Bye!
/* watchdog detector functions */ bool is_hardlockup(void) { unsigned long hrint = __this_cpu_read(hrtimer_interrupts); if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) return true; __this_cpu_write(hrtimer_interrupts_saved, hrint); return false; } ----------------- crash> p hrtimer_interrupts:126 per_cpu(hrtimer_interrupts, 126) = $2 = 1524 crash> p hrtimer_interrupts_saved:126 per_cpu(hrtimer_interrupts_saved, 126) = $3 = 1524 crash> watchdog_thresh watchdog_thresh = $4 = 10 10s period, not arch timer interrupt occured. crash> sys config | grep HZ CONFIG_NO_HZ_COMMON=y # CONFIG_HZ_PERIODIC is not set # CONFIG_NO_HZ_IDLE is not set CONFIG_NO_HZ_FULL=y CONFIG_NO_HZ=y # CONFIG_HZ_100 is not set CONFIG_HZ_250=y # CONFIG_HZ_300 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=250 nohz=off is configured,theoretically,every 4ms an timer interrupt occured. grub info: BOOT_IMAGE=/vmlinuz-5.10.23-007.ali5000.alios7.aarch64 root=/dev/vda3 ro crashkernel=512M vconsole.font=latarcyrheb-sun16 vconsole.keymap=us biosdevname=0 console=tty0 scsi_mod.scan=sync pci=pcie_bus_perf nohz=off console=ttyS0,115200 virtio_ring.vring_force_dma_api=1 module_blacklist=ast,hns3 ast.ast_modeset=0 cryptomgr.notests cma_pernuma=64M mpam=acpi console=ttyAMA0,115200 initcall_debug pci=earlydump earlycon=pl011,0x2A400000,115200 module_blacklist=acc
crash> p watchdog_hrtimer:126 per_cpu(watchdog_hrtimer, 126) = $5 = { node = { node = { __rb_parent_color = 18446603337777921265, rb_right = 0x0, rb_left = 0x0 }, expires = 6110016166520 }, _softexpires = 6110016166520, function = 0xffff80001017aa50 <watchdog_timer_fn>, base = 0xffff04473bbcc780, state = 1 '\001', is_rel = 0 '\000', is_soft = 0 '\000', is_hard = 1 '\001' }
crash> p tick_cpu_sched:126 per_cpu(tick_cpu_sched, 126) = $1 = { sched_timer = { node = { node = { __rb_parent_color = 18446603337117384112, rb_right = 0x0, rb_left = 0x0 }, expires = 6108564000000 }, _softexpires = 6108564000000, function = 0xffff800010122ec0 <tick_sched_timer>, base = 0xffff04473bbcc780, state = 1 '\001', is_rel = 0 '\000', is_soft = 0 '\000', is_hard = 1 '\001' }, check_clocks = 0, nohz_mode = NOHZ_MODE_INACTIVE, inidle = 1, tick_stopped = 0, idle_active = 1, do_timer_last = 0, got_idle_tick = 1, last_tick = 0, next_tick = 0, idle_jiffies = 0, idle_calls = 0, idle_sleeps = 0, idle_entrytime = 5012087709249, idle_waketime = 0, idle_exittime = 0, idle_sleeptime = 4936136669951, iowait_sleeptime = -1942739704, last_jiffies = 0, timer_expires = 0, timer_expires_base = 0, next_timer = 0, idle_expires = 0, tick_dep_mask = { counter = 0 } }
(In reply to Liguang from comment #3) > crash> p tick_cpu_sched:126 > per_cpu(tick_cpu_sched, 126) = $1 = { > sched_timer = { > node = { > node = { > __rb_parent_color = 18446603337117384112, > rb_right = 0x0, > rb_left = 0x0 > }, > expires = 6108564000000 > }, > _softexpires = 6108564000000, > function = 0xffff800010122ec0 <tick_sched_timer>, > base = 0xffff04473bbcc780, > state = 1 '\001', > is_rel = 0 '\000', > is_soft = 0 '\000', > is_hard = 1 '\001' > }, > check_clocks = 0, > nohz_mode = NOHZ_MODE_INACTIVE, > inidle = 1, > tick_stopped = 0, > idle_active = 1, > do_timer_last = 0, > got_idle_tick = 1, > last_tick = 0, > next_tick = 0, > idle_jiffies = 0, > idle_calls = 0, > idle_sleeps = 0, > idle_entrytime = 5012087709249, > idle_waketime = 0, > idle_exittime = 0, > idle_sleeptime = 4936136669951, > iowait_sleeptime = -1942739704, > last_jiffies = 0, > timer_expires = 0, > timer_expires_base = 0, > next_timer = 0, > idle_expires = 0, > tick_dep_mask = { > counter = 0 > } > } iowait_sleeptime = -1942739704 why iowait_sleeptime is a negative value?
nohz=off was configured in the startup parameters crash> tick_nohz_enabled tick_nohz_enabled = $1 = false crash> tick_nohz_active tick_nohz_active = $2 = 0 and current no cpufreq driver was used: crash> cpufreq_driver cpufreq_driver = $4 = (struct cpufreq_driver *) 0x0
my machine is 128 core, three cores have the same problem: crash> p tick_cpu_sched:a | grep iowait_sleeptime | grep "\-" iowait_sleeptime = -422390319, iowait_sleeptime = -365884628719, iowait_sleeptime = -1942739704,
(In reply to Liguang from comment #6) > my machine is 128 core, three cores have the same problem: > crash> p tick_cpu_sched:a | grep iowait_sleeptime | grep "\-" > iowait_sleeptime = -422390319, > iowait_sleeptime = -365884628719, > iowait_sleeptime = -1942739704, crash> p tick_cpu_sched:59 per_cpu(tick_cpu_sched, 59) = $331 = { sched_timer = { node = { node = { __rb_parent_color = 18446603337735190528, rb_right = 0x0, rb_left = 0x0 }, expires = 6108564000000 }, _softexpires = 6108564000000, function = 0xffff800010122ec0 <tick_sched_timer>, base = 0xffff00477bb78780, state = 1 '\001', is_rel = 0 '\000', is_soft = 0 '\000', is_hard = 1 '\001' }, check_clocks = 0, nohz_mode = NOHZ_MODE_INACTIVE, inidle = 1, tick_stopped = 0, idle_active = 1, do_timer_last = 0, got_idle_tick = 1, last_tick = 0, next_tick = 0, idle_jiffies = 0, idle_calls = 0, idle_sleeps = 0, idle_entrytime = 5009052372788, idle_waketime = 0, idle_exittime = 0, idle_sleeptime = 4925948942621, iowait_sleeptime = -422390319, last_jiffies = 0, timer_expires = 0, timer_expires_base = 0, next_timer = 0, idle_expires = 0, tick_dep_mask = { counter = 0 } } crash> p tick_cpu_sched:102 per_cpu(tick_cpu_sched, 102) = $332 = { sched_timer = { node = { node = { __rb_parent_color = 18446603336923004336, rb_right = 0x0, rb_left = 0x0 }, expires = 6108564000000 }, _softexpires = 6108564000000, function = 0xffff800010122ec0 <tick_sched_timer>, base = 0xffff04473b92c780, state = 1 '\001', is_rel = 0 '\000', is_soft = 0 '\000', is_hard = 1 '\001' }, check_clocks = 0, nohz_mode = NOHZ_MODE_INACTIVE, inidle = 1, tick_stopped = 0, idle_active = 1, do_timer_last = 0, got_idle_tick = 1, last_tick = 0, next_tick = 0, idle_jiffies = 0, idle_calls = 0, idle_sleeps = 0, idle_entrytime = 5009052372748, idle_waketime = 0, idle_exittime = 0, idle_sleeptime = 5281350974871, iowait_sleeptime = -365884628719, last_jiffies = 0, timer_expires = 0, timer_expires_base = 0, next_timer = 0, idle_expires = 0, tick_dep_mask = { counter = 0 } } crash> p tick_cpu_sched:126 per_cpu(tick_cpu_sched, 126) = $333 = { sched_timer = { node = { node = { __rb_parent_color = 18446603337117384112, rb_right = 0x0, rb_left = 0x0 }, expires = 6108564000000 }, _softexpires = 6108564000000, function = 0xffff800010122ec0 <tick_sched_timer>, base = 0xffff04473bbcc780, state = 1 '\001', is_rel = 0 '\000', is_soft = 0 '\000', is_hard = 1 '\001' }, check_clocks = 0, nohz_mode = NOHZ_MODE_INACTIVE, inidle = 1, tick_stopped = 0, idle_active = 1, do_timer_last = 0, got_idle_tick = 1, last_tick = 0, next_tick = 0, idle_jiffies = 0, idle_calls = 0, idle_sleeps = 0, idle_entrytime = 5012087709249, idle_waketime = 0, idle_exittime = 0, idle_sleeptime = 4936136669951, iowait_sleeptime = -1942739704, last_jiffies = 0, timer_expires = 0, timer_expires_base = 0, next_timer = 0, idle_expires = 0, tick_dep_mask = { counter = 0 } }