Bug 215458 - tick/sched: iowait_sleeptime in tick_cpu_sched structure is a negative value
Summary: tick/sched: iowait_sleeptime in tick_cpu_sched structure is a negative value
Status: NEW
Alias: None
Product: Timers
Classification: Unclassified
Component: Other (show other bugs)
Hardware: All Linux
: P1 normal
Assignee: john stultz
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2022-01-06 08:19 UTC by Liguang
Modified: 2022-01-07 01:36 UTC (History)
0 users

See Also:
Kernel Version: 5.10.23
Subsystem:
Regression: No
Bisected commit-id:


Attachments

Description Liguang 2022-01-06 08:19:12 UTC
On ARM server,we encounted a hardlockup problem baseed on sdei watchdog,watchdog period is 10s

[ 6149.965863] CPU: 126 PID: 0 Comm: swapper/126 Kdump: loaded Tainted: G        W         5.10.23-007.ali5000.alios7.aarch64 #1
[ 6149.977151] Hardware name: AlibabaCloud AliServer-Xuanwu2.0AM-02-2UC1P-5B/M1 EVB, BIOS 1.2.M1.AL.E.115.03 11/27/2021
[ 6149.987656] pstate: 604003c9 (nZCv DAIF +PAN -UAO -TCO BTYPE=--)
[ 6149.993649] pc : machine_kexec+0x44/0x160
[ 6149.997645] lr : machine_kexec+0x44/0x160
[ 6150.001640] sp : ffff8000250f3c50
[ 6150.004940] x29: ffff8000250f3c50 x28: ffff0400064de300 
[ 6150.010238] x27: 0000000000000000 x26: ffff800010b137f8 
[ 6150.015535] x25: ffff800010010800 x24: 0000000000000008 
[ 6150.020833] x23: ffff8000250f3cd8 x22: ffff8000119cd000 
[ 6150.026130] x21: 0000000000000000 x20: ffff00080bc72c00 
[ 6150.031428] x19: ffff00080bc72c00 x18: 0000000000000030 
[ 6150.036725] x17: 0000000000000000 x16: 0000000000000000 
[ 6150.042022] x15: ffff0400064de958 x14: ffffffffffffffff 
[ 6150.047320] x13: ffff8000a50f39a7 x12: ffff8000250f39af 
[ 6150.052617] x11: ffff80001171bcb8 x10: ffff800011590cb8 
[ 6150.057914] x9 : ffff8000100da340 x8 : 0000000000055d6d 
[ 6150.063212] x7 : 800000010000cd40 x6 : ffff8000118c0e87 
[ 6150.068509] x5 : ffff04473bbca048 x4 : ffff8000250f3ab0 
[ 6150.073806] x3 : 0000000000000001 x2 : ffff04473bbca050 
[ 6150.079104] x1 : 0000000000000000 x0 : 0000000000000000 
[ 6150.084401] Call trace:
[ 6150.086834]  machine_kexec+0x44/0x160
[ 6150.090481]  __crash_kexec+0x7c/0x12c
[ 6150.094131]  panic+0x198/0x368
[ 6150.097172]  add_taint+0x0/0xac
[ 6150.100300]  watchdog_hardlockup_check+0xd4/0x160
[ 6150.104990]  sdei_watchdog_callback+0x18/0x24
[ 6150.109333]  sdei_event_handler+0x3c/0xb0
[ 6150.113328]  _sdei_handler+0x88/0x150
[ 6150.116977]  __sdei_handler+0x24/0x50
[ 6150.120625]  __sdei_asm_handler+0xbc/0x150
[ 6150.124707]  __cpu_do_idle+0x8/0x10
[ 6150.128182]  arch_cpu_idle+0xc/0x3c
[ 6150.131657]  default_idle_call+0x40/0x190
[ 6150.135653]  cpuidle_idle_call+0x160/0x1d4
[ 6150.139735]  do_idle+0xb0/0x11c
[ 6150.142862]  cpu_startup_entry+0x28/0xa0
[ 6150.146770]  secondary_start_kernel+0x104/0x170
[ 6150.151287] ---[ end trace aa3574da538122bc ]---
[ 6150.155891] Bye!
Comment 1 Liguang 2022-01-06 08:34:56 UTC
/* watchdog detector functions */
bool is_hardlockup(void)
{
	unsigned long hrint = __this_cpu_read(hrtimer_interrupts);

	if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
		return true;

	__this_cpu_write(hrtimer_interrupts_saved, hrint);
	return false;
}

-----------------
crash> p hrtimer_interrupts:126
per_cpu(hrtimer_interrupts, 126) = $2 = 1524
crash> p hrtimer_interrupts_saved:126
per_cpu(hrtimer_interrupts_saved, 126) = $3 = 1524
crash> watchdog_thresh
watchdog_thresh = $4 = 10

10s period, not arch timer interrupt occured. 
crash> sys config | grep HZ
CONFIG_NO_HZ_COMMON=y
# CONFIG_HZ_PERIODIC is not set
# CONFIG_NO_HZ_IDLE is not set
CONFIG_NO_HZ_FULL=y
CONFIG_NO_HZ=y
# CONFIG_HZ_100 is not set
CONFIG_HZ_250=y
# CONFIG_HZ_300 is not set
# CONFIG_HZ_1000 is not set
CONFIG_HZ=250

nohz=off is configured,theoretically,every 4ms an timer interrupt occured.


grub info:
BOOT_IMAGE=/vmlinuz-5.10.23-007.ali5000.alios7.aarch64 root=/dev/vda3 ro crashkernel=512M vconsole.font=latarcyrheb-sun16 vconsole.keymap=us biosdevname=0 console=tty0 scsi_mod.scan=sync pci=pcie_bus_perf nohz=off console=ttyS0,115200 virtio_ring.vring_force_dma_api=1 module_blacklist=ast,hns3 ast.ast_modeset=0 cryptomgr.notests cma_pernuma=64M mpam=acpi console=ttyAMA0,115200 initcall_debug pci=earlydump earlycon=pl011,0x2A400000,115200 module_blacklist=acc
Comment 2 Liguang 2022-01-06 08:42:07 UTC
crash> p watchdog_hrtimer:126
per_cpu(watchdog_hrtimer, 126) = $5 = {
  node = {
    node = {
      __rb_parent_color = 18446603337777921265,
      rb_right = 0x0,
      rb_left = 0x0
    },
    expires = 6110016166520
  },
  _softexpires = 6110016166520,
  function = 0xffff80001017aa50 <watchdog_timer_fn>,
  base = 0xffff04473bbcc780,
  state = 1 '\001',
  is_rel = 0 '\000',
  is_soft = 0 '\000',
  is_hard = 1 '\001'
}
Comment 3 Liguang 2022-01-06 08:42:24 UTC
crash> p tick_cpu_sched:126
per_cpu(tick_cpu_sched, 126) = $1 = {
  sched_timer = {
    node = {
      node = {
        __rb_parent_color = 18446603337117384112,
        rb_right = 0x0,
        rb_left = 0x0
      },
      expires = 6108564000000
    },
    _softexpires = 6108564000000,
    function = 0xffff800010122ec0 <tick_sched_timer>,
    base = 0xffff04473bbcc780,
    state = 1 '\001',
    is_rel = 0 '\000',
    is_soft = 0 '\000',
    is_hard = 1 '\001'
  },
  check_clocks = 0,
  nohz_mode = NOHZ_MODE_INACTIVE,
  inidle = 1,
  tick_stopped = 0,
  idle_active = 1,
  do_timer_last = 0,
  got_idle_tick = 1,
  last_tick = 0,
  next_tick = 0,
  idle_jiffies = 0,
  idle_calls = 0,
  idle_sleeps = 0,
  idle_entrytime = 5012087709249,
  idle_waketime = 0,
  idle_exittime = 0,
  idle_sleeptime = 4936136669951,
  iowait_sleeptime = -1942739704,
  last_jiffies = 0,
  timer_expires = 0,
  timer_expires_base = 0,
  next_timer = 0,
  idle_expires = 0,
  tick_dep_mask = {
    counter = 0
  }
}
Comment 4 Liguang 2022-01-06 08:42:54 UTC
(In reply to Liguang from comment #3)
> crash> p tick_cpu_sched:126
> per_cpu(tick_cpu_sched, 126) = $1 = {
>   sched_timer = {
>     node = {
>       node = {
>         __rb_parent_color = 18446603337117384112,
>         rb_right = 0x0,
>         rb_left = 0x0
>       },
>       expires = 6108564000000
>     },
>     _softexpires = 6108564000000,
>     function = 0xffff800010122ec0 <tick_sched_timer>,
>     base = 0xffff04473bbcc780,
>     state = 1 '\001',
>     is_rel = 0 '\000',
>     is_soft = 0 '\000',
>     is_hard = 1 '\001'
>   },
>   check_clocks = 0,
>   nohz_mode = NOHZ_MODE_INACTIVE,
>   inidle = 1,
>   tick_stopped = 0,
>   idle_active = 1,
>   do_timer_last = 0,
>   got_idle_tick = 1,
>   last_tick = 0,
>   next_tick = 0,
>   idle_jiffies = 0,
>   idle_calls = 0,
>   idle_sleeps = 0,
>   idle_entrytime = 5012087709249,
>   idle_waketime = 0,
>   idle_exittime = 0,
>   idle_sleeptime = 4936136669951,
>   iowait_sleeptime = -1942739704,
>   last_jiffies = 0,
>   timer_expires = 0,
>   timer_expires_base = 0,
>   next_timer = 0,
>   idle_expires = 0,
>   tick_dep_mask = {
>     counter = 0
>   }
> }

iowait_sleeptime = -1942739704
why iowait_sleeptime is a negative value?
Comment 5 Liguang 2022-01-07 01:16:07 UTC
nohz=off was configured in the startup parameters

crash> tick_nohz_enabled
tick_nohz_enabled = $1 = false
crash> tick_nohz_active
tick_nohz_active = $2 = 0


and current no cpufreq driver was used:

crash> cpufreq_driver
cpufreq_driver = $4 = (struct cpufreq_driver *) 0x0
Comment 6 Liguang 2022-01-07 01:34:41 UTC
my machine is 128 core, three cores have the same problem:
crash> p tick_cpu_sched:a | grep iowait_sleeptime | grep "\-"
  iowait_sleeptime = -422390319,
  iowait_sleeptime = -365884628719,
  iowait_sleeptime = -1942739704,
Comment 7 Liguang 2022-01-07 01:36:16 UTC
(In reply to Liguang from comment #6)
> my machine is 128 core, three cores have the same problem:
> crash> p tick_cpu_sched:a | grep iowait_sleeptime | grep "\-"
>   iowait_sleeptime = -422390319,
>   iowait_sleeptime = -365884628719,
>   iowait_sleeptime = -1942739704,


crash> p tick_cpu_sched:59
per_cpu(tick_cpu_sched, 59) = $331 = {
  sched_timer = {
    node = {
      node = {
        __rb_parent_color = 18446603337735190528,
        rb_right = 0x0,
        rb_left = 0x0
      },
      expires = 6108564000000
    },
    _softexpires = 6108564000000,
    function = 0xffff800010122ec0 <tick_sched_timer>,
    base = 0xffff00477bb78780,
    state = 1 '\001',
    is_rel = 0 '\000',
    is_soft = 0 '\000',
    is_hard = 1 '\001'
  },
  check_clocks = 0,
  nohz_mode = NOHZ_MODE_INACTIVE,
  inidle = 1,
  tick_stopped = 0,
  idle_active = 1,
  do_timer_last = 0,
  got_idle_tick = 1,
  last_tick = 0,
  next_tick = 0,
  idle_jiffies = 0,
  idle_calls = 0,
  idle_sleeps = 0,
  idle_entrytime = 5009052372788,
  idle_waketime = 0,
  idle_exittime = 0,
  idle_sleeptime = 4925948942621,
  iowait_sleeptime = -422390319,
  last_jiffies = 0,
  timer_expires = 0,
  timer_expires_base = 0,
  next_timer = 0,
  idle_expires = 0,
  tick_dep_mask = {
    counter = 0
  }
}
crash> p tick_cpu_sched:102
per_cpu(tick_cpu_sched, 102) = $332 = {
  sched_timer = {
    node = {
      node = {
        __rb_parent_color = 18446603336923004336,
        rb_right = 0x0,
        rb_left = 0x0
      },
      expires = 6108564000000
    },
    _softexpires = 6108564000000,
    function = 0xffff800010122ec0 <tick_sched_timer>,
    base = 0xffff04473b92c780,
    state = 1 '\001',
    is_rel = 0 '\000',
    is_soft = 0 '\000',
    is_hard = 1 '\001'
  },
  check_clocks = 0,
  nohz_mode = NOHZ_MODE_INACTIVE,
  inidle = 1,
  tick_stopped = 0,
  idle_active = 1,
  do_timer_last = 0,
  got_idle_tick = 1,
  last_tick = 0,
  next_tick = 0,
  idle_jiffies = 0,
  idle_calls = 0,
  idle_sleeps = 0,
  idle_entrytime = 5009052372748,
  idle_waketime = 0,
  idle_exittime = 0,
  idle_sleeptime = 5281350974871,
  iowait_sleeptime = -365884628719,
  last_jiffies = 0,
  timer_expires = 0,
  timer_expires_base = 0,
  next_timer = 0,
  idle_expires = 0,
  tick_dep_mask = {
    counter = 0
  }
}
crash> p tick_cpu_sched:126
per_cpu(tick_cpu_sched, 126) = $333 = {
  sched_timer = {
    node = {
      node = {
        __rb_parent_color = 18446603337117384112,
        rb_right = 0x0,
        rb_left = 0x0
      },
      expires = 6108564000000
    },
    _softexpires = 6108564000000,
    function = 0xffff800010122ec0 <tick_sched_timer>,
    base = 0xffff04473bbcc780,
    state = 1 '\001',
    is_rel = 0 '\000',
    is_soft = 0 '\000',
    is_hard = 1 '\001'
  },
  check_clocks = 0,
  nohz_mode = NOHZ_MODE_INACTIVE,
  inidle = 1,
  tick_stopped = 0,
  idle_active = 1,
  do_timer_last = 0,
  got_idle_tick = 1,
  last_tick = 0,
  next_tick = 0,
  idle_jiffies = 0,
  idle_calls = 0,
  idle_sleeps = 0,
  idle_entrytime = 5012087709249,
  idle_waketime = 0,
  idle_exittime = 0,
  idle_sleeptime = 4936136669951,
  iowait_sleeptime = -1942739704,
  last_jiffies = 0,
  timer_expires = 0,
  timer_expires_base = 0,
  next_timer = 0,
  idle_expires = 0,
  tick_dep_mask = {
    counter = 0
  }
}

Note You need to log in before you can comment on or make changes to this bug.