Need guidance and help on this and wanting to understand this is a known problem Running a 64bit 2.6.24.7 kernel on a multiprocessor, multicore x86 hardware multiple stack prints indicating soft lockup (on a spinlock) logs coming on console preventing any other operation. After reboot, from the evidence it looks like a division macro do_div converting a 64bit divisor into 32 bit value (while the if check for it being 0 is above it). This is resulting in divide 0 error. This has happened while it is holding the siglock of another process. Pasted below is the stack trace, kernel code and object dump of relevant code piece ---- first stack trace divide error: 0000 [1] SMP CPU 0 Modules linked in: Pid: 28272, comm: pidof Not tainted 2.6.24.7 #1 RIP: 0010:[<ffffffff802e1247>] [<ffffffff802e1247>] task_utime+0x57/0x90 RSP: 0018:ffff810308fcfba8 EFLAGS: 00010046 RAX: fff3e3500d5d6260 RBX: fff3e3500d5d6260 RCX: 00000000fff4c8b6 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00988fd09479ebce RBP: ffff81042ff98080 R08: 0000000000000001 R09: ffff81042e819968 R10: 0000000000000002 R11: 0000000000000050 R12: 0000000100000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00002ae99f7196e0(0000) GS:ffffffff8056f000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 000000000065b1a8 CR3: 00000002e446e000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process pidof (pid: 28272, threadinfo ffff810308fce000, task ffff8104278b50c0) Stack: ffff81042ff98080 ffff81042ff98080 ffff81042d9c8b40 ffffffff802e1a8e ffff81042ff98080 ffffffff802ddc9f ffff81042f983e50 0000000000000001 ffff810308fcfea8 ffffffff802a77bf 00000000000041fd ffff81042e09dac0 Call Trace: [<ffffffff802e1a8e>] do_task_stat+0x7ae/0xb60 [<ffffffff802ddc9f>] pid_revalidate+0x3f/0x110 [<ffffffff802a77bf>] do_lookup+0x8f/0x210 [<ffffffff802b2b36>] dput+0xa6/0x140 [<ffffffff802a9d98>] __link_path_walk+0xc28/0xdf0 [<ffffffff802dce23>] task_dumpable+0x23/0x40 [<ffffffff802b7057>] mntput_no_expire+0x27/0xb0 [<ffffffff802a9fe1>] link_path_walk+0x81/0x100 [<ffffffff802a2bbb>] sys_readlinkat+0x8b/0xc0 [<ffffffff8028c455>] vma_adjust+0x145/0x4c0 [<ffffffff8027e6f1>] __alloc_pages+0x61/0x3c0 [<ffffffff802def4a>] proc_info_read+0xba/0x100 [<ffffffff8029fe55>] vfs_read+0xc5/0x160 [<ffffffff802a0333>] sys_read+0x53/0x90 [<ffffffff8020c35e>] system_call+0x7e/0x83 kernel: Code: 48 f7 f6 48 89 c1 48 8b 9d 08 02 00 00 48 89 cf e8 f4 02 f6 RIP [<ffffffff802e1247>] task_utime+0x57/0x90 RSP <ffff810308fcfba8> ---[ end trace dcef4052efc887f7 ]--- ------------------------------------------------ 344 static cputime_t task_utime(struct task_struct *p) 345 { 346 clock_t utime = cputime_to_clock_t(p->utime), 347 total = utime + cputime_to_clock_t(p->stime); 348 u64 temp; 349 350 /* 351 * Use CFS's precise accounting: 352 */ 353 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 354 355 if (total) { 356 temp *= utime; 357 do_div(temp, total); 358 } 359 utime = (clock_t)temp; 360 361 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 362 return p->prev_utime; 363 } ffffffff802e11f0 <task_utime>: task_utime(): kernel/fs/proc/array.c:345 ffffffff802e11f0: 48 83 ec 18 sub $0x18,%rsp ffffffff802e11f4: 48 89 1c 24 mov %rbx,(%rsp) ffffffff802e11f8: 48 89 6c 24 08 mov %rbp,0x8(%rsp) ffffffff802e11fd: 4c 89 64 24 10 mov %r12,0x10(%rsp) ffffffff802e1202: 48 89 fd mov %rdi,%rbp kernel/fs/proc/array.c:346 ffffffff802e1205: 48 8b bf e0 01 00 00 mov 0x1e0(%rdi),%rdi ffffffff802e120c: e8 1f 03 f6 ff callq ffffffff80241530 <jiffies_to_clock_t> kernel/fs/proc/array.c:347 ffffffff802e1211: 48 8b bd e8 01 00 00 mov 0x1e8(%rbp),%rdi kernel/fs/proc/array.c:346 ffffffff802e1218: 48 89 c3 mov %rax,%rbx kernel/fs/proc/array.c:347 ffffffff802e121b: e8 10 03 f6 ff callq ffffffff80241530 <jiffies_to_clock_t> kernel/fs/proc/array.c:353 ffffffff802e1220: 48 8b bd 80 00 00 00 mov 0x80(%rbp),%rdi kernel/fs/proc/array.c:347 ffffffff802e1227: 4c 8d 24 18 lea (%rax,%rbx,1),%r12 kernel/fs/proc/array.c:353 ffffffff802e122b: e8 80 03 f6 ff callq ffffffff802415b0 <nsec_to_clock_t> kernel/fs/proc/array.c:355 ffffffff802e1230: 4d 85 e4 test %r12,%r12 kernel/fs/proc/array.c:353 ffffffff802e1233: 48 89 c1 mov %rax,%rcx kernel/fs/proc/array.c:355 ffffffff802e1236: 74 15 je ffffffff802e124d <task_utime+0x5d> kernel/fs/proc/array.c:357 ffffffff802e1238: 44 89 e2 mov %r12d,%edx kernel/fs/proc/array.c:356 ffffffff802e123b: 48 0f af d8 imul %rax,%rbx kernel/fs/proc/array.c:357 ffffffff802e123f: 48 89 d6 mov %rdx,%rsi ffffffff802e1242: 48 89 d8 mov %rbx,%rax ffffffff802e1245: 31 d2 xor %edx,%edx ffffffff802e1247: 48 f7 f6 div %rsi ffffffff802e124a: 48 89 c1 mov %rax,%rcx kernel/fs/proc/array.c:361 ffffffff802e124d: 48 8b 9d 08 02 00 00 mov 0x208(%rbp),%rbx ffffffff802e1254: 48 89 cf mov %rcx,%rdi ffffffff802e1257: e8 f4 02 f6 ff callq ffffffff80241550 <clock_t_to_jiffies> ffffffff802e125c: 48 39 d8 cmp %rbx,%rax ffffffff802e125f: 48 0f 42 c3 cmovb %rbx,%rax ffffffff802e1263: 48 89 85 08 02 00 00 mov %rax,0x208(%rbp) kernel/fs/proc/array.c:363 ffffffff802e126a: 48 8b 1c 24 mov (%rsp),%rbx ffffffff802e126e: 48 8b 6c 24 08 mov 0x8(%rsp),%rbp ffffffff802e1273: 4c 8b 64 24 10 mov 0x10(%rsp),%r12 ffffffff802e1278: 48 83 c4 18 add $0x18,%rsp
Subsequently any other process wanting to take the lock for this process (whose lock has been acquired by this process which has created the exception) is looping for the spin lock
This is happening as the device had been up and running for 497 datys +2 hrs + and we have application whose instances are binded to a core each and the cores are dedicated for that processing (nothing else runs on these cores except some non-maskable interrupts). So the utime values for these processes have overflown there lower 32 bits (it is a 64 bit kernel). Then are other processes/scripts which monitor these applications by reading their stat files (or calling pidof)
*** Bug 43047 has been marked as a duplicate of this bug. ***