Bug 10680

Summary: bonding with arp_interval very low => Ooops.
Product: Networking Reporter: Nicolas 2P (nicolas.2p.debian)
Component: OtherAssignee: Arnaldo Carvalho de Melo (acme)
Status: CLOSED CODE_FIX    
Severity: normal CC: bunk, nicolas.2p.debian
Priority: P1    
Hardware: All   
OS: Linux   
Kernel Version: 2.6.24-1 Subsystem:
Regression: --- Bisected commit-id:

Description Nicolas 2P 2008-05-11 13:51:01 UTC
Latest working kernel version: don't know.
Earliest failing kernel version: don't know.
Distribution: debian
Hardware Environment: Tested on x86 and amd64, 5 different hardware.
Software Environment:
Problem Description: Setting arp_interval to a very low value (2) => Ooops.

Steps to reproduce:

# modprobe bonding
# echo 2 > /sys/class/net/bond0/bonding/arp_interval
# ifconfig bond0 up

Alternate steps to reproduce:

in /etc/modprobe.conf :
alias bond0 bonding
options bond0 arp_interval=2 arp_ip_target=192.168.0.254

# ifconfig bond0 up
Comment 1 Nicolas 2P 2008-05-11 15:50:26 UTC
Oops details :

# grep bond0 /etc/modprobe.conf
alias bond0 bonding
options bond0 arp_interval=2 arp_ip_target=192.168.0.254
# ifconfig bond0 up
Ethernet Channel Bonding Driver: v3.2.3 (December 6, 2007)
bonding: ARP monitoring set to 2 ms, validate none, with 1 target(s): 192.168.0.254
ADDRCONF(NETDEV_UP): bond0: link is not ready

# BUG: soft lockup - CPU#0 stuck for 11s! [bond0:3253]

Pid: 3253, comm: bond0 Not tainted (2.6.24-1-686 #1)
EIP: 0060:[<c02bdb31>] EFLAGS: 00000202 CPU: 0
EIP is at _spin_unlock_irqrestore+0xa/0x13
EAX: 00000202 EBX: c9319670 ECX: 00000202 EDX: 00000200
ESI: c93193e0 EDI: 00000000 EBP: 00000000 ESP: c93cff64
 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
CR0: 8005003b CR2: 08124008 CR3: 09347000 CR4: 000006d0
DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
DR6: ffff0ff0 DR7: 00000400
 [<c013274d>] queue_work+0x33/0x3c
 [<cfc8eca0>] bond_loadbalance_arp_mon+0x1ce/0x1da [bonding]
 [<cfc8ead2>] bond_loadbalance_arp_mon+0x0/0x1da [bonding]
 [<cfc8ead2>] bond_loadbalance_arp_mon+0x0/0x1da [bonding]
 [<c013220d>] run_workqueue+0x7d/0x109
 [<c013550c>] prepare_to_wait+0x12/0x49
 [<c0132a83>] worker_thread+0x0/0xc5
 [<c0132b3d>] worker_thread+0xba/0xc5
 [<c01353f9>] autoremove_wake_function+0x0/0x35
 [<c0135332>] kthread+0x38/0x5e
 [<c01352fa>] kthread+0x0/0x5e
 [<c0104b0f>] kernel_thread_helper+0x7/0x10
 =======================
BUG: soft lockup - CPU#0 stuck for 11s! [bond0:3253]

Pid: 3253, comm: bond0 Not tainted (2.6.24-1-686 #1)
EIP: 0060:[<c01321f4>] EFLAGS: 00000287 CPU: 0
EIP is at run_workqueue+0x64/0x109
EAX: c93a9600 EBX: c93a9640 ECX: c9319674 EDX: c93a9644
ESI: c9319670 EDI: cfc8ead2 EBP: 00000000 ESP: c93cff9c
 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
CR0: 8005003b CR2: 08124008 CR3: 09347000 CR4: 000006d0
DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
DR6: ffff0ff0 DR7: 00000400
 [<c013550c>] prepare_to_wait+0x12/0x49
 [<c0132a83>] worker_thread+0x0/0xc5
 [<c0132b3d>] worker_thread+0xba/0xc5
 [<c01353f9>] autoremove_wake_function+0x0/0x35
 [<c0135332>] kthread+0x38/0x5e
 [<c01352fa>] kthread+0x0/0x5e
 [<c0104b0f>] kernel_thread_helper+0x7/0x10
 =======================
BUG: soft lockup - CPU#0 stuck for 11s! [bond0:3253]

Pid: 3253, comm: bond0 Not tainted (2.6.24-1-686 #1)
EIP: 0060:[<cfc8eb06>] EFLAGS: 00000246 CPU: 0
EIP is at bond_loadbalance_arp_mon+0x34/0x1da [bonding]
EAX: 00000000 EBX: c9319670 ECX: 00000004 EDX: 00000002
ESI: c93193e0 EDI: cfc8ead2 EBP: 00000000 ESP: c93cff70
 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
CR0: 8005003b CR2: 08124008 CR3: 09347000 CR4: 000006d0
DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
DR6: ffff0ff0 DR7: 00000400
 [<cfc8ead2>] bond_loadbalance_arp_mon+0x0/0x1da [bonding]
 [<cfc8ead2>] bond_loadbalance_arp_mon+0x0/0x1da [bonding]
 [<c013220d>] run_workqueue+0x7d/0x109
 [<c013550c>] prepare_to_wait+0x12/0x49
 [<c0132a83>] worker_thread+0x0/0xc5
 [<c0132b3d>] worker_thread+0xba/0xc5
 [<c01353f9>] autoremove_wake_function+0x0/0x35
 [<c0135332>] kthread+0x38/0x5e
 [<c01352fa>] kthread+0x0/0x5e
 [<c0104b0f>] kernel_thread_helper+0x7/0x10
 =======================
BUG: soft lockup - CPU#0 stuck for 11s! [bond0:3253]

Pid: 3253, comm: bond0 Not tainted (2.6.24-1-686 #1)
EIP: 0060:[<c01321f4>] EFLAGS: 00000287 CPU: 0
EIP is at run_workqueue+0x64/0x109
EAX: c93a9600 EBX: c93a9640 ECX: c9319674 EDX: c93a9644
ESI: c9319670 EDI: cfc8ead2 EBP: 00000000 ESP: c93cff9c
 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
CR0: 8005003b CR2: 08124008 CR3: 09347000 CR4: 000006d0
DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
DR6: ffff0ff0 DR7: 00000400
 [<c013550c>] prepare_to_wait+0x12/0x49
 [<c0132a83>] worker_thread+0x0/0xc5
 [<c0132b3d>] worker_thread+0xba/0xc5
 [<c01353f9>] autoremove_wake_function+0x0/0x35
 [<c0135332>] kthread+0x38/0x5e
 [<c01352fa>] kthread+0x0/0x5e
 [<c0104b0f>] kernel_thread_helper+0x7/0x10
 =======================
BUG: soft lockup - CPU#0 stuck for 11s! [bond0:3253]

Pid: 3253, comm: bond0 Not tainted (2.6.24-1-686 #1)
EIP: 0060:[<c02bdb31>] EFLAGS: 00000202 CPU: 0
EIP is at _spin_unlock_irqrestore+0xa/0x13
EAX: 00000202 EBX: c9319670 ECX: 00000202 EDX: 00000200
ESI: c93193e0 EDI: 00000000 EBP: 00000000 ESP: c93cff64
 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
CR0: 8005003b CR2: 08124008 CR3: 09347000 CR4: 000006d0
DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
DR6: ffff0ff0 DR7: 00000400
 [<c013274d>] queue_work+0x33/0x3c
 [<cfc8eca0>] bond_loadbalance_arp_mon+0x1ce/0x1da [bonding]
 [<cfc8ead2>] bond_loadbalance_arp_mon+0x0/0x1da [bonding]
 [<cfc8ead2>] bond_loadbalance_arp_mon+0x0/0x1da [bonding]
 [<c013220d>] run_workqueue+0x7d/0x109
 [<c013550c>] prepare_to_wait+0x12/0x49
 [<c0132a83>] worker_thread+0x0/0xc5
 [<c0132b3d>] worker_thread+0xba/0xc5
 [<c01353f9>] autoremove_wake_function+0x0/0x35
 [<c0135332>] kthread+0x38/0x5e
 [<c01352fa>] kthread+0x0/0x5e
 [<c0104b0f>] kernel_thread_helper+0x7/0x10
 =======================
BUG: soft lockup - CPU#0 stuck for 11s! [bond0:3253]

Pid: 3253, comm: bond0 Not tainted (2.6.24-1-686 #1)
EIP: 0060:[<c01321f4>] EFLAGS: 00000287 CPU: 0
EIP is at run_workqueue+0x64/0x109
EAX: c93a9600 EBX: c93a9640 ECX: c9319674 EDX: c93a9644
ESI: c9319670 EDI: cfc8ead2 EBP: 00000000 ESP: c93cff9c
 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
CR0: 8005003b CR2: 08124008 CR3: 09347000 CR4: 000006d0
DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
DR6: ffff0ff0 DR7: 00000400
 [<c013550c>] prepare_to_wait+0x12/0x49
 [<c0132a83>] worker_thread+0x0/0xc5
 [<c0132b3d>] worker_thread+0xba/0xc5
 [<c01353f9>] autoremove_wake_function+0x0/0x35
 [<c0135332>] kthread+0x38/0x5e
 [<c01352fa>] kthread+0x0/0x5e
 [<c0104b0f>] kernel_thread_helper+0x7/0x10
 =======================
BUG: soft lockup - CPU#0 stuck for 11s! [bond0:3253]

Pid: 3253, comm: bond0 Not tainted (2.6.24-1-686 #1)
EIP: 0060:[<c02bdb31>] EFLAGS: 00000202 CPU: 0
EIP is at _spin_unlock_irqrestore+0xa/0x13
EAX: 00000202 EBX: c9319670 ECX: 00000202 EDX: 00000200
ESI: c93193e0 EDI: 00000000 EBP: 00000000 ESP: c93cff64
 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
CR0: 8005003b CR2: 08124008 CR3: 09347000 CR4: 000006d0
DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000
DR6: ffff0ff0 DR7: 00000400
 [<c013274d>] queue_work+0x33/0x3c
 [<cfc8eca0>] bond_loadbalance_arp_mon+0x1ce/0x1da [bonding]
 [<cfc8ead2>] bond_loadbalance_arp_mon+0x0/0x1da [bonding]
 [<cfc8ead2>] bond_loadbalance_arp_mon+0x0/0x1da [bonding]
 [<c013220d>] run_workqueue+0x7d/0x109
 [<c013550c>] prepare_to_wait+0x12/0x49
 [<c0132a83>] worker_thread+0x0/0xc5
 [<c0132b3d>] worker_thread+0xba/0xc5
 [<c01353f9>] autoremove_wake_function+0x0/0x35
 [<c0135332>] kthread+0x38/0x5e
 [<c01352fa>] kthread+0x0/0x5e
 [<c0104b0f>] kernel_thread_helper+0x7/0x10
 =======================
Comment 2 Nicolas 2P 2008-05-15 13:39:59 UTC
Possible patch :

Signed-off-by: Nicolas de Pesloüan <nicolas.2p.debian@free.fr> 

--- /usr/src/linux/drivers/net/bonding/bond_main_orig.c 2008-05-13 02:00:01.000000000 +0200
+++ /usr/src/linux/drivers/net/bonding/bond_main.c      2008-05-14 14:55:53.000000000 +0200
@@ -2391,7 +2391,7 @@
                read_lock(&bond->lock);
        }

-       delay = ((bond->params.miimon * HZ) / 1000) ? : 1;
+       delay = msecs_to_jiffies(bond->params.miimon);
        read_unlock(&bond->lock);
        queue_delayed_work(bond->wq, &bond->mii_work, delay);
 }
@@ -2704,7 +2704,7 @@

        read_lock(&bond->lock);

-       delta_in_ticks = (bond->params.arp_interval * HZ) / 1000;
+       delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);

        if (bond->kill_timers) {
                goto out;
@@ -2837,7 +2837,7 @@

        read_lock(&bond->lock);

-       delta_in_ticks = (bond->params.arp_interval * HZ) / 1000;
+       delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval * HZ);

        if (bond->kill_timers) {
                goto out;
Comment 3 Nicolas 2P 2008-05-18 09:25:31 UTC
Correct patch (suppressing "* HZ" in 3rd change) :

Signed-off-by: Nicolas de Pesloüan <nicolas.2p.debian@free.fr> 

--- /usr/src/linux/drivers/net/bonding/bond_main_orig.c 2008-05-13
02:00:01.000000000 +0200
+++ /usr/src/linux/drivers/net/bonding/bond_main.c      2008-05-14
14:55:53.000000000 +0200
@@ -2391,7 +2391,7 @@
                read_lock(&bond->lock);
        }

-       delay = ((bond->params.miimon * HZ) / 1000) ? : 1;
+       delay = msecs_to_jiffies(bond->params.miimon);
        read_unlock(&bond->lock);
        queue_delayed_work(bond->wq, &bond->mii_work, delay);
 }
@@ -2704,7 +2704,7 @@

        read_lock(&bond->lock);

-       delta_in_ticks = (bond->params.arp_interval * HZ) / 1000;
+       delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);

        if (bond->kill_timers) {
                goto out;
@@ -2837,7 +2837,7 @@

        read_lock(&bond->lock);

-       delta_in_ticks = (bond->params.arp_interval * HZ) / 1000;
+       delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);

        if (bond->kill_timers) {
                goto out;
Comment 4 Adrian Bunk 2008-07-21 01:37:05 UTC
included as commit 5ce0da8f0386b62345312ec8fed31303732f4220