Bug 199109

Summary: pptp: kernel printk "recursion detected", and then reboot itself
Product: Networking Reporter: xuheng (xuheng333)
Component: OtherAssignee: Stephen Hemminger (stephen)
Status: RESOLVED CODE_FIX    
Severity: high    
Priority: P1    
Hardware: Mips32   
OS: Linux   
Kernel Version: 4.9.77 Subsystem:
Regression: No Bisected commit-id:
Attachments: system log

Description xuheng 2018-03-14 06:56:09 UTC
Created attachment 274715 [details]
system log

Use openwrt LEDE, (gcc version 5.5.0 (OpenWrt GCC 5.5.0 r5932-9c4fe10) ).
When use pptp, make WAN(eth1) down and then make WAN(eth1) up, pptp will redial itself. Do this in a loop.

After a while, kernel print "recursion detected" and "net_ratelimit: 49422 callbacks suppressed", print this many times, then system reboot.

When add more printk in driver "driver/net/ppp/ppp_generic.c", fond it fall in loop in "static void __ppp_xmit_process(struct ppp *ppp)", which called by "static void __ppp_channel_push(struct channel *pch)". 
__ppp_channel_push() locked ppp->xmit_recursion, and loop for long time( while (!ppp->xmit_pending && (skb = skb_dequeue(&ppp->file.xq))) ppp_send_frame(ppp, skb); ). Some other thread want to lock ppp->xmit_recursion, but failed.


static void __ppp_channel_push(struct channel *pch)
{
        struct sk_buff *skb;
        struct ppp *ppp;

        spin_lock_bh(&pch->downl);
        if (pch->chan) {
                while (!skb_queue_empty(&pch->file.xq)) {
                        skb = skb_dequeue(&pch->file.xq);
                        if (!pch->chan->ops->start_xmit(pch->chan, skb)) {
                                /* put the packet back and try again later */
                                skb_queue_head(&pch->file.xq, skb);
                                break;
                        }
                }
        } else {
                /* channel got deregistered */
                skb_queue_purge(&pch->file.xq);
        }
        spin_unlock_bh(&pch->downl);
        /* see if there is anything from the attached unit to be sent */
        if (skb_queue_empty(&pch->file.xq)) {
                ppp = pch->ppp;
                if (ppp)
                        __ppp_xmit_process(ppp);
        }
}
Comment 1 xuheng 2018-03-14 09:07:42 UTC
root@OpenWrt:/# route -n
Kernel IP routing table
Destination     Gateway         Genmask         Flags Metric Ref    Use Iface
0.0.0.0         10.1.1.1        0.0.0.0         UG    0      0        0 pptp-pptp_test
10.1.1.1        0.0.0.0         255.255.255.255 UH    0      0        0 pptp-pptp_test
10.128.65.220   0.0.0.0         255.255.255.252 U     0      0        0 eth1
192.168.1.0     0.0.0.0         255.255.255.0   U     0      0        0 br-lan
218.108.76.135  10.128.65.221   255.255.255.255 UGH   0      0        0 eth1
root@OpenWrt:/#
Comment 2 xuheng 2018-03-15 01:16:03 UTC
Add printk in static void __ppp_xmit_process(struct ppp *ppp), like this:

/*                                                                                                                                                                                        
 * Transmit-side routines.                                                                                                                                                                
 */

/* Called to do any work queued up on the transmit side that can now be done */
static void __ppp_xmit_process(struct ppp *ppp)
{
        int i=0;
        struct sk_buff *skb;

        ppp_xmit_lock(ppp);
        if (!ppp->closing) {
                ppp_push(ppp);
                while (!ppp->xmit_pending &&
                       (skb = skb_dequeue(&ppp->file.xq))){
                    if(0!=i){
                        printk("%s:%d loop [%d], per-cpu ppp->xmit_recursion[%d]\n", __FUNCTION__,__LINE__,i,*this_cpu_ptr(ppp->xmit_recursion));
                    }
                        ppp_send_frame(ppp, skb);
                        i++;
                }
                /* If there's no work left to do, tell the core net                                                                                                                       
                   code that we can accept some more. */
                if (!ppp->xmit_pending && !skb_peek(&ppp->file.xq))
                        netif_wake_queue(ppp->dev);
                else
                        netif_stop_queue(ppp->dev);
        }
        ppp_xmit_unlock(ppp);
}




when kernel reboot itself, the last print is:
[44237.690049] pptp-pptp_test: renamed from ppp0
[44266.985247] pptp-pptp_test: recursion detected
[44266.989970] pptp-pptp_test: recursion detected
[44266.994511] __ppp_xmit_process:1429 loop [1], per-cpu ppp->xmit_recursion[1]
[44267.001813] pptp-pptp_test: recursion detected
[44267.006328] __ppp_xmit_process:1429 loop [2], per-cpu ppp->xmit_recursion[1]
[44267.013631] pptp-pptp_test: recursion detected
[44267.018182] __ppp_xmit_process:1429 loop [3], per-cpu ppp->xmit_recursion[1]
[44267.025450] pptp-pptp_test: recursion detected
[44267.030001] __ppp_xmit_process:1429 loop [4], per-cpu ppp->xmit_recursion[1]
[44267.037287] pptp-pptp_test: recursion detected
[44267.041800] __ppp_xmit_process:1429 loop [5], per-cpu ppp->xmit_recursion[1]
[44267.049119] pptp-pptp_test: recursion detected
[44267.053636] __ppp_xmit_process:1429 loop [6], per-cpu ppp->xmit_recursion[1]
[44267.060947] pptp-pptp_test: recursion detected
[44267.065462] __ppp_xmit_process:1429 loop [7], per-cpu ppp->xmit_recursion[1]
[44267.072765] pptp-pptp_test: recursion detected
[44267.077317] __ppp_xmit_process:1429 loop [8], per-cpu ppp->xmit_recursion[1]
[44267.084594] pptp-pptp_test: recursion detected
[44267.089149] __ppp_xmit_process:1429 loop [9], per-cpu ppp->xmit_recursion[1]
.
.
.
[44293.215320] __ppp_xmit_process:1429 loop [3434], per-cpu ppp->xmit_recursion[1]
[44293.222972] __ppp_xmit_process:1429 loop [3435], per-cpu ppp->xmit_recursion[1]
[44293.230547] __ppp_xmit_process:1429 loop [3436], per-cpu ppp->xmit_recursion[1]
[44293.238120] __ppp_xmit_process:1429 loop [3437], per-cpu ppp->xmit_recursion[1]
[44293.245642] __ppp_xmit_process:1429 loop [3438], per-cpu ppp->xmit_recursion[1]
[44293.253219] __ppp_xmit_process:1429 loop [3439], per-cpu ppp->xmit_recursion[1]
[44293.260814] __ppp_xmit_process:1429 loop [3440], per-cpu ppp->xmit_recursion[1]
[44293.268402] __ppp_xmit_process:1429 loop [3441], per-cpu ppp->xmit_recursion[1]
[44293.276011] __ppp_xmit_process:1429 loop [3442], per-cpu ppp->xmit_recursion[1]
[44293.283596] __ppp_xmit_process:1429 loop [3443], per-cpu ppp->xmit_recursion[1]
[44293.291176] __ppp_xmit_process:1429 loop [3444], per-cpu ppp->xmit_recursion[1]
[44293.298756] __ppp_xmit_process:1429 loop [3445], per-cpu ppp->xmit_recursion[1]
[44293.306335] __ppp_xmit_process:1429 loop [3446], per-cpu ppp->xmit_recursion[1]
[44293.313866] __ppp_xmit_process:1429 loop [3447], per-cpu ppp->xmit_recursion[1]
[44293.321448] __ppp_xmit_process:1429 loop [3448], per-cp

U-Boot 1.1.4 (Oct 29 2015 - 14:36:21)

DB120
DRAM:  
sri
Wasp 1.3
wasp_ddr_initial_config(249): (32bit) ddr2 init
wasp_ddr_initial_config(426): Wasp ddr init done
Tap value selected = 0xe [0x0 - 0x1d]
128 MB
Top of RAM usable for U-Boot at: 88000000
Reserving 206k for U-Boot at: 87fcc000
Reserving 192k for malloc() at: 87f9c000
Reserving 44 Bytes for Board Info at: 87f9bfd4
Reserving 36 Bytes for Global Data at: 87f9bfb0
Reserving 128k for boot params() at: 87f7bfb0
Stack Pointer at: 87f7bf98
Now running in RAM - U-Boot at: 87fcc000
Flash Manuf Id 0xef, DeviceId0 0x40, DeviceId1 0x18
flash size 16MB, sector count = 256
Flash: 16 MB
Comment 3 xuheng 2018-03-16 01:12:12 UTC
For testing, in __ppp_channel_push(), disable sending anything from the attached unit, just disable __ppp_xmit_process(ppp) in __ppp_channel_push(). In my opinion, __ppp_xmit_process() should only called by ppp_xmit_process(), because of ppp->xmit_recursion __percpu.

And, as a result of the test, the "recursion detected" not happened for 82519 seconds. I will test this for a long time.


1935| /* Try to send data out on a channel */
1936| static void __ppp_channel_push(struct channel *pch)
1937| {
1938|         struct sk_buff *skb; 
1939|         struct ppp *ppp;
1940| 
1941|         spin_lock_bh(&pch->downl);
1942|         if (pch->chan) {
1943|                 while (!skb_queue_empty(&pch->file.xq)) {
1944|                         skb = skb_dequeue(&pch->file.xq);
1945|                         if (!pch->chan->ops->start_xmit(pch->chan, skb)) {
1946|                                 /* put the packet back and try again later */
1947|                                 skb_queue_head(&pch->file.xq, skb);
1948|                                 break;
1949|                         }
1950|                 }
1951|         } else {
1952|                 /* channel got deregistered */
1953|                 skb_queue_purge(&pch->file.xq);
1954|         }
1955|         spin_unlock_bh(&pch->downl);
1956| #if 0
1957|         /* see if there is anything from the attached unit to be sent */
1958|         if (skb_queue_empty(&pch->file.xq)) {
1959|                 ppp = pch->ppp;
1960|                 if (ppp)
1961|                         __ppp_xmit_process(ppp);
1962|         }
1963| #endif
1964| }
Comment 4 xuheng 2018-03-21 01:43:59 UTC
Reported-by: xu heng <xuheng333@zoho.com>
Fixes: 55454a565836 ("ppp: avoid dealock on recursive xmit")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
---
drivers/net/ppp/ppp_generic.c | 26 ++++++++++++++------------
1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index fa2a9bdd1866..da1937832c99 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -257,7 +257,7 @@ struct ppp_net {
/* Prototypes. */
static int ppp_unattached_ioctl(struct net *net, struct ppp_file *pf,
            struct file *file, unsigned int cmd, unsigned long arg);
-static void ppp_xmit_process(struct ppp *ppp);
+static void ppp_xmit_process(struct ppp *ppp, struct sk_buff *skb);
static void ppp_send_frame(struct ppp *ppp, struct sk_buff *skb);
static void ppp_push(struct ppp *ppp);
static void ppp_channel_push(struct channel *pch);
@@ -513,13 +513,12 @@ static ssize_t ppp_write(struct file *file, const char __user *buf,
        goto out;
    }

-    skb_queue_tail(&pf->xq, skb);
-
    switch (pf->kind) {
    case INTERFACE:
-        ppp_xmit_process(PF_TO_PPP(pf));
+        ppp_xmit_process(PF_TO_PPP(pf), skb);
        break;
    case CHANNEL:
+        skb_queue_tail(&pf->xq, skb);
        ppp_channel_push(PF_TO_CHANNEL(pf));
        break;
    }
@@ -1267,8 +1266,8 @@ ppp_start_xmit(struct sk_buff *skb, struct net_device *dev)
    put_unaligned_be16(proto, pp);

    skb_scrub_packet(skb, !net_eq(ppp->ppp_net, dev_net(dev)));
-    skb_queue_tail(&ppp->file.xq, skb);
-    ppp_xmit_process(ppp);
+    ppp_xmit_process(ppp, skb);
+
    return NETDEV_TX_OK;

outf:
@@ -1420,13 +1419,14 @@ static void ppp_setup(struct net_device *dev)
*/

/* Called to do any work queued up on the transmit side that can now be done */
-static void __ppp_xmit_process(struct ppp *ppp)
+static void __ppp_xmit_process(struct ppp *ppp, struct sk_buff *skb)
{
-    struct sk_buff *skb;
-
    ppp_xmit_lock(ppp);
    if (!ppp->closing) {
        ppp_push(ppp);
+
+        if (skb)
+            skb_queue_tail(&ppp->file.xq, skb);
        while (!ppp->xmit_pending &&
         (skb = skb_dequeue(&ppp->file.xq)))
            ppp_send_frame(ppp, skb);
@@ -1440,7 +1440,7 @@ static void __ppp_xmit_process(struct ppp *ppp)
    ppp_xmit_unlock(ppp);
}

-static void ppp_xmit_process(struct ppp *ppp)
+static void ppp_xmit_process(struct ppp *ppp, struct sk_buff *skb)
{
    local_bh_disable();

@@ -1448,7 +1448,7 @@ static void ppp_xmit_process(struct ppp *ppp)
        goto err;

    (*this_cpu_ptr(ppp->xmit_recursion))++;
-    __ppp_xmit_process(ppp);
+    __ppp_xmit_process(ppp, skb);
    (*this_cpu_ptr(ppp->xmit_recursion))--;

    local_bh_enable();
@@ -1458,6 +1458,8 @@ static void ppp_xmit_process(struct ppp *ppp)
err:
    local_bh_enable();

+    kfree_skb(skb);
+
    if (net_ratelimit())
        netdev_err(ppp->dev, "recursion detected\n");
}
@@ -1942,7 +1944,7 @@ static void __ppp_channel_push(struct channel *pch)
    if (skb_queue_empty(&pch->file.xq)) {
        ppp = pch->ppp;
        if (ppp)
-            __ppp_xmit_process(ppp);
+            __ppp_xmit_process(ppp, NULL);
    }
}