Bug 14805

Summary: a kernel bug about ip defrag and ip conntrack
Product: Networking Reporter: Chong Qiao (qiaochong)
Component: Netfilter/IptablesAssignee: networking_netfilter-iptables (networking_netfilter-iptables)
Status: RESOLVED CODE_FIX    
Severity: normal CC: kaber
Priority: P1    
Hardware: All   
OS: Linux   
Kernel Version: 2.6.25.17,2.6.27 Subsystem:
Regression: No Bisected commit-id:
Attachments: Put fragments originating from a bridge into their own defrag namespace

Description Chong Qiao 2009-12-14 06:39:18 UTC
I have seen
http://bugzilla.kernel.org/show_bug.cgi?id=13550
http://bugzilla.netfilter.org/show_bug.cgi?id=339
.My description is more detail.

I found a kernel bug,when I test linux firewall.
I have tried 2.6.25.17,2.6.27 kernel,both has bug.
This bug will cause kernel oops.

Environment:
prepare a motherboard with two net adaptor pluged on.
compile kernel,choose 
CONFIG_NF_NAT=y
CONFIG_BRIDGE=y
CONFIG_BRIDGE_NETFILTER=y

compile the kernel,and boot the board.
type command as bellow:

 ifconfig eth0 up
 ifconfig eth1 up
 brctl addif br0 eth0
 ifconfig br0 up
 echo 1 > /proc/sys/net/ipv4/ip_forward

plug net wires into to net adaptor's slots,the other ends of the two wires are
pluged in same switcher.
So they can reveive same broadcast packet.

Then use another machine to send udp broadcast big packets,like 10000bytes,the
cause packet is defraged into multi small ip packets and received by the two
net adaptor.
./udp  send 255.255.255.255 8899 10000
send udp broadcast port 8899,size 10000.
the source code in the attachment.

then you will see panic messages prompt on teminal.
CPU 0 Unable to handle kernel paging request at virtual address
0000000000000018, epc == ffffffff8071c490, ra == ffffffff8078
Oops[#1]:
Cpu 0
$ 0   : 0000000000000000 ffffffffcfffffff 0000000000000020 0000000000000001
$ 4   : 980000009e1ce6a0 0000000000000001 0000000000000002 0000000000000000
$ 8   : 980000009e18e000 0000000000000000 ffffffff8071c46c 0000000000000000
$12   : 0000000000000028 ffffffff80205e24 00000000000186a0 0000000000000000
$16   : 0000000000000000 980000009e1ce6a0 0000000000000000 ffffffff80716dc0
$20   : 0000000000000000 980000009e48a000 0000000000000000 980000009e1ce6a0
$24   : 0000000000000000 ffffffff80224a90
$28   : ffffffff808cc000 ffffffff808cfa50 0000000000000007 ffffffff8071d538
Hi    : 0000000000000ecf
Lo    : 00000000038f0000
epc   : ffffffff8071c490 br_nf_pre_routing_finish+0x24/0x5a4
    Not tainted
ra    : ffffffff8071d538 br_nf_pre_routing+0xaa0/0xae8
Status: 140044e3    KX SX UX KERNEL EXL IE
Cause : 10008808
BadVA : 0000000000000018
PrId  : 00006303 (ICT Loongson-2)
Modules linked in:
Process swapper (pid: 0, threadinfo=ffffffff808cc000, task=ffffffff808d0000,
tls=0000000000000000)
Stack : ffffffff80000000 ffffffff80683f68 980000009e1ce6a0 0000000000000000
        0000000000000000 ffffffff8071c46c 0000000000000000 980000009e18e000
        0000000000000000 0000000000000010 ffffffff806840a0 980000009b8bb460
        ffffffff80fc8020 ffffffff80000000 0000000000000000 980000009e1ce6a0
        ffffffff80000000 ffffffff80716dc0 0000000000000000 980000009e1ce6a0
        ffffffff80000000 ffffffff80716dc0 0000000000000000 ffffffff8071d538
        ffffffff808cfb90 ffffffff80fc82a0 ffffffff80000000 ffffffff80716dc0
        ffffffff80683f68 ffffffff80683f68 980000009e1ce6a0 0000000000000000
        0000000000000000 ffffffff80716dc0 0000000000000000 980000009e48a000
        0000000000000000 0000000000000038 ffffffff806840a0 0000000000000010
        ...
Call Trace:
[<ffffffff8071c490>] br_nf_pre_routing_finish+0x24/0x5a4
[<ffffffff8071d538>] br_nf_pre_routing+0xaa0/0xae8
[<ffffffff80683f68>] nf_iterate+0x94/0x108
[<ffffffff806840a0>] nf_hook_slow+0xc4/0x178
[<ffffffff807172f4>] br_handle_frame+0x2f0/0x348
[<ffffffff80658d2c>] netif_receive_skb+0x5bc/0x7b8
[<ffffffff8053e1e0>] e1000_clean_rx_irq+0x35c/0x4ac
[<ffffffff8053da30>] e1000_clean+0x528/0x83c
[<ffffffff806592e0>] net_rx_action+0x140/0x3e8
[<ffffffff80260498>] __do_softirq+0xb0/0x170
[<ffffffff802605c4>] do_softirq+0x6c/0xac
[<ffffffff80260904>] irq_exit+0x54/0xb8
[<ffffffff80224818>] plat_irq_dispatch+0x1b0/0x414
[<ffffffff80200424>] ret_from_irq+0x0/0x4
[<ffffffff802212bc>] ls2f_cpu_wait+0x5c/0x7c
[<ffffffff80225b0c>] cpu_idle+0x4c/0x7c


I add print info into kernel and found the reason of panic.

because both net adaptor can receive broadcast packet,both packets are send to
ip_defrag,
when are fragments are all colleted,the last skb  run ip_frag_reasm。

see the code bellow 
Ip_defrag_reasm 

456 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
457              struct net_device *dev)
458 {
459     struct iphdr *iph;
460     struct sk_buff *fp, *head = qp->q.fragments;
461     int len;
462     int ihlen;
463     int err;
464
465     ipq_kill(qp);
466
467     /* Make the one we just received the head. */
468     if (prev) {
469         head = prev->next;
470         fp = skb_clone(head, GFP_ATOMIC);
471         if (!fp)
472             goto out_nomem;
473
474         fp->next = head->next;
475         prev->next = fp;
476
477         skb_morph(head, qp->q.fragments);
478         head->next = qp->q.fragments->next;
479
480         kfree_skb(qp->q.fragments);
481         qp->q.fragments = head;
482     }
483

line 477:
bt as bellow:
1. dst->nf_bridge  = src->nf_bridge;
2. __nf_copy(new, old);
3. __copy_skb_header
4. __skb_clone
5. skb_morph
6.ip_frag_reasm
7.ip_frag_queue
8.ip_defrag

when the first packet on qh is from eth1,the last on is from br0,then 
qp->q.fragments->nf_bridge is 0,head->nf_bridge is a real nf_bridge pointer.

but skb_morph free  nf_bridge and set head->nf_bridge to
qp->q.fragments->nf_bridge,that is 0。


537 static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff
*skb,
 538                       const struct net_device *in,
 539                       const struct net_device *out,
 540                       int (*okfn)(struct sk_buff *))
 541 {
...
 595     NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
 596         br_nf_pre_routing_finish);
...
}
 324 static int br_nf_pre_routing_finish(struct sk_buff *skb)
 325 {
 326     struct net_device *dev = skb->dev;
 327     struct iphdr *iph = ip_hdr(skb);
 328     struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 329     int err;
 330
 331     if (nf_bridge->mask & BRNF_PKT_TYPE) {
 332         skb->pkt_type = PACKET_OTHERHOST;
 333         nf_bridge->mask ^= BRNF_PKT_TYPE;
 334     }

so when return from netfilter,the kernel die on line 331,because skb->nf_bridge
has been changed to zero
。

I do not known how to fix this bug,but I think is a big bug.

That is all!


The code bellow is changed by me to help you to print debug info.


298 /* Add new segment to existing queue. */
299 static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
300 {
301     
...
436     if (dev) {
437         qp->iif = dev->ifindex;
438         //skb->dev = NULL; // for print debug reason
439     }
...

467 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
468              struct net_device *dev)
469 {
470     struct iphdr *iph;
471     struct sk_buff *fp, *head = qp->q.fragments;
472     int len;
473     int ihlen;
474     int err;
475 {struct nf_bridge_info   *oldbr;  //add code here to check bug reason
476 struct sk_buff *myskb=head;
477 oldbr=myskb->nf_bridge;
478
479     ipq_kill(qp);
480
481     /* Make the one we just received the head. */
482     if (prev) {
483         head = prev->next;
484         myskb=head;
485 oldbr=myskb->nf_bridge;
486 if(oldbr && !qp->q.fragments->nf_bridge)
487 {
488 if(myskb->dev)
489
printk("%s,head=%p,data=%p,networkheader=%x,ifindex=%x\n",myskb->dev->name,myskb->head,myskb->data,myskb->network_header,
    myskb->dev->ifindex);
490 print_ip_sym(myskb->dev->hard_start_xmit);
491 if(qp->q.fragments->dev)
492
printk("%s,head=%p,data=%p,networkheader=%x,ifindex=%x\n",qp->q.fragments->dev->name,qp->q.fragments->head,qp->q.fragment
    s->data,qp->q.fragments->network_header,qp->q.fragments->dev->ifindex);
493 print_ip_sym(qp->q.fragments->dev->hard_start_xmit);
494 }
495     

br0,head=980000009b7a9800,data=980000009b7a9834,networkheader=20,ifindex=4
[<ffffffff80714334>] br_dev_xmit+0x0/0xc0
eth1,head=980000009b8bf000,data=980000009b8bf034,networkheader=20,ifindex=3
[<ffffffff8053c1c8>] e1000_xmit_frame+0x0/0xf68
nf_bridge is 0 on ip_frag_reasm:504
CPU 0 Unable to handle kernel paging request at virtual address
0000000000000018, epc == ffffffff8071c490, ra == ffffffff8078
Oops[#1]:
Cpu 0
$ 0   : 0000000000000000 ffffffffcfffffff 0000000000000020 0000000000000001
$ 4   : 980000009e1ce6a0 0000000000000001 0000000000000002 0000000000000000
$ 8   : 980000009e18e000 0000000000000000 ffffffff8071c46c 0000000000000000
$12   : 0000000000000028 ffffffff80205e24 00000000000186a0 0000000000000000
$16   : 0000000000000000 980000009e1ce6a0 0000000000000000 ffffffff80716dc0
$20   : 0000000000000000 980000009e48a000 0000000000000000 980000009e1ce6a0
$24   : 0000000000000000 ffffffff80224a90
$28   : ffffffff808cc000 ffffffff808cfa50 0000000000000007 ffffffff8071d538
Hi    : 0000000000000ecf
Lo    : 00000000038f0000
epc   : ffffffff8071c490 br_nf_pre_routing_finish+0x24/0x5a4
    Not tainted
ra    : ffffffff8071d538 br_nf_pre_routing+0xaa0/0xae8
Status: 140044e3    KX SX UX KERNEL EXL IE
Cause : 10008808
BadVA : 0000000000000018
PrId  : 00006303 (ICT Loongson-2)
Modules linked in:
Process swapper (pid: 0, threadinfo=ffffffff808cc000, task=ffffffff808d0000,
tls=0000000000000000)
Stack : ffffffff80000000 ffffffff80683f68 980000009e1ce6a0 0000000000000000
        0000000000000000 ffffffff8071c46c 0000000000000000 980000009e18e000
        0000000000000000 0000000000000010 ffffffff806840a0 980000009b8bb460
        ffffffff80fc8020 ffffffff80000000 0000000000000000 980000009e1ce6a0
        ffffffff80000000 ffffffff80716dc0 0000000000000000 980000009e1ce6a0
        ffffffff80000000 ffffffff80716dc0 0000000000000000 ffffffff8071d538
        ffffffff808cfb90 ffffffff80fc82a0 ffffffff80000000 ffffffff80716dc0
        ffffffff80683f68 ffffffff80683f68 980000009e1ce6a0 0000000000000000
        0000000000000000 ffffffff80716dc0 0000000000000000 980000009e48a000
        0000000000000000 0000000000000038 ffffffff806840a0 0000000000000010
        ...
Call Trace:
[<ffffffff8071c490>] br_nf_pre_routing_finish+0x24/0x5a4
[<ffffffff8071d538>] br_nf_pre_routing+0xaa0/0xae8
[<ffffffff80683f68>] nf_iterate+0x94/0x108
[<ffffffff806840a0>] nf_hook_slow+0xc4/0x178
[<ffffffff807172f4>] br_handle_frame+0x2f0/0x348
[<ffffffff80658d2c>] netif_receive_skb+0x5bc/0x7b8
[<ffffffff8053e1e0>] e1000_clean_rx_irq+0x35c/0x4ac
[<ffffffff8053da30>] e1000_clean+0x528/0x83c
[<ffffffff806592e0>] net_rx_action+0x140/0x3e8
[<ffffffff80260498>] __do_softirq+0xb0/0x170
[<ffffffff802605c4>] do_softirq+0x6c/0xac
[<ffffffff80260904>] irq_exit+0x54/0xb8
[<ffffffff80224818>] plat_irq_dispatch+0x1b0/0x414
[<ffffffff80200424>] ret_from_irq+0x0/0x4
[<ffffffff802212bc>] ls2f_cpu_wait+0x5c/0x7c
[<ffffffff80225b0c>] cpu_idle+0x4c/0x7c


Code: ffb200a0  dc920098  9c8200bc <8e430018> dc8500d0  30630001  0080882d 
00a2802d  1060000d
Kernel panic - not syncing: Fatal exception in interrupt

udp.c

#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <stdio.h>
#include <signal.h>
int server(int argc,char **argv)
{
int s=socket(AF_INET,SOCK_DGRAM,0);
struct sockaddr_in addr;
struct sockaddr from;
char *buf[10240];
int fromlen,len;
addr.sin_family=AF_INET;
addr.sin_addr.s_addr=inet_addr("0.0.0.0");
addr.sin_port=htons(strtoul(argv[2],0,0));
if(bind(s,(struct sockaddr *) &addr,sizeof(addr)))
perror("bind error");
while((len=recvfrom(s,buf,10240,0,&from,&fromlen))>0)
{
printf("len=%d\n",len);
}
close(s);
return 0;
}

int client(int argc,char **argv)
{
int s=socket(AF_INET,SOCK_DGRAM,0);
struct sockaddr_in addr;
struct sockaddr from;
char *buf[10240];
int len,sendlen;
int so_broadcast=1;
if(argc!=5)return -1;
addr.sin_family=AF_INET;
addr.sin_addr.s_addr=inet_addr(argv[2]);
addr.sin_port=htons(strtoul(argv[3],0,0));
sendlen=strtoul(argv[4],0,0);
setsockopt(s,SOL_SOCKET,SO_BROADCAST,&so_broadcast,sizeof(so_broadcast));
while((len=sendto(s,buf,sendlen,0,(struct sockaddr *) &addr,sizeof(addr)))>0)
{
//printf("len=%d\n",len);
}
close(s);
return 0;
}

void myexit(int sig)
{
exit(0);
}
int main(int argc,char **argv)
{
if(argc<3){printf("usage:%s [-a] send ip port size |[-a] recv
port\n",argv[0]);return -1;}
if(!strncmp(argv[1],"-a",2))
{
signal(SIGALRM,myexit);
alarm(strtoul(&argv[1][2],0,0));
argc--;
argv++;
}

if(!strcmp(argv[1],"send")) client(argc,argv);
else server(argc,argv);
return 0;
}
Comment 1 Patrick McHardy 2009-12-14 12:08:44 UTC
Created attachment 24176 [details]
Put fragments originating from a bridge into their own defrag namespace

Thanks for the detailed description. Does this patch fix the problem?
Comment 2 Chong Qiao 2009-12-14 22:50:48 UTC
This is a smart patch.Yes,it fix the problem.Thanks a lot!
Comment 3 Chong Qiao 2009-12-15 01:11:14 UTC
by the way,a misspell in your patch:
+		if (nf_ct_ipv4_gather_frags(skb, user));
 			return NF_STOLEN;
shoule be
+		if (nf_ct_ipv4_gather_frags(skb, user))
 			return NF_STOLEN;
:)
Comment 4 Patrick McHardy 2009-12-15 15:02:08 UTC
Thanks, I'll fix it up and will send it upstream.