Bug 4

Summary: Dcache spirals out of control on 2.5.43-mm2
Product: IO/Storage Reporter: Martin J. Bligh (mbligh)
Component: OtherAssignee: Martin J. Bligh (mbligh)
Status: CLOSED PATCH_ALREADY_AVAILABLE    
Severity: blocking    
Priority: P2    
Hardware: IA-32   
OS: Linux   
Kernel Version: Tree: Mainline
Regression: ---

Description Martin J. Bligh 2002-11-13 19:05:57 UTC
Exact Kernel version: 2.5.43-mm2
Distribution: Debian woody
Harware Environment: 16x NUMA-Q

My big NUMA box went OOM over the weekend and started killing things
for no good reason (2.5.43-mm2). Probably running some background
updatedb for locate thing, not doing any real work.

meminfo:

MemTotal:     16077728 kB
MemFree:      14950708 kB
MemShared:           0 kB
Buffers:           492 kB
Cached:         384976 kB
SwapCached:          0 kB
Active:         372608 kB
Inactive:        13380 kB
HighTotal:    15335424 kB
HighFree:     14949000 kB
LowTotal:       742304 kB
LowFree:          1708 kB
SwapTotal:           0 kB
SwapFree:            0 kB
Dirty:               0 kB
Writeback:           0 kB
Mapped:           2248 kB
Slab:           724744 kB
Reserved:       570464 kB
Committed_AS:     1100 kB
PageTables:        140 kB
ReverseMaps:      1518

Big things out of slabinfo:

ext2_inode_cache  554556 554598    416 61622 61622    1 :  120   60
dentry_cache      2791320 2791320    160 116305 116305    1 :  248  124

By my reckoning, that's over 450Mb of dentry cache that's refusing to shrink
under pressure. ext2_inode_cache ain't exactly anorexic either. Hmmm ..... 
Any good ways to debug this?
Comment 1 Martin J. Bligh 2002-11-13 19:09:55 UTC
This turned out to be a bug somewhere in RCU where we never hit a quiescent
period. Now fixed in later -mm's. Per Dipankar:

> Something broke.

Yes, in RCU and ever since interrupt counters were changed to use
thread_info->preempt_count :)

There is a check in RCU for idle CPUs which signifies quiescent
state (and hence no reference to RCU protected data) which was
broken -

--- kernel/rcupdate.c	Sat Oct 19 09:31:07 2002
+++ /tmp/rcupdate.c.mod	Tue Oct 22 21:20:07 2002
@@ -192,7 +192,8 @@
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user || 
-	    (idle_cpu(cpu) && !in_softirq() && hardirq_count() <= 1))
+	    (idle_cpu(cpu) && !in_softirq() && 
+				hardirq_count() <= (1 << HARDIRQ_SHIFT)))
 		RCU_qsctr(cpu)++;
 	tasklet_schedule(&RCU_tasklet(cpu));
 }

Martin's machine with large number of CPUs which were idle was not
completing any RCU grace period because RCU was forever waiting
for idle CPUs to context switch. Had the idle check worked, this
would not have happened. With no RCU happening, the dentries were
getting "freed" (dentry stats showing that) but not getting returned
to slab. This would not show up in systems that are generally busy
as context switches then would happen in all CPUs and the per-CPU
quiescent state counter would get incremented during context switch.

I have included here a patch that adds statistics to RCU which are
very helpful in detecting problems. That patch also includes the
idle CPU check fix. The stats are in /proc/rcu. I still need to
check if seq_file access of a /proc file is serialized or not.

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.


diff -ruN linux-2.5.44-base/fs/proc/proc_misc.c linux-2.5.44-
rcu/fs/proc/proc_misc.c
--- linux-2.5.44-base/fs/proc/proc_misc.c	Sat Oct 19 09:31:14 2002
+++ linux-2.5.44-rcu/fs/proc/proc_misc.c	Tue Oct 22 18:47:00 2002
@@ -253,6 +253,18 @@
 	.release	= seq_release,
 };
 
+extern struct seq_operations rcu_op;
+static int rcu_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &rcu_op);
+}
+static struct file_operations proc_rcu_operations = {
+	.open		= rcu_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 extern struct seq_operations vmstat_op;
 static int vmstat_open(struct inode *inode, struct file *file)
 {
@@ -631,6 +643,7 @@
 	if (entry)
 		entry->proc_fops = &proc_kmsg_operations;
 	create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
+	create_seq_entry("rcu", 0, &proc_rcu_operations);
 	create_seq_entry("partitions", 0, &proc_partitions_operations);
 #if !defined(CONFIG_ARCH_S390)
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
diff -ruN linux-2.5.44-base/include/linux/rcupdate.h linux-2.5.44-
rcu/include/linux/rcupdate.h
--- linux-2.5.44-base/include/linux/rcupdate.h	Sat Oct 19 09:31:18 2002
+++ linux-2.5.44-rcu/include/linux/rcupdate.h	Tue Oct 22 18:47:00 2002
@@ -68,6 +68,7 @@
 	long		maxbatch;	/* Max requested batch number.        */
 	unsigned long	rcu_cpu_mask; 	/* CPUs that need to switch in order  */
 					/* for current batch to proceed.      */
+	long		nr_batches;
 };
 
 /* Is batch a before batch b ? */
@@ -94,6 +95,10 @@
         long  	       	batch;           /* Batch # for current RCU batch */
         struct list_head  nxtlist;
         struct list_head  curlist;
+	long		nr_newreqs;
+	long		nr_curreqs;
+	long		nr_pendreqs;
+	long		nr_rcupdates;
 } ____cacheline_aligned_in_smp;
 
 extern struct rcu_data rcu_data[NR_CPUS];
@@ -104,6 +109,10 @@
 #define RCU_batch(cpu) 		(rcu_data[(cpu)].batch)
 #define RCU_nxtlist(cpu) 	(rcu_data[(cpu)].nxtlist)
 #define RCU_curlist(cpu) 	(rcu_data[(cpu)].curlist)
+#define RCU_nr_newreqs(cpu) 	(rcu_data[(cpu)].nr_newreqs)
+#define RCU_nr_curreqs(cpu) 	(rcu_data[(cpu)].nr_curreqs)
+#define RCU_nr_pendreqs(cpu) 	(rcu_data[(cpu)].nr_pendreqs)
+#define RCU_nr_rcupdates(cpu) 	(rcu_data[(cpu)].nr_rcupdates)
 
 #define RCU_QSCTR_INVALID	0
 
diff -ruN linux-2.5.44-base/kernel/rcupdate.c linux-2.5.44-rcu/kernel/rcupdate.c
--- linux-2.5.44-base/kernel/rcupdate.c	Sat Oct 19 09:31:07 2002
+++ linux-2.5.44-rcu/kernel/rcupdate.c	Tue Oct 22 18:54:04 2002
@@ -41,6 +41,7 @@
 #include <linux/module.h>
 #include <linux/completion.h>
 #include <linux/percpu.h>
+#include <linux/seq_file.h>
 #include <linux/rcupdate.h>
 
 /* Definition for rcupdate control block. */
@@ -74,6 +75,7 @@
 	local_irq_save(flags);
 	cpu = smp_processor_id();
 	list_add_tail(&head->list, &RCU_nxtlist(cpu));
+	RCU_nr_newreqs(cpu)++;
 	local_irq_restore(flags);
 }
 
@@ -81,7 +83,7 @@
  * Invoke the completed RCU callbacks. They are expected to be in
  * a per-cpu list.
  */
-static void rcu_do_batch(struct list_head *list)
+static void rcu_do_batch(int cpu, struct list_head *list)
 {
 	struct list_head *entry;
 	struct rcu_head *head;
@@ -91,7 +93,9 @@
 		list_del(entry);
 		head = list_entry(entry, struct rcu_head, list);
 		head->func(head->arg);
+		RCU_nr_rcupdates(cpu)++;
 	}
+	RCU_nr_pendreqs(cpu) = 0;
 }
 
 /*
@@ -99,7 +103,7 @@
  * active batch and the batch to be registered has not already occurred.
  * Caller must hold the rcu_ctrlblk lock.
  */
-static void rcu_start_batch(long newbatch)
+static void rcu_start_batch(int cpu, long newbatch)
 {
 	if (rcu_batch_before(rcu_ctrlblk.maxbatch, newbatch)) {
 		rcu_ctrlblk.maxbatch = newbatch;
@@ -109,6 +113,8 @@
 		return;
 	}
 	rcu_ctrlblk.rcu_cpu_mask = cpu_online_map;
+	RCU_nr_pendreqs(cpu) = RCU_nr_curreqs(cpu);
+	RCU_nr_curreqs(cpu) = 0;
 }
 
 /*
@@ -149,7 +155,8 @@
 		return;
 	}
 	rcu_ctrlblk.curbatch++;
-	rcu_start_batch(rcu_ctrlblk.maxbatch);
+	rcu_ctrlblk.nr_batches++;
+	rcu_start_batch(cpu, rcu_ctrlblk.maxbatch);
 	spin_unlock(&rcu_ctrlblk.mutex);
 }
 
@@ -172,6 +179,8 @@
 	if (!list_empty(&RCU_nxtlist(cpu)) && list_empty(&RCU_curlist(cpu))) {
 		list_splice(&RCU_nxtlist(cpu), &RCU_curlist(cpu));
 		INIT_LIST_HEAD(&RCU_nxtlist(cpu));
+		RCU_nr_curreqs(cpu) = RCU_nr_newreqs(cpu);
+		RCU_nr_newreqs(cpu) = 0;
 		local_irq_enable();
 
 		/*
@@ -179,20 +188,21 @@
 		 */
 		spin_lock(&rcu_ctrlblk.mutex);
 		RCU_batch(cpu) = rcu_ctrlblk.curbatch + 1;
-		rcu_start_batch(RCU_batch(cpu));
+		rcu_start_batch(cpu, RCU_batch(cpu));
 		spin_unlock(&rcu_ctrlblk.mutex);
 	} else {
 		local_irq_enable();
 	}
 	rcu_check_quiescent_state();
 	if (!list_empty(&list))
-		rcu_do_batch(&list);
+		rcu_do_batch(cpu, &list);
 }
 
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user || 
-	    (idle_cpu(cpu) && !in_softirq() && hardirq_count() <= 1))
+	    (idle_cpu(cpu) && !in_softirq() && 
+				hardirq_count() <= (1 << HARDIRQ_SHIFT)))
 		RCU_qsctr(cpu)++;
 	tasklet_schedule(&RCU_tasklet(cpu));
 }
@@ -240,3 +250,60 @@
 
 EXPORT_SYMBOL(call_rcu);
 EXPORT_SYMBOL(synchronize_kernel);
+
+#ifdef	CONFIG_PROC_FS
+
+static void *rcu_start(struct seq_file *m, loff_t *pos)
+{
+	static int cpu;
+	cpu = *pos;
+	return *pos < NR_CPUS ? &cpu : NULL;
+}
+		
+static void *rcu_next(struct seq_file *m, void *v, loff_t *pos) 
+{
+	++*pos;
+	return rcu_start(m, pos);
+
+}
+
+static void rcu_stop(struct seq_file *m, void *v)
+{
+}
+
+static int show_rcu(struct seq_file *m, void *v)
+{
+	int cpu = *(int *)v;
+
+	if (!cpu_online(cpu))
+		return 0;
+	if (cpu == 0) {
+		seq_printf(m, "RCU Current Batch : %ld\n",
+				rcu_ctrlblk.curbatch);
+		seq_printf(m, "RCU Max Batch : %ld\n", 
+				rcu_ctrlblk.maxbatch);
+		seq_printf(m, "RCU Global Cpumask : 0x%lx\n",
+				rcu_ctrlblk.rcu_cpu_mask);
+		seq_printf(m, "RCU Total Batches : %ld\n\n\n", 
+				rcu_ctrlblk.nr_batches);
+	}
+	seq_printf(m, "CPU : %d\n", cpu);
+	seq_printf(m, "RCU qsctr : %ld\n", RCU_qsctr(cpu));
+	seq_printf(m, "RCU last qsctr : %ld\n", RCU_last_qsctr(cpu));
+	seq_printf(m, "RCU batch : %ld\n", RCU_batch(cpu));
+	seq_printf(m, "RCU new requests : %ld\n", RCU_nr_newreqs(cpu));
+	seq_printf(m, "RCU current requests : %ld\n", RCU_nr_curreqs(cpu));
+	seq_printf(m, "RCU pending requests : %ld\n", RCU_nr_pendreqs(cpu));
+	seq_printf(m, "RCU updated : %ld\n\n\n", RCU_nr_rcupdates(cpu));
+	return 0;
+}
+
+struct seq_operations rcu_op = {
+	.start	= rcu_start,
+	.next	= rcu_next,
+	.stop	= rcu_stop,
+	.show	= show_rcu,
+};
+
+#endif
+