Bug 202081

Summary:	[REGRESSION] nvme-pci: Deadlock in nvme_remove on SURPRISE!!! removal
Product:	Drivers	Reporter:	Alex G. (mr.nuke.me)
Component:	Other	Assignee:	drivers_other
Status:	NEW ---
Severity:	normal	CC:	austin.bolen, bjorn, kbusch, lukas, stuart.w.hayes
Priority:	P1
Hardware:	All
OS:	Linux
Kernel Version:	4.20	Subsystem:
Regression:	No	Bisected commit-id:
Attachments:	System log as problem happens (ANSI color sequences -- use cat or less to read) attachment-28040-0.html Modifications to pciehp_isr/ist for debug info

Description Alex G. 2018-12-27 19:47:52 UTC

Created attachment 280165 [details]
System log as problem happens (ANSI color sequences -- use cat or less to read)

On SURPRISE!!! removal of the underlying PCIe device, the nvme driver deadlocks in nvme_remove.

The pciehp_ist thread calls nvme_remove, which then deadlocks. No further hotplug interrupts can be processed from the upstream port. The kernel can no longer respond to a new device being added.

The pciehp_isr is still hit when the hardware sends an interrupt, but that never reaches pciehp_ist. This was not a problem in v4.19.

# ls /sys/bus/pci/devices/0000\:b0\:04.0/msi_irqs/
58

# ps aux |grep 58-pcie
root       539  0.0  0.0      0     0 ?        D    13:31   0:00 [irq/58-pciehp]

# sudo cat /proc/539/stack
[<0>] __flush_work+0x11a/0x1d0
[<0>] nvme_remove+0x50/0x170 [nvme]
[<0>] pci_device_remove+0x3b/0xc0
[<0>] device_release_driver_internal+0x180/0x240
[<0>] pci_stop_bus_device+0x69/0x90
[<0>] pci_stop_and_remove_bus_device+0xe/0x20
[<0>] pciehp_unconfigure_device+0x84/0x140
[<0>] pciehp_disable_slot+0x67/0x110
[<0>] pciehp_handle_presence_or_link_change+0x25f/0x400
[<0>] pciehp_ist+0x1bb/0x1c0
[<0>] irq_thread_fn+0x1f/0x50
[<0>] irq_thread+0xe7/0x170
[<0>] kthread+0x112/0x130
[<0>] ret_from_fork+0x35/0x40
[<0>] 0xffffffffffffffff

Comment 1 Keith Busch 2018-12-27 19:48:23 UTC

Created attachment 280167 [details]
attachment-28040-0.html

Thank you for the message. I am on out of the office with intermittent internet access until December 28; response will be delayed.

Comment 2 Alex G. 2019-01-03 22:49:57 UTC

I enabled tracing, yanked out one drive, and it seems that there is an admin command that doesn't get back a completion:

# echo 1 > /sys/kernel/debug/tracing/events/nvme/enable 

# cat /sys/kernel/debug/tracing/trace
# tracer: nop
#
#                              _-----=> irqs-off
#                             / _----=> need-resched
#                            | / _---=> hardirq/softirq
#                            || / _--=> preempt-depth
#                            ||| /     delay
#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
#              | |       |   ||||       |         |
  kworker/u130:3-904   [033] ....  1286.523559: nvme_setup_cmd: nvme4: qid=0, cmdid=5, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_identify cns=1, ctrlid=0)
          <idle>-0     [063] ..s.  1286.523710: nvme_complete_rq: nvme4: qid=0, cmdid=5, res=0, retries=0, flags=0x0, status=0
  kworker/u130:3-904   [033] ....  1286.523733: nvme_setup_cmd: nvme4: qid=0, cmdid=5, nsid=4294967295, flags=0x0, meta=0x0, cmd=(nvme_admin_get_log_page cdw10=05 00 ff 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00)
          <idle>-0     [063] ..s.  1286.523765: nvme_complete_rq: nvme4: qid=0, cmdid=5, res=0, retries=0, flags=0x0, status=0
  kworker/u130:3-904   [033] ....  1286.523790: nvme_setup_cmd: nvme4: qid=0, cmdid=5, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_set_features cdw10=07 00 00 00 3f 00 3f 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00)
          <idle>-0     [063] ..s.  1286.523806: nvme_complete_rq: nvme4: qid=0, cmdid=5, res=8323199, retries=0, flags=0x0, status=0
  kworker/u130:3-904   [033] ....  1286.526503: nvme_setup_cmd: nvme4: qid=0, cmdid=5, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_create_cq cqid=1, qsize=1023, cq_flags=0x3, irq_vector=1)

Comment 3 Alex G. 2019-01-18 22:42:26 UTC

Simpler mode of failure:
 * Drive is disconnected, wait a few seconfs
 * Drive is re-connected

#  cat /sys/kernel/debug/tracing/trace
# tracer: nop
#
#                              _-----=> irqs-off
#                             / _----=> need-resched
#                            | / _---=> hardirq/softirq
#                            || / _--=> preempt-depth
#                            ||| /     delay
#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
#              | |       |   ||||       |         |
  kworker/u162:5-1078  [013] ....   331.236313: nvme_setup_cmd: nvme4: qid=0, cmdid=12, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_identify cns=1, ctrlid=0)

# ps aux |grep D[[:space:]]
USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
root       626  0.0  0.0      0     0 ?        D    15:52   0:00 [irq/44-pciehp]
root      1066  0.0  0.0      0     0 ?        D    15:53   0:00 [kworker/u162:2+events_unbound]
root      1078  0.0  0.0      0     0 ?        D    15:53   0:00 [kworker/u162:5+nvme-reset-wq]

# cat /proc/1066/stack 
[<0>] __flush_work+0x11a/0x1d0
[<0>] nvme_reset_ctrl_sync+0x26/0x40 [nvme_core]
[<0>] nvme_async_probe+0x15/0x30 [nvme]
[<0>] async_run_entry_fn+0x39/0x160
[<0>] process_one_work+0x1a1/0x3a0
[<0>] worker_thread+0x30/0x380
[<0>] kthread+0x112/0x130
[<0>] ret_from_fork+0x35/0x40
[<0>] 0xffffffffffffffff

# cat /proc/1078/stack 
[<0>] blk_execute_rq+0x69/0xa0
[<0>] __nvme_submit_sync_cmd+0xaf/0x1c0 [nvme_core]
[<0>] nvme_identify_ctrl.isra.40+0x7e/0xc0 [nvme_core]
[<0>] nvme_init_identify+0x8e/0xdf0 [nvme_core]
[<0>] nvme_reset_work+0x46e/0x17a0 [nvme]
[<0>] process_one_work+0x1a1/0x3a0
[<0>] worker_thread+0x1c9/0x380
[<0>] kthread+0x112/0x130
[<0>] ret_from_fork+0x35/0x40
[<0>] 0xffffffffffffffff

# cat /proc/626/stack 
[<0>] __flush_work+0x11a/0x1d0
[<0>] nvme_remove+0x50/0x170 [nvme]
[<0>] pci_device_remove+0x3b/0xc0
[<0>] device_release_driver_internal+0x183/0x240
[<0>] pci_stop_bus_device+0x69/0x90
[<0>] pci_stop_and_remove_bus_device+0xe/0x20
[<0>] pciehp_unconfigure_device+0x84/0x140
[<0>] pciehp_disable_slot+0x67/0x110
[<0>] pciehp_handle_presence_or_link_change+0x25f/0x400
[<0>] pciehp_ist+0x1bb/0x1c0
[<0>] irq_thread_fn+0x1f/0x60
[<0>] irq_thread+0xe7/0x170
[<0>] kthread+0x112/0x130
[<0>] ret_from_fork+0x35/0x40
[<0>] 0xffffffffffffffff

Comment 4 Keith Busch 2019-01-18 23:41:39 UTC

I think you must be removing the device almost immediately after the controller said it was ready, but before the driver set up the admin tags. Might be a gap where we try to tear down the admin queue resource before it was completely initialized.

Comment 5 Alex G. 2019-01-21 17:36:25 UTC

Everything seems to work fine on removal. Then, when the drive is reinserted, the kernel thinks there is a Link Up, and then a Card not present interrupt.

[237028.914948] pciehp 0000:b0:04.0:pcie204: Slot(178): Link Up
[...]
[237029.216339] pcieport 0000:b0:04.0: PCI bridge to [bus b1]
[...]
[237029.218130] nvme nvme4: pci function 0000:b1:00.0
[237029.218161] nvme 0000:b1:00.0: enabling device (0000 -> 0002)
[237029.313898] pciehp 0000:b0:04.0:pcie204: Slot(178): Card not present

I'm going to try to print out the hardware bits in the isr, because the current sequence of events doesn't make sense.

Comment 6 Alex G. 2019-01-22 00:56:57 UTC

This may be an interaction issue with pciehp. There's a "card not present" event, or at least the kernel thinks so.
But! The trigger for that event has the "presence detect" bit set... and we're still interpreting it as not present event.

[  838.784439] pcieport 0000:b0:04.0: ZOPA: Status(0x00000000):
[  838.784447] pcieport 0000:b0:04.0: ZOPA: Events(0x00000100): DLL-changed
[  838.784454] pcieport 0000:b0:04.0: ZOPA: Queued up(0x00000100): DLL-changed
[  838.784457] pcieport 0000:b0:04.0: ZOPA: pciehp_isr: exiting
[  838.784524] pcieport 0000:b0:04.0: ZOPA: pciehp_ist: Entered with events(0x00000100): DLL-changed
[  838.784529] pciehp 0000:b0:04.0:pcie204: ZOPA: pciehp_handle_presence_or_link_change: locked &ctrl->state_lock
[  838.784532] pciehp 0000:b0:04.0:pcie204: ZOPA: pciehp_handle_presence_or_link_change unlocked (no action)
[  838.784538] pciehp 0000:b0:04.0:pcie204: ZOPA: pciehp_handle_presence_or_link_change unlocked (POWERON)
[  838.784541] pciehp 0000:b0:04.0:pcie204: Slot(178): Link Up
[...]
[  839.093787] nvme 0000:b1:00.0: enabling device (0000 -> 0002)
[  839.093984] pcieport 0000:b0:04.0: ZOPA: pciehp_ist handled
[  839.183445] pcieport 0000:b0:04.0: ZOPA: Status(0x00000040): presence-detected
[  839.183452] pcieport 0000:b0:04.0: ZOPA: Events(0x00000008): presence-detect-changed
[  839.183458] pcieport 0000:b0:04.0: ZOPA: Queued up(0x00000008): presence-detect-changed
[  839.183461] pcieport 0000:b0:04.0: ZOPA: pciehp_isr: exiting
[  839.183494] pcieport 0000:b0:04.0: ZOPA: pciehp_ist: Entered with events(0x00000008): presence-detect-changed
[  839.183499] pciehp 0000:b0:04.0:pcie204: ZOPA: pciehp_handle_presence_or_link_change: locked &ctrl->state_lock
[  839.183503] pciehp 0000:b0:04.0:pcie204: ZOPA: pciehp_handle_presence_or_link_change unlocked (POWEROFF)
[  839.183506] pciehp 0000:b0:04.0:pcie204: Slot(178): Card not present

Comment 7 Alex G. 2019-01-22 16:50:40 UTC

Created attachment 280671 [details]
Modifications to pciehp_isr/ist for debug info

Comment 8 Keith Busch 2019-01-22 16:52:06 UTC

Okay, so the slot is notifying of a PDC event, however the link up event set the state to ON_STATE before the PDC event was seen, so the subsequent PDC handler assumes something changed. That's not good, we should see both events together.

Comment 9 Keith Busch 2019-01-22 19:37:09 UTC

Looked a little more. I think there's two problems. First is that nvme shouldn't lock up no matter what happens, so we'll need to fix that driver. Second is how we may be able to handle the Link Up followed by a PDC event. The only thing I can think of is to add some arbitrary debouncing time.

The first issue has priority, IMO.

Comment 10 Alex G. 2019-01-22 20:01:15 UTC

I agree. Would be great if you could look into the lockup issue.

As far as the second issue, I think it's possible for PD to come up after data link layer. IIRC there's a 500ms hardware debouncer for PD, hence the delay.
PD *should* be a logical OR of in-band and out-of-band presence, but neither Dell nor HPE follow that. I'll look into the hotplug part and see if I can make it more resilient.

Comment 11 Keith Busch 2019-01-22 20:38:51 UTC

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9bc585415d9b..022ea1ee63f8 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2557,16 +2557,7 @@ static void nvme_reset_work(struct work_struct *work)
 	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
 		nvme_dev_disable(dev, false);
 
-	/*
-	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
-	 * initializing procedure here.
-	 */
-	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
-		dev_warn(dev->ctrl.device,
-			"failed to mark controller CONNECTING\n");
-		goto out;
-	}
-
+	mutex_lock(&dev->shutdown_lock);
 	result = nvme_pci_enable(dev);
 	if (result)
 		goto out;
@@ -2585,6 +2576,17 @@ static void nvme_reset_work(struct work_struct *work)
 	 */
 	dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1;
 	dev->ctrl.max_segments = NVME_MAX_SEGS;
+	mutex_unlock(&dev->shutdown_lock);
+
+	/*
+	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
+	 * initializing procedure here.
+	 */
+	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
+		dev_warn(dev->ctrl.device,
+			"failed to mark controller CONNECTING\n");
+		goto out;
+	}
 
 	result = nvme_init_identify(&dev->ctrl);
 	if (result)
--

Comment 12 Keith Busch 2019-01-22 20:47:35 UTC

Could we try the following patch?

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9bc585415d9b..022ea1ee63f8 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2557,16 +2557,7 @@ static void nvme_reset_work(struct work_struct *work)
 	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
 		nvme_dev_disable(dev, false);
 
-	/*
-	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
-	 * initializing procedure here.
-	 */
-	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
-		dev_warn(dev->ctrl.device,
-			"failed to mark controller CONNECTING\n");
-		goto out;
-	}
-
+	mutex_lock(&dev->shutdown_lock);
 	result = nvme_pci_enable(dev);
 	if (result)
 		goto out;
@@ -2585,6 +2576,17 @@ static void nvme_reset_work(struct work_struct *work)
 	 */
 	dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1;
 	dev->ctrl.max_segments = NVME_MAX_SEGS;
+	mutex_unlock(&dev->shutdown_lock);
+
+	/*
+	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
+	 * initializing procedure here.
+	 */
+	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
+		dev_warn(dev->ctrl.device,
+			"failed to mark controller CONNECTING\n");
+		goto out;
+	}
 
 	result = nvme_init_identify(&dev->ctrl);
 	if (result)
--

Comment 13 Keith Busch 2019-01-23 18:56:02 UTC

I was wondering why the blk-mq timeout handler isn't kicking in, and it looks like it's because of some silly PowerPC work-around added to the nvme pci timeout handler where we don't do anything to recover if the pci channel isn't online. It's not online in a device removal ...

Comment 14 Alex G. 2019-01-23 23:45:25 UTC

I tried the patch, and it seems things are back to normal.

Comment 15 Keith Busch 2019-01-23 23:47:33 UTC

Thanks. I think it will take me longer to write a solid change log than the patch itself :)

Will get this posted by tomorrow, I'll try to make a case for 5.0-rc4

Comment 16 Keith Busch 2019-01-29 17:10:50 UTC

(In reply to Alex G. from comment #14)
> I tried the patch, and it seems things are back to normal.

Could you possibly reply to the patch with a Tested-by? Just want to get some attention on it since no one's replied to it yet. :(

http://lists.infradead.org/pipermail/linux-nvme/2019-January/022096.html

Comment 17 stuart hayes 2020-02-07 19:25:33 UTC

I'm still seeing failures with an Intel P4800 (optane) NVMe drive, when it gets disconnected before reset_work finishes setting up the drive.

The patch below makes more use of the shutdown_lock, and should abort the setup if it has started to disconnect.  I can't reproduce a failure with this patch.

Keith--do you see any issues with it?  I could reproduce the issue I see (running without this patch) a few times and post the kernel logs, if it would help.  Thanks!


diff -pur linux-4.18.0-135.el8.x86_64.unmodified/drivers/nvme/host/pci.c linux-4.18.0-135.el8.x86_64.nvmemutexpatch/drivers/nvme/host/pci.c
--- linux-4.18.0-135.el8.x86_64.unmodified/drivers/nvme/host/pci.c	2019-08-16 15:02:40.000000000 -0400
+++ linux-4.18.0-135.el8.x86_64.nvmemutexpatch/drivers/nvme/host/pci.c	2019-09-24 16:35:52.910077111 -0400
@@ -1585,7 +1585,11 @@ static int nvme_create_queue(struct nvme
 	nvme_init_queue(nvmeq, qid);
 
 	if (vector != -1) {
-		result = queue_request_irq(nvmeq);
+		result = -EIO;
+		if (mutex_trylock(&dev->shutdown_lock)) {
+			result = queue_request_irq(nvmeq);
+			mutex_unlock(&dev->shutdown_lock);
+		}
 		if (result < 0)
 			goto release_sq;
 	}
@@ -2153,6 +2157,19 @@ static int nvme_setup_io_queues(struct n
 	adminq->q_db = dev->dbs;
 
  retry:
+	/*
+	 * Try to obtain shutdown_lock mutex, and abort if we can't--that
+	 * means nvme_dev_disable() is already running.
+	 * This will prevent nvme_dev_disable() from also freeing these IRQs
+	 * (we get an ugly warning if we try to free them twice).
+	 * Also, queue_request_irq() would race with nvme_dev_disable()
+	 * because one sets up IRQs while the other tears them down, which
+	 * can trigger a BUG_ON(irq_has_action()) in free_msi_irqs().
+	 */
+	if (!mutex_trylock(&dev->shutdown_lock)) {
+		return -EIO;
+	}
+
 	/* Deregister the admin queue's interrupt */
 	pci_free_irq(pdev, 0, adminq);
 
@@ -2163,8 +2180,10 @@ static int nvme_setup_io_queues(struct n
 	pci_free_irq_vectors(pdev);
 
 	result = nvme_setup_irqs(dev, nr_io_queues);
-	if (result <= 0)
+	if (result <= 0) {
+		mutex_unlock(&dev->shutdown_lock);
 		return -EIO;
+	}
 
 	dev->num_vecs = result;
 	result = max(result - 1, 1);
@@ -2177,6 +2196,9 @@ static int nvme_setup_io_queues(struct n
 	 * number of interrupts.
 	 */
 	result = queue_request_irq(adminq);
+
+	mutex_unlock(&dev->shutdown_lock);
+
 	if (result) {
 		adminq->cq_vector = -1;
 		return result

Comment 18 Keith Busch 2020-02-07 19:38:13 UTC

I suspect you'll encounter the same problem if the shutdown thread completes fast enough that the "mutex_trylock()" in nvme_create_io_queues succeeds.