From be23e837333a914df3f24bf0b32e87b0331ab8d1 Mon Sep 17 00:00:00 2001
From: Zhiqiang Liu <liuzhiqiang26@huawei.com>
Date: Mon, 15 Jun 2020 00:53:30 +0800
Subject: [PATCH 01/13] bcache: fix potential deadlock problem in
 btree_gc_coalesce

coccicheck reports:
  drivers/md//bcache/btree.c:1538:1-7: preceding lock on line 1417

In btree_gc_coalesce func, if the coalescing process fails, we will goto
to out_nocoalesce tag directly without releasing new_nodes[i]->write_lock.
Then, it will cause a deadlock when trying to acquire new_nodes[i]->
write_lock for freeing new_nodes[i] before return.

btree_gc_coalesce func details as follows:
	if alloc new_nodes[i] fails:
		goto out_nocoalesce;
	// obtain new_nodes[i]->write_lock
	mutex_lock(&new_nodes[i]->write_lock)
	// main coalescing process
	for (i = nodes - 1; i > 0; --i)
		[snipped]
		if coalescing process fails:
			// Here, directly goto out_nocoalesce
			 // tag will cause a deadlock
			goto out_nocoalesce;
		[snipped]
	// release new_nodes[i]->write_lock
	mutex_unlock(&new_nodes[i]->write_lock)
	// coalesing succ, return
	return;
out_nocoalesce:
	btree_node_free(new_nodes[i])	// free new_nodes[i]
	// obtain new_nodes[i]->write_lock
	mutex_lock(&new_nodes[i]->write_lock);
	// set flag for reuse
	clear_bit(BTREE_NODE_dirty, &ew_nodes[i]->flags);
	// release new_nodes[i]->write_lock
	mutex_unlock(&new_nodes[i]->write_lock);

To fix the problem, we add a new tag 'out_unlock_nocoalesce' for
releasing new_nodes[i]->write_lock before out_nocoalesce tag. If
coalescing process fails, we will go to out_unlock_nocoalesce tag
for releasing new_nodes[i]->write_lock before free new_nodes[i] in
out_nocoalesce tag.

(Coly Li helps to clean up commit log format.)

Fixes: 2a285686c109816 ("bcache: btree locking rework")
Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/btree.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 39de94edd73a..6548a601edf0 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1389,7 +1389,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 			if (__set_blocks(n1, n1->keys + n2->keys,
 					 block_bytes(b->c)) >
 			    btree_blocks(new_nodes[i]))
-				goto out_nocoalesce;
+				goto out_unlock_nocoalesce;
 
 			keys = n2->keys;
 			/* Take the key of the node we're getting rid of */
@@ -1418,7 +1418,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 
 		if (__bch_keylist_realloc(&keylist,
 					  bkey_u64s(&new_nodes[i]->key)))
-			goto out_nocoalesce;
+			goto out_unlock_nocoalesce;
 
 		bch_btree_node_write(new_nodes[i], &cl);
 		bch_keylist_add(&keylist, &new_nodes[i]->key);
@@ -1464,6 +1464,10 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 	/* Invalidated our iterator */
 	return -EINTR;
 
+out_unlock_nocoalesce:
+	for (i = 0; i < nodes; i++)
+		mutex_unlock(&new_nodes[i]->write_lock);
+
 out_nocoalesce:
 	closure_sync(&cl);
 

From dcacbc1242c71e18fa9d2eadc5647e115c9c627d Mon Sep 17 00:00:00 2001
From: Mauricio Faria de Oliveira <mfo@canonical.com>
Date: Mon, 15 Jun 2020 00:53:31 +0800
Subject: [PATCH 02/13] bcache: check and adjust logical block size for backing
 devices

It's possible for a block driver to set logical block size to
a value greater than page size incorrectly; e.g. bcache takes
the value from the superblock, set by the user w/ make-bcache.

This causes a BUG/NULL pointer dereference in the path:

  __blkdev_get()
  -> set_init_blocksize() // set i_blkbits based on ...
     -> bdev_logical_block_size()
        -> queue_logical_block_size() // ... this value
  -> bdev_disk_changed()
     ...
     -> blkdev_readpage()
        -> block_read_full_page()
           -> create_page_buffers() // size = 1 << i_blkbits
              -> create_empty_buffers() // give size/take pointer
                 -> alloc_page_buffers() // return NULL
                 .. BUG!

Because alloc_page_buffers() is called with size > PAGE_SIZE,
thus it initializes head = NULL, skips the loop, return head;
then create_empty_buffers() gets (and uses) the NULL pointer.

This has been around longer than commit ad6bf88a6c19 ("block:
fix an integer overflow in logical block size"); however, it
increased the range of values that can trigger the issue.

Previously only 8k/16k/32k (on x86/4k page size) would do it,
as greater values overflow unsigned short to zero, and queue_
logical_block_size() would then use the default of 512.

Now the range with unsigned int is much larger, and users w/
the 512k value, which happened to be zero'ed previously and
work fine, started to hit this issue -- as the zero is gone,
and queue_logical_block_size() does return 512k (>PAGE_SIZE.)

Fix this by checking the bcache device's logical block size,
and if it's greater than page size, fallback to the backing/
cached device's logical page size.

This doesn't affect cache devices as those are still checked
for block/page size in read_super(); only the backing/cached
devices are not.

Apparently it's a regression from commit 2903381fce71 ("bcache:
Take data offset from the bdev superblock."), moving the check
into BCACHE_SB_VERSION_CDEV only. Now that we have superblocks
of backing devices out there with this larger value, we cannot
refuse to load them (i.e., have a similar check in _BDEV.)

Ideally perhaps bcache should use all values from the backing
device (physical/logical/io_min block size)? But for now just
fix the problematic case.

Test-case:

    # IMG=/root/disk.img
    # dd if=/dev/zero of=$IMG bs=1 count=0 seek=1G
    # DEV=$(losetup --find --show $IMG)
    # make-bcache --bdev $DEV --block 8k
      < see dmesg >

Before:

    # uname -r
    5.7.0-rc7

    [   55.944046] BUG: kernel NULL pointer dereference, address: 0000000000000000
    ...
    [   55.949742] CPU: 3 PID: 610 Comm: bcache-register Not tainted 5.7.0-rc7 #4
    ...
    [   55.952281] RIP: 0010:create_empty_buffers+0x1a/0x100
    ...
    [   55.966434] Call Trace:
    [   55.967021]  create_page_buffers+0x48/0x50
    [   55.967834]  block_read_full_page+0x49/0x380
    [   55.972181]  do_read_cache_page+0x494/0x610
    [   55.974780]  read_part_sector+0x2d/0xaa
    [   55.975558]  read_lba+0x10e/0x1e0
    [   55.977904]  efi_partition+0x120/0x5a6
    [   55.980227]  blk_add_partitions+0x161/0x390
    [   55.982177]  bdev_disk_changed+0x61/0xd0
    [   55.982961]  __blkdev_get+0x350/0x490
    [   55.983715]  __device_add_disk+0x318/0x480
    [   55.984539]  bch_cached_dev_run+0xc5/0x270
    [   55.986010]  register_bcache.cold+0x122/0x179
    [   55.987628]  kernfs_fop_write+0xbc/0x1a0
    [   55.988416]  vfs_write+0xb1/0x1a0
    [   55.989134]  ksys_write+0x5a/0xd0
    [   55.989825]  do_syscall_64+0x43/0x140
    [   55.990563]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
    [   55.991519] RIP: 0033:0x7f7d60ba3154
    ...

After:

    # uname -r
    5.7.0.bcachelbspgsz

    [   31.672460] bcache: bcache_device_init() bcache0: sb/logical block size (8192) greater than page size (4096) falling back to device logical block size (512)
    [   31.675133] bcache: register_bdev() registered backing device loop0

    # grep ^ /sys/block/bcache0/queue/*_block_size
    /sys/block/bcache0/queue/logical_block_size:512
    /sys/block/bcache0/queue/physical_block_size:8192

Reported-by: Ryan Finnie <ryan@finnie.org>
Reported-by: Sebastian Marsching <sebastian@marsching.com>
Signed-off-by: Mauricio Faria de Oliveira <mfo@canonical.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f9975c22bf7e..904ea9bbb5c4 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -819,7 +819,8 @@ static void bcache_device_free(struct bcache_device *d)
 }
 
 static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
-			      sector_t sectors, make_request_fn make_request_fn)
+			      sector_t sectors, make_request_fn make_request_fn,
+			      struct block_device *cached_bdev)
 {
 	struct request_queue *q;
 	const size_t max_stripes = min_t(size_t, INT_MAX,
@@ -885,6 +886,21 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
 	q->limits.io_min		= block_size;
 	q->limits.logical_block_size	= block_size;
 	q->limits.physical_block_size	= block_size;
+
+	if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) {
+		/*
+		 * This should only happen with BCACHE_SB_VERSION_BDEV.
+		 * Block/page size is checked for BCACHE_SB_VERSION_CDEV.
+		 */
+		pr_info("%s: sb/logical block size (%u) greater than page size "
+			"(%lu) falling back to device logical block size (%u)",
+			d->disk->disk_name, q->limits.logical_block_size,
+			PAGE_SIZE, bdev_logical_block_size(cached_bdev));
+
+		/* This also adjusts physical block size/min io size if needed */
+		blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev));
+	}
+
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
@@ -1340,7 +1356,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
 
 	ret = bcache_device_init(&dc->disk, block_size,
 			 dc->bdev->bd_part->nr_sects - dc->sb.data_offset,
-			 cached_dev_make_request);
+			 cached_dev_make_request, dc->bdev);
 	if (ret)
 		return ret;
 
@@ -1453,7 +1469,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
 	kobject_init(&d->kobj, &bch_flash_dev_ktype);
 
 	if (bcache_device_init(d, block_bytes(c), u->sectors,
-			flash_dev_make_request))
+			flash_dev_make_request, NULL))
 		goto err;
 
 	bcache_device_attach(d, c, u - c->uuids);

From ee4a36f414617aaae01f93133e828363e7e9dd17 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Mon, 15 Jun 2020 00:53:32 +0800
Subject: [PATCH 03/13] bcache: use delayed kworker fo asynchronous devices
 registration

This patch changes the asynchronous registration kworker to a delayed
kworker. There is probability queue_work() queues the async registration
kworker to the same CPU (even though very little), then the process
which writing sysfs interface to reigster bcache device may won't return
immeidately. queue_delayed_work() in this patch will delay 10 jiffies
before insert the kworker to run queue, which makes sure the registering
process may always returns to user space in time.

Fixes: 9e23ccf8f0a22 ("bcache: asynchronous devices registration")
Signed-off-by: Coly Li <colyli@suse.de>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 904ea9bbb5c4..a6e79083010a 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -19,6 +19,7 @@
 #include <linux/genhd.h>
 #include <linux/idr.h>
 #include <linux/kthread.h>
+#include <linux/workqueue.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/reboot.h>
@@ -2380,7 +2381,7 @@ static bool bch_is_open(struct block_device *bdev)
 }
 
 struct async_reg_args {
-	struct work_struct reg_work;
+	struct delayed_work reg_work;
 	char *path;
 	struct cache_sb *sb;
 	struct cache_sb_disk *sb_disk;
@@ -2391,7 +2392,7 @@ static void register_bdev_worker(struct work_struct *work)
 {
 	int fail = false;
 	struct async_reg_args *args =
-		container_of(work, struct async_reg_args, reg_work);
+		container_of(work, struct async_reg_args, reg_work.work);
 	struct cached_dev *dc;
 
 	dc = kzalloc(sizeof(*dc), GFP_KERNEL);
@@ -2421,7 +2422,7 @@ static void register_cache_worker(struct work_struct *work)
 {
 	int fail = false;
 	struct async_reg_args *args =
-		container_of(work, struct async_reg_args, reg_work);
+		container_of(work, struct async_reg_args, reg_work.work);
 	struct cache *ca;
 
 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -2449,11 +2450,12 @@ static void register_cache_worker(struct work_struct *work)
 static void register_device_aync(struct async_reg_args *args)
 {
 	if (SB_IS_BDEV(args->sb))
-		INIT_WORK(&args->reg_work, register_bdev_worker);
+		INIT_DELAYED_WORK(&args->reg_work, register_bdev_worker);
 	else
-		INIT_WORK(&args->reg_work, register_cache_worker);
+		INIT_DELAYED_WORK(&args->reg_work, register_cache_worker);
 
-	queue_work(system_wq, &args->reg_work);
+	/* 10 jiffies is enough for a delay */
+	queue_delayed_work(system_wq, &args->reg_work, 10);
 }
 
 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,

From 4b25bbf52a55b58f03473333eafe61c0d46125aa Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Mon, 15 Jun 2020 00:53:33 +0800
Subject: [PATCH 04/13] bcache: pr_info() format clean up in
 bcache_device_init()

scripts/checkpatch.pl reports following warning for patch
("bcache: check and adjust logical block size for backing devices"),
    WARNING: quoted string split across lines
    #146: FILE: drivers/md/bcache/super.c:896:
    +  pr_info("%s: sb/logical block size (%u) greater than page size "
    +	       "(%lu) falling back to device logical block size (%u)",

There are two things to fix up,
- The kernel message print should be in a single line.
- pr_info() won't automatically add new line since v5.8, a '\n' should
  be added.

This patch just does the above cleanup in bcache_device_init().

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index a6e79083010a..2014016f9a60 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -893,8 +893,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
 		 * This should only happen with BCACHE_SB_VERSION_BDEV.
 		 * Block/page size is checked for BCACHE_SB_VERSION_CDEV.
 		 */
-		pr_info("%s: sb/logical block size (%u) greater than page size "
-			"(%lu) falling back to device logical block size (%u)",
+		pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
 			d->disk->disk_name, q->limits.logical_block_size,
 			PAGE_SIZE, bdev_logical_block_size(cached_bdev));
 

From a8a5e383cf41c3852621f8cdfb9141c77b004cdd Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Mon, 15 Jun 2020 17:12:23 +0800
Subject: [PATCH 05/13] blk-mq: Remove redundant 'return' statement

The blk_mq_all_tag_iter() is a void function, thus remove
the redundant 'return' statement in this function.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 44f3d0967cb4..ae722f8b13fb 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -376,7 +376,7 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
 void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 		void *priv)
 {
-	return __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS);
+	__blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS);
 }
 
 /**

From 5bcc066c05909d2af5c954911309f02312b0b02e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 15 Jun 2020 12:22:53 -0700
Subject: [PATCH 06/13] trace/events/block.h: drop kernel-doc for dropped
 function parameter

Fix kernel-doc warning: the parameter was removed, so also remove
the kernel-doc notation for it.

../include/trace/events/block.h:278: warning: Excess function parameter 'error' description in 'trace_block_bio_complete'

Fixes: d24de76af836 ("block: remove the error argument to the block_bio_complete tracepoint")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/block.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 1257f26bb887..93b114226af8 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -254,7 +254,6 @@ TRACE_EVENT(block_bio_bounce,
  * block_bio_complete - completed all work on the block operation
  * @q: queue holding the block operation
  * @bio: block operation completed
- * @error: io error value
  *
  * This tracepoint indicates there is no further work to do on this
  * block IO operation @bio.

From 2d3a8e2deddea6c89961c422ec0c5b851e648c14 Mon Sep 17 00:00:00 2001
From: Jason Yan <yanaijie@huawei.com>
Date: Tue, 16 Jun 2020 20:16:55 +0800
Subject: [PATCH 07/13] block: Fix use-after-free in blkdev_get()

In blkdev_get() we call __blkdev_get() to do some internal jobs and if
there is some errors in __blkdev_get(), the bdput() is called which
means we have released the refcount of the bdev (actually the refcount of
the bdev inode). This means we cannot access bdev after that point. But
acctually bdev is still accessed in blkdev_get() after calling
__blkdev_get(). This results in use-after-free if the refcount is the
last one we released in __blkdev_get(). Let's take a look at the
following scenerio:

  CPU0            CPU1                    CPU2
blkdev_open     blkdev_open           Remove disk
                  bd_acquire
		  blkdev_get
		    __blkdev_get      del_gendisk
					bdev_unhash_inode
  bd_acquire          bdev_get_gendisk
    bd_forget           failed because of unhashed
	  bdput
	              bdput (the last one)
		        bdev_evict_inode

	  	    access bdev => use after free

[  459.350216] BUG: KASAN: use-after-free in __lock_acquire+0x24c1/0x31b0
[  459.351190] Read of size 8 at addr ffff88806c815a80 by task syz-executor.0/20132
[  459.352347]
[  459.352594] CPU: 0 PID: 20132 Comm: syz-executor.0 Not tainted 4.19.90 #2
[  459.353628] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[  459.354947] Call Trace:
[  459.355337]  dump_stack+0x111/0x19e
[  459.355879]  ? __lock_acquire+0x24c1/0x31b0
[  459.356523]  print_address_description+0x60/0x223
[  459.357248]  ? __lock_acquire+0x24c1/0x31b0
[  459.357887]  kasan_report.cold+0xae/0x2d8
[  459.358503]  __lock_acquire+0x24c1/0x31b0
[  459.359120]  ? _raw_spin_unlock_irq+0x24/0x40
[  459.359784]  ? lockdep_hardirqs_on+0x37b/0x580
[  459.360465]  ? _raw_spin_unlock_irq+0x24/0x40
[  459.361123]  ? finish_task_switch+0x125/0x600
[  459.361812]  ? finish_task_switch+0xee/0x600
[  459.362471]  ? mark_held_locks+0xf0/0xf0
[  459.363108]  ? __schedule+0x96f/0x21d0
[  459.363716]  lock_acquire+0x111/0x320
[  459.364285]  ? blkdev_get+0xce/0xbe0
[  459.364846]  ? blkdev_get+0xce/0xbe0
[  459.365390]  __mutex_lock+0xf9/0x12a0
[  459.365948]  ? blkdev_get+0xce/0xbe0
[  459.366493]  ? bdev_evict_inode+0x1f0/0x1f0
[  459.367130]  ? blkdev_get+0xce/0xbe0
[  459.367678]  ? destroy_inode+0xbc/0x110
[  459.368261]  ? mutex_trylock+0x1a0/0x1a0
[  459.368867]  ? __blkdev_get+0x3e6/0x1280
[  459.369463]  ? bdev_disk_changed+0x1d0/0x1d0
[  459.370114]  ? blkdev_get+0xce/0xbe0
[  459.370656]  blkdev_get+0xce/0xbe0
[  459.371178]  ? find_held_lock+0x2c/0x110
[  459.371774]  ? __blkdev_get+0x1280/0x1280
[  459.372383]  ? lock_downgrade+0x680/0x680
[  459.373002]  ? lock_acquire+0x111/0x320
[  459.373587]  ? bd_acquire+0x21/0x2c0
[  459.374134]  ? do_raw_spin_unlock+0x4f/0x250
[  459.374780]  blkdev_open+0x202/0x290
[  459.375325]  do_dentry_open+0x49e/0x1050
[  459.375924]  ? blkdev_get_by_dev+0x70/0x70
[  459.376543]  ? __x64_sys_fchdir+0x1f0/0x1f0
[  459.377192]  ? inode_permission+0xbe/0x3a0
[  459.377818]  path_openat+0x148c/0x3f50
[  459.378392]  ? kmem_cache_alloc+0xd5/0x280
[  459.379016]  ? entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  459.379802]  ? path_lookupat.isra.0+0x900/0x900
[  459.380489]  ? __lock_is_held+0xad/0x140
[  459.381093]  do_filp_open+0x1a1/0x280
[  459.381654]  ? may_open_dev+0xf0/0xf0
[  459.382214]  ? find_held_lock+0x2c/0x110
[  459.382816]  ? lock_downgrade+0x680/0x680
[  459.383425]  ? __lock_is_held+0xad/0x140
[  459.384024]  ? do_raw_spin_unlock+0x4f/0x250
[  459.384668]  ? _raw_spin_unlock+0x1f/0x30
[  459.385280]  ? __alloc_fd+0x448/0x560
[  459.385841]  do_sys_open+0x3c3/0x500
[  459.386386]  ? filp_open+0x70/0x70
[  459.386911]  ? trace_hardirqs_on_thunk+0x1a/0x1c
[  459.387610]  ? trace_hardirqs_off_caller+0x55/0x1c0
[  459.388342]  ? do_syscall_64+0x1a/0x520
[  459.388930]  do_syscall_64+0xc3/0x520
[  459.389490]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  459.390248] RIP: 0033:0x416211
[  459.390720] Code: 75 14 b8 02 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83
04 19 00 00 c3 48 83 ec 08 e8 0a fa ff ff 48 89 04 24 b8 02 00 00 00 0f
   05 <48> 8b 3c 24 48 89 c2 e8 53 fa ff ff 48 89 d0 48 83 c4 08 48 3d
      01
[  459.393483] RSP: 002b:00007fe45dfe9a60 EFLAGS: 00000293 ORIG_RAX: 0000000000000002
[  459.394610] RAX: ffffffffffffffda RBX: 00007fe45dfea6d4 RCX: 0000000000416211
[  459.395678] RDX: 00007fe45dfe9b0a RSI: 0000000000000002 RDI: 00007fe45dfe9b00
[  459.396758] RBP: 000000000076bf20 R08: 0000000000000000 R09: 000000000000000a
[  459.397930] R10: 0000000000000075 R11: 0000000000000293 R12: 00000000ffffffff
[  459.399022] R13: 0000000000000bd9 R14: 00000000004cdb80 R15: 000000000076bf2c
[  459.400168]
[  459.400430] Allocated by task 20132:
[  459.401038]  kasan_kmalloc+0xbf/0xe0
[  459.401652]  kmem_cache_alloc+0xd5/0x280
[  459.402330]  bdev_alloc_inode+0x18/0x40
[  459.402970]  alloc_inode+0x5f/0x180
[  459.403510]  iget5_locked+0x57/0xd0
[  459.404095]  bdget+0x94/0x4e0
[  459.404607]  bd_acquire+0xfa/0x2c0
[  459.405113]  blkdev_open+0x110/0x290
[  459.405702]  do_dentry_open+0x49e/0x1050
[  459.406340]  path_openat+0x148c/0x3f50
[  459.406926]  do_filp_open+0x1a1/0x280
[  459.407471]  do_sys_open+0x3c3/0x500
[  459.408010]  do_syscall_64+0xc3/0x520
[  459.408572]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  459.409415]
[  459.409679] Freed by task 1262:
[  459.410212]  __kasan_slab_free+0x129/0x170
[  459.410919]  kmem_cache_free+0xb2/0x2a0
[  459.411564]  rcu_process_callbacks+0xbb2/0x2320
[  459.412318]  __do_softirq+0x225/0x8ac

Fix this by delaying bdput() to the end of blkdev_get() which means we
have finished accessing bdev.

Fixes: 77ea887e433a ("implement in-kernel gendisk events handling")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Jason Yan <yanaijie@huawei.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 47860e589388..08c87db3a92b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1565,10 +1565,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	 */
 	if (!for_part) {
 		ret = devcgroup_inode_permission(bdev->bd_inode, perm);
-		if (ret != 0) {
-			bdput(bdev);
+		if (ret != 0)
 			return ret;
-		}
 	}
 
  restart:
@@ -1637,8 +1635,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_clear;
 			BUG_ON(for_part);
 			ret = __blkdev_get(whole, mode, 1);
-			if (ret)
+			if (ret) {
+				bdput(whole);
 				goto out_clear;
+			}
 			bdev->bd_contains = whole;
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
@@ -1688,7 +1688,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	disk_unblock_events(disk);
 	put_disk_and_module(disk);
  out:
-	bdput(bdev);
 
 	return ret;
 }
@@ -1755,6 +1754,9 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 		bdput(whole);
 	}
 
+	if (res)
+		bdput(bdev);
+
 	return res;
 }
 EXPORT_SYMBOL(blkdev_get);

From 1b0b283648163dae2a214ca28ed5a99f62a77319 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Fri, 5 Jun 2020 16:58:36 +0200
Subject: [PATCH 08/13] blktrace: break out of blktrace setup on concurrent
 calls

We use one blktrace per request_queue, that means one per the entire
disk.  So we cannot run one blktrace on say /dev/vda and then /dev/vda1,
or just two calls on /dev/vda.

We check for concurrent setup only at the very end of the blktrace setup though.

If we try to run two concurrent blktraces on the same block device the
second one will fail, and the first one seems to go on. However when
one tries to kill the first one one will see things like this:

The kernel will show these:

```
debugfs: File 'dropped' in directory 'nvme1n1' already present!
debugfs: File 'msg' in directory 'nvme1n1' already present!
debugfs: File 'trace0' in directory 'nvme1n1' already present!
``

And userspace just sees this error message for the second call:

```
blktrace /dev/nvme1n1
BLKTRACESETUP(2) /dev/nvme1n1 failed: 5/Input/output error
```

The first userspace process #1 will also claim that the files
were taken underneath their nose as well. The files are taken
away form the first process given that when the second blktrace
fails, it will follow up with a BLKTRACESTOP and BLKTRACETEARDOWN.
This means that even if go-happy process #1 is waiting for blktrace
data, we *have* been asked to take teardown the blktrace.

This can easily be reproduced with break-blktrace [0] run_0005.sh test.

Just break out early if we know we're already going to fail, this will
prevent trying to create the files all over again, which we know still
exist.

[0] https://github.com/mcgrof/break-blktrace

Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 5773f0ba7e76..8fd36e827f14 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -3,6 +3,9 @@
  * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
  *
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
 #include <linux/blktrace_api.h>
@@ -494,6 +497,16 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	 */
 	strreplace(buts->name, '/', '_');
 
+	/*
+	 * bdev can be NULL, as with scsi-generic, this is a helpful as
+	 * we can be.
+	 */
+	if (q->blk_trace) {
+		pr_warn("Concurrent blktraces are not allowed on %s\n",
+			buts->name);
+		return -EBUSY;
+	}
+
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 	if (!bt)
 		return -ENOMEM;

From c3dbe541ef77754729de5e82be2d6e5d267c6c8c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 5 Jun 2020 16:58:37 +0200
Subject: [PATCH 09/13] blktrace: Avoid sparse warnings when assigning
 q->blk_trace

Mostly for historical reasons, q->blk_trace is assigned through xchg()
and cmpxchg() atomic operations. Although this is correct, sparse
complains about this because it violates rcu annotations since commit
c780e86dd48e ("blktrace: Protect q->blk_trace with RCU") which started
to use rcu for accessing q->blk_trace. Furthermore there's no real need
for atomic operations anymore since all changes to q->blk_trace happen
under q->blk_trace_mutex and since it also makes more sense to check if
q->blk_trace is set with the mutex held earlier.

So let's just replace xchg() with rcu_replace_pointer() and cmpxchg()
with explicit check and rcu_assign_pointer(). This makes the code more
efficient and sparse happy.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8fd36e827f14..5ef0484513ec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -347,7 +347,8 @@ static int __blk_trace_remove(struct request_queue *q)
 {
 	struct blk_trace *bt;
 
-	bt = xchg(&q->blk_trace, NULL);
+	bt = rcu_replace_pointer(q->blk_trace, NULL,
+				 lockdep_is_held(&q->blk_trace_mutex));
 	if (!bt)
 		return -EINVAL;
 
@@ -501,7 +502,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	 * bdev can be NULL, as with scsi-generic, this is a helpful as
 	 * we can be.
 	 */
-	if (q->blk_trace) {
+	if (rcu_dereference_protected(q->blk_trace,
+				      lockdep_is_held(&q->blk_trace_mutex))) {
 		pr_warn("Concurrent blktraces are not allowed on %s\n",
 			buts->name);
 		return -EBUSY;
@@ -556,10 +558,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 
-	ret = -EBUSY;
-	if (cmpxchg(&q->blk_trace, NULL, bt))
-		goto err;
-
+	rcu_assign_pointer(q->blk_trace, bt);
 	get_probe_ref();
 
 	ret = 0;
@@ -1642,7 +1641,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
 {
 	struct blk_trace *bt;
 
-	bt = xchg(&q->blk_trace, NULL);
+	bt = rcu_replace_pointer(q->blk_trace, NULL,
+				 lockdep_is_held(&q->blk_trace_mutex));
 	if (bt == NULL)
 		return -EINVAL;
 
@@ -1674,10 +1674,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
 
 	blk_trace_setup_lba(bt, bdev);
 
-	ret = -EBUSY;
-	if (cmpxchg(&q->blk_trace, NULL, bt))
-		goto free_bt;
-
+	rcu_assign_pointer(q->blk_trace, bt);
 	get_probe_ref();
 	return 0;
 

From fe35ec58f0d339221643287bbb7cee15c93a5389 Mon Sep 17 00:00:00 2001
From: Weiping Zhang <zhangweiping@didiglobal.com>
Date: Wed, 17 Jun 2020 14:18:37 +0800
Subject: [PATCH 10/13] block: update hctx map when use multiple maps

There is an issue when tune the number for read and write queues,
if the total queue count was not changed. The hctx->type cannot
be updated, since __blk_mq_update_nr_hw_queues will return directly
if the total queue count has not been changed.

Reproduce:

dmesg | grep "default/read/poll"
[    2.607459] nvme nvme0: 48/0/0 default/read/poll queues
cat /sys/kernel/debug/block/nvme0n1/hctx*/type | sort | uniq -c
     48 default

tune the write queues to 24:
echo 24 > /sys/module/nvme/parameters/write_queues
echo 1 > /sys/block/nvme0n1/device/reset_controller

dmesg | grep "default/read/poll"
[  433.547235] nvme nvme0: 24/24/0 default/read/poll queues

cat /sys/kernel/debug/block/nvme0n1/hctx*/type | sort | uniq -c
     48 default

The driver's hardware queue mapping is not same as block layer.

Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4f57d27bfa73..a9aa6d1e44cf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3479,7 +3479,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 
 	if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
 		nr_hw_queues = nr_cpu_ids;
-	if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
+	if (nr_hw_queues < 1)
+		return;
+	if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
 		return;
 
 	list_for_each_entry(q, &set->tag_list, tag_set_list)

From bc163c2046c5867df7d2b03e85bca29adf83fa32 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 22 Apr 2020 16:03:17 +0300
Subject: [PATCH 11/13] partitions/ldm: Replace uuid_copy() with import_uuid()
 where it makes sense

There is a specific API to treat raw data as UUID, i.e. import_uuid().
Use it instead of uuid_copy() with explicit casting.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/partitions/ldm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c
index 6fdfcb40c537..d333786b5c7e 100644
--- a/block/partitions/ldm.c
+++ b/block/partitions/ldm.c
@@ -910,7 +910,7 @@ static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 
 	disk = &vb->vblk.disk;
-	uuid_copy(&disk->disk_id, (uuid_t *)(buffer + 0x18 + r_name));
+	import_uuid(&disk->disk_id, buffer + 0x18 + r_name);
 	return true;
 }
 

From f4bd34b139a3fa2808c4205f12714c65e1548c6c Mon Sep 17 00:00:00 2001
From: Zheng Bin <zhengbin13@huawei.com>
Date: Thu, 18 Jun 2020 12:21:37 +0800
Subject: [PATCH 12/13] loop: replace kill_bdev with invalidate_bdev

When a filesystem is mounted on a loop device and on a loop ioctl
LOOP_SET_STATUS64, because of kill_bdev, buffer_head mappings are getting
destroyed.
kill_bdev
  truncate_inode_pages
    truncate_inode_pages_range
      do_invalidatepage
        block_invalidatepage
          discard_buffer  -->clear BH_Mapped flag

sb_bread
  __bread_gfp
  bh = __getblk_gfp
  -->discard_buffer clear BH_Mapped flag
  __bread_slow
    submit_bh
      submit_bh_wbc
        BUG_ON(!buffer_mapped(bh))  --> hit this BUG_ON

Fixes: 5db470e229e2 ("loop: drop caches if offset or block_size are changed")
Signed-off-by: Zheng Bin <zhengbin13@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c33bbbfd1bd9..475e1a738560 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1368,14 +1368,14 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	    lo->lo_sizelimit != info->lo_sizelimit) {
 		size_changed = true;
 		sync_blockdev(lo->lo_device);
-		kill_bdev(lo->lo_device);
+		invalidate_bdev(lo->lo_device);
 	}
 
 	/* I/O need to be drained during transfer transition */
 	blk_mq_freeze_queue(lo->lo_queue);
 
 	if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) {
-		/* If any pages were dirtied after kill_bdev(), try again */
+		/* If any pages were dirtied after invalidate_bdev(), try again */
 		err = -EAGAIN;
 		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",
 			__func__, lo->lo_number, lo->lo_file_name,
@@ -1615,11 +1615,11 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 		return 0;
 
 	sync_blockdev(lo->lo_device);
-	kill_bdev(lo->lo_device);
+	invalidate_bdev(lo->lo_device);
 
 	blk_mq_freeze_queue(lo->lo_queue);
 
-	/* kill_bdev should have truncated all the pages */
+	/* invalidate_bdev should have truncated all the pages */
 	if (lo->lo_device->bd_inode->i_mapping->nrpages) {
 		err = -EAGAIN;
 		pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n",

From 3373a3461aa15b7f9a871fa4cb2c9ef21ac20b47 Mon Sep 17 00:00:00 2001
From: Zheng Bin <zhengbin13@huawei.com>
Date: Thu, 18 Jun 2020 12:21:38 +0800
Subject: [PATCH 13/13] block: make function 'kill_bdev' static

kill_bdev does not have any external user, so make it static.

Signed-off-by: Zheng Bin <zhengbin13@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c     | 5 ++---
 include/linux/fs.h | 2 --
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 08c87db3a92b..0ae656e022fd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -75,7 +75,7 @@ static void bdev_write_inode(struct block_device *bdev)
 }
 
 /* Kill _all_ buffers and pagecache , dirty or not.. */
-void kill_bdev(struct block_device *bdev)
+static void kill_bdev(struct block_device *bdev)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 
@@ -84,8 +84,7 @@ void kill_bdev(struct block_device *bdev)
 
 	invalidate_bh_lrus();
 	truncate_inode_pages(mapping, 0);
-}	
-EXPORT_SYMBOL(kill_bdev);
+}
 
 /* Invalidate clean unused buffers and pagecache. */
 void invalidate_bdev(struct block_device *bdev)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6c4ab4dc1cd7..3f881a892ea7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2592,7 +2592,6 @@ extern void bdput(struct block_device *);
 extern void invalidate_bdev(struct block_device *);
 extern void iterate_bdevs(void (*)(struct block_device *, void *), void *);
 extern int sync_blockdev(struct block_device *bdev);
-extern void kill_bdev(struct block_device *);
 extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern void emergency_thaw_bdev(struct super_block *sb);
@@ -2608,7 +2607,6 @@ static inline bool sb_is_blkdev_sb(struct super_block *sb)
 #else
 static inline void bd_forget(struct inode *inode) {}
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
-static inline void kill_bdev(struct block_device *bdev) {}
 static inline void invalidate_bdev(struct block_device *bdev) {}
 
 static inline struct super_block *freeze_bdev(struct block_device *sb)