From dd840087086f3b93ac20f7472b4fca59aff7b79f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Aug 2014 23:16:32 +0800 Subject: [PATCH 01/15] blk-mq: fix WARNING "percpu_ref_kill() called more than once!" Before doing queue release, the queue has been freezed already by blk_cleanup_queue(), so needn't to freeze queue for deleting tag set. This patch fixes the WARNING of "percpu_ref_kill() called more than once!" which is triggered during unloading block driver. Cc: Tejun Heo Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 5189cb1e478a..ac8a0413664e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1713,14 +1713,10 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; - blk_mq_freeze_queue(q); - mutex_lock(&set->tag_list_lock); list_del_init(&q->tag_set_list); blk_mq_update_tag_set_depth(set); mutex_unlock(&set->tag_list_lock); - - blk_mq_unfreeze_queue(q); } static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, From 8a58d1f1f373238cb0d6d7f8d3dd723aa164b8ac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Aug 2014 12:38:41 -0600 Subject: [PATCH 02/15] blk-mq: get rid of unused BLK_MQ_F_SHOULD_SORT flag We used to use this for determining whether to sort the dispatch list, but it's unused now. Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index eb726b9c5762..a1e31f274fcd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -127,10 +127,9 @@ enum { BLK_MQ_RQ_QUEUE_ERROR = 2, /* end IO with error */ BLK_MQ_F_SHOULD_MERGE = 1 << 0, - BLK_MQ_F_SHOULD_SORT = 1 << 1, - BLK_MQ_F_TAG_SHARED = 1 << 2, - BLK_MQ_F_SG_MERGE = 1 << 3, - BLK_MQ_F_SYSFS_UP = 1 << 4, + BLK_MQ_F_TAG_SHARED = 1 << 1, + BLK_MQ_F_SG_MERGE = 1 << 2, + BLK_MQ_F_SYSFS_UP = 1 << 3, BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, From 274a5843ff2f08a89464589d90c64eb65f2c0847 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Aug 2014 12:44:08 -0600 Subject: [PATCH 03/15] blk-mq: don't allow merges if turned off for the queue blk-mq uses BLK_MQ_F_SHOULD_MERGE, as set by the driver at init time, to determine whether it should merge IO or not. However, this could also be disabled by the admin, if merging is switched off through sysfs. So check the general queue state as well before attempting to merge IO. Reported-by: Rob Elliott Tested-by: Rob Elliott Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ac8a0413664e..c9e89a8792e3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1068,13 +1068,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) blk_account_io_start(rq, 1); } +static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) +{ + return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && + !blk_queue_nomerges(hctx->queue); +} + static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq, struct bio *bio) { - struct request_queue *q = hctx->queue; - - if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) { + if (!hctx_allow_merges(hctx)) { blk_mq_bio_to_request(rq, bio); spin_lock(&ctx->lock); insert_rq: @@ -1082,6 +1086,8 @@ static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, spin_unlock(&ctx->lock); return false; } else { + struct request_queue *q = hctx->queue; + spin_lock(&ctx->lock); if (!blk_mq_attempt_merge(q, ctx, bio)) { blk_mq_bio_to_request(rq, bio); From 16f408dc6b1c87f5e3a767626df09c1399c6bf70 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 13 Aug 2014 14:49:31 +0300 Subject: [PATCH 04/15] block: Fix BUG_ON when pi errors occur When getting a pi error we get to bio_integrity_end_io with bi_remaining already decremented to 0 where we will eventually need to call bio_endio with restored original bio completion handler. Calling bio_endio invokes a BUG_ON(). We should call bio_endio_nodec instead, like what is done in bio_integrity_verify_fn. Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index bc423f7b02da..f14b4abbebd8 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -520,7 +520,7 @@ void bio_integrity_endio(struct bio *bio, int error) */ if (error) { bio->bi_end_io = bip->bip_end_io; - bio_endio(bio, error); + bio_endio_nodec(bio, error); return; } From a68aafa5b297d99c2d0c38689089a752126e9e79 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Aug 2014 13:19:15 -0600 Subject: [PATCH 05/15] blk-mq: correct a few wrong/bad comments Just grammar or spelling errors, nothing major. Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index c9e89a8792e3..a0565bb20fd5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1580,7 +1580,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, hctx->tags = set->tags[i]; /* - * Allocate space for all possible cpus to avoid allocation in + * Allocate space for all possible cpus to avoid allocation at * runtime */ hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), @@ -1668,8 +1668,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) { /* - * If not software queues are mapped to this hardware queue, - * disable it and free the request entries + * If no software queues are mapped to this hardware queue, + * disable it and free the request entries. */ if (!hctx->nr_ctx) { struct blk_mq_tag_set *set = q->tag_set; From cddd5d17642cc6881352732693c2ae6930e9ce65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 16 Aug 2014 08:02:24 -0400 Subject: [PATCH 06/15] blk-mq: blk_mq_freeze_queue() should allow nesting While converting to percpu_ref for freezing, add703fda981 ("blk-mq: use percpu_ref for mq usage count") incorrectly made blk_mq_freeze_queue() misbehave when freezing is nested due to percpu_ref_kill() being invoked on an already killed ref. Fix it by making blk_mq_freeze_queue() kill and kick the queue only for the outermost freeze attempt. All the nested ones can simply wait for the ref to reach zero. While at it, remove unnecessary @wake initialization from blk_mq_unfreeze_queue(). Signed-off-by: Tejun Heo Reported-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a0565bb20fd5..7950f8d7c1bb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -112,18 +112,22 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref) */ void blk_mq_freeze_queue(struct request_queue *q) { + bool freeze; + spin_lock_irq(q->queue_lock); - q->mq_freeze_depth++; + freeze = !q->mq_freeze_depth++; spin_unlock_irq(q->queue_lock); - percpu_ref_kill(&q->mq_usage_counter); - blk_mq_run_queues(q, false); + if (freeze) { + percpu_ref_kill(&q->mq_usage_counter); + blk_mq_run_queues(q, false); + } wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); } static void blk_mq_unfreeze_queue(struct request_queue *q) { - bool wake = false; + bool wake; spin_lock_irq(q->queue_lock); wake = !--q->mq_freeze_depth; From ffb5db73ebe9b5c37fd741e1684833dd674372bd Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Wed, 13 Aug 2014 13:59:50 +0200 Subject: [PATCH 07/15] block: systemace: Remove .owner field for driver There is no need to init .owner field. Based on the patch from Peter Griffin "mmc: remove .owner field for drivers using module_platform_driver" This patch removes the superflous .owner field for drivers which use the module_platform_driver API, as this is overriden in platform_driver_register anyway." Signed-off-by: Michal Simek Signed-off-by: Jens Axboe --- drivers/block/xsysace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index ab3ea62e5dfc..c4328d9d9981 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -1203,7 +1203,6 @@ static struct platform_driver ace_platform_driver = { .probe = ace_probe, .remove = ace_remove, .driver = { - .owner = THIS_MODULE, .name = "xsysace", .of_match_table = ace_of_match, }, From aeac31819475ad0980cb3a13d5599f5a1127e83d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 18 Aug 2014 12:49:08 +0400 Subject: [PATCH 08/15] brd: add ram disk visibility option Currenly ram disk is not visiable inside /proc/partitions. This was done for compatibility reasons here: 53978d0a7a27. But some utilities expect disk presents in /proc/partitions. Let's add module's option and let's administrator chose visibility behaviour. By default, old behaviour preserved. Signed-off-by: Dmitry Monakhov Signed-off-by: Jens Axboe --- drivers/block/brd.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c7d138eca731..3598110d2cef 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -442,12 +442,15 @@ static int rd_nr; int rd_size = CONFIG_BLK_DEV_RAM_SIZE; static int max_part; static int part_shift; +static int part_show = 0; module_param(rd_nr, int, S_IRUGO); MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); module_param(rd_size, int, S_IRUGO); MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); module_param(max_part, int, S_IRUGO); MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk"); +module_param(part_show, int, S_IRUGO); +MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); MODULE_ALIAS("rd"); @@ -501,7 +504,8 @@ static struct brd_device *brd_alloc(int i) disk->fops = &brd_fops; disk->private_data = brd; disk->queue = brd->brd_queue; - disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; + if (!part_show) + disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; sprintf(disk->disk_name, "ram%d", i); set_capacity(disk, rd_size * 2); From 2cada584b20007470931080c49310d85908449b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Aug 2014 20:38:27 -0500 Subject: [PATCH 09/15] block: cleanup error handling in sg_io Make sure we always clean up through the out label and just have a single place to put the request. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 51bf5155ee75..ba8706f6cded 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -279,7 +279,6 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, r = blk_rq_unmap_user(bio); if (!ret) ret = r; - blk_put_request(rq); return ret; } @@ -322,10 +321,9 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, return -ENOMEM; blk_rq_set_block_pc(rq); - if (blk_fill_sghdr_rq(q, rq, hdr, mode)) { - blk_put_request(rq); - return -EFAULT; - } + ret = -EFAULT; + if (blk_fill_sghdr_rq(q, rq, hdr, mode)) + goto out; if (hdr->iovec_count) { size_t iov_data_len; @@ -376,7 +374,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, hdr->duration = jiffies_to_msecs(jiffies - start_time); - return blk_complete_sghdr_rq(rq, hdr, bio); + ret = blk_complete_sghdr_rq(rq, hdr, bio); out: blk_put_request(rq); return ret; From a57821cac6bb6e46abea118e34d0e86444ec1410 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Aug 2014 20:39:53 -0500 Subject: [PATCH 10/15] block: support > 16 byte CDBs for SG_IO Signed-off-by: Christoph Hellwig Reviewed-by: Boaz Harrosh Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index ba8706f6cded..1d78e6cf9d61 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -296,8 +296,6 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (hdr->interface_id != 'S') return -EINVAL; - if (hdr->cmd_len > BLK_MAX_CDB) - return -EINVAL; if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9)) return -EIO; @@ -316,14 +314,21 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (hdr->flags & SG_FLAG_Q_AT_HEAD) at_head = 1; + ret = -ENOMEM; rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); if (!rq) - return -ENOMEM; + goto out; blk_rq_set_block_pc(rq); + if (hdr->cmd_len > BLK_MAX_CDB) { + rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); + if (!rq->cmd) + goto out_put_request; + } + ret = -EFAULT; if (blk_fill_sghdr_rq(q, rq, hdr, mode)) - goto out; + goto out_free_cdb; if (hdr->iovec_count) { size_t iov_data_len; @@ -333,7 +338,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, 0, NULL, &iov); if (ret < 0) { kfree(iov); - goto out; + goto out_free_cdb; } iov_data_len = ret; @@ -356,7 +361,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, GFP_KERNEL); if (ret) - goto out; + goto out_free_cdb; bio = rq->bio; memset(sense, 0, sizeof(sense)); @@ -375,8 +380,13 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, hdr->duration = jiffies_to_msecs(jiffies - start_time); ret = blk_complete_sghdr_rq(rq, hdr, bio); -out: + +out_free_cdb: + if (rq->cmd != rq->__cmd) + kfree(rq->cmd); +out_put_request: blk_put_request(rq); +out: return ret; } From 6f4a16266fb3e58cd3e200eab51d2220ef92d604 Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Fri, 22 Aug 2014 15:53:39 -0400 Subject: [PATCH 11/15] scsi-mq: fix requests that use a separate CDB buffer This patch fixes code such as the following with scsi-mq enabled: rq = blk_get_request(...); blk_rq_set_block_pc(rq); rq->cmd = my_cmd_buffer; /* separate CDB buffer */ blk_execute_rq_nowait(...); Code like this appears in e.g. sg_start_req() in drivers/scsi/sg.c (for large CDBs only). Without this patch, scsi_mq_prep_fn() will set rq->cmd back to rq->__cmd, causing the wrong CDB to be sent to the device. Signed-off-by: Tony Battersby Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - block/blk-mq.c | 2 ++ drivers/scsi/scsi_lib.c | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index c359d72e9d76..bf930f481d43 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1252,7 +1252,6 @@ void blk_rq_set_block_pc(struct request *rq) rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; memset(rq->__cmd, 0, sizeof(rq->__cmd)); - rq->cmd = rq->__cmd; } EXPORT_SYMBOL(blk_rq_set_block_pc); diff --git a/block/blk-mq.c b/block/blk-mq.c index 7950f8d7c1bb..4aac82615a46 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -176,6 +176,8 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, /* tag was already set */ rq->errors = 0; + rq->cmd = rq->__cmd; + rq->extra_len = 0; rq->sense_len = 0; rq->resid_len = 0; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 9c44392b748f..d86808f051e0 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1808,7 +1808,6 @@ static int scsi_mq_prep_fn(struct request *req) cmd->tag = req->tag; - req->cmd = req->__cmd; cmd->cmnd = req->cmd; cmd->prot_op = SCSI_PROT_NORMAL; From 2ba136daa3ae1e881c9f586f283fcaa164767dce Mon Sep 17 00:00:00 2001 From: Tony Battersby Date: Fri, 22 Aug 2014 15:53:35 -0400 Subject: [PATCH 12/15] fix regression in SCSI_IOCTL_SEND_COMMAND blk_rq_set_block_pc() memsets rq->cmd to 0, so it should come immediately after blk_get_request() to avoid overwriting the user-supplied CDB. Also check for failure to allocate rq. Fixes: f27b087b81b7 ("block: add blk_rq_set_block_pc()") Cc: # 3.16.x Signed-off-by: Tony Battersby Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 1d78e6cf9d61..5dd477bfb4bc 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -456,6 +456,11 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); + if (!rq) { + err = -ENOMEM; + goto error; + } + blk_rq_set_block_pc(rq); cmdlen = COMMAND_SIZE(opcode); @@ -509,7 +514,6 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, memset(sense, 0, sizeof(sense)); rq->sense = sense; rq->sense_len = 0; - blk_rq_set_block_pc(rq); blk_execute_rq(q, disk, rq, 0); @@ -529,7 +533,8 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, error: kfree(buffer); - blk_put_request(rq); + if (rq) + blk_put_request(rq); return err; } EXPORT_SYMBOL_GPL(sg_scsi_ioctl); From d19d744685f47f1bb3d39b3ec51eb50afe5ff47d Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 26 Aug 2014 16:14:02 +0200 Subject: [PATCH 13/15] block: fix error handling in sg_io Before commit 2cada584b200 ("block: cleanup error handling in sg_io"), we had ret = 0 before entering the last big if block of sg_io. Since 2cada584b200, ret = -EFAULT, which breaks hdparm: /dev/sda: setting Advanced Power Management level to 0xc8 (200) HDIO_DRIVE_CMD failed: Bad address APM_level = 128 Signed-off-by: Sabrina Dubroca Fixes: 2cada584b200 ("block: cleanup error handling in sg_io") Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 5dd477bfb4bc..9b8eaeca6a79 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -330,6 +330,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (blk_fill_sghdr_rq(q, rq, hdr, mode)) goto out_free_cdb; + ret = 0; if (hdr->iovec_count) { size_t iov_data_len; struct iovec *iov = NULL; From e15693ef18e13e3e6bffe891fe140f18b8ff6d07 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Tue, 26 Aug 2014 20:56:36 +0900 Subject: [PATCH 14/15] cfq-iosched: Fix wrong children_weight calculation cfq_group_service_tree_add() is applying new_weight at the beginning of the function via cfq_update_group_weight(). This actually allows weight to change between adding it to and subtracting it from children_weight, and triggers WARN_ON_ONCE() in cfq_group_service_tree_del(), or even causes oops by divide error during vfr calculation in cfq_group_service_tree_add(). The detailed scenario is as follows: 1. Create blkio cgroups X and Y as a child of X. Set X's weight to 500 and perform some I/O to apply new_weight. This X's I/O completes before starting Y's I/O. 2. Y starts I/O and cfq_group_service_tree_add() is called with Y. 3. cfq_group_service_tree_add() walks up the tree during children_weight calculation and adds parent X's weight (500) to children_weight of root. children_weight becomes 500. 4. Set X's weight to 1000. 5. X starts I/O and cfq_group_service_tree_add() is called with X. 6. cfq_group_service_tree_add() applies its new_weight (1000). 7. I/O of Y completes and cfq_group_service_tree_del() is called with Y. 8. I/O of X completes and cfq_group_service_tree_del() is called with X. 9. cfq_group_service_tree_del() subtracts X's weight (1000) from children_weight of root. children_weight becomes -500. This triggers WARN_ON_ONCE(). 10. Set X's weight to 500. 11. X starts I/O and cfq_group_service_tree_add() is called with X. 12. cfq_group_service_tree_add() applies its new_weight (500) and adds it to children_weight of root. children_weight becomes 0. Calcularion of vfr triggers oops by divide error. weight should be updated right before adding it to children_weight. Reported-by: Ruki Sekiya Signed-off-by: Toshiaki Makita Acked-by: Tejun Heo Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index cadc37841744..d7494637c5db 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1275,12 +1275,16 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) static void cfq_update_group_weight(struct cfq_group *cfqg) { - BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - if (cfqg->new_weight) { cfqg->weight = cfqg->new_weight; cfqg->new_weight = 0; } +} + +static void +cfq_update_group_leaf_weight(struct cfq_group *cfqg) +{ + BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); if (cfqg->new_leaf_weight) { cfqg->leaf_weight = cfqg->new_leaf_weight; @@ -1299,7 +1303,7 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) /* add to the service tree */ BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - cfq_update_group_weight(cfqg); + cfq_update_group_leaf_weight(cfqg); __cfq_group_service_tree_add(st, cfqg); /* @@ -1323,6 +1327,7 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) */ while ((parent = cfqg_parent(pos))) { if (propagate) { + cfq_update_group_weight(pos); propagate = !parent->nr_active++; parent->children_weight += pos->weight; } From 7b5af5cffce569298b1d03af1ddf1dc43570ab42 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 28 Aug 2014 17:14:58 +0900 Subject: [PATCH 15/15] cfq-iosched: Add comments on update timing of weight Explain that weight has to be updated on activation. This complements previous fix e15693ef18e1 ("cfq-iosched: Fix wrong children_weight calculation"). Signed-off-by: Toshiaki Makita Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d7494637c5db..3f31cf9508e6 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1272,6 +1272,9 @@ __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) rb_insert_color(&cfqg->rb_node, &st->rb); } +/* + * This has to be called only on activation of cfqg + */ static void cfq_update_group_weight(struct cfq_group *cfqg) { @@ -1303,6 +1306,11 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) /* add to the service tree */ BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); + /* + * Update leaf_weight. We cannot update weight at this point + * because cfqg might already have been activated and is + * contributing its current weight to the parent's child_weight. + */ cfq_update_group_leaf_weight(cfqg); __cfq_group_service_tree_add(st, cfqg);