From 3681c85dffda70e551dead31c8d102bd69033fe8 Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Sat, 5 Mar 2016 00:27:04 +0800 Subject: [PATCH 01/11] null_blk: add lightnvm null_blk device to the nullb_list After register null_blk devices into lightnvm, we forget to add these devices to the the nullb_list, makes them invisible to the null_blk driver. Signed-off-by: Wenwei Tao Fixes: a514379b0c77 ("null_blk: oops when initializing without lightnvm") Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 64a7b5971b57..cab97593ba54 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -742,10 +742,11 @@ static int null_add_dev(void) add_disk(disk); +done: mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); mutex_unlock(&lock); -done: + return 0; out_cleanup_lightnvm: From 4c9dacb82d5aa36aa2568df60d897f2eb3d8819b Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Thu, 3 Mar 2016 15:06:37 +0100 Subject: [PATCH 02/11] lightnvm: specify target's logical address area MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can create more than one target on a lightnvm device by specifying its begin lun and end lun. But only specify the physical address area is not enough, we need to get the corresponding non- intersection logical address area division from the backend device's logcial address space. Otherwise the targets on the device might use the same logical addresses cause incorrect information in the device's l2p table. Signed-off-by: Wenwei Tao Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 1 + drivers/lightnvm/gennvm.c | 67 +++++++++++++++++++++++++++++++++++++++ drivers/lightnvm/gennvm.h | 6 ++++ drivers/lightnvm/rrpc.c | 35 ++++++++++++++++++-- drivers/lightnvm/rrpc.h | 1 + include/linux/lightnvm.h | 8 +++++ 6 files changed, 116 insertions(+), 2 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 0d1fb6b40c46..2925fd0b82bb 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -466,6 +466,7 @@ static int nvm_core_init(struct nvm_dev *dev) dev->total_secs = dev->nr_luns * dev->sec_per_lun; INIT_LIST_HEAD(&dev->online_targets); mutex_init(&dev->mlock); + spin_lock_init(&dev->lock); return 0; } diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index d65ec36a2231..d460b37bb016 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -20,6 +20,68 @@ #include "gennvm.h" +static int gennvm_get_area(struct nvm_dev *dev, sector_t *lba, sector_t len) +{ + struct gen_nvm *gn = dev->mp; + struct gennvm_area *area, *prev, *next; + sector_t begin = 0; + sector_t max_sectors = (dev->sec_size * dev->total_secs) >> 9; + + if (len > max_sectors) + return -EINVAL; + + area = kmalloc(sizeof(struct gennvm_area), GFP_KERNEL); + if (!area) + return -ENOMEM; + + prev = NULL; + + spin_lock(&dev->lock); + list_for_each_entry(next, &gn->area_list, list) { + if (begin + len > next->begin) { + begin = next->end; + prev = next; + continue; + } + break; + } + + if ((begin + len) > max_sectors) { + spin_unlock(&dev->lock); + kfree(area); + return -EINVAL; + } + + area->begin = *lba = begin; + area->end = begin + len; + + if (prev) /* insert into sorted order */ + list_add(&area->list, &prev->list); + else + list_add(&area->list, &gn->area_list); + spin_unlock(&dev->lock); + + return 0; +} + +static void gennvm_put_area(struct nvm_dev *dev, sector_t begin) +{ + struct gen_nvm *gn = dev->mp; + struct gennvm_area *area; + + spin_lock(&dev->lock); + list_for_each_entry(area, &gn->area_list, list) { + if (area->begin != begin) + continue; + + list_del(&area->list); + spin_unlock(&dev->lock); + kfree(area); + return; + } + spin_unlock(&dev->lock); +} + static void gennvm_blocks_free(struct nvm_dev *dev) { struct gen_nvm *gn = dev->mp; @@ -229,6 +291,7 @@ static int gennvm_register(struct nvm_dev *dev) gn->dev = dev; gn->nr_luns = dev->nr_luns; + INIT_LIST_HEAD(&gn->area_list); dev->mp = gn; ret = gennvm_luns_init(dev, gn); @@ -465,6 +528,10 @@ static struct nvmm_type gennvm = { .get_lun = gennvm_get_lun, .lun_info_print = gennvm_lun_info_print, + + .get_area = gennvm_get_area, + .put_area = gennvm_put_area, + }; static int __init gennvm_module_init(void) diff --git a/drivers/lightnvm/gennvm.h b/drivers/lightnvm/gennvm.h index 9c24b5b32dac..04d7c23cfc61 100644 --- a/drivers/lightnvm/gennvm.h +++ b/drivers/lightnvm/gennvm.h @@ -39,8 +39,14 @@ struct gen_nvm { int nr_luns; struct gen_lun *luns; + struct list_head area_list; }; +struct gennvm_area { + struct list_head list; + sector_t begin; + sector_t end; /* end is excluded */ +}; #define gennvm_for_each_lun(bm, lun, i) \ for ((i) = 0, lun = &(bm)->luns[0]; \ (i) < (bm)->nr_luns; (i)++, lun = &(bm)->luns[(i)]) diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 82343783aa47..c1e3c83f06b3 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -1053,8 +1053,11 @@ static int rrpc_map_init(struct rrpc *rrpc) { struct nvm_dev *dev = rrpc->dev; sector_t i; + u64 slba; int ret; + slba = rrpc->soffset >> (ilog2(dev->sec_size) - 9); + rrpc->trans_map = vzalloc(sizeof(struct rrpc_addr) * rrpc->nr_sects); if (!rrpc->trans_map) return -ENOMEM; @@ -1076,7 +1079,7 @@ static int rrpc_map_init(struct rrpc *rrpc) return 0; /* Bring up the mapping table from device */ - ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs, rrpc_l2p_update, + ret = dev->ops->get_l2p_tbl(dev, slba, rrpc->nr_sects, rrpc_l2p_update, rrpc); if (ret) { pr_err("nvm: rrpc: could not read L2P table.\n"); @@ -1086,7 +1089,6 @@ static int rrpc_map_init(struct rrpc *rrpc) return 0; } - /* Minimum pages needed within a lun */ #define PAGE_POOL_SIZE 16 #define ADDR_POOL_SIZE 64 @@ -1200,12 +1202,33 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) return -ENOMEM; } +/* returns 0 on success and stores the beginning address in *begin */ +static int rrpc_area_init(struct rrpc *rrpc, sector_t *begin) +{ + struct nvm_dev *dev = rrpc->dev; + struct nvmm_type *mt = dev->mt; + sector_t size = rrpc->nr_sects * dev->sec_size; + + size >>= 9; + + return mt->get_area(dev, begin, size); +} + +static void rrpc_area_free(struct rrpc *rrpc) +{ + struct nvm_dev *dev = rrpc->dev; + struct nvmm_type *mt = dev->mt; + + mt->put_area(dev, rrpc->soffset); +} + static void rrpc_free(struct rrpc *rrpc) { rrpc_gc_free(rrpc); rrpc_map_free(rrpc); rrpc_core_free(rrpc); rrpc_luns_free(rrpc); + rrpc_area_free(rrpc); kfree(rrpc); } @@ -1327,6 +1350,7 @@ static void *rrpc_init(struct nvm_dev *dev, struct gendisk *tdisk, struct request_queue *bqueue = dev->q; struct request_queue *tqueue = tdisk->queue; struct rrpc *rrpc; + sector_t soffset; int ret; if (!(dev->identity.dom & NVM_RSP_L2P)) { @@ -1352,6 +1376,13 @@ static void *rrpc_init(struct nvm_dev *dev, struct gendisk *tdisk, /* simple round-robin strategy */ atomic_set(&rrpc->next_lun, -1); + ret = rrpc_area_init(rrpc, &soffset); + if (ret < 0) { + pr_err("nvm: rrpc: could not initialize area\n"); + return ERR_PTR(ret); + } + rrpc->soffset = soffset; + ret = rrpc_luns_init(rrpc, lun_begin, lun_end); if (ret) { pr_err("nvm: rrpc: could not initialize luns\n"); diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h index 855f4a5ca7dd..2653484a3b40 100644 --- a/drivers/lightnvm/rrpc.h +++ b/drivers/lightnvm/rrpc.h @@ -97,6 +97,7 @@ struct rrpc { struct nvm_dev *dev; struct gendisk *disk; + sector_t soffset; /* logical sector offset */ u64 poffset; /* physical page offset */ int lun_offset; diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index c3c43184a787..b466bd9f2cf8 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -355,6 +355,7 @@ struct nvm_dev { char name[DISK_NAME_LEN]; struct mutex mlock; + spinlock_t lock; }; static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, @@ -467,6 +468,9 @@ typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int); typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *); +typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t); +typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t); + struct nvmm_type { const char *name; unsigned int version[3]; @@ -491,6 +495,10 @@ struct nvmm_type { /* Statistics */ nvmm_lun_info_print_fn *lun_info_print; + + nvmm_get_area_fn *get_area; + nvmm_put_area_fn *put_area; + struct list_head list; }; From da1e284919b0b99c5bf0618b6c98cbaf2c17e62e Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Thu, 3 Mar 2016 15:06:38 +0100 Subject: [PATCH 03/11] lightnvm: add a bitmap of luns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a bitmap of luns to indicate the status of luns: inuse/available. When create targets do the necessary check to avoid allocating luns that are already allocated. Signed-off-by: Wenwei Tao Freed dev->lun_map if nvm_core_init later failed in the init process. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 6 ++++ drivers/lightnvm/gennvm.c | 18 ++++++++++ drivers/lightnvm/rrpc.c | 74 ++++++++++++++++++++++++--------------- include/linux/lightnvm.h | 5 +++ 4 files changed, 74 insertions(+), 29 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 2925fd0b82bb..0dc9a80adb94 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -464,6 +464,10 @@ static int nvm_core_init(struct nvm_dev *dev) dev->nr_luns = dev->luns_per_chnl * dev->nr_chnls; dev->total_secs = dev->nr_luns * dev->sec_per_lun; + dev->lun_map = kcalloc(BITS_TO_LONGS(dev->nr_luns), + sizeof(unsigned long), GFP_KERNEL); + if (!dev->lun_map) + return -ENOMEM; INIT_LIST_HEAD(&dev->online_targets); mutex_init(&dev->mlock); spin_lock_init(&dev->lock); @@ -586,6 +590,7 @@ int nvm_register(struct request_queue *q, char *disk_name, return 0; err_init: + kfree(dev->lun_map); kfree(dev); return ret; } @@ -608,6 +613,7 @@ void nvm_unregister(char *disk_name) up_write(&nvm_lock); nvm_exit(dev); + kfree(dev->lun_map); kfree(dev); } EXPORT_SYMBOL(nvm_unregister); diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index d460b37bb016..b97801c00099 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -192,6 +192,9 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) lun_id = div_u64(pba, dev->sec_per_lun); lun = &gn->luns[lun_id]; + if (!test_bit(lun_id, dev->lun_map)) + __set_bit(lun_id, dev->lun_map); + /* Calculate block offset into lun */ pba = pba - (dev->sec_per_lun * lun_id); blk = &lun->vlun.blocks[div_u64(pba, dev->sec_per_blk)]; @@ -482,10 +485,23 @@ static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, return nvm_erase_ppa(dev, &addr, 1); } +static int gennvm_reserve_lun(struct nvm_dev *dev, int lunid) +{ + return test_and_set_bit(lunid, dev->lun_map); +} + +static void gennvm_release_lun(struct nvm_dev *dev, int lunid) +{ + WARN_ON(!test_and_clear_bit(lunid, dev->lun_map)); +} + static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid) { struct gen_nvm *gn = dev->mp; + if (unlikely(lunid >= dev->nr_luns)) + return NULL; + return &gn->luns[lunid].vlun; } @@ -527,6 +543,8 @@ static struct nvmm_type gennvm = { .erase_blk = gennvm_erase_blk, .get_lun = gennvm_get_lun, + .reserve_lun = gennvm_reserve_lun, + .release_lun = gennvm_release_lun, .lun_info_print = gennvm_lun_info_print, .get_area = gennvm_get_area, diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index c1e3c83f06b3..3ab6495c3fd8 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -965,25 +965,11 @@ static void rrpc_requeue(struct work_struct *work) static void rrpc_gc_free(struct rrpc *rrpc) { - struct rrpc_lun *rlun; - int i; - if (rrpc->krqd_wq) destroy_workqueue(rrpc->krqd_wq); if (rrpc->kgc_wq) destroy_workqueue(rrpc->kgc_wq); - - if (!rrpc->luns) - return; - - for (i = 0; i < rrpc->nr_luns; i++) { - rlun = &rrpc->luns[i]; - - if (!rlun->blocks) - break; - vfree(rlun->blocks); - } } static int rrpc_gc_init(struct rrpc *rrpc) @@ -1143,6 +1129,23 @@ static void rrpc_core_free(struct rrpc *rrpc) static void rrpc_luns_free(struct rrpc *rrpc) { + struct nvm_dev *dev = rrpc->dev; + struct nvm_lun *lun; + struct rrpc_lun *rlun; + int i; + + if (!rrpc->luns) + return; + + for (i = 0; i < rrpc->nr_luns; i++) { + rlun = &rrpc->luns[i]; + lun = rlun->parent; + if (!lun) + break; + dev->mt->release_lun(dev, lun->id); + vfree(rlun->blocks); + } + kfree(rrpc->luns); } @@ -1150,7 +1153,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) { struct nvm_dev *dev = rrpc->dev; struct rrpc_lun *rlun; - int i, j; + int i, j, ret = -EINVAL; if (dev->sec_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) { pr_err("rrpc: number of pages per block too high."); @@ -1166,25 +1169,26 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) /* 1:1 mapping */ for (i = 0; i < rrpc->nr_luns; i++) { - struct nvm_lun *lun = dev->mt->get_lun(dev, lun_begin + i); + int lunid = lun_begin + i; + struct nvm_lun *lun; + + if (dev->mt->reserve_lun(dev, lunid)) { + pr_err("rrpc: lun %u is already allocated\n", lunid); + goto err; + } + + lun = dev->mt->get_lun(dev, lunid); + if (!lun) + goto err; rlun = &rrpc->luns[i]; - rlun->rrpc = rrpc; rlun->parent = lun; - INIT_LIST_HEAD(&rlun->prio_list); - INIT_LIST_HEAD(&rlun->open_list); - INIT_LIST_HEAD(&rlun->closed_list); - - INIT_WORK(&rlun->ws_gc, rrpc_lun_gc); - spin_lock_init(&rlun->lock); - - rrpc->total_blocks += dev->blks_per_lun; - rrpc->nr_sects += dev->sec_per_lun; - rlun->blocks = vzalloc(sizeof(struct rrpc_block) * rrpc->dev->blks_per_lun); - if (!rlun->blocks) + if (!rlun->blocks) { + ret = -ENOMEM; goto err; + } for (j = 0; j < rrpc->dev->blks_per_lun; j++) { struct rrpc_block *rblk = &rlun->blocks[j]; @@ -1195,11 +1199,23 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) INIT_LIST_HEAD(&rblk->prio); spin_lock_init(&rblk->lock); } + + rlun->rrpc = rrpc; + INIT_LIST_HEAD(&rlun->prio_list); + INIT_LIST_HEAD(&rlun->open_list); + INIT_LIST_HEAD(&rlun->closed_list); + + INIT_WORK(&rlun->ws_gc, rrpc_lun_gc); + spin_lock_init(&rlun->lock); + + rrpc->total_blocks += dev->blks_per_lun; + rrpc->nr_sects += dev->sec_per_lun; + } return 0; err: - return -ENOMEM; + return ret; } /* returns 0 on success and stores the beginning address in *begin */ diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index b466bd9f2cf8..0ee2c2c78ffd 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -346,6 +346,7 @@ struct nvm_dev { int nr_luns; unsigned max_pages_per_blk; + unsigned long *lun_map; void *ppalist_pool; struct nvm_id identity; @@ -466,6 +467,8 @@ typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, unsigned long); typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int); +typedef int (nvmm_reserve_lun)(struct nvm_dev *, int); +typedef void (nvmm_release_lun)(struct nvm_dev *, int); typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *); typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t); @@ -492,6 +495,8 @@ struct nvmm_type { /* Configuration management */ nvmm_get_lun_fn *get_lun; + nvmm_reserve_lun *reserve_lun; + nvmm_release_lun *release_lun; /* Statistics */ nvmm_lun_info_print_fn *lun_info_print; From 9f867268436d799549909437e627e7cf279e1127 Mon Sep 17 00:00:00 2001 From: Matias Bjorling Date: Thu, 3 Mar 2016 15:06:39 +0100 Subject: [PATCH 04/11] nvme: lightnvm: return ppa completion status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PPAs sent to device is separately acknowledge in a 64bit status variable. The status is stored in DW0 and DW1 of the completion queue entry. Store this status inside the nvm_rq for further processing. This can later be used to implement retry techniques for failed writes and reads. Reviewed-by: Christoph Hellwig Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/nvme/host/lightnvm.c | 17 +++++++++++++++-- include/linux/lightnvm.h | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 42a01a931989..9461dd639acd 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -146,6 +146,14 @@ struct nvme_nvm_command { }; }; +struct nvme_nvm_completion { + __le64 result; /* Used by LightNVM to return ppa completions */ + __le16 sq_head; /* how much of this queue may be reclaimed */ + __le16 sq_id; /* submission queue that generated this entry */ + __u16 command_id; /* of the command which completed */ + __le16 status; /* did the command fail, and if so, why? */ +}; + #define NVME_NVM_LP_MLC_PAIRS 886 struct nvme_nvm_lp_mlc { __u16 num_pairs; @@ -507,6 +515,10 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd, static void nvme_nvm_end_io(struct request *rq, int error) { struct nvm_rq *rqd = rq->end_io_data; + struct nvme_nvm_completion *cqe = rq->special; + + if (cqe) + rqd->ppa_status = le64_to_cpu(cqe->result); nvm_end_io(rqd, error); @@ -526,7 +538,8 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) if (IS_ERR(rq)) return -ENOMEM; - cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); + cmd = kzalloc(sizeof(struct nvme_nvm_command) + + sizeof(struct nvme_nvm_completion), GFP_KERNEL); if (!cmd) { blk_mq_free_request(rq); return -ENOMEM; @@ -545,7 +558,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) rq->cmd = (unsigned char *)cmd; rq->cmd_len = sizeof(struct nvme_nvm_command); - rq->special = (void *)0; + rq->special = cmd + 1; rq->end_io_data = rqd; diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 0ee2c2c78ffd..cdcb2ccbefa8 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -242,6 +242,7 @@ struct nvm_rq { uint16_t nr_pages; uint16_t flags; + u64 ppa_status; /* ppa media status */ int error; }; From 719b59172cdcd5a2ba532b4bb4d56c36df20c28e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Thu, 3 Mar 2016 15:06:40 +0100 Subject: [PATCH 05/11] lightnvm: do not reserve lun on l2p loading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the l2p table is loaded, addresses are checked for the lun they belong to and luns are reserved accordingly. This assumes that metadata is being stored in the backend device to recover the previous target configuration. Since this is not yet implemented, this check collides with some of the core initialization (e.g., sysblock initialization when a page is formed by several sectors). We take this check out and for now rely on that the right target will be created instead. When metadata is stored to recover a target, this check will come natural as part of the recovery strategy. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/gennvm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index b97801c00099..42c1c2ab6cc3 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -192,9 +192,6 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) lun_id = div_u64(pba, dev->sec_per_lun); lun = &gn->luns[lun_id]; - if (!test_bit(lun_id, dev->lun_map)) - __set_bit(lun_id, dev->lun_map); - /* Calculate block offset into lun */ pba = pba - (dev->sec_per_lun * lun_id); blk = &lun->vlun.blocks[div_u64(pba, dev->sec_per_blk)]; From 29fd20b8e68a4d31a82909265b1e650b7b860f54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Thu, 3 Mar 2016 15:06:41 +0100 Subject: [PATCH 06/11] lightnvm: do not load L2P table if not supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An Open-Channel SSD can work on two modes: (i) hybrid mode, where the L2P table is maintained both by the host and by the device; and (ii) full host-based, where the L2P table is uniquely maintained by the host. In the advent of a new target implementing the full host-based mode, do not assume that the L2P table must be loaded on the generic media manager; check device properties loaded on the identify command instead. Signed-off-by: Javier González Moved into the following statement. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/gennvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 42c1c2ab6cc3..72e124a3927d 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -257,7 +257,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) } } - if (dev->ops->get_l2p_tbl) { + if ((dev->identity.dom & NVM_RSP_L2P) && dev->ops->get_l2p_tbl) { ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs, gennvm_block_map, dev); if (ret) { From 5173cb814b36439a9d9537016965e75798b9f130 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sat, 19 Mar 2016 01:35:54 +0300 Subject: [PATCH 07/11] mtip32xx: fix checks for dma mapping errors exec_drive_taskfile() checks for dma mapping errors by comparison returned address with zero, while pci_dma_mapping_error() should be used. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index cc2e71d0a77f..25824c1697c5 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2051,7 +2051,7 @@ static int exec_drive_taskfile(struct driver_data *dd, outbuf, taskout, DMA_TO_DEVICE); - if (outbuf_dma == 0) { + if (pci_dma_mapping_error(dd->pdev, outbuf_dma)) { err = -ENOMEM; goto abort; } @@ -2068,7 +2068,7 @@ static int exec_drive_taskfile(struct driver_data *dd, inbuf_dma = pci_map_single(dd->pdev, inbuf, taskin, DMA_FROM_DEVICE); - if (inbuf_dma == 0) { + if (pci_dma_mapping_error(dd->pdev, inbuf_dma)) { err = -ENOMEM; goto abort; } From 897bb0c7f1ea82d7cc882b19790b5e1df00ffc29 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 19 Mar 2016 11:30:33 +0100 Subject: [PATCH 08/11] blk-mq: Use proper cpumask iterator queue_for_each_ctx() iterates over per_cpu variables under the assumption that the possible cpu mask cannot have holes. That's wrong as all cpumasks can have holes. In case there are holes the iteration ends up accessing uninitialized memory and crashing as a result. Replace the macro by a proper for_each_possible_cpu() loop and drop the unused macro blk_ctx_sum() which references queue_for_each_ctx(). Reported-by: Xiong Zhou Signed-off-by: Thomas Gleixner Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 6 ++++-- block/blk-mq.c | 3 ++- include/linux/blk-mq.h | 14 -------------- 3 files changed, 6 insertions(+), 17 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 431fdda21737..4ea4dd8a1eed 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -416,12 +416,14 @@ void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) static void blk_mq_sysfs_init(struct request_queue *q) { struct blk_mq_ctx *ctx; - int i; + int cpu; kobject_init(&q->mq_kobj, &blk_mq_ktype); - queue_for_each_ctx(q, ctx, i) + for_each_possible_cpu(cpu) { + ctx = per_cpu_ptr(q->queue_ctx, cpu); kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); + } } int blk_mq_register_disk(struct gendisk *disk) diff --git a/block/blk-mq.c b/block/blk-mq.c index 050f7a13021b..1699baf39b78 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1798,11 +1798,12 @@ static void blk_mq_map_swqueue(struct request_queue *q, /* * Map software to hardware queues */ - queue_for_each_ctx(q, ctx, i) { + for_each_possible_cpu(i) { /* If the cpu isn't online, the cpu is mapped to first hctx */ if (!cpumask_test_cpu(i, online_mask)) continue; + ctx = per_cpu_ptr(q->queue_ctx, i); hctx = q->mq_ops->map_queue(q, i); cpumask_set_cpu(i, hctx->cpumask); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 15a73d49fd1d..9ac9799b702b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -263,22 +263,8 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) for ((i) = 0; (i) < (q)->nr_hw_queues && \ ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) -#define queue_for_each_ctx(q, ctx, i) \ - for ((i) = 0; (i) < (q)->nr_queues && \ - ({ ctx = per_cpu_ptr((q)->queue_ctx, (i)); 1; }); (i)++) - #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) -#define blk_ctx_sum(q, sum) \ -({ \ - struct blk_mq_ctx *__x; \ - unsigned int __ret = 0, __i; \ - \ - queue_for_each_ctx((q), __x, __i) \ - __ret += sum; \ - __ret; \ -}) - #endif From 614a4e3773148a31f58dc174bbf578ceb63510c2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Mar 2016 13:50:03 -0400 Subject: [PATCH 09/11] writeback, cgroup: fix premature wb_put() in locked_inode_to_wb_and_lock_list() locked_inode_to_wb_and_lock_list() wb_get()'s the wb associated with the target inode, unlocks inode, locks the wb's list_lock and verifies that the inode is still associated with the wb. To prevent the wb going away between dropping inode lock and acquiring list_lock, the wb is pinned while inode lock is held. The wb reference is put right after acquiring list_lock citing that the wb won't be dereferenced anymore. This isn't true. If the inode is still associated with the wb, the inode has reference and it's safe to return the wb; however, if inode has been switched, the wb still needs to be unlocked which is a dereference and can lead to use-after-free if it it races with wb destruction. Fix it by putting the reference after releasing list_lock. Signed-off-by: Tejun Heo Fixes: 87e1d789bf55 ("writeback: implement [locked_]inode_to_wb_and_lock_list()") Cc: stable@vger.kernel.org # v4.2+ Tested-by: Tahsin Erdogan Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5c46ed9f3e14..7b9582ed26f2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -281,13 +281,15 @@ locked_inode_to_wb_and_lock_list(struct inode *inode) wb_get(wb); spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock); - wb_put(wb); /* not gonna deref it anymore */ /* i_wb may have changed inbetween, can't use inode_to_wb() */ - if (likely(wb == inode->i_wb)) - return wb; /* @inode already has ref */ + if (likely(wb == inode->i_wb)) { + wb_put(wb); /* @inode already has ref */ + return wb; + } spin_unlock(&wb->list_lock); + wb_put(wb); cpu_relax(); spin_lock(&inode->i_lock); } From aaf2559332ba272671bb870464a99b909b29a3a1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Mar 2016 13:52:04 -0400 Subject: [PATCH 10/11] writeback, cgroup: fix use of the wrong bdi_writeback which mismatches the inode When cgroup writeback is in use, there can be multiple wb's (bdi_writeback's) per bdi and an inode may switch among them dynamically. In a couple places, the wrong wb was used leading to performing operations on the wrong list under the wrong lock corrupting the io lists. * writeback_single_inode() was taking @wb parameter and used it to remove the inode from io lists if it becomes clean after writeback. The callers of this function were always passing in the root wb regardless of the actual wb that the inode was associated with, which could also change while writeback is in progress. Fix it by dropping the @wb parameter and using inode_to_wb_and_lock_list() to determine and lock the associated wb. * After writeback_sb_inodes() writes out an inode, it re-locks @wb and inode to remove it from or move it to the right io list. It assumes that the inode is still associated with @wb; however, the inode may have switched to another wb while writeback was in progress. Fix it by using inode_to_wb_and_lock_list() to determine and lock the associated wb after writeback is complete. As the function requires the original @wb->list_lock locked for the next iteration, in the unlikely case where the inode has changed association, switch the locks. Kudos to Tahsin for pinpointing these subtle breakages. Signed-off-by: Tejun Heo Fixes: d10c80955265 ("writeback: implement foreign cgroup inode bdi_writeback switching") Link: http://lkml.kernel.org/g/CAAeU0aMYeM_39Y2+PaRvyB1nqAPYZSNngJ1eBRmrxn7gKAt2Mg@mail.gmail.com Reported-and-diagnosed-by: Tahsin Erdogan Tested-by: Tahsin Erdogan Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7b9582ed26f2..fee81e8768c9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1339,10 +1339,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() * and does more profound writeback list handling in writeback_sb_inodes(). */ -static int -writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, - struct writeback_control *wbc) +static int writeback_single_inode(struct inode *inode, + struct writeback_control *wbc) { + struct bdi_writeback *wb; int ret = 0; spin_lock(&inode->i_lock); @@ -1380,7 +1380,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, ret = __writeback_single_inode(inode, wbc); wbc_detach_inode(wbc); - spin_lock(&wb->list_lock); + + wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* * If inode is clean, remove it from writeback lists. Otherwise don't @@ -1455,6 +1456,7 @@ static long writeback_sb_inodes(struct super_block *sb, while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); + struct bdi_writeback *tmp_wb; if (inode->i_sb != sb) { if (work->sb) { @@ -1545,15 +1547,23 @@ static long writeback_sb_inodes(struct super_block *sb, cond_resched(); } - - spin_lock(&wb->list_lock); + /* + * Requeue @inode if still dirty. Be careful as @inode may + * have been switched to another wb in the meantime. + */ + tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) wrote++; - requeue_inode(inode, wb, &wbc); + requeue_inode(inode, tmp_wb, &wbc); inode_sync_complete(inode); spin_unlock(&inode->i_lock); + if (unlikely(tmp_wb != wb)) { + spin_unlock(&tmp_wb->list_lock); + spin_lock(&wb->list_lock); + } + /* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. @@ -2340,7 +2350,6 @@ EXPORT_SYMBOL(sync_inodes_sb); */ int write_inode_now(struct inode *inode, int sync) { - struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, @@ -2352,7 +2361,7 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - return writeback_single_inode(inode, wb, &wbc); + return writeback_single_inode(inode, &wbc); } EXPORT_SYMBOL(write_inode_now); @@ -2369,7 +2378,7 @@ EXPORT_SYMBOL(write_inode_now); */ int sync_inode(struct inode *inode, struct writeback_control *wbc) { - return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc); + return writeback_single_inode(inode, wbc); } EXPORT_SYMBOL(sync_inode); From d783e0bd02e700e7a893ef4fa71c69438ac1c276 Mon Sep 17 00:00:00 2001 From: Marta Rybczynska Date: Tue, 22 Mar 2016 16:02:06 +0100 Subject: [PATCH 11/11] nvme: avoid cqe corruption when update at the same time as read Make sure the CQE phase (validity) is read before the rest of the structure. The phase bit is the highest address and the CQE read will happen on most platforms from lower to upper addresses and will be done by multiple non-atomic loads. If the structure is updated by PCI during the reads from the processor, the processor may get a corrupted copy. The addition of the new nvme_cqe_valid function that verifies the validity bit also allows refactoring of the other CQE read sequences. Signed-off-by: Marta Rybczynska Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f8db70ae172d..24ccda303efb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -723,6 +723,13 @@ static void nvme_complete_rq(struct request *req) blk_mq_end_request(req, error); } +/* We read the CQE phase first to check if the rest of the entry is valid */ +static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head, + u16 phase) +{ + return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase; +} + static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) { u16 head, phase; @@ -730,13 +737,10 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) head = nvmeq->cq_head; phase = nvmeq->cq_phase; - for (;;) { + while (nvme_cqe_valid(nvmeq, head, phase)) { struct nvme_completion cqe = nvmeq->cqes[head]; - u16 status = le16_to_cpu(cqe.status); struct request *req; - if ((status & 1) != phase) - break; if (++head == nvmeq->q_depth) { head = 0; phase = !phase; @@ -767,7 +771,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id); if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special) memcpy(req->special, &cqe, sizeof(cqe)); - blk_mq_complete_request(req, status >> 1); + blk_mq_complete_request(req, le16_to_cpu(cqe.status) >> 1); } @@ -808,18 +812,16 @@ static irqreturn_t nvme_irq(int irq, void *data) static irqreturn_t nvme_irq_check(int irq, void *data) { struct nvme_queue *nvmeq = data; - struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; - if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) - return IRQ_NONE; - return IRQ_WAKE_THREAD; + if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) + return IRQ_WAKE_THREAD; + return IRQ_NONE; } static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) { struct nvme_queue *nvmeq = hctx->driver_data; - if ((le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) == - nvmeq->cq_phase) { + if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { spin_lock_irq(&nvmeq->q_lock); __nvme_process_cq(nvmeq, &tag); spin_unlock_irq(&nvmeq->q_lock);