blk-mq: improve layout of blk_mq_hw_ctx

Various cache line optimizations:

- Move delay_work towards the end. It's huge, and we don't use it
  a lot (only SCSI).

- Move the atomic state into the same cacheline as the the dispatch
  list and lock.

- Rearrange a few members to pack it better.

- Shrink the max-order for dispatch accounting from 10 to 7. This
  means that ->dispatched[] and ->run now take up their own
  cacheline.

This shrinks struct blk_mq_hw_ctx down to 8 cachelines.

Signed-off-by: Jens Axboe <axboe@fb.com>
This commit is contained in:
Jens Axboe 2016-08-25 08:00:28 -06:00
parent 27489a3c82
commit 8d354f133e

View File

@ -22,11 +22,10 @@ struct blk_mq_hw_ctx {
struct { struct {
spinlock_t lock; spinlock_t lock;
struct list_head dispatch; struct list_head dispatch;
unsigned long state; /* BLK_MQ_S_* flags */
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
unsigned long state; /* BLK_MQ_S_* flags */
struct work_struct run_work; struct work_struct run_work;
struct delayed_work delay_work;
cpumask_var_t cpumask; cpumask_var_t cpumask;
int next_cpu; int next_cpu;
int next_cpu_batch; int next_cpu_batch;
@ -40,8 +39,8 @@ struct blk_mq_hw_ctx {
struct blk_mq_ctxmap ctx_map; struct blk_mq_ctxmap ctx_map;
unsigned int nr_ctx;
struct blk_mq_ctx **ctxs; struct blk_mq_ctx **ctxs;
unsigned int nr_ctx;
atomic_t wait_index; atomic_t wait_index;
@ -49,7 +48,7 @@ struct blk_mq_hw_ctx {
unsigned long queued; unsigned long queued;
unsigned long run; unsigned long run;
#define BLK_MQ_MAX_DISPATCH_ORDER 10 #define BLK_MQ_MAX_DISPATCH_ORDER 7
unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
unsigned int numa_node; unsigned int numa_node;
@ -57,6 +56,8 @@ struct blk_mq_hw_ctx {
atomic_t nr_active; atomic_t nr_active;
struct delayed_work delay_work;
struct blk_mq_cpu_notifier cpu_notifier; struct blk_mq_cpu_notifier cpu_notifier;
struct kobject kobj; struct kobject kobj;