From c23f3445e68e1db0e74099f264bc5ff5d55ebdeb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 2 Jun 2010 20:40:00 +0200 Subject: [PATCH] vhost: replace vhost_workqueue with per-vhost kthread Replace vhost_workqueue with per-vhost kthread. Other than callback argument change from struct work_struct * to struct vhost_work *, there's no visible change to vhost_poll_*() interface. This conversion is to make each vhost use a dedicated kthread so that resource control via cgroup can be applied. Partially based on Sridhar Samudrala's patch. * Updated to use sub structure vhost_work instead of directly using vhost_poll at Michael's suggestion. * Added flusher wake_up() optimization at Michael's suggestion. Changes by MST: * Converted atomics/barrier use to a spinlock. * Create thread on SET_OWNER * Fix flushing Signed-off-by: Tejun Heo Signed-off-by: Michael S. Tsirkin Cc: Sridhar Samudrala --- drivers/vhost/net.c | 56 +++++++---------- drivers/vhost/vhost.c | 143 +++++++++++++++++++++++++++++++++--------- drivers/vhost/vhost.h | 38 +++++++---- 3 files changed, 164 insertions(+), 73 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f11e6bb5b036..d395b59289ae 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -302,54 +302,58 @@ static void handle_rx(struct vhost_net *net) unuse_mm(net->dev.mm); } -static void handle_tx_kick(struct work_struct *work) +static void handle_tx_kick(struct vhost_work *work) { - struct vhost_virtqueue *vq; - struct vhost_net *net; - vq = container_of(work, struct vhost_virtqueue, poll.work); - net = container_of(vq->dev, struct vhost_net, dev); + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); + handle_tx(net); } -static void handle_rx_kick(struct work_struct *work) +static void handle_rx_kick(struct vhost_work *work) { - struct vhost_virtqueue *vq; - struct vhost_net *net; - vq = container_of(work, struct vhost_virtqueue, poll.work); - net = container_of(vq->dev, struct vhost_net, dev); + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, + poll.work); + struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); + handle_rx(net); } -static void handle_tx_net(struct work_struct *work) +static void handle_tx_net(struct vhost_work *work) { - struct vhost_net *net; - net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); + struct vhost_net *net = container_of(work, struct vhost_net, + poll[VHOST_NET_VQ_TX].work); handle_tx(net); } -static void handle_rx_net(struct work_struct *work) +static void handle_rx_net(struct vhost_work *work) { - struct vhost_net *net; - net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); + struct vhost_net *net = container_of(work, struct vhost_net, + poll[VHOST_NET_VQ_RX].work); handle_rx(net); } static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); + struct vhost_dev *dev; int r; + if (!n) return -ENOMEM; + + dev = &n->dev; n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; - r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); + r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX); if (r < 0) { kfree(n); return r; } - vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); - vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); n->tx_poll_state = VHOST_NET_POLL_DISABLED; f->private_data = n; @@ -656,25 +660,13 @@ static struct miscdevice vhost_net_misc = { static int vhost_net_init(void) { - int r = vhost_init(); - if (r) - goto err_init; - r = misc_register(&vhost_net_misc); - if (r) - goto err_reg; - return 0; -err_reg: - vhost_cleanup(); -err_init: - return r; - + return misc_register(&vhost_net_misc); } module_init(vhost_net_init); static void vhost_net_exit(void) { misc_deregister(&vhost_net_misc); - vhost_cleanup(); } module_exit(vhost_net_exit); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 248ed2db0711..30d93c2b45b8 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -17,12 +17,12 @@ #include #include #include -#include #include #include #include #include #include +#include #include #include @@ -37,8 +37,6 @@ enum { VHOST_MEMORY_F_LOG = 0x1, }; -static struct workqueue_struct *vhost_workqueue; - static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { @@ -52,23 +50,31 @@ static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) { - struct vhost_poll *poll; - poll = container_of(wait, struct vhost_poll, wait); + struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait); + if (!((unsigned long)key & poll->mask)) return 0; - queue_work(vhost_workqueue, &poll->work); + vhost_poll_queue(poll); return 0; } /* Init poll structure */ -void vhost_poll_init(struct vhost_poll *poll, work_func_t func, - unsigned long mask) +void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, + unsigned long mask, struct vhost_dev *dev) { - INIT_WORK(&poll->work, func); + struct vhost_work *work = &poll->work; + init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); init_poll_funcptr(&poll->table, vhost_poll_func); poll->mask = mask; + poll->dev = dev; + + INIT_LIST_HEAD(&work->node); + work->fn = fn; + init_waitqueue_head(&work->done); + work->flushing = 0; + work->queue_seq = work->done_seq = 0; } /* Start polling a file. We add ourselves to file's wait queue. The caller must @@ -92,12 +98,40 @@ void vhost_poll_stop(struct vhost_poll *poll) * locks that are also used by the callback. */ void vhost_poll_flush(struct vhost_poll *poll) { - flush_work(&poll->work); + struct vhost_work *work = &poll->work; + unsigned seq; + int left; + int flushing; + + spin_lock_irq(&poll->dev->work_lock); + seq = work->queue_seq; + work->flushing++; + spin_unlock_irq(&poll->dev->work_lock); + wait_event(work->done, ({ + spin_lock_irq(&poll->dev->work_lock); + left = seq - work->done_seq <= 0; + spin_unlock_irq(&poll->dev->work_lock); + left; + })); + spin_lock_irq(&poll->dev->work_lock); + flushing = --work->flushing; + spin_unlock_irq(&poll->dev->work_lock); + BUG_ON(flushing < 0); } void vhost_poll_queue(struct vhost_poll *poll) { - queue_work(vhost_workqueue, &poll->work); + struct vhost_dev *dev = poll->dev; + struct vhost_work *work = &poll->work; + unsigned long flags; + + spin_lock_irqsave(&dev->work_lock, flags); + if (list_empty(&work->node)) { + list_add_tail(&work->node, &dev->work_list); + work->queue_seq++; + wake_up_process(dev->worker); + } + spin_unlock_irqrestore(&dev->work_lock, flags); } static void vhost_vq_reset(struct vhost_dev *dev, @@ -125,10 +159,51 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->log_ctx = NULL; } +static int vhost_worker(void *data) +{ + struct vhost_dev *dev = data; + struct vhost_work *work = NULL; + unsigned uninitialized_var(seq); + + for (;;) { + /* mb paired w/ kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + + spin_lock_irq(&dev->work_lock); + if (work) { + work->done_seq = seq; + if (work->flushing) + wake_up_all(&work->done); + } + + if (kthread_should_stop()) { + spin_unlock_irq(&dev->work_lock); + __set_current_state(TASK_RUNNING); + return 0; + } + if (!list_empty(&dev->work_list)) { + work = list_first_entry(&dev->work_list, + struct vhost_work, node); + list_del_init(&work->node); + seq = work->queue_seq; + } else + work = NULL; + spin_unlock_irq(&dev->work_lock); + + if (work) { + __set_current_state(TASK_RUNNING); + work->fn(work); + } else + schedule(); + + } +} + long vhost_dev_init(struct vhost_dev *dev, struct vhost_virtqueue *vqs, int nvqs) { int i; + dev->vqs = vqs; dev->nvqs = nvqs; mutex_init(&dev->mutex); @@ -136,6 +211,9 @@ long vhost_dev_init(struct vhost_dev *dev, dev->log_file = NULL; dev->memory = NULL; dev->mm = NULL; + spin_lock_init(&dev->work_lock); + INIT_LIST_HEAD(&dev->work_list); + dev->worker = NULL; for (i = 0; i < dev->nvqs; ++i) { dev->vqs[i].dev = dev; @@ -143,9 +221,9 @@ long vhost_dev_init(struct vhost_dev *dev, vhost_vq_reset(dev, dev->vqs + i); if (dev->vqs[i].handle_kick) vhost_poll_init(&dev->vqs[i].poll, - dev->vqs[i].handle_kick, - POLLIN); + dev->vqs[i].handle_kick, POLLIN, dev); } + return 0; } @@ -159,12 +237,31 @@ long vhost_dev_check_owner(struct vhost_dev *dev) /* Caller should have device mutex */ static long vhost_dev_set_owner(struct vhost_dev *dev) { + struct task_struct *worker; + int err; /* Is there an owner already? */ - if (dev->mm) - return -EBUSY; + if (dev->mm) { + err = -EBUSY; + goto err_mm; + } /* No owner, become one */ dev->mm = get_task_mm(current); + worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); + if (IS_ERR(worker)) { + err = PTR_ERR(worker); + goto err_worker; + } + + dev->worker = worker; + wake_up_process(worker); /* avoid contributing to loadavg */ + return 0; +err_worker: + if (dev->mm) + mmput(dev->mm); + dev->mm = NULL; +err_mm: + return err; } /* Caller should have device mutex */ @@ -217,6 +314,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev) if (dev->mm) mmput(dev->mm); dev->mm = NULL; + + WARN_ON(!list_empty(&dev->work_list)); + kthread_stop(dev->worker); } static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) @@ -1115,16 +1215,3 @@ void vhost_disable_notify(struct vhost_virtqueue *vq) vq_err(vq, "Failed to enable notification at %p: %d\n", &vq->used->flags, r); } - -int vhost_init(void) -{ - vhost_workqueue = create_singlethread_workqueue("vhost"); - if (!vhost_workqueue) - return -ENOMEM; - return 0; -} - -void vhost_cleanup(void) -{ - destroy_workqueue(vhost_workqueue); -} diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 11ee13dba0f7..3693327549b3 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -5,13 +5,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include struct vhost_device; @@ -20,19 +20,31 @@ enum { VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, }; +struct vhost_work; +typedef void (*vhost_work_fn_t)(struct vhost_work *work); + +struct vhost_work { + struct list_head node; + vhost_work_fn_t fn; + wait_queue_head_t done; + int flushing; + unsigned queue_seq; + unsigned done_seq; +}; + /* Poll a file (eventfd or socket) */ /* Note: there's nothing vhost specific about this structure. */ struct vhost_poll { poll_table table; wait_queue_head_t *wqh; wait_queue_t wait; - /* struct which will handle all actual work. */ - struct work_struct work; + struct vhost_work work; unsigned long mask; + struct vhost_dev *dev; }; -void vhost_poll_init(struct vhost_poll *poll, work_func_t func, - unsigned long mask); +void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, + unsigned long mask, struct vhost_dev *dev); void vhost_poll_start(struct vhost_poll *poll, struct file *file); void vhost_poll_stop(struct vhost_poll *poll); void vhost_poll_flush(struct vhost_poll *poll); @@ -63,7 +75,7 @@ struct vhost_virtqueue { struct vhost_poll poll; /* The routine to call when the Guest pings us, or timeout. */ - work_func_t handle_kick; + vhost_work_fn_t handle_kick; /* Last available index we saw. */ u16 last_avail_idx; @@ -86,11 +98,11 @@ struct vhost_virtqueue { struct iovec hdr[VHOST_NET_MAX_SG]; size_t hdr_size; /* We use a kind of RCU to access private pointer. - * All readers access it from workqueue, which makes it possible to - * flush the workqueue instead of synchronize_rcu. Therefore readers do + * All readers access it from worker, which makes it possible to + * flush the vhost_work instead of synchronize_rcu. Therefore readers do * not need to call rcu_read_lock/rcu_read_unlock: the beginning of - * work item execution acts instead of rcu_read_lock() and the end of - * work item execution acts instead of rcu_read_lock(). + * vhost_work execution acts instead of rcu_read_lock() and the end of + * vhost_work execution acts instead of rcu_read_lock(). * Writers use virtqueue mutex. */ void *private_data; /* Log write descriptors */ @@ -110,6 +122,9 @@ struct vhost_dev { int nvqs; struct file *log_file; struct eventfd_ctx *log_ctx; + spinlock_t work_lock; + struct list_head work_list; + struct task_struct *worker; }; long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); @@ -136,9 +151,6 @@ bool vhost_enable_notify(struct vhost_virtqueue *); int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, unsigned int log_num, u64 len); -int vhost_init(void); -void vhost_cleanup(void); - #define vq_err(vq, fmt, ...) do { \ pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ if ((vq)->error_ctx) \