- a patch that removes crush_workspace_mutex (myself).  CRUSH
   computations are no longer serialized and can run in parallel.
 
 - a couple new filesystem client metrics for "ceph fs top" command
   (Xiubo Li)
 
 - a fix for a very old messenger bug that affected the filesystem,
   marked for stable (myself)
 
 - assorted fixups and cleanups throughout the codebase from Jeff
   and others.
 -----BEGIN PGP SIGNATURE-----
 
 iQFHBAABCAAxFiEEydHwtzie9C7TfviiSn/eOAIR84sFAl+QN8cTHGlkcnlvbW92
 QGdtYWlsLmNvbQAKCRBKf944AhHzi4iwCACLwUvO0ONTzUUb2D8ftfyXjo/jIod5
 eO7RHfCHOP2A83GQpdYdktVDSVeOUIOiPhxAxo1dL4GieI2/saXrnoevam7ogZkA
 OmR4drdtRVUqF/aATrtjiDMg2ge0dnx5gfjMxSP/FiPPpXOdrtxng/7tv8yo+03q
 AlMqg/YcxO06t1M1qh9SEyfzjcHyPnJU2i0ienngxnGxQ7QiMOR6anF1LNhtN803
 4fTBX2tLqDNAa+x5yF1kKSn9OmFNhc0oUsqef+Ck0Vw1LC0/SuxzE2J904iehAwy
 /HzJCer0zcp+eZhn3HhoHmp0UZpOC1dzMxa3CHyRJFrxCSTEhY8iqPiJ
 =wGr7
 -----END PGP SIGNATURE-----

Merge tag 'ceph-for-5.10-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:

 - a patch that removes crush_workspace_mutex (myself). CRUSH
   computations are no longer serialized and can run in parallel.

 - a couple new filesystem client metrics for "ceph fs top" command
   (Xiubo Li)

 - a fix for a very old messenger bug that affected the filesystem,
   marked for stable (myself)

 - assorted fixups and cleanups throughout the codebase from Jeff and
   others.

* tag 'ceph-for-5.10-rc1' of git://github.com/ceph/ceph-client: (27 commits)
  libceph: clear con->out_msg on Policy::stateful_server faults
  libceph: format ceph_entity_addr nonces as unsigned
  libceph: fix ENTITY_NAME format suggestion
  libceph: move a dout in queue_con_delay()
  ceph: comment cleanups and clarifications
  ceph: break up send_cap_msg
  ceph: drop separate mdsc argument from __send_cap
  ceph: promote to unsigned long long before shifting
  ceph: don't SetPageError on readpage errors
  ceph: mark ceph_fmt_xattr() as printf-like for better type checking
  ceph: fold ceph_update_writeable_page into ceph_write_begin
  ceph: fold ceph_sync_writepages into writepage_nounlock
  ceph: fold ceph_sync_readpages into ceph_readpage
  ceph: don't call ceph_update_writeable_page from page_mkwrite
  ceph: break out writeback of incompatible snap context to separate function
  ceph: add a note explaining session reject error string
  libceph: switch to the new "osd blocklist add" command
  libceph, rbd, ceph: "blacklist" -> "blocklist"
  ceph: have ceph_writepages_start call pagevec_lookup_range_tag
  ceph: use kill_anon_super helper
  ...
This commit is contained in:
Linus Torvalds 2020-10-21 10:34:10 -07:00
commit ed7cfefe44
26 changed files with 689 additions and 443 deletions

View File

@ -163,14 +163,14 @@ Mount Options
to the default VFS implementation if this option is used.
recover_session=<no|clean>
Set auto reconnect mode in the case where the client is blacklisted. The
Set auto reconnect mode in the case where the client is blocklisted. The
available modes are "no" and "clean". The default is "no".
* no: never attempt to reconnect when client detects that it has been
blacklisted. Operations will generally fail after being blacklisted.
blocklisted. Operations will generally fail after being blocklisted.
* clean: client reconnects to the ceph cluster automatically when it
detects that it has been blacklisted. During reconnect, client drops
detects that it has been blocklisted. During reconnect, client drops
dirty data/metadata, invalidates page caches and writable file handles.
After reconnect, file locks become stale because the MDS loses track
of them. If an inode contains any stale file locks, read/write on the

View File

@ -4010,10 +4010,10 @@ static int rbd_try_lock(struct rbd_device *rbd_dev)
rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
ENTITY_NAME(lockers[0].id.name));
ret = ceph_monc_blacklist_add(&client->monc,
ret = ceph_monc_blocklist_add(&client->monc,
&lockers[0].info.addr);
if (ret) {
rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
ENTITY_NAME(lockers[0].id.name), ret);
goto out;
}
@ -4077,7 +4077,7 @@ static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
ret = rbd_try_lock(rbd_dev);
if (ret < 0) {
rbd_warn(rbd_dev, "failed to lock header: %d", ret);
if (ret == -EBLACKLISTED)
if (ret == -EBLOCKLISTED)
goto out;
ret = 1; /* request lock anyway */
@ -4613,7 +4613,7 @@ static void rbd_reregister_watch(struct work_struct *work)
ret = __rbd_register_watch(rbd_dev);
if (ret) {
rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
if (ret != -EBLACKLISTED && ret != -ENOENT) {
if (ret != -EBLOCKLISTED && ret != -ENOENT) {
queue_delayed_work(rbd_dev->task_wq,
&rbd_dev->watch_dwork,
RBD_RETRY_DELAY);

View File

@ -182,58 +182,15 @@ static int ceph_releasepage(struct page *page, gfp_t g)
return !PagePrivate(page);
}
/*
* Read some contiguous pages. If we cross a stripe boundary, shorten
* *plen. Return number of bytes read, or error.
*/
static int ceph_sync_readpages(struct ceph_fs_client *fsc,
struct ceph_vino vino,
struct ceph_file_layout *layout,
u64 off, u64 *plen,
u32 truncate_seq, u64 truncate_size,
struct page **pages, int num_pages,
int page_align)
{
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
int rc = 0;
dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
vino.snap, off, *plen);
req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
NULL, truncate_seq, truncate_size,
false);
if (IS_ERR(req))
return PTR_ERR(req);
/* it may be a short read due to an object boundary */
osd_req_op_extent_osd_data_pages(req, 0,
pages, *plen, page_align, false, false);
dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
off, *plen, *plen, page_align);
rc = ceph_osdc_start_request(osdc, req, false);
if (!rc)
rc = ceph_osdc_wait_request(osdc, req);
ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, rc);
ceph_osdc_put_request(req);
dout("readpages result %d\n", rc);
return rc;
}
/*
* read a single page, without unlocking it.
*/
/* read a single page, without unlocking it. */
static int ceph_do_readpage(struct file *filp, struct page *page)
{
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
struct ceph_vino vino = ceph_vino(inode);
int err = 0;
u64 off = page_offset(page);
u64 len = PAGE_SIZE;
@ -260,19 +217,33 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
if (err == 0)
return -EINPROGRESS;
dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
err = ceph_sync_readpages(fsc, ceph_vino(inode),
&ci->i_layout, off, &len,
ci->i_truncate_seq, ci->i_truncate_size,
&page, 1, 0);
dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
vino.ino, vino.snap, filp, off, len, page, page->index);
req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1,
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL,
ci->i_truncate_seq, ci->i_truncate_size,
false);
if (IS_ERR(req))
return PTR_ERR(req);
osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
err = ceph_osdc_start_request(osdc, req, false);
if (!err)
err = ceph_osdc_wait_request(osdc, req);
ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, err);
ceph_osdc_put_request(req);
dout("readpage result %d\n", err);
if (err == -ENOENT)
err = 0;
if (err < 0) {
SetPageError(page);
ceph_fscache_readpage_cancel(inode, page);
if (err == -EBLACKLISTED)
fsc->blacklisted = true;
if (err == -EBLOCKLISTED)
fsc->blocklisted = true;
goto out;
}
if (err < PAGE_SIZE)
@ -312,8 +283,8 @@ static void finish_read(struct ceph_osd_request *req)
int i;
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
if (rc == -EBLACKLISTED)
ceph_inode_to_client(inode)->blacklisted = true;
if (rc == -EBLOCKLISTED)
ceph_inode_to_client(inode)->blocklisted = true;
/* unlock all pages, zeroing any data we didn't read */
osd_data = osd_req_op_extent_osd_data(req, 0);
@ -619,50 +590,6 @@ static u64 get_writepages_data_length(struct inode *inode,
return end > start ? end - start : 0;
}
/*
* do a synchronous write on N pages
*/
static int ceph_sync_writepages(struct ceph_fs_client *fsc,
struct ceph_vino vino,
struct ceph_file_layout *layout,
struct ceph_snap_context *snapc,
u64 off, u64 len,
u32 truncate_seq, u64 truncate_size,
struct timespec64 *mtime,
struct page **pages, int num_pages)
{
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
int rc = 0;
int page_align = off & ~PAGE_MASK;
req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
snapc, truncate_seq, truncate_size,
true);
if (IS_ERR(req))
return PTR_ERR(req);
/* it may be a short write due to an object boundary */
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
false, false);
dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
req->r_mtime = *mtime;
rc = ceph_osdc_start_request(osdc, req, true);
if (!rc)
rc = ceph_osdc_wait_request(osdc, req);
ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, rc);
ceph_osdc_put_request(req);
if (rc == 0)
rc = len;
dout("writepages result %d\n", rc);
return rc;
}
/*
* Write a single page, but leave the page locked.
*
@ -671,20 +598,19 @@ static int ceph_sync_writepages(struct ceph_fs_client *fsc,
*/
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
{
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_fs_client *fsc;
struct inode *inode = page->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc, *oldest;
loff_t page_off = page_offset(page);
int err, len = PAGE_SIZE;
int err;
loff_t len = PAGE_SIZE;
struct ceph_writeback_ctl ceph_wbc;
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
dout("writepage %p idx %lu\n", page, page->index);
inode = page->mapping->host;
ci = ceph_inode(inode);
fsc = ceph_inode_to_client(inode);
/* verify this is a writeable snap context */
snapc = page_snap_context(page);
if (!snapc) {
@ -713,7 +639,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (ceph_wbc.i_size < page_off + len)
len = ceph_wbc.i_size - page_off;
dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n",
dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
inode, page, page->index, page_off, len, snapc, snapc->seq);
if (atomic_long_inc_return(&fsc->writeback_count) >
@ -721,11 +647,33 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
set_page_writeback(page);
err = ceph_sync_writepages(fsc, ceph_vino(inode),
&ci->i_layout, snapc, page_off, len,
ceph_wbc.truncate_seq,
ceph_wbc.truncate_size,
&inode->i_mtime, &page, 1);
req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
true);
if (IS_ERR(req)) {
redirty_page_for_writepage(wbc, page);
end_page_writeback(page);
return PTR_ERR(req);
}
/* it may be a short write due to an object boundary */
WARN_ON_ONCE(len > PAGE_SIZE);
osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
req->r_mtime = inode->i_mtime;
err = ceph_osdc_start_request(osdc, req, true);
if (!err)
err = ceph_osdc_wait_request(osdc, req);
ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, err);
ceph_osdc_put_request(req);
if (err == 0)
err = len;
if (err < 0) {
struct writeback_control tmp_wbc;
if (!wbc)
@ -737,8 +685,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
end_page_writeback(page);
return err;
}
if (err == -EBLACKLISTED)
fsc->blacklisted = true;
if (err == -EBLOCKLISTED)
fsc->blocklisted = true;
dout("writepage setting page/mapping error %d %p\n",
err, page);
mapping_set_error(&inode->i_data, err);
@ -801,8 +749,8 @@ static void writepages_finish(struct ceph_osd_request *req)
if (rc < 0) {
mapping_set_error(mapping, rc);
ceph_set_error_write(ci);
if (rc == -EBLACKLISTED)
fsc->blacklisted = true;
if (rc == -EBLOCKLISTED)
fsc->blocklisted = true;
} else {
ceph_clear_error_write(ci);
}
@ -962,9 +910,8 @@ static int ceph_writepages_start(struct address_space *mapping,
max_pages = wsize >> PAGE_SHIFT;
get_more_pages:
pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index,
end, PAGECACHE_TAG_DIRTY,
max_pages - locked_pages);
pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
end, PAGECACHE_TAG_DIRTY);
dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
if (!pvec_pages && !locked_pages)
break;
@ -1299,110 +1246,60 @@ static int context_is_writeable_or_written(struct inode *inode,
return ret;
}
/*
* We are only allowed to write into/dirty the page if the page is
* clean, or already dirty within the same snap context.
/**
* ceph_find_incompatible - find an incompatible context and return it
* @page: page being dirtied
*
* called with page locked.
* return success with page locked,
* or any failure (incl -EAGAIN) with page unlocked.
* We are only allowed to write into/dirty a page if the page is
* clean, or already dirty within the same snap context. Returns a
* conflicting context if there is one, NULL if there isn't, or a
* negative error code on other errors.
*
* Must be called with page lock held.
*/
static int ceph_update_writeable_page(struct file *file,
loff_t pos, unsigned len,
struct page *page)
static struct ceph_snap_context *
ceph_find_incompatible(struct page *page)
{
struct inode *inode = file_inode(file);
struct inode *inode = page->mapping->host;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
loff_t page_off = pos & PAGE_MASK;
int pos_in_page = pos & ~PAGE_MASK;
int end_in_page = pos_in_page + len;
loff_t i_size;
int r;
struct ceph_snap_context *snapc, *oldest;
if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
dout(" page %p forced umount\n", page);
unlock_page(page);
return -EIO;
return ERR_PTR(-EIO);
}
retry_locked:
/* writepages currently holds page lock, but if we change that later, */
wait_on_page_writeback(page);
for (;;) {
struct ceph_snap_context *snapc, *oldest;
wait_on_page_writeback(page);
snapc = page_snap_context(page);
if (!snapc || snapc == ci->i_head_snapc)
break;
snapc = page_snap_context(page);
if (snapc && snapc != ci->i_head_snapc) {
/*
* this page is already dirty in another (older) snap
* context! is it writeable now?
*/
oldest = get_oldest_context(inode, NULL, NULL);
if (snapc->seq > oldest->seq) {
/* not writeable -- return it for the caller to deal with */
ceph_put_snap_context(oldest);
dout(" page %p snapc %p not current or oldest\n",
page, snapc);
/*
* queue for writeback, and wait for snapc to
* be writeable or written
*/
snapc = ceph_get_snap_context(snapc);
unlock_page(page);
ceph_queue_writeback(inode);
r = wait_event_killable(ci->i_cap_wq,
context_is_writeable_or_written(inode, snapc));
ceph_put_snap_context(snapc);
if (r == -ERESTARTSYS)
return r;
return -EAGAIN;
dout(" page %p snapc %p not current or oldest\n", page, snapc);
return ceph_get_snap_context(snapc);
}
ceph_put_snap_context(oldest);
/* yay, writeable, do it now (without dropping page lock) */
dout(" page %p snapc %p not current, but oldest\n",
page, snapc);
if (!clear_page_dirty_for_io(page))
goto retry_locked;
r = writepage_nounlock(page, NULL);
if (r < 0)
goto fail_unlock;
goto retry_locked;
dout(" page %p snapc %p not current, but oldest\n", page, snapc);
if (clear_page_dirty_for_io(page)) {
int r = writepage_nounlock(page, NULL);
if (r < 0)
return ERR_PTR(r);
}
}
if (PageUptodate(page)) {
dout(" page %p already uptodate\n", page);
return 0;
}
/* full page? */
if (pos_in_page == 0 && len == PAGE_SIZE)
return 0;
/* past end of file? */
i_size = i_size_read(inode);
if (page_off >= i_size ||
(pos_in_page == 0 && (pos+len) >= i_size &&
end_in_page - pos_in_page != PAGE_SIZE)) {
dout(" zeroing %p 0 - %d and %d - %d\n",
page, pos_in_page, end_in_page, (int)PAGE_SIZE);
zero_user_segments(page,
0, pos_in_page,
end_in_page, PAGE_SIZE);
return 0;
}
/* we need to read it. */
r = ceph_do_readpage(file, page);
if (r < 0) {
if (r == -EINPROGRESS)
return -EAGAIN;
goto fail_unlock;
}
goto retry_locked;
fail_unlock:
unlock_page(page);
return r;
return NULL;
}
/*
@ -1414,26 +1311,78 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
struct page *page;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
struct page *page = NULL;
pgoff_t index = pos >> PAGE_SHIFT;
int r;
int pos_in_page = pos & ~PAGE_MASK;
int r = 0;
do {
/* get a page */
dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len);
for (;;) {
page = grab_cache_page_write_begin(mapping, index, 0);
if (!page)
return -ENOMEM;
if (!page) {
r = -ENOMEM;
break;
}
dout("write_begin file %p inode %p page %p %d~%d\n", file,
inode, page, (int)pos, (int)len);
r = ceph_update_writeable_page(file, pos, len, page);
if (r < 0)
snapc = ceph_find_incompatible(page);
if (snapc) {
if (IS_ERR(snapc)) {
r = PTR_ERR(snapc);
break;
}
unlock_page(page);
put_page(page);
else
*pagep = page;
} while (r == -EAGAIN);
page = NULL;
ceph_queue_writeback(inode);
r = wait_event_killable(ci->i_cap_wq,
context_is_writeable_or_written(inode, snapc));
ceph_put_snap_context(snapc);
if (r != 0)
break;
continue;
}
if (PageUptodate(page)) {
dout(" page %p already uptodate\n", page);
break;
}
/*
* In some cases we don't need to read at all:
* - full page write
* - write that lies completely beyond EOF
* - write that covers the the page from start to EOF or beyond it
*/
if ((pos_in_page == 0 && len == PAGE_SIZE) ||
(pos >= i_size_read(inode)) ||
(pos_in_page == 0 && (pos + len) >= i_size_read(inode))) {
zero_user_segments(page, 0, pos_in_page,
pos_in_page + len, PAGE_SIZE);
break;
}
/*
* We need to read it. If we get back -EINPROGRESS, then the page was
* handed off to fscache and it will be unlocked when the read completes.
* Refind the page in that case so we can reacquire the page lock. Otherwise
* we got a hard error or the read was completed synchronously.
*/
r = ceph_do_readpage(file, page);
if (r != -EINPROGRESS)
break;
}
if (r < 0) {
if (page) {
unlock_page(page);
put_page(page);
}
} else {
*pagep = page;
}
return r;
}
@ -1522,7 +1471,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
struct page *pinned_page = NULL;
loff_t off = vmf->pgoff << PAGE_SHIFT;
loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
int want, got, err;
sigset_t oldset;
vm_fault_t ret = VM_FAULT_SIGBUS;
@ -1668,6 +1617,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
inode_inc_iversion_raw(inode);
do {
struct ceph_snap_context *snapc;
lock_page(page);
if (page_mkwrite_check_truncate(page, inode) < 0) {
@ -1676,13 +1627,26 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
break;
}
err = ceph_update_writeable_page(vma->vm_file, off, len, page);
if (err >= 0) {
snapc = ceph_find_incompatible(page);
if (!snapc) {
/* success. we'll keep the page locked. */
set_page_dirty(page);
ret = VM_FAULT_LOCKED;
break;
}
} while (err == -EAGAIN);
unlock_page(page);
if (IS_ERR(snapc)) {
ret = VM_FAULT_SIGBUS;
break;
}
ceph_queue_writeback(inode);
err = wait_event_killable(ci->i_cap_wq,
context_is_writeable_or_written(inode, snapc));
ceph_put_snap_context(snapc);
} while (err == 0);
if (ret == VM_FAULT_LOCKED ||
ci->i_inline_version != CEPH_INLINE_NONE) {
@ -2039,16 +2003,16 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
if (err >= 0 || err == -ENOENT)
have |= POOL_READ;
else if (err != -EPERM) {
if (err == -EBLACKLISTED)
fsc->blacklisted = true;
if (err == -EBLOCKLISTED)
fsc->blocklisted = true;
goto out_unlock;
}
if (err2 == 0 || err2 == -EEXIST)
have |= POOL_WRITE;
else if (err2 != -EPERM) {
if (err2 == -EBLACKLISTED)
fsc->blacklisted = true;
if (err2 == -EBLOCKLISTED)
fsc->blocklisted = true;
err = err2;
goto out_unlock;
}

View File

@ -1222,36 +1222,27 @@ struct cap_msg_args {
};
/*
* Build and send a cap message to the given MDS.
*
* Caller should be holding s_mutex.
* cap struct size + flock buffer size + inline version + inline data size +
* osd_epoch_barrier + oldest_flush_tid
*/
static int send_cap_msg(struct cap_msg_args *arg)
#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
/* Marshal up the cap msg to the MDS */
static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
{
struct ceph_mds_caps *fc;
struct ceph_msg *msg;
void *p;
size_t extra_len;
struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
" seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
" xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
arg->cid, arg->ino, ceph_cap_string(arg->caps),
ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
arg->mseq, arg->follows, arg->size, arg->max_size,
arg->xattr_version,
dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
__func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
arg->size, arg->max_size, arg->xattr_version,
arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
/* flock buffer size + inline version + inline data size +
* osd_epoch_barrier + oldest_flush_tid */
extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
GFP_NOFS, false);
if (!msg)
return -ENOMEM;
msg->hdr.version = cpu_to_le16(10);
msg->hdr.tid = cpu_to_le64(arg->flush_tid);
@ -1323,9 +1314,6 @@ static int send_cap_msg(struct cap_msg_args *arg)
/* Advisory flags (version 10) */
ceph_encode_32(&p, arg->flags);
ceph_con_send(&arg->session->s_con, msg);
return 0;
}
/*
@ -1454,25 +1442,25 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
*
* Caller should hold snap_rwsem (read), s_mutex.
*/
static void __send_cap(struct ceph_mds_client *mdsc, struct cap_msg_args *arg,
struct ceph_inode_info *ci)
static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
{
struct ceph_msg *msg;
struct inode *inode = &ci->vfs_inode;
int ret;
ret = send_cap_msg(arg);
if (ret < 0) {
pr_err("error sending cap msg, ino (%llx.%llx) "
"flushing %s tid %llu, requeue\n",
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
if (!msg) {
pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
ceph_vinop(inode), ceph_cap_string(arg->dirty),
arg->flush_tid);
spin_lock(&ci->i_ceph_lock);
__cap_delay_requeue(mdsc, ci);
__cap_delay_requeue(arg->session->s_mdsc, ci);
spin_unlock(&ci->i_ceph_lock);
return;
}
encode_cap_msg(msg, arg);
ceph_con_send(&arg->session->s_con, msg);
ceph_buffer_put(arg->old_xattr_buf);
if (arg->wake)
wake_up_all(&ci->i_cap_wq);
}
@ -1483,6 +1471,11 @@ static inline int __send_flush_snap(struct inode *inode,
u32 mseq, u64 oldest_flush_tid)
{
struct cap_msg_args arg;
struct ceph_msg *msg;
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
if (!msg)
return -ENOMEM;
arg.session = session;
arg.ino = ceph_vino(inode).ino;
@ -1521,7 +1514,9 @@ static inline int __send_flush_snap(struct inode *inode,
arg.flags = 0;
arg.wake = false;
return send_cap_msg(&arg);
encode_cap_msg(msg, &arg);
ceph_con_send(&arg.session->s_con, msg);
return 0;
}
/*
@ -1906,9 +1901,8 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_cap *cap;
u64 flush_tid, oldest_flush_tid;
int file_wanted, used, cap_used;
@ -1928,12 +1922,24 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
retry:
spin_lock(&ci->i_ceph_lock);
retry_locked:
/* Caps wanted by virtue of active open files. */
file_wanted = __ceph_caps_file_wanted(ci);
/* Caps which have active references against them */
used = __ceph_caps_used(ci);
/*
* "issued" represents the current caps that the MDS wants us to have.
* "implemented" is the set that we have been granted, and includes the
* ones that have not yet been returned to the MDS (the "revoking" set,
* usually because they have outstanding references).
*/
issued = __ceph_caps_issued(ci, &implemented);
revoking = implemented & ~issued;
want = file_wanted;
/* The ones we currently want to retain (may be adjusted below) */
retain = file_wanted | used | CEPH_CAP_PIN;
if (!mdsc->stopping && inode->i_nlink > 0) {
if (file_wanted) {
@ -2011,6 +2017,10 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
/* NOTE: no side-effects allowed, until we take s_mutex */
/*
* If we have an auth cap, we don't need to consider any
* overlapping caps as used.
*/
cap_used = used;
if (ci->i_auth_cap && cap != ci->i_auth_cap)
cap_used &= ~ci->i_auth_cap->issued;
@ -2148,7 +2158,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
want, retain, flushing, flush_tid, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
__send_cap(mdsc, &arg, ci);
__send_cap(&arg, ci);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
@ -2222,7 +2232,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
flushing, flush_tid, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
__send_cap(mdsc, &arg, ci);
__send_cap(&arg, ci);
} else {
if (!list_empty(&ci->i_cap_flush_list)) {
struct ceph_cap_flush *cf =
@ -2436,7 +2446,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
(cap->issued | cap->implemented),
cf->caps, cf->tid, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
__send_cap(mdsc, &arg, ci);
__send_cap(&arg, ci);
} else {
struct ceph_cap_snap *capsnap =
container_of(cf, struct ceph_cap_snap,
@ -4284,13 +4294,30 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
int i;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
int bits = (fmode << 1) | 1;
bool is_opened = false;
int i;
if (count == 1)
atomic64_inc(&mdsc->metric.opened_files);
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i))
ci->i_nr_by_mode[i] += count;
/*
* If any of the mode ref is larger than 1,
* that means it has been already opened by
* others. Just skip checking the PIN ref.
*/
if (i && ci->i_nr_by_mode[i] > 1)
is_opened = true;
}
if (!is_opened)
percpu_counter_inc(&mdsc->metric.opened_inodes);
spin_unlock(&ci->i_ceph_lock);
}
@ -4301,15 +4328,32 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
*/
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
int i;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
int bits = (fmode << 1) | 1;
bool is_closed = true;
int i;
if (count == 1)
atomic64_dec(&mdsc->metric.opened_files);
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i)) {
BUG_ON(ci->i_nr_by_mode[i] < count);
ci->i_nr_by_mode[i] -= count;
}
/*
* If any of the mode ref is not 0 after
* decreased, that means it is still opened
* by others. Just skip checking the PIN ref.
*/
if (i && ci->i_nr_by_mode[i])
is_closed = false;
}
if (is_closed)
percpu_counter_dec(&mdsc->metric.opened_inodes);
spin_unlock(&ci->i_ceph_lock);
}

View File

@ -148,6 +148,17 @@ static int metric_show(struct seq_file *s, void *p)
int nr_caps = 0;
s64 total, sum, avg, min, max, sq;
sum = percpu_counter_sum(&m->total_inodes);
seq_printf(s, "item total\n");
seq_printf(s, "------------------------------------------\n");
seq_printf(s, "%-35s%lld / %lld\n", "opened files / total inodes",
atomic64_read(&m->opened_files), sum);
seq_printf(s, "%-35s%lld / %lld\n", "pinned i_caps / total inodes",
atomic64_read(&m->total_caps), sum);
seq_printf(s, "%-35s%lld / %lld\n", "opened inodes / total inodes",
percpu_counter_sum(&m->opened_inodes), sum);
seq_printf(s, "\n");
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
seq_printf(s, "-----------------------------------------------------------------------------------\n");
@ -202,7 +213,8 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
{
struct seq_file *s = p;
seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
cap->session->s_mds,
ceph_cap_string(cap->issued),
ceph_cap_string(cap->implemented));
return 0;
@ -222,8 +234,8 @@ static int caps_show(struct seq_file *s, void *p)
"reserved\t%d\n"
"min\t\t%d\n\n",
total, avail, used, reserved, min);
seq_printf(s, "ino issued implemented\n");
seq_printf(s, "-----------------------------------------------\n");
seq_printf(s, "ino mds issued implemented\n");
seq_printf(s, "--------------------------------------------------\n");
mutex_lock(&mdsc->mutex);
for (i = 0; i < mdsc->max_sessions; i++) {

View File

@ -38,8 +38,7 @@ static int __dir_lease_try_check(const struct dentry *dentry);
static int ceph_d_init(struct dentry *dentry)
{
struct ceph_dentry_info *di;
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
if (!di)
@ -738,7 +737,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
int op;
int mask;
@ -827,8 +826,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
static int ceph_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err;
@ -889,8 +887,7 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
static int ceph_symlink(struct inode *dir, struct dentry *dentry,
const char *dest)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err;
@ -942,8 +939,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
int err = -EROFS;
@ -1010,8 +1006,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
static int ceph_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
int err;
@ -1192,8 +1187,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
struct ceph_mds_request *req;
int op = CEPH_MDS_OP_RENAME;
int err;

View File

@ -182,8 +182,7 @@ static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
static struct ceph_mds_request *
prepare_open_request(struct super_block *sb, int flags, int create_mode)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
struct ceph_mds_request *req;
int want_auth = USE_ANY_MDS;
int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@ -256,8 +255,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFDIR:
ret = ceph_init_file_info(inode, file, fmode,
S_ISDIR(inode->i_mode));
if (ret)
return ret;
break;
case S_IFLNK:
@ -285,7 +282,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
*/
int ceph_renew_caps(struct inode *inode, int fmode)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
int err, flags, wanted;
@ -865,6 +862,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
size_t page_off;
u64 i_size;
bool more;
int idx;
size_t left;
req = ceph_osdc_new_request(osdc, &ci->i_layout,
ci->i_vino, off, &len, 0, 1,
@ -878,29 +877,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
more = len < iov_iter_count(to);
if (unlikely(iov_iter_is_pipe(to))) {
ret = iov_iter_get_pages_alloc(to, &pages, len,
&page_off);
if (ret <= 0) {
ceph_osdc_put_request(req);
ret = -ENOMEM;
break;
}
num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
if (ret < len) {
len = ret;
osd_req_op_extent_update(req, 0, len);
more = false;
}
} else {
num_pages = calc_pages_for(off, len);
page_off = off & ~PAGE_MASK;
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) {
ceph_osdc_put_request(req);
ret = PTR_ERR(pages);
break;
}
num_pages = calc_pages_for(off, len);
page_off = off & ~PAGE_MASK;
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) {
ceph_osdc_put_request(req);
ret = PTR_ERR(pages);
break;
}
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
@ -931,36 +914,27 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
ret += zlen;
}
if (unlikely(iov_iter_is_pipe(to))) {
if (ret > 0) {
iov_iter_advance(to, ret);
off += ret;
} else {
iov_iter_advance(to, 0);
idx = 0;
left = ret > 0 ? ret : 0;
while (left > 0) {
size_t len, copied;
page_off = off & ~PAGE_MASK;
len = min_t(size_t, left, PAGE_SIZE - page_off);
SetPageUptodate(pages[idx]);
copied = copy_page_to_iter(pages[idx++],
page_off, len, to);
off += copied;
left -= copied;
if (copied < len) {
ret = -EFAULT;
break;
}
ceph_put_page_vector(pages, num_pages, false);
} else {
int idx = 0;
size_t left = ret > 0 ? ret : 0;
while (left > 0) {
size_t len, copied;
page_off = off & ~PAGE_MASK;
len = min_t(size_t, left, PAGE_SIZE - page_off);
copied = copy_page_to_iter(pages[idx++],
page_off, len, to);
off += copied;
left -= copied;
if (copied < len) {
ret = -EFAULT;
break;
}
}
ceph_release_page_vector(pages, num_pages);
}
ceph_release_page_vector(pages, num_pages);
if (ret < 0) {
if (ret == -EBLACKLISTED)
fsc->blacklisted = true;
if (ret == -EBLOCKLISTED)
fsc->blocklisted = true;
break;
}
@ -1052,8 +1026,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
struct inode *inode = req->r_inode;
struct ceph_aio_request *aio_req = req->r_priv;
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_client_metric *metric = &fsc->mdsc->metric;
struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
BUG_ON(!osd_data->num_bvecs);

View File

@ -42,10 +42,13 @@ static void ceph_inode_work(struct work_struct *work);
static int ceph_set_ino_cb(struct inode *inode, void *data)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
ci->i_vino = *(struct ceph_vino *)data;
inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
inode_set_iversion_raw(inode, 0);
percpu_counter_inc(&mdsc->metric.total_inodes);
return 0;
}
@ -538,11 +541,14 @@ void ceph_free_inode(struct inode *inode)
void ceph_evict_inode(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_frag *frag;
struct rb_node *n;
dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
percpu_counter_dec(&mdsc->metric.total_inodes);
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
@ -558,8 +564,6 @@ void ceph_evict_inode(struct inode *inode)
* caps in i_snap_caps.
*/
if (ci->i_snap_realm) {
struct ceph_mds_client *mdsc =
ceph_inode_to_client(inode)->mdsc;
if (ceph_snap(inode) == CEPH_NOSNAP) {
struct ceph_snap_realm *realm = ci->i_snap_realm;
dout(" dropping residual ref to snap realm %p\n",
@ -739,7 +743,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
struct ceph_mds_session *session, int cap_fmode,
struct ceph_cap_reservation *caps_reservation)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_mds_reply_inode *info = iinfo->in;
struct ceph_inode_info *ci = ceph_inode(inode);
int issued, new_issued, info_caps;

View File

@ -63,7 +63,7 @@ static const struct file_lock_operations ceph_fl_lock_ops = {
static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
int cmd, u8 wait, struct file_lock *fl)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_mds_request *req;
int err;
u64 length = 0;

View File

@ -3303,7 +3303,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
}
static int __decode_session_metadata(void **p, void *end,
bool *blacklisted)
bool *blocklisted)
{
/* map<string,string> */
u32 n;
@ -3317,8 +3317,12 @@ static int __decode_session_metadata(void **p, void *end,
*p += len;
ceph_decode_32_safe(p, end, len, bad);
ceph_decode_need(p, end, len, bad);
/*
* Match "blocklisted (blacklisted)" from newer MDSes,
* or "blacklisted" from older MDSes.
*/
if (err_str && strnstr(*p, "blacklisted", len))
*blacklisted = true;
*blocklisted = true;
*p += len;
}
return 0;
@ -3341,7 +3345,7 @@ static void handle_session(struct ceph_mds_session *session,
u32 op;
u64 seq, features = 0;
int wake = 0;
bool blacklisted = false;
bool blocklisted = false;
/* decode */
ceph_decode_need(&p, end, sizeof(*h), bad);
@ -3354,7 +3358,7 @@ static void handle_session(struct ceph_mds_session *session,
if (msg_version >= 3) {
u32 len;
/* version >= 2, metadata */
if (__decode_session_metadata(&p, end, &blacklisted) < 0)
if (__decode_session_metadata(&p, end, &blocklisted) < 0)
goto bad;
/* version >= 3, feature bits */
ceph_decode_32_safe(&p, end, len, bad);
@ -3445,8 +3449,8 @@ static void handle_session(struct ceph_mds_session *session,
session->s_state = CEPH_MDS_SESSION_REJECTED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
if (blacklisted)
mdsc->fsc->blacklisted = true;
if (blocklisted)
mdsc->fsc->blocklisted = true;
wake = 2; /* for good measure */
break;
@ -3612,6 +3616,39 @@ static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
return err;
}
static struct dentry* d_find_primary(struct inode *inode)
{
struct dentry *alias, *dn = NULL;
if (hlist_empty(&inode->i_dentry))
return NULL;
spin_lock(&inode->i_lock);
if (hlist_empty(&inode->i_dentry))
goto out_unlock;
if (S_ISDIR(inode->i_mode)) {
alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
if (!IS_ROOT(alias))
dn = dget(alias);
goto out_unlock;
}
hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
spin_lock(&alias->d_lock);
if (!d_unhashed(alias) &&
(ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
dn = dget_dlock(alias);
}
spin_unlock(&alias->d_lock);
if (dn)
break;
}
out_unlock:
spin_unlock(&inode->i_lock);
return dn;
}
/*
* Encode information about a cap for a reconnect with the MDS.
*/
@ -3625,13 +3662,32 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_inode_info *ci = cap->ci;
struct ceph_reconnect_state *recon_state = arg;
struct ceph_pagelist *pagelist = recon_state->pagelist;
int err;
struct dentry *dentry;
char *path;
int pathlen, err;
u64 pathbase;
u64 snap_follows;
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
inode, ceph_vinop(inode), cap, cap->cap_id,
ceph_cap_string(cap->issued));
dentry = d_find_primary(inode);
if (dentry) {
/* set pathbase to parent dir when msg_version >= 2 */
path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
recon_state->msg_version >= 2);
dput(dentry);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out_err;
}
} else {
path = NULL;
pathlen = 0;
pathbase = 0;
}
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
@ -3652,7 +3708,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
rec.v2.pathbase = 0;
rec.v2.pathbase = cpu_to_le64(pathbase);
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
@ -3663,7 +3719,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
rec.v1.pathbase = 0;
rec.v1.pathbase = cpu_to_le64(pathbase);
}
if (list_empty(&ci->i_cap_snaps)) {
@ -3725,7 +3781,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
sizeof(struct ceph_filelock);
rec.v2.flock_len = cpu_to_le32(struct_len);
struct_len += sizeof(u32) + sizeof(rec.v2);
struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
if (struct_v >= 2)
struct_len += sizeof(u64); /* snap_follows */
@ -3749,7 +3805,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
ceph_pagelist_encode_8(pagelist, 1);
ceph_pagelist_encode_32(pagelist, struct_len);
}
ceph_pagelist_encode_string(pagelist, NULL, 0);
ceph_pagelist_encode_string(pagelist, path, pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
ceph_locks_to_pagelist(flocks, pagelist,
num_fcntl_locks, num_flock_locks);
@ -3758,39 +3814,20 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
out_freeflocks:
kfree(flocks);
} else {
u64 pathbase = 0;
int pathlen = 0;
char *path = NULL;
struct dentry *dentry;
dentry = d_find_alias(inode);
if (dentry) {
path = ceph_mdsc_build_path(dentry,
&pathlen, &pathbase, 0);
dput(dentry);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out_err;
}
rec.v1.pathbase = cpu_to_le64(pathbase);
}
err = ceph_pagelist_reserve(pagelist,
sizeof(u64) + sizeof(u32) +
pathlen + sizeof(rec.v1));
if (err) {
goto out_freepath;
}
if (err)
goto out_err;
ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
ceph_pagelist_encode_string(pagelist, path, pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
out_freepath:
ceph_mdsc_free_path(path, pathlen);
}
out_err:
if (err >= 0)
ceph_mdsc_free_path(path, pathlen);
if (!err)
recon_state->nr_caps++;
return err;
}
@ -4334,14 +4371,14 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
return;
if (!READ_ONCE(fsc->blacklisted))
if (!READ_ONCE(fsc->blocklisted))
return;
if (fsc->last_auto_reconnect &&
time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
return;
pr_info("auto reconnect after blacklisted\n");
pr_info("auto reconnect after blocklisted\n");
fsc->last_auto_reconnect = jiffies;
ceph_force_reconnect(fsc->sb);
}

View File

@ -393,7 +393,7 @@ struct ceph_mds_client {
struct ceph_mds_session **sessions; /* NULL for mds if no session */
atomic_t num_sessions;
int max_sessions; /* len of s_mds_sessions */
int max_sessions; /* len of sessions array */
int stopping; /* true if shutting down */
atomic64_t quotarealms_count; /* # realms with quota */

View File

@ -192,11 +192,23 @@ int ceph_metric_init(struct ceph_client_metric *m)
m->total_metadatas = 0;
m->metadata_latency_sum = 0;
atomic64_set(&m->opened_files, 0);
ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL);
if (ret)
goto err_opened_inodes;
ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL);
if (ret)
goto err_total_inodes;
m->session = NULL;
INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
return 0;
err_total_inodes:
percpu_counter_destroy(&m->opened_inodes);
err_opened_inodes:
percpu_counter_destroy(&m->i_caps_mis);
err_i_caps_mis:
percpu_counter_destroy(&m->i_caps_hit);
err_i_caps_hit:
@ -212,6 +224,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
if (!m)
return;
percpu_counter_destroy(&m->total_inodes);
percpu_counter_destroy(&m->opened_inodes);
percpu_counter_destroy(&m->i_caps_mis);
percpu_counter_destroy(&m->i_caps_hit);
percpu_counter_destroy(&m->d_lease_mis);

View File

@ -115,6 +115,13 @@ struct ceph_client_metric {
ktime_t metadata_latency_min;
ktime_t metadata_latency_max;
/* The total number of directories and files that are opened */
atomic64_t opened_files;
/* The total number of inodes that have opened files or directories */
struct percpu_counter opened_inodes;
struct percpu_counter total_inodes;
struct ceph_mds_session *session;
struct delayed_work delayed_work; /* delayed work */
};

View File

@ -12,7 +12,7 @@
void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
if (inc)
atomic64_inc(&mdsc->quotarealms_count);
else
@ -21,8 +21,8 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
static inline bool ceph_has_realms_with_quotas(struct inode *inode)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct super_block *sb = mdsc->fsc->sb;
struct super_block *sb = inode->i_sb;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
struct inode *root = d_inode(sb->s_root);
if (atomic64_read(&mdsc->quotarealms_count) > 0)
@ -266,7 +266,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb);
struct ceph_snap_realm *old_realm, *new_realm;
bool is_same;
@ -313,7 +313,7 @@ enum quota_check_op {
static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
loff_t delta)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci;
struct ceph_snap_realm *realm, *next;
struct inode *in;

View File

@ -602,7 +602,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
struct inode *inode = &ci->vfs_inode;
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
BUG_ON(capsnap->writing);
capsnap->size = inode->i_size;

View File

@ -1205,14 +1205,13 @@ static int ceph_init_fs_context(struct fs_context *fc)
static void ceph_kill_sb(struct super_block *s)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(s);
dev_t dev = s->s_dev;
dout("kill_sb %p\n", s);
ceph_mdsc_pre_umount(fsc->mdsc);
flush_fs_workqueues(fsc);
generic_shutdown_super(s);
kill_anon_super(s);
fsc->client->extra_mon_dispatch = NULL;
ceph_fs_debugfs_cleanup(fsc);
@ -1220,7 +1219,6 @@ static void ceph_kill_sb(struct super_block *s)
ceph_fscache_unregister_fs(fsc);
destroy_fs_client(fsc);
free_anon_bdev(dev);
}
static struct file_system_type ceph_fs_type = {
@ -1243,13 +1241,13 @@ int ceph_force_reconnect(struct super_block *sb)
* see remove_session_caps_cb() */
flush_workqueue(fsc->inode_wq);
/* In case that we were blacklisted. This also reset
/* In case that we were blocklisted. This also reset
* all mon/osd connections */
ceph_reset_client_addr(fsc->client);
ceph_osdc_clear_abort_err(&fsc->client->osdc);
fsc->blacklisted = false;
fsc->blocklisted = false;
fsc->mount_state = CEPH_MOUNT_MOUNTED;
if (sb->s_root) {

View File

@ -32,7 +32,7 @@
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */
#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
@ -109,7 +109,7 @@ struct ceph_fs_client {
unsigned long mount_state;
unsigned long last_auto_reconnect;
bool blacklisted;
bool blocklisted;
bool have_copy_from2;
@ -160,7 +160,8 @@ struct ceph_cap {
int issued; /* latest, from the mds */
int implemented; /* implemented superset of
issued (for revocation) */
int mds, mds_wanted;
int mds; /* mds index for this cap */
int mds_wanted; /* caps wanted from this mds */
};
/* caps to release */
struct {
@ -451,6 +452,12 @@ ceph_sb_to_client(const struct super_block *sb)
return (struct ceph_fs_client *)sb->s_fs_info;
}
static inline struct ceph_mds_client *
ceph_sb_to_mdsc(const struct super_block *sb)
{
return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc;
}
static inline struct ceph_vino
ceph_vino(const struct inode *inode)
{

View File

@ -116,7 +116,8 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
* NULL terminates however, so call it on a temporary buffer and then memcpy
* the result into place.
*/
static int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
static __printf(3, 4)
int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
{
int ret;
va_list args;

View File

@ -54,7 +54,7 @@ struct ceph_connection_operations {
int (*check_message_signature) (struct ceph_msg *msg);
};
/* use format string %s%d */
/* use format string %s%lld */
#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
struct ceph_messenger {

View File

@ -142,7 +142,7 @@ int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
ceph_monc_callback_t cb, u64 private_data);
int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
struct ceph_entity_addr *client_addr);
extern int ceph_monc_open_session(struct ceph_mon_client *monc);

View File

@ -137,6 +137,17 @@ int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
const char *fmt, ...);
void ceph_oid_destroy(struct ceph_object_id *oid);
struct workspace_manager {
struct list_head idle_ws;
spinlock_t ws_lock;
/* Number of free workspaces */
int free_ws;
/* Total number of allocated workspaces */
atomic_t total_ws;
/* Waiters for a free workspace */
wait_queue_head_t ws_wait;
};
struct ceph_pg_mapping {
struct rb_node node;
struct ceph_pg pgid;
@ -184,8 +195,7 @@ struct ceph_osdmap {
* the list of osds that store+replicate them. */
struct crush_map *crush;
struct mutex crush_workspace_mutex;
void *crush_workspace;
struct workspace_manager crush_wsm;
};
static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)

View File

@ -424,7 +424,7 @@ enum {
};
#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
#define EBLACKLISTED ESHUTDOWN /* blacklisted */
#define EBLOCKLISTED ESHUTDOWN /* blocklisted */
/* xattr comparison */
enum {

View File

@ -346,6 +346,9 @@ struct crush_work_bucket {
struct crush_work {
struct crush_work_bucket **work; /* Per-bucket working store */
#ifdef __KERNEL__
struct list_head item;
#endif
};
#ifdef __KERNEL__

View File

@ -2016,11 +2016,11 @@ static int process_banner(struct ceph_connection *con)
sizeof(con->peer_addr)) != 0 &&
!(addr_is_blank(&con->actual_peer_addr) &&
con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
pr_warn("wrong peer, want %s/%d, got %s/%d\n",
pr_warn("wrong peer, want %s/%u, got %s/%u\n",
ceph_pr_addr(&con->peer_addr),
(int)le32_to_cpu(con->peer_addr.nonce),
le32_to_cpu(con->peer_addr.nonce),
ceph_pr_addr(&con->actual_peer_addr),
(int)le32_to_cpu(con->actual_peer_addr.nonce));
le32_to_cpu(con->actual_peer_addr.nonce));
con->error_msg = "wrong peer at address";
return -1;
}
@ -2811,13 +2811,13 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
return -ENOENT;
}
dout("%s %p %lu\n", __func__, con, delay);
if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
dout("%s %p - already queued\n", __func__, con);
con->ops->put(con);
return -EBUSY;
}
dout("%s %p %lu\n", __func__, con, delay);
return 0;
}
@ -2998,6 +2998,11 @@ static void con_fault(struct ceph_connection *con)
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
}
if (con->out_msg) {
BUG_ON(con->out_msg->con != con);
ceph_msg_put(con->out_msg);
con->out_msg = NULL;
}
/* Requeue anything that hasn't been acked */
list_splice_init(&con->out_sent, &con->out_queue);

View File

@ -896,8 +896,9 @@ static void handle_command_ack(struct ceph_mon_client *monc,
ceph_msg_dump(msg);
}
int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
struct ceph_entity_addr *client_addr)
static __printf(2, 0)
int do_mon_command_vargs(struct ceph_mon_client *monc, const char *fmt,
va_list ap)
{
struct ceph_mon_generic_request *req;
struct ceph_mon_command *h;
@ -925,29 +926,65 @@ int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
h->monhdr.session_mon_tid = 0;
h->fsid = monc->monmap->fsid;
h->num_strs = cpu_to_le32(1);
len = sprintf(h->str, "{ \"prefix\": \"osd blacklist\", \
\"blacklistop\": \"add\", \
\"addr\": \"%pISpc/%u\" }",
&client_addr->in_addr, le32_to_cpu(client_addr->nonce));
len = vsprintf(h->str, fmt, ap);
h->str_len = cpu_to_le32(len);
send_generic_request(monc, req);
mutex_unlock(&monc->mutex);
ret = wait_generic_request(req);
if (!ret)
/*
* Make sure we have the osdmap that includes the blacklist
* entry. This is needed to ensure that the OSDs pick up the
* new blacklist before processing any future requests from
* this client.
*/
ret = ceph_wait_for_latest_osdmap(monc->client, 0);
out:
put_generic_request(req);
return ret;
}
EXPORT_SYMBOL(ceph_monc_blacklist_add);
static __printf(2, 3)
int do_mon_command(struct ceph_mon_client *monc, const char *fmt, ...)
{
va_list ap;
int ret;
va_start(ap, fmt);
ret = do_mon_command_vargs(monc, fmt, ap);
va_end(ap);
return ret;
}
int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
struct ceph_entity_addr *client_addr)
{
int ret;
ret = do_mon_command(monc,
"{ \"prefix\": \"osd blocklist\", \
\"blocklistop\": \"add\", \
\"addr\": \"%pISpc/%u\" }",
&client_addr->in_addr,
le32_to_cpu(client_addr->nonce));
if (ret == -EINVAL) {
/*
* The monitor returns EINVAL on an unrecognized command.
* Try the legacy command -- it is exactly the same except
* for the name.
*/
ret = do_mon_command(monc,
"{ \"prefix\": \"osd blacklist\", \
\"blacklistop\": \"add\", \
\"addr\": \"%pISpc/%u\" }",
&client_addr->in_addr,
le32_to_cpu(client_addr->nonce));
}
if (ret)
return ret;
/*
* Make sure we have the osdmap that includes the blocklist
* entry. This is needed to ensure that the OSDs pick up the
* new blocklist before processing any future requests from
* this client.
*/
return ceph_wait_for_latest_osdmap(monc->client, 0);
}
EXPORT_SYMBOL(ceph_monc_blocklist_add);
/*
* Resend pending generic requests.

View File

@ -964,6 +964,143 @@ static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
return -EINVAL;
}
/*
* CRUSH workspaces
*
* workspace_manager framework borrowed from fs/btrfs/compression.c.
* Two simplifications: there is only one type of workspace and there
* is always at least one workspace.
*/
static struct crush_work *alloc_workspace(const struct crush_map *c)
{
struct crush_work *work;
size_t work_size;
WARN_ON(!c->working_size);
work_size = crush_work_size(c, CEPH_PG_MAX_SIZE);
dout("%s work_size %zu bytes\n", __func__, work_size);
work = ceph_kvmalloc(work_size, GFP_NOIO);
if (!work)
return NULL;
INIT_LIST_HEAD(&work->item);
crush_init_workspace(c, work);
return work;
}
static void free_workspace(struct crush_work *work)
{
WARN_ON(!list_empty(&work->item));
kvfree(work);
}
static void init_workspace_manager(struct workspace_manager *wsm)
{
INIT_LIST_HEAD(&wsm->idle_ws);
spin_lock_init(&wsm->ws_lock);
atomic_set(&wsm->total_ws, 0);
wsm->free_ws = 0;
init_waitqueue_head(&wsm->ws_wait);
}
static void add_initial_workspace(struct workspace_manager *wsm,
struct crush_work *work)
{
WARN_ON(!list_empty(&wsm->idle_ws));
list_add(&work->item, &wsm->idle_ws);
atomic_set(&wsm->total_ws, 1);
wsm->free_ws = 1;
}
static void cleanup_workspace_manager(struct workspace_manager *wsm)
{
struct crush_work *work;
while (!list_empty(&wsm->idle_ws)) {
work = list_first_entry(&wsm->idle_ws, struct crush_work,
item);
list_del_init(&work->item);
free_workspace(work);
}
atomic_set(&wsm->total_ws, 0);
wsm->free_ws = 0;
}
/*
* Finds an available workspace or allocates a new one. If it's not
* possible to allocate a new one, waits until there is one.
*/
static struct crush_work *get_workspace(struct workspace_manager *wsm,
const struct crush_map *c)
{
struct crush_work *work;
int cpus = num_online_cpus();
again:
spin_lock(&wsm->ws_lock);
if (!list_empty(&wsm->idle_ws)) {
work = list_first_entry(&wsm->idle_ws, struct crush_work,
item);
list_del_init(&work->item);
wsm->free_ws--;
spin_unlock(&wsm->ws_lock);
return work;
}
if (atomic_read(&wsm->total_ws) > cpus) {
DEFINE_WAIT(wait);
spin_unlock(&wsm->ws_lock);
prepare_to_wait(&wsm->ws_wait, &wait, TASK_UNINTERRUPTIBLE);
if (atomic_read(&wsm->total_ws) > cpus && !wsm->free_ws)
schedule();
finish_wait(&wsm->ws_wait, &wait);
goto again;
}
atomic_inc(&wsm->total_ws);
spin_unlock(&wsm->ws_lock);
work = alloc_workspace(c);
if (!work) {
atomic_dec(&wsm->total_ws);
wake_up(&wsm->ws_wait);
/*
* Do not return the error but go back to waiting. We
* have the inital workspace and the CRUSH computation
* time is bounded so we will get it eventually.
*/
WARN_ON(atomic_read(&wsm->total_ws) < 1);
goto again;
}
return work;
}
/*
* Puts a workspace back on the list or frees it if we have enough
* idle ones sitting around.
*/
static void put_workspace(struct workspace_manager *wsm,
struct crush_work *work)
{
spin_lock(&wsm->ws_lock);
if (wsm->free_ws <= num_online_cpus()) {
list_add(&work->item, &wsm->idle_ws);
wsm->free_ws++;
spin_unlock(&wsm->ws_lock);
goto wake;
}
spin_unlock(&wsm->ws_lock);
free_workspace(work);
atomic_dec(&wsm->total_ws);
wake:
if (wq_has_sleeper(&wsm->ws_wait))
wake_up(&wsm->ws_wait);
}
/*
* osd map
*/
@ -981,7 +1118,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
map->primary_temp = RB_ROOT;
map->pg_upmap = RB_ROOT;
map->pg_upmap_items = RB_ROOT;
mutex_init(&map->crush_workspace_mutex);
init_workspace_manager(&map->crush_wsm);
return map;
}
@ -989,8 +1127,11 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
void ceph_osdmap_destroy(struct ceph_osdmap *map)
{
dout("osdmap_destroy %p\n", map);
if (map->crush)
crush_destroy(map->crush);
cleanup_workspace_manager(&map->crush_wsm);
while (!RB_EMPTY_ROOT(&map->pg_temp)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->pg_temp),
@ -1029,7 +1170,6 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
kvfree(map->osd_weight);
kvfree(map->osd_addr);
kvfree(map->osd_primary_affinity);
kvfree(map->crush_workspace);
kfree(map);
}
@ -1104,26 +1244,22 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
{
void *workspace;
size_t work_size;
struct crush_work *work;
if (IS_ERR(crush))
return PTR_ERR(crush);
work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
dout("%s work_size %zu bytes\n", __func__, work_size);
workspace = ceph_kvmalloc(work_size, GFP_NOIO);
if (!workspace) {
work = alloc_workspace(crush);
if (!work) {
crush_destroy(crush);
return -ENOMEM;
}
crush_init_workspace(crush, workspace);
if (map->crush)
crush_destroy(map->crush);
kvfree(map->crush_workspace);
cleanup_workspace_manager(&map->crush_wsm);
map->crush = crush;
map->crush_workspace = workspace;
add_initial_workspace(&map->crush_wsm, work);
return 0;
}
@ -2322,6 +2458,7 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
s64 choose_args_index)
{
struct crush_choose_arg_map *arg_map;
struct crush_work *work;
int r;
BUG_ON(result_max > CEPH_PG_MAX_SIZE);
@ -2332,12 +2469,11 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
arg_map = lookup_choose_arg_map(&map->crush->choose_args,
CEPH_DEFAULT_CHOOSE_ARGS);
mutex_lock(&map->crush_workspace_mutex);
work = get_workspace(&map->crush_wsm, map->crush);
r = crush_do_rule(map->crush, ruleno, x, result, result_max,
weight, weight_max, map->crush_workspace,
weight, weight_max, work,
arg_map ? arg_map->args : NULL);
mutex_unlock(&map->crush_workspace_mutex);
put_workspace(&map->crush_wsm, work);
return r;
}