forked from luck/tmp_suning_uos_patched
We have:
- a patch that removes crush_workspace_mutex (myself). CRUSH computations are no longer serialized and can run in parallel. - a couple new filesystem client metrics for "ceph fs top" command (Xiubo Li) - a fix for a very old messenger bug that affected the filesystem, marked for stable (myself) - assorted fixups and cleanups throughout the codebase from Jeff and others. -----BEGIN PGP SIGNATURE----- iQFHBAABCAAxFiEEydHwtzie9C7TfviiSn/eOAIR84sFAl+QN8cTHGlkcnlvbW92 QGdtYWlsLmNvbQAKCRBKf944AhHzi4iwCACLwUvO0ONTzUUb2D8ftfyXjo/jIod5 eO7RHfCHOP2A83GQpdYdktVDSVeOUIOiPhxAxo1dL4GieI2/saXrnoevam7ogZkA OmR4drdtRVUqF/aATrtjiDMg2ge0dnx5gfjMxSP/FiPPpXOdrtxng/7tv8yo+03q AlMqg/YcxO06t1M1qh9SEyfzjcHyPnJU2i0ienngxnGxQ7QiMOR6anF1LNhtN803 4fTBX2tLqDNAa+x5yF1kKSn9OmFNhc0oUsqef+Ck0Vw1LC0/SuxzE2J904iehAwy /HzJCer0zcp+eZhn3HhoHmp0UZpOC1dzMxa3CHyRJFrxCSTEhY8iqPiJ =wGr7 -----END PGP SIGNATURE----- Merge tag 'ceph-for-5.10-rc1' of git://github.com/ceph/ceph-client Pull ceph updates from Ilya Dryomov: - a patch that removes crush_workspace_mutex (myself). CRUSH computations are no longer serialized and can run in parallel. - a couple new filesystem client metrics for "ceph fs top" command (Xiubo Li) - a fix for a very old messenger bug that affected the filesystem, marked for stable (myself) - assorted fixups and cleanups throughout the codebase from Jeff and others. * tag 'ceph-for-5.10-rc1' of git://github.com/ceph/ceph-client: (27 commits) libceph: clear con->out_msg on Policy::stateful_server faults libceph: format ceph_entity_addr nonces as unsigned libceph: fix ENTITY_NAME format suggestion libceph: move a dout in queue_con_delay() ceph: comment cleanups and clarifications ceph: break up send_cap_msg ceph: drop separate mdsc argument from __send_cap ceph: promote to unsigned long long before shifting ceph: don't SetPageError on readpage errors ceph: mark ceph_fmt_xattr() as printf-like for better type checking ceph: fold ceph_update_writeable_page into ceph_write_begin ceph: fold ceph_sync_writepages into writepage_nounlock ceph: fold ceph_sync_readpages into ceph_readpage ceph: don't call ceph_update_writeable_page from page_mkwrite ceph: break out writeback of incompatible snap context to separate function ceph: add a note explaining session reject error string libceph: switch to the new "osd blocklist add" command libceph, rbd, ceph: "blacklist" -> "blocklist" ceph: have ceph_writepages_start call pagevec_lookup_range_tag ceph: use kill_anon_super helper ...
This commit is contained in:
commit
ed7cfefe44
|
@ -163,14 +163,14 @@ Mount Options
|
|||
to the default VFS implementation if this option is used.
|
||||
|
||||
recover_session=<no|clean>
|
||||
Set auto reconnect mode in the case where the client is blacklisted. The
|
||||
Set auto reconnect mode in the case where the client is blocklisted. The
|
||||
available modes are "no" and "clean". The default is "no".
|
||||
|
||||
* no: never attempt to reconnect when client detects that it has been
|
||||
blacklisted. Operations will generally fail after being blacklisted.
|
||||
blocklisted. Operations will generally fail after being blocklisted.
|
||||
|
||||
* clean: client reconnects to the ceph cluster automatically when it
|
||||
detects that it has been blacklisted. During reconnect, client drops
|
||||
detects that it has been blocklisted. During reconnect, client drops
|
||||
dirty data/metadata, invalidates page caches and writable file handles.
|
||||
After reconnect, file locks become stale because the MDS loses track
|
||||
of them. If an inode contains any stale file locks, read/write on the
|
||||
|
|
|
@ -4010,10 +4010,10 @@ static int rbd_try_lock(struct rbd_device *rbd_dev)
|
|||
rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
|
||||
ENTITY_NAME(lockers[0].id.name));
|
||||
|
||||
ret = ceph_monc_blacklist_add(&client->monc,
|
||||
ret = ceph_monc_blocklist_add(&client->monc,
|
||||
&lockers[0].info.addr);
|
||||
if (ret) {
|
||||
rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
|
||||
rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
|
||||
ENTITY_NAME(lockers[0].id.name), ret);
|
||||
goto out;
|
||||
}
|
||||
|
@ -4077,7 +4077,7 @@ static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
|
|||
ret = rbd_try_lock(rbd_dev);
|
||||
if (ret < 0) {
|
||||
rbd_warn(rbd_dev, "failed to lock header: %d", ret);
|
||||
if (ret == -EBLACKLISTED)
|
||||
if (ret == -EBLOCKLISTED)
|
||||
goto out;
|
||||
|
||||
ret = 1; /* request lock anyway */
|
||||
|
@ -4613,7 +4613,7 @@ static void rbd_reregister_watch(struct work_struct *work)
|
|||
ret = __rbd_register_watch(rbd_dev);
|
||||
if (ret) {
|
||||
rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
|
||||
if (ret != -EBLACKLISTED && ret != -ENOENT) {
|
||||
if (ret != -EBLOCKLISTED && ret != -ENOENT) {
|
||||
queue_delayed_work(rbd_dev->task_wq,
|
||||
&rbd_dev->watch_dwork,
|
||||
RBD_RETRY_DELAY);
|
||||
|
|
416
fs/ceph/addr.c
416
fs/ceph/addr.c
|
@ -182,58 +182,15 @@ static int ceph_releasepage(struct page *page, gfp_t g)
|
|||
return !PagePrivate(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* Read some contiguous pages. If we cross a stripe boundary, shorten
|
||||
* *plen. Return number of bytes read, or error.
|
||||
*/
|
||||
static int ceph_sync_readpages(struct ceph_fs_client *fsc,
|
||||
struct ceph_vino vino,
|
||||
struct ceph_file_layout *layout,
|
||||
u64 off, u64 *plen,
|
||||
u32 truncate_seq, u64 truncate_size,
|
||||
struct page **pages, int num_pages,
|
||||
int page_align)
|
||||
{
|
||||
struct ceph_osd_client *osdc = &fsc->client->osdc;
|
||||
struct ceph_osd_request *req;
|
||||
int rc = 0;
|
||||
|
||||
dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
|
||||
vino.snap, off, *plen);
|
||||
req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
|
||||
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
|
||||
NULL, truncate_seq, truncate_size,
|
||||
false);
|
||||
if (IS_ERR(req))
|
||||
return PTR_ERR(req);
|
||||
|
||||
/* it may be a short read due to an object boundary */
|
||||
osd_req_op_extent_osd_data_pages(req, 0,
|
||||
pages, *plen, page_align, false, false);
|
||||
|
||||
dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
|
||||
off, *plen, *plen, page_align);
|
||||
|
||||
rc = ceph_osdc_start_request(osdc, req, false);
|
||||
if (!rc)
|
||||
rc = ceph_osdc_wait_request(osdc, req);
|
||||
|
||||
ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
|
||||
req->r_end_latency, rc);
|
||||
|
||||
ceph_osdc_put_request(req);
|
||||
dout("readpages result %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* read a single page, without unlocking it.
|
||||
*/
|
||||
/* read a single page, without unlocking it. */
|
||||
static int ceph_do_readpage(struct file *filp, struct page *page)
|
||||
{
|
||||
struct inode *inode = file_inode(filp);
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
||||
struct ceph_osd_client *osdc = &fsc->client->osdc;
|
||||
struct ceph_osd_request *req;
|
||||
struct ceph_vino vino = ceph_vino(inode);
|
||||
int err = 0;
|
||||
u64 off = page_offset(page);
|
||||
u64 len = PAGE_SIZE;
|
||||
|
@ -260,19 +217,33 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
|
|||
if (err == 0)
|
||||
return -EINPROGRESS;
|
||||
|
||||
dout("readpage inode %p file %p page %p index %lu\n",
|
||||
inode, filp, page, page->index);
|
||||
err = ceph_sync_readpages(fsc, ceph_vino(inode),
|
||||
&ci->i_layout, off, &len,
|
||||
ci->i_truncate_seq, ci->i_truncate_size,
|
||||
&page, 1, 0);
|
||||
dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
|
||||
vino.ino, vino.snap, filp, off, len, page, page->index);
|
||||
req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1,
|
||||
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL,
|
||||
ci->i_truncate_seq, ci->i_truncate_size,
|
||||
false);
|
||||
if (IS_ERR(req))
|
||||
return PTR_ERR(req);
|
||||
|
||||
osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
|
||||
|
||||
err = ceph_osdc_start_request(osdc, req, false);
|
||||
if (!err)
|
||||
err = ceph_osdc_wait_request(osdc, req);
|
||||
|
||||
ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
|
||||
req->r_end_latency, err);
|
||||
|
||||
ceph_osdc_put_request(req);
|
||||
dout("readpage result %d\n", err);
|
||||
|
||||
if (err == -ENOENT)
|
||||
err = 0;
|
||||
if (err < 0) {
|
||||
SetPageError(page);
|
||||
ceph_fscache_readpage_cancel(inode, page);
|
||||
if (err == -EBLACKLISTED)
|
||||
fsc->blacklisted = true;
|
||||
if (err == -EBLOCKLISTED)
|
||||
fsc->blocklisted = true;
|
||||
goto out;
|
||||
}
|
||||
if (err < PAGE_SIZE)
|
||||
|
@ -312,8 +283,8 @@ static void finish_read(struct ceph_osd_request *req)
|
|||
int i;
|
||||
|
||||
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
|
||||
if (rc == -EBLACKLISTED)
|
||||
ceph_inode_to_client(inode)->blacklisted = true;
|
||||
if (rc == -EBLOCKLISTED)
|
||||
ceph_inode_to_client(inode)->blocklisted = true;
|
||||
|
||||
/* unlock all pages, zeroing any data we didn't read */
|
||||
osd_data = osd_req_op_extent_osd_data(req, 0);
|
||||
|
@ -619,50 +590,6 @@ static u64 get_writepages_data_length(struct inode *inode,
|
|||
return end > start ? end - start : 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* do a synchronous write on N pages
|
||||
*/
|
||||
static int ceph_sync_writepages(struct ceph_fs_client *fsc,
|
||||
struct ceph_vino vino,
|
||||
struct ceph_file_layout *layout,
|
||||
struct ceph_snap_context *snapc,
|
||||
u64 off, u64 len,
|
||||
u32 truncate_seq, u64 truncate_size,
|
||||
struct timespec64 *mtime,
|
||||
struct page **pages, int num_pages)
|
||||
{
|
||||
struct ceph_osd_client *osdc = &fsc->client->osdc;
|
||||
struct ceph_osd_request *req;
|
||||
int rc = 0;
|
||||
int page_align = off & ~PAGE_MASK;
|
||||
|
||||
req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
|
||||
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
|
||||
snapc, truncate_seq, truncate_size,
|
||||
true);
|
||||
if (IS_ERR(req))
|
||||
return PTR_ERR(req);
|
||||
|
||||
/* it may be a short write due to an object boundary */
|
||||
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
|
||||
false, false);
|
||||
dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
|
||||
|
||||
req->r_mtime = *mtime;
|
||||
rc = ceph_osdc_start_request(osdc, req, true);
|
||||
if (!rc)
|
||||
rc = ceph_osdc_wait_request(osdc, req);
|
||||
|
||||
ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
|
||||
req->r_end_latency, rc);
|
||||
|
||||
ceph_osdc_put_request(req);
|
||||
if (rc == 0)
|
||||
rc = len;
|
||||
dout("writepages result %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* Write a single page, but leave the page locked.
|
||||
*
|
||||
|
@ -671,20 +598,19 @@ static int ceph_sync_writepages(struct ceph_fs_client *fsc,
|
|||
*/
|
||||
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
|
||||
{
|
||||
struct inode *inode;
|
||||
struct ceph_inode_info *ci;
|
||||
struct ceph_fs_client *fsc;
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
||||
struct ceph_snap_context *snapc, *oldest;
|
||||
loff_t page_off = page_offset(page);
|
||||
int err, len = PAGE_SIZE;
|
||||
int err;
|
||||
loff_t len = PAGE_SIZE;
|
||||
struct ceph_writeback_ctl ceph_wbc;
|
||||
struct ceph_osd_client *osdc = &fsc->client->osdc;
|
||||
struct ceph_osd_request *req;
|
||||
|
||||
dout("writepage %p idx %lu\n", page, page->index);
|
||||
|
||||
inode = page->mapping->host;
|
||||
ci = ceph_inode(inode);
|
||||
fsc = ceph_inode_to_client(inode);
|
||||
|
||||
/* verify this is a writeable snap context */
|
||||
snapc = page_snap_context(page);
|
||||
if (!snapc) {
|
||||
|
@ -713,7 +639,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
|
|||
if (ceph_wbc.i_size < page_off + len)
|
||||
len = ceph_wbc.i_size - page_off;
|
||||
|
||||
dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n",
|
||||
dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
|
||||
inode, page, page->index, page_off, len, snapc, snapc->seq);
|
||||
|
||||
if (atomic_long_inc_return(&fsc->writeback_count) >
|
||||
|
@ -721,11 +647,33 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
|
|||
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
|
||||
|
||||
set_page_writeback(page);
|
||||
err = ceph_sync_writepages(fsc, ceph_vino(inode),
|
||||
&ci->i_layout, snapc, page_off, len,
|
||||
ceph_wbc.truncate_seq,
|
||||
ceph_wbc.truncate_size,
|
||||
&inode->i_mtime, &page, 1);
|
||||
req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
|
||||
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
|
||||
ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
|
||||
true);
|
||||
if (IS_ERR(req)) {
|
||||
redirty_page_for_writepage(wbc, page);
|
||||
end_page_writeback(page);
|
||||
return PTR_ERR(req);
|
||||
}
|
||||
|
||||
/* it may be a short write due to an object boundary */
|
||||
WARN_ON_ONCE(len > PAGE_SIZE);
|
||||
osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
|
||||
dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
|
||||
|
||||
req->r_mtime = inode->i_mtime;
|
||||
err = ceph_osdc_start_request(osdc, req, true);
|
||||
if (!err)
|
||||
err = ceph_osdc_wait_request(osdc, req);
|
||||
|
||||
ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
|
||||
req->r_end_latency, err);
|
||||
|
||||
ceph_osdc_put_request(req);
|
||||
if (err == 0)
|
||||
err = len;
|
||||
|
||||
if (err < 0) {
|
||||
struct writeback_control tmp_wbc;
|
||||
if (!wbc)
|
||||
|
@ -737,8 +685,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
|
|||
end_page_writeback(page);
|
||||
return err;
|
||||
}
|
||||
if (err == -EBLACKLISTED)
|
||||
fsc->blacklisted = true;
|
||||
if (err == -EBLOCKLISTED)
|
||||
fsc->blocklisted = true;
|
||||
dout("writepage setting page/mapping error %d %p\n",
|
||||
err, page);
|
||||
mapping_set_error(&inode->i_data, err);
|
||||
|
@ -801,8 +749,8 @@ static void writepages_finish(struct ceph_osd_request *req)
|
|||
if (rc < 0) {
|
||||
mapping_set_error(mapping, rc);
|
||||
ceph_set_error_write(ci);
|
||||
if (rc == -EBLACKLISTED)
|
||||
fsc->blacklisted = true;
|
||||
if (rc == -EBLOCKLISTED)
|
||||
fsc->blocklisted = true;
|
||||
} else {
|
||||
ceph_clear_error_write(ci);
|
||||
}
|
||||
|
@ -962,9 +910,8 @@ static int ceph_writepages_start(struct address_space *mapping,
|
|||
max_pages = wsize >> PAGE_SHIFT;
|
||||
|
||||
get_more_pages:
|
||||
pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index,
|
||||
end, PAGECACHE_TAG_DIRTY,
|
||||
max_pages - locked_pages);
|
||||
pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
|
||||
end, PAGECACHE_TAG_DIRTY);
|
||||
dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
|
||||
if (!pvec_pages && !locked_pages)
|
||||
break;
|
||||
|
@ -1299,110 +1246,60 @@ static int context_is_writeable_or_written(struct inode *inode,
|
|||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* We are only allowed to write into/dirty the page if the page is
|
||||
* clean, or already dirty within the same snap context.
|
||||
/**
|
||||
* ceph_find_incompatible - find an incompatible context and return it
|
||||
* @page: page being dirtied
|
||||
*
|
||||
* called with page locked.
|
||||
* return success with page locked,
|
||||
* or any failure (incl -EAGAIN) with page unlocked.
|
||||
* We are only allowed to write into/dirty a page if the page is
|
||||
* clean, or already dirty within the same snap context. Returns a
|
||||
* conflicting context if there is one, NULL if there isn't, or a
|
||||
* negative error code on other errors.
|
||||
*
|
||||
* Must be called with page lock held.
|
||||
*/
|
||||
static int ceph_update_writeable_page(struct file *file,
|
||||
loff_t pos, unsigned len,
|
||||
struct page *page)
|
||||
static struct ceph_snap_context *
|
||||
ceph_find_incompatible(struct page *page)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
loff_t page_off = pos & PAGE_MASK;
|
||||
int pos_in_page = pos & ~PAGE_MASK;
|
||||
int end_in_page = pos_in_page + len;
|
||||
loff_t i_size;
|
||||
int r;
|
||||
struct ceph_snap_context *snapc, *oldest;
|
||||
|
||||
if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
|
||||
dout(" page %p forced umount\n", page);
|
||||
unlock_page(page);
|
||||
return -EIO;
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
retry_locked:
|
||||
/* writepages currently holds page lock, but if we change that later, */
|
||||
wait_on_page_writeback(page);
|
||||
for (;;) {
|
||||
struct ceph_snap_context *snapc, *oldest;
|
||||
|
||||
wait_on_page_writeback(page);
|
||||
|
||||
snapc = page_snap_context(page);
|
||||
if (!snapc || snapc == ci->i_head_snapc)
|
||||
break;
|
||||
|
||||
snapc = page_snap_context(page);
|
||||
if (snapc && snapc != ci->i_head_snapc) {
|
||||
/*
|
||||
* this page is already dirty in another (older) snap
|
||||
* context! is it writeable now?
|
||||
*/
|
||||
oldest = get_oldest_context(inode, NULL, NULL);
|
||||
if (snapc->seq > oldest->seq) {
|
||||
/* not writeable -- return it for the caller to deal with */
|
||||
ceph_put_snap_context(oldest);
|
||||
dout(" page %p snapc %p not current or oldest\n",
|
||||
page, snapc);
|
||||
/*
|
||||
* queue for writeback, and wait for snapc to
|
||||
* be writeable or written
|
||||
*/
|
||||
snapc = ceph_get_snap_context(snapc);
|
||||
unlock_page(page);
|
||||
ceph_queue_writeback(inode);
|
||||
r = wait_event_killable(ci->i_cap_wq,
|
||||
context_is_writeable_or_written(inode, snapc));
|
||||
ceph_put_snap_context(snapc);
|
||||
if (r == -ERESTARTSYS)
|
||||
return r;
|
||||
return -EAGAIN;
|
||||
dout(" page %p snapc %p not current or oldest\n", page, snapc);
|
||||
return ceph_get_snap_context(snapc);
|
||||
}
|
||||
ceph_put_snap_context(oldest);
|
||||
|
||||
/* yay, writeable, do it now (without dropping page lock) */
|
||||
dout(" page %p snapc %p not current, but oldest\n",
|
||||
page, snapc);
|
||||
if (!clear_page_dirty_for_io(page))
|
||||
goto retry_locked;
|
||||
r = writepage_nounlock(page, NULL);
|
||||
if (r < 0)
|
||||
goto fail_unlock;
|
||||
goto retry_locked;
|
||||
dout(" page %p snapc %p not current, but oldest\n", page, snapc);
|
||||
if (clear_page_dirty_for_io(page)) {
|
||||
int r = writepage_nounlock(page, NULL);
|
||||
if (r < 0)
|
||||
return ERR_PTR(r);
|
||||
}
|
||||
}
|
||||
|
||||
if (PageUptodate(page)) {
|
||||
dout(" page %p already uptodate\n", page);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* full page? */
|
||||
if (pos_in_page == 0 && len == PAGE_SIZE)
|
||||
return 0;
|
||||
|
||||
/* past end of file? */
|
||||
i_size = i_size_read(inode);
|
||||
|
||||
if (page_off >= i_size ||
|
||||
(pos_in_page == 0 && (pos+len) >= i_size &&
|
||||
end_in_page - pos_in_page != PAGE_SIZE)) {
|
||||
dout(" zeroing %p 0 - %d and %d - %d\n",
|
||||
page, pos_in_page, end_in_page, (int)PAGE_SIZE);
|
||||
zero_user_segments(page,
|
||||
0, pos_in_page,
|
||||
end_in_page, PAGE_SIZE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* we need to read it. */
|
||||
r = ceph_do_readpage(file, page);
|
||||
if (r < 0) {
|
||||
if (r == -EINPROGRESS)
|
||||
return -EAGAIN;
|
||||
goto fail_unlock;
|
||||
}
|
||||
goto retry_locked;
|
||||
fail_unlock:
|
||||
unlock_page(page);
|
||||
return r;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1414,26 +1311,78 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
|
|||
struct page **pagep, void **fsdata)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct page *page;
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_snap_context *snapc;
|
||||
struct page *page = NULL;
|
||||
pgoff_t index = pos >> PAGE_SHIFT;
|
||||
int r;
|
||||
int pos_in_page = pos & ~PAGE_MASK;
|
||||
int r = 0;
|
||||
|
||||
do {
|
||||
/* get a page */
|
||||
dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len);
|
||||
|
||||
for (;;) {
|
||||
page = grab_cache_page_write_begin(mapping, index, 0);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
if (!page) {
|
||||
r = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
||||
dout("write_begin file %p inode %p page %p %d~%d\n", file,
|
||||
inode, page, (int)pos, (int)len);
|
||||
|
||||
r = ceph_update_writeable_page(file, pos, len, page);
|
||||
if (r < 0)
|
||||
snapc = ceph_find_incompatible(page);
|
||||
if (snapc) {
|
||||
if (IS_ERR(snapc)) {
|
||||
r = PTR_ERR(snapc);
|
||||
break;
|
||||
}
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
else
|
||||
*pagep = page;
|
||||
} while (r == -EAGAIN);
|
||||
page = NULL;
|
||||
ceph_queue_writeback(inode);
|
||||
r = wait_event_killable(ci->i_cap_wq,
|
||||
context_is_writeable_or_written(inode, snapc));
|
||||
ceph_put_snap_context(snapc);
|
||||
if (r != 0)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (PageUptodate(page)) {
|
||||
dout(" page %p already uptodate\n", page);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* In some cases we don't need to read at all:
|
||||
* - full page write
|
||||
* - write that lies completely beyond EOF
|
||||
* - write that covers the the page from start to EOF or beyond it
|
||||
*/
|
||||
if ((pos_in_page == 0 && len == PAGE_SIZE) ||
|
||||
(pos >= i_size_read(inode)) ||
|
||||
(pos_in_page == 0 && (pos + len) >= i_size_read(inode))) {
|
||||
zero_user_segments(page, 0, pos_in_page,
|
||||
pos_in_page + len, PAGE_SIZE);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to read it. If we get back -EINPROGRESS, then the page was
|
||||
* handed off to fscache and it will be unlocked when the read completes.
|
||||
* Refind the page in that case so we can reacquire the page lock. Otherwise
|
||||
* we got a hard error or the read was completed synchronously.
|
||||
*/
|
||||
r = ceph_do_readpage(file, page);
|
||||
if (r != -EINPROGRESS)
|
||||
break;
|
||||
}
|
||||
|
||||
if (r < 0) {
|
||||
if (page) {
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
}
|
||||
} else {
|
||||
*pagep = page;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
|
@ -1522,7 +1471,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
|
|||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_file_info *fi = vma->vm_file->private_data;
|
||||
struct page *pinned_page = NULL;
|
||||
loff_t off = vmf->pgoff << PAGE_SHIFT;
|
||||
loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
|
||||
int want, got, err;
|
||||
sigset_t oldset;
|
||||
vm_fault_t ret = VM_FAULT_SIGBUS;
|
||||
|
@ -1668,6 +1617,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
|
|||
inode_inc_iversion_raw(inode);
|
||||
|
||||
do {
|
||||
struct ceph_snap_context *snapc;
|
||||
|
||||
lock_page(page);
|
||||
|
||||
if (page_mkwrite_check_truncate(page, inode) < 0) {
|
||||
|
@ -1676,13 +1627,26 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
|
|||
break;
|
||||
}
|
||||
|
||||
err = ceph_update_writeable_page(vma->vm_file, off, len, page);
|
||||
if (err >= 0) {
|
||||
snapc = ceph_find_incompatible(page);
|
||||
if (!snapc) {
|
||||
/* success. we'll keep the page locked. */
|
||||
set_page_dirty(page);
|
||||
ret = VM_FAULT_LOCKED;
|
||||
break;
|
||||
}
|
||||
} while (err == -EAGAIN);
|
||||
|
||||
unlock_page(page);
|
||||
|
||||
if (IS_ERR(snapc)) {
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
break;
|
||||
}
|
||||
|
||||
ceph_queue_writeback(inode);
|
||||
err = wait_event_killable(ci->i_cap_wq,
|
||||
context_is_writeable_or_written(inode, snapc));
|
||||
ceph_put_snap_context(snapc);
|
||||
} while (err == 0);
|
||||
|
||||
if (ret == VM_FAULT_LOCKED ||
|
||||
ci->i_inline_version != CEPH_INLINE_NONE) {
|
||||
|
@ -2039,16 +2003,16 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
|
|||
if (err >= 0 || err == -ENOENT)
|
||||
have |= POOL_READ;
|
||||
else if (err != -EPERM) {
|
||||
if (err == -EBLACKLISTED)
|
||||
fsc->blacklisted = true;
|
||||
if (err == -EBLOCKLISTED)
|
||||
fsc->blocklisted = true;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
if (err2 == 0 || err2 == -EEXIST)
|
||||
have |= POOL_WRITE;
|
||||
else if (err2 != -EPERM) {
|
||||
if (err2 == -EBLACKLISTED)
|
||||
fsc->blacklisted = true;
|
||||
if (err2 == -EBLOCKLISTED)
|
||||
fsc->blocklisted = true;
|
||||
err = err2;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
|
128
fs/ceph/caps.c
128
fs/ceph/caps.c
|
@ -1222,36 +1222,27 @@ struct cap_msg_args {
|
|||
};
|
||||
|
||||
/*
|
||||
* Build and send a cap message to the given MDS.
|
||||
*
|
||||
* Caller should be holding s_mutex.
|
||||
* cap struct size + flock buffer size + inline version + inline data size +
|
||||
* osd_epoch_barrier + oldest_flush_tid
|
||||
*/
|
||||
static int send_cap_msg(struct cap_msg_args *arg)
|
||||
#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
|
||||
4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
|
||||
|
||||
/* Marshal up the cap msg to the MDS */
|
||||
static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
|
||||
{
|
||||
struct ceph_mds_caps *fc;
|
||||
struct ceph_msg *msg;
|
||||
void *p;
|
||||
size_t extra_len;
|
||||
struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
|
||||
|
||||
dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
|
||||
" seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
|
||||
" xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
|
||||
arg->cid, arg->ino, ceph_cap_string(arg->caps),
|
||||
ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
|
||||
arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
|
||||
arg->mseq, arg->follows, arg->size, arg->max_size,
|
||||
arg->xattr_version,
|
||||
dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
|
||||
__func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
|
||||
ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
|
||||
ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
|
||||
arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
|
||||
arg->size, arg->max_size, arg->xattr_version,
|
||||
arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
|
||||
|
||||
/* flock buffer size + inline version + inline data size +
|
||||
* osd_epoch_barrier + oldest_flush_tid */
|
||||
extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
|
||||
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
|
||||
GFP_NOFS, false);
|
||||
if (!msg)
|
||||
return -ENOMEM;
|
||||
|
||||
msg->hdr.version = cpu_to_le16(10);
|
||||
msg->hdr.tid = cpu_to_le64(arg->flush_tid);
|
||||
|
||||
|
@ -1323,9 +1314,6 @@ static int send_cap_msg(struct cap_msg_args *arg)
|
|||
|
||||
/* Advisory flags (version 10) */
|
||||
ceph_encode_32(&p, arg->flags);
|
||||
|
||||
ceph_con_send(&arg->session->s_con, msg);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1454,25 +1442,25 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
|
|||
*
|
||||
* Caller should hold snap_rwsem (read), s_mutex.
|
||||
*/
|
||||
static void __send_cap(struct ceph_mds_client *mdsc, struct cap_msg_args *arg,
|
||||
struct ceph_inode_info *ci)
|
||||
static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
|
||||
{
|
||||
struct ceph_msg *msg;
|
||||
struct inode *inode = &ci->vfs_inode;
|
||||
int ret;
|
||||
|
||||
ret = send_cap_msg(arg);
|
||||
if (ret < 0) {
|
||||
pr_err("error sending cap msg, ino (%llx.%llx) "
|
||||
"flushing %s tid %llu, requeue\n",
|
||||
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
|
||||
if (!msg) {
|
||||
pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
|
||||
ceph_vinop(inode), ceph_cap_string(arg->dirty),
|
||||
arg->flush_tid);
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
__cap_delay_requeue(mdsc, ci);
|
||||
__cap_delay_requeue(arg->session->s_mdsc, ci);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
encode_cap_msg(msg, arg);
|
||||
ceph_con_send(&arg->session->s_con, msg);
|
||||
ceph_buffer_put(arg->old_xattr_buf);
|
||||
|
||||
if (arg->wake)
|
||||
wake_up_all(&ci->i_cap_wq);
|
||||
}
|
||||
|
@ -1483,6 +1471,11 @@ static inline int __send_flush_snap(struct inode *inode,
|
|||
u32 mseq, u64 oldest_flush_tid)
|
||||
{
|
||||
struct cap_msg_args arg;
|
||||
struct ceph_msg *msg;
|
||||
|
||||
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
|
||||
if (!msg)
|
||||
return -ENOMEM;
|
||||
|
||||
arg.session = session;
|
||||
arg.ino = ceph_vino(inode).ino;
|
||||
|
@ -1521,7 +1514,9 @@ static inline int __send_flush_snap(struct inode *inode,
|
|||
arg.flags = 0;
|
||||
arg.wake = false;
|
||||
|
||||
return send_cap_msg(&arg);
|
||||
encode_cap_msg(msg, &arg);
|
||||
ceph_con_send(&arg.session->s_con, msg);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1906,9 +1901,8 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
|
|||
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
|
||||
struct ceph_mds_session *session)
|
||||
{
|
||||
struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct inode *inode = &ci->vfs_inode;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||
struct ceph_cap *cap;
|
||||
u64 flush_tid, oldest_flush_tid;
|
||||
int file_wanted, used, cap_used;
|
||||
|
@ -1928,12 +1922,24 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
|
|||
retry:
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
retry_locked:
|
||||
/* Caps wanted by virtue of active open files. */
|
||||
file_wanted = __ceph_caps_file_wanted(ci);
|
||||
|
||||
/* Caps which have active references against them */
|
||||
used = __ceph_caps_used(ci);
|
||||
|
||||
/*
|
||||
* "issued" represents the current caps that the MDS wants us to have.
|
||||
* "implemented" is the set that we have been granted, and includes the
|
||||
* ones that have not yet been returned to the MDS (the "revoking" set,
|
||||
* usually because they have outstanding references).
|
||||
*/
|
||||
issued = __ceph_caps_issued(ci, &implemented);
|
||||
revoking = implemented & ~issued;
|
||||
|
||||
want = file_wanted;
|
||||
|
||||
/* The ones we currently want to retain (may be adjusted below) */
|
||||
retain = file_wanted | used | CEPH_CAP_PIN;
|
||||
if (!mdsc->stopping && inode->i_nlink > 0) {
|
||||
if (file_wanted) {
|
||||
|
@ -2011,6 +2017,10 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
|
|||
|
||||
/* NOTE: no side-effects allowed, until we take s_mutex */
|
||||
|
||||
/*
|
||||
* If we have an auth cap, we don't need to consider any
|
||||
* overlapping caps as used.
|
||||
*/
|
||||
cap_used = used;
|
||||
if (ci->i_auth_cap && cap != ci->i_auth_cap)
|
||||
cap_used &= ~ci->i_auth_cap->issued;
|
||||
|
@ -2148,7 +2158,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
|
|||
want, retain, flushing, flush_tid, oldest_flush_tid);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
|
||||
__send_cap(mdsc, &arg, ci);
|
||||
__send_cap(&arg, ci);
|
||||
|
||||
goto retry; /* retake i_ceph_lock and restart our cap scan. */
|
||||
}
|
||||
|
@ -2222,7 +2232,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
|
|||
flushing, flush_tid, oldest_flush_tid);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
|
||||
__send_cap(mdsc, &arg, ci);
|
||||
__send_cap(&arg, ci);
|
||||
} else {
|
||||
if (!list_empty(&ci->i_cap_flush_list)) {
|
||||
struct ceph_cap_flush *cf =
|
||||
|
@ -2436,7 +2446,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
|
|||
(cap->issued | cap->implemented),
|
||||
cf->caps, cf->tid, oldest_flush_tid);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
__send_cap(mdsc, &arg, ci);
|
||||
__send_cap(&arg, ci);
|
||||
} else {
|
||||
struct ceph_cap_snap *capsnap =
|
||||
container_of(cf, struct ceph_cap_snap,
|
||||
|
@ -4284,13 +4294,30 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
|
|||
|
||||
void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
|
||||
{
|
||||
int i;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
|
||||
int bits = (fmode << 1) | 1;
|
||||
bool is_opened = false;
|
||||
int i;
|
||||
|
||||
if (count == 1)
|
||||
atomic64_inc(&mdsc->metric.opened_files);
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
|
||||
if (bits & (1 << i))
|
||||
ci->i_nr_by_mode[i] += count;
|
||||
|
||||
/*
|
||||
* If any of the mode ref is larger than 1,
|
||||
* that means it has been already opened by
|
||||
* others. Just skip checking the PIN ref.
|
||||
*/
|
||||
if (i && ci->i_nr_by_mode[i] > 1)
|
||||
is_opened = true;
|
||||
}
|
||||
|
||||
if (!is_opened)
|
||||
percpu_counter_inc(&mdsc->metric.opened_inodes);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
}
|
||||
|
||||
|
@ -4301,15 +4328,32 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
|
|||
*/
|
||||
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
|
||||
{
|
||||
int i;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
|
||||
int bits = (fmode << 1) | 1;
|
||||
bool is_closed = true;
|
||||
int i;
|
||||
|
||||
if (count == 1)
|
||||
atomic64_dec(&mdsc->metric.opened_files);
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
|
||||
if (bits & (1 << i)) {
|
||||
BUG_ON(ci->i_nr_by_mode[i] < count);
|
||||
ci->i_nr_by_mode[i] -= count;
|
||||
}
|
||||
|
||||
/*
|
||||
* If any of the mode ref is not 0 after
|
||||
* decreased, that means it is still opened
|
||||
* by others. Just skip checking the PIN ref.
|
||||
*/
|
||||
if (i && ci->i_nr_by_mode[i])
|
||||
is_closed = false;
|
||||
}
|
||||
|
||||
if (is_closed)
|
||||
percpu_counter_dec(&mdsc->metric.opened_inodes);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
}
|
||||
|
||||
|
|
|
@ -148,6 +148,17 @@ static int metric_show(struct seq_file *s, void *p)
|
|||
int nr_caps = 0;
|
||||
s64 total, sum, avg, min, max, sq;
|
||||
|
||||
sum = percpu_counter_sum(&m->total_inodes);
|
||||
seq_printf(s, "item total\n");
|
||||
seq_printf(s, "------------------------------------------\n");
|
||||
seq_printf(s, "%-35s%lld / %lld\n", "opened files / total inodes",
|
||||
atomic64_read(&m->opened_files), sum);
|
||||
seq_printf(s, "%-35s%lld / %lld\n", "pinned i_caps / total inodes",
|
||||
atomic64_read(&m->total_caps), sum);
|
||||
seq_printf(s, "%-35s%lld / %lld\n", "opened inodes / total inodes",
|
||||
percpu_counter_sum(&m->opened_inodes), sum);
|
||||
|
||||
seq_printf(s, "\n");
|
||||
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
|
||||
seq_printf(s, "-----------------------------------------------------------------------------------\n");
|
||||
|
||||
|
@ -202,7 +213,8 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
|
|||
{
|
||||
struct seq_file *s = p;
|
||||
|
||||
seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
|
||||
seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
|
||||
cap->session->s_mds,
|
||||
ceph_cap_string(cap->issued),
|
||||
ceph_cap_string(cap->implemented));
|
||||
return 0;
|
||||
|
@ -222,8 +234,8 @@ static int caps_show(struct seq_file *s, void *p)
|
|||
"reserved\t%d\n"
|
||||
"min\t\t%d\n\n",
|
||||
total, avail, used, reserved, min);
|
||||
seq_printf(s, "ino issued implemented\n");
|
||||
seq_printf(s, "-----------------------------------------------\n");
|
||||
seq_printf(s, "ino mds issued implemented\n");
|
||||
seq_printf(s, "--------------------------------------------------\n");
|
||||
|
||||
mutex_lock(&mdsc->mutex);
|
||||
for (i = 0; i < mdsc->max_sessions; i++) {
|
||||
|
|
|
@ -38,8 +38,7 @@ static int __dir_lease_try_check(const struct dentry *dentry);
|
|||
static int ceph_d_init(struct dentry *dentry)
|
||||
{
|
||||
struct ceph_dentry_info *di;
|
||||
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
|
||||
|
||||
di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
|
||||
if (!di)
|
||||
|
@ -738,7 +737,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
|
|||
unsigned int flags)
|
||||
{
|
||||
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
|
||||
struct ceph_mds_request *req;
|
||||
int op;
|
||||
int mask;
|
||||
|
@ -827,8 +826,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
|
|||
static int ceph_mknod(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, dev_t rdev)
|
||||
{
|
||||
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
|
||||
struct ceph_mds_request *req;
|
||||
struct ceph_acl_sec_ctx as_ctx = {};
|
||||
int err;
|
||||
|
@ -889,8 +887,7 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
|
|||
static int ceph_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *dest)
|
||||
{
|
||||
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
|
||||
struct ceph_mds_request *req;
|
||||
struct ceph_acl_sec_ctx as_ctx = {};
|
||||
int err;
|
||||
|
@ -942,8 +939,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
|
|||
|
||||
static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
||||
{
|
||||
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
|
||||
struct ceph_mds_request *req;
|
||||
struct ceph_acl_sec_ctx as_ctx = {};
|
||||
int err = -EROFS;
|
||||
|
@ -1010,8 +1006,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
|||
static int ceph_link(struct dentry *old_dentry, struct inode *dir,
|
||||
struct dentry *dentry)
|
||||
{
|
||||
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
|
||||
struct ceph_mds_request *req;
|
||||
int err;
|
||||
|
||||
|
@ -1192,8 +1187,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
|
|||
struct inode *new_dir, struct dentry *new_dentry,
|
||||
unsigned int flags)
|
||||
{
|
||||
struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
|
||||
struct ceph_mds_request *req;
|
||||
int op = CEPH_MDS_OP_RENAME;
|
||||
int err;
|
||||
|
|
|
@ -182,8 +182,7 @@ static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
|
|||
static struct ceph_mds_request *
|
||||
prepare_open_request(struct super_block *sb, int flags, int create_mode)
|
||||
{
|
||||
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
|
||||
struct ceph_mds_request *req;
|
||||
int want_auth = USE_ANY_MDS;
|
||||
int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
|
||||
|
@ -256,8 +255,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
|
|||
case S_IFDIR:
|
||||
ret = ceph_init_file_info(inode, file, fmode,
|
||||
S_ISDIR(inode->i_mode));
|
||||
if (ret)
|
||||
return ret;
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
|
@ -285,7 +282,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
|
|||
*/
|
||||
int ceph_renew_caps(struct inode *inode, int fmode)
|
||||
{
|
||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_mds_request *req;
|
||||
int err, flags, wanted;
|
||||
|
@ -865,6 +862,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
|
|||
size_t page_off;
|
||||
u64 i_size;
|
||||
bool more;
|
||||
int idx;
|
||||
size_t left;
|
||||
|
||||
req = ceph_osdc_new_request(osdc, &ci->i_layout,
|
||||
ci->i_vino, off, &len, 0, 1,
|
||||
|
@ -878,29 +877,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
|
|||
|
||||
more = len < iov_iter_count(to);
|
||||
|
||||
if (unlikely(iov_iter_is_pipe(to))) {
|
||||
ret = iov_iter_get_pages_alloc(to, &pages, len,
|
||||
&page_off);
|
||||
if (ret <= 0) {
|
||||
ceph_osdc_put_request(req);
|
||||
ret = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
|
||||
if (ret < len) {
|
||||
len = ret;
|
||||
osd_req_op_extent_update(req, 0, len);
|
||||
more = false;
|
||||
}
|
||||
} else {
|
||||
num_pages = calc_pages_for(off, len);
|
||||
page_off = off & ~PAGE_MASK;
|
||||
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
|
||||
if (IS_ERR(pages)) {
|
||||
ceph_osdc_put_request(req);
|
||||
ret = PTR_ERR(pages);
|
||||
break;
|
||||
}
|
||||
num_pages = calc_pages_for(off, len);
|
||||
page_off = off & ~PAGE_MASK;
|
||||
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
|
||||
if (IS_ERR(pages)) {
|
||||
ceph_osdc_put_request(req);
|
||||
ret = PTR_ERR(pages);
|
||||
break;
|
||||
}
|
||||
|
||||
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
|
||||
|
@ -931,36 +914,27 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
|
|||
ret += zlen;
|
||||
}
|
||||
|
||||
if (unlikely(iov_iter_is_pipe(to))) {
|
||||
if (ret > 0) {
|
||||
iov_iter_advance(to, ret);
|
||||
off += ret;
|
||||
} else {
|
||||
iov_iter_advance(to, 0);
|
||||
idx = 0;
|
||||
left = ret > 0 ? ret : 0;
|
||||
while (left > 0) {
|
||||
size_t len, copied;
|
||||
page_off = off & ~PAGE_MASK;
|
||||
len = min_t(size_t, left, PAGE_SIZE - page_off);
|
||||
SetPageUptodate(pages[idx]);
|
||||
copied = copy_page_to_iter(pages[idx++],
|
||||
page_off, len, to);
|
||||
off += copied;
|
||||
left -= copied;
|
||||
if (copied < len) {
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
ceph_put_page_vector(pages, num_pages, false);
|
||||
} else {
|
||||
int idx = 0;
|
||||
size_t left = ret > 0 ? ret : 0;
|
||||
while (left > 0) {
|
||||
size_t len, copied;
|
||||
page_off = off & ~PAGE_MASK;
|
||||
len = min_t(size_t, left, PAGE_SIZE - page_off);
|
||||
copied = copy_page_to_iter(pages[idx++],
|
||||
page_off, len, to);
|
||||
off += copied;
|
||||
left -= copied;
|
||||
if (copied < len) {
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ceph_release_page_vector(pages, num_pages);
|
||||
}
|
||||
ceph_release_page_vector(pages, num_pages);
|
||||
|
||||
if (ret < 0) {
|
||||
if (ret == -EBLACKLISTED)
|
||||
fsc->blacklisted = true;
|
||||
if (ret == -EBLOCKLISTED)
|
||||
fsc->blocklisted = true;
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1052,8 +1026,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
|
|||
struct inode *inode = req->r_inode;
|
||||
struct ceph_aio_request *aio_req = req->r_priv;
|
||||
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
|
||||
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
||||
struct ceph_client_metric *metric = &fsc->mdsc->metric;
|
||||
struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
|
||||
|
||||
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
|
||||
BUG_ON(!osd_data->num_bvecs);
|
||||
|
|
|
@ -42,10 +42,13 @@ static void ceph_inode_work(struct work_struct *work);
|
|||
static int ceph_set_ino_cb(struct inode *inode, void *data)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||
|
||||
ci->i_vino = *(struct ceph_vino *)data;
|
||||
inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
|
||||
inode_set_iversion_raw(inode, 0);
|
||||
percpu_counter_inc(&mdsc->metric.total_inodes);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -538,11 +541,14 @@ void ceph_free_inode(struct inode *inode)
|
|||
void ceph_evict_inode(struct inode *inode)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||
struct ceph_inode_frag *frag;
|
||||
struct rb_node *n;
|
||||
|
||||
dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
|
||||
|
||||
percpu_counter_dec(&mdsc->metric.total_inodes);
|
||||
|
||||
truncate_inode_pages_final(&inode->i_data);
|
||||
clear_inode(inode);
|
||||
|
||||
|
@ -558,8 +564,6 @@ void ceph_evict_inode(struct inode *inode)
|
|||
* caps in i_snap_caps.
|
||||
*/
|
||||
if (ci->i_snap_realm) {
|
||||
struct ceph_mds_client *mdsc =
|
||||
ceph_inode_to_client(inode)->mdsc;
|
||||
if (ceph_snap(inode) == CEPH_NOSNAP) {
|
||||
struct ceph_snap_realm *realm = ci->i_snap_realm;
|
||||
dout(" dropping residual ref to snap realm %p\n",
|
||||
|
@ -739,7 +743,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
|
|||
struct ceph_mds_session *session, int cap_fmode,
|
||||
struct ceph_cap_reservation *caps_reservation)
|
||||
{
|
||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||
struct ceph_mds_reply_inode *info = iinfo->in;
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
int issued, new_issued, info_caps;
|
||||
|
|
|
@ -63,7 +63,7 @@ static const struct file_lock_operations ceph_fl_lock_ops = {
|
|||
static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
|
||||
int cmd, u8 wait, struct file_lock *fl)
|
||||
{
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||
struct ceph_mds_request *req;
|
||||
int err;
|
||||
u64 length = 0;
|
||||
|
|
|
@ -3303,7 +3303,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
|
|||
}
|
||||
|
||||
static int __decode_session_metadata(void **p, void *end,
|
||||
bool *blacklisted)
|
||||
bool *blocklisted)
|
||||
{
|
||||
/* map<string,string> */
|
||||
u32 n;
|
||||
|
@ -3317,8 +3317,12 @@ static int __decode_session_metadata(void **p, void *end,
|
|||
*p += len;
|
||||
ceph_decode_32_safe(p, end, len, bad);
|
||||
ceph_decode_need(p, end, len, bad);
|
||||
/*
|
||||
* Match "blocklisted (blacklisted)" from newer MDSes,
|
||||
* or "blacklisted" from older MDSes.
|
||||
*/
|
||||
if (err_str && strnstr(*p, "blacklisted", len))
|
||||
*blacklisted = true;
|
||||
*blocklisted = true;
|
||||
*p += len;
|
||||
}
|
||||
return 0;
|
||||
|
@ -3341,7 +3345,7 @@ static void handle_session(struct ceph_mds_session *session,
|
|||
u32 op;
|
||||
u64 seq, features = 0;
|
||||
int wake = 0;
|
||||
bool blacklisted = false;
|
||||
bool blocklisted = false;
|
||||
|
||||
/* decode */
|
||||
ceph_decode_need(&p, end, sizeof(*h), bad);
|
||||
|
@ -3354,7 +3358,7 @@ static void handle_session(struct ceph_mds_session *session,
|
|||
if (msg_version >= 3) {
|
||||
u32 len;
|
||||
/* version >= 2, metadata */
|
||||
if (__decode_session_metadata(&p, end, &blacklisted) < 0)
|
||||
if (__decode_session_metadata(&p, end, &blocklisted) < 0)
|
||||
goto bad;
|
||||
/* version >= 3, feature bits */
|
||||
ceph_decode_32_safe(&p, end, len, bad);
|
||||
|
@ -3445,8 +3449,8 @@ static void handle_session(struct ceph_mds_session *session,
|
|||
session->s_state = CEPH_MDS_SESSION_REJECTED;
|
||||
cleanup_session_requests(mdsc, session);
|
||||
remove_session_caps(session);
|
||||
if (blacklisted)
|
||||
mdsc->fsc->blacklisted = true;
|
||||
if (blocklisted)
|
||||
mdsc->fsc->blocklisted = true;
|
||||
wake = 2; /* for good measure */
|
||||
break;
|
||||
|
||||
|
@ -3612,6 +3616,39 @@ static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
|
|||
return err;
|
||||
}
|
||||
|
||||
static struct dentry* d_find_primary(struct inode *inode)
|
||||
{
|
||||
struct dentry *alias, *dn = NULL;
|
||||
|
||||
if (hlist_empty(&inode->i_dentry))
|
||||
return NULL;
|
||||
|
||||
spin_lock(&inode->i_lock);
|
||||
if (hlist_empty(&inode->i_dentry))
|
||||
goto out_unlock;
|
||||
|
||||
if (S_ISDIR(inode->i_mode)) {
|
||||
alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
|
||||
if (!IS_ROOT(alias))
|
||||
dn = dget(alias);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
|
||||
spin_lock(&alias->d_lock);
|
||||
if (!d_unhashed(alias) &&
|
||||
(ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
|
||||
dn = dget_dlock(alias);
|
||||
}
|
||||
spin_unlock(&alias->d_lock);
|
||||
if (dn)
|
||||
break;
|
||||
}
|
||||
out_unlock:
|
||||
spin_unlock(&inode->i_lock);
|
||||
return dn;
|
||||
}
|
||||
|
||||
/*
|
||||
* Encode information about a cap for a reconnect with the MDS.
|
||||
*/
|
||||
|
@ -3625,13 +3662,32 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
|||
struct ceph_inode_info *ci = cap->ci;
|
||||
struct ceph_reconnect_state *recon_state = arg;
|
||||
struct ceph_pagelist *pagelist = recon_state->pagelist;
|
||||
int err;
|
||||
struct dentry *dentry;
|
||||
char *path;
|
||||
int pathlen, err;
|
||||
u64 pathbase;
|
||||
u64 snap_follows;
|
||||
|
||||
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
|
||||
inode, ceph_vinop(inode), cap, cap->cap_id,
|
||||
ceph_cap_string(cap->issued));
|
||||
|
||||
dentry = d_find_primary(inode);
|
||||
if (dentry) {
|
||||
/* set pathbase to parent dir when msg_version >= 2 */
|
||||
path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
|
||||
recon_state->msg_version >= 2);
|
||||
dput(dentry);
|
||||
if (IS_ERR(path)) {
|
||||
err = PTR_ERR(path);
|
||||
goto out_err;
|
||||
}
|
||||
} else {
|
||||
path = NULL;
|
||||
pathlen = 0;
|
||||
pathbase = 0;
|
||||
}
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
cap->seq = 0; /* reset cap seq */
|
||||
cap->issue_seq = 0; /* and issue_seq */
|
||||
|
@ -3652,7 +3708,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
|||
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
|
||||
rec.v2.issued = cpu_to_le32(cap->issued);
|
||||
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
|
||||
rec.v2.pathbase = 0;
|
||||
rec.v2.pathbase = cpu_to_le64(pathbase);
|
||||
rec.v2.flock_len = (__force __le32)
|
||||
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
|
||||
} else {
|
||||
|
@ -3663,7 +3719,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
|||
ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
|
||||
ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
|
||||
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
|
||||
rec.v1.pathbase = 0;
|
||||
rec.v1.pathbase = cpu_to_le64(pathbase);
|
||||
}
|
||||
|
||||
if (list_empty(&ci->i_cap_snaps)) {
|
||||
|
@ -3725,7 +3781,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
|||
sizeof(struct ceph_filelock);
|
||||
rec.v2.flock_len = cpu_to_le32(struct_len);
|
||||
|
||||
struct_len += sizeof(u32) + sizeof(rec.v2);
|
||||
struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
|
||||
|
||||
if (struct_v >= 2)
|
||||
struct_len += sizeof(u64); /* snap_follows */
|
||||
|
@ -3749,7 +3805,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
|||
ceph_pagelist_encode_8(pagelist, 1);
|
||||
ceph_pagelist_encode_32(pagelist, struct_len);
|
||||
}
|
||||
ceph_pagelist_encode_string(pagelist, NULL, 0);
|
||||
ceph_pagelist_encode_string(pagelist, path, pathlen);
|
||||
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
|
||||
ceph_locks_to_pagelist(flocks, pagelist,
|
||||
num_fcntl_locks, num_flock_locks);
|
||||
|
@ -3758,39 +3814,20 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
|||
out_freeflocks:
|
||||
kfree(flocks);
|
||||
} else {
|
||||
u64 pathbase = 0;
|
||||
int pathlen = 0;
|
||||
char *path = NULL;
|
||||
struct dentry *dentry;
|
||||
|
||||
dentry = d_find_alias(inode);
|
||||
if (dentry) {
|
||||
path = ceph_mdsc_build_path(dentry,
|
||||
&pathlen, &pathbase, 0);
|
||||
dput(dentry);
|
||||
if (IS_ERR(path)) {
|
||||
err = PTR_ERR(path);
|
||||
goto out_err;
|
||||
}
|
||||
rec.v1.pathbase = cpu_to_le64(pathbase);
|
||||
}
|
||||
|
||||
err = ceph_pagelist_reserve(pagelist,
|
||||
sizeof(u64) + sizeof(u32) +
|
||||
pathlen + sizeof(rec.v1));
|
||||
if (err) {
|
||||
goto out_freepath;
|
||||
}
|
||||
if (err)
|
||||
goto out_err;
|
||||
|
||||
ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
|
||||
ceph_pagelist_encode_string(pagelist, path, pathlen);
|
||||
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
|
||||
out_freepath:
|
||||
ceph_mdsc_free_path(path, pathlen);
|
||||
}
|
||||
|
||||
out_err:
|
||||
if (err >= 0)
|
||||
ceph_mdsc_free_path(path, pathlen);
|
||||
if (!err)
|
||||
recon_state->nr_caps++;
|
||||
return err;
|
||||
}
|
||||
|
@ -4334,14 +4371,14 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
|
|||
if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
|
||||
return;
|
||||
|
||||
if (!READ_ONCE(fsc->blacklisted))
|
||||
if (!READ_ONCE(fsc->blocklisted))
|
||||
return;
|
||||
|
||||
if (fsc->last_auto_reconnect &&
|
||||
time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
|
||||
return;
|
||||
|
||||
pr_info("auto reconnect after blacklisted\n");
|
||||
pr_info("auto reconnect after blocklisted\n");
|
||||
fsc->last_auto_reconnect = jiffies;
|
||||
ceph_force_reconnect(fsc->sb);
|
||||
}
|
||||
|
|
|
@ -393,7 +393,7 @@ struct ceph_mds_client {
|
|||
|
||||
struct ceph_mds_session **sessions; /* NULL for mds if no session */
|
||||
atomic_t num_sessions;
|
||||
int max_sessions; /* len of s_mds_sessions */
|
||||
int max_sessions; /* len of sessions array */
|
||||
int stopping; /* true if shutting down */
|
||||
|
||||
atomic64_t quotarealms_count; /* # realms with quota */
|
||||
|
|
|
@ -192,11 +192,23 @@ int ceph_metric_init(struct ceph_client_metric *m)
|
|||
m->total_metadatas = 0;
|
||||
m->metadata_latency_sum = 0;
|
||||
|
||||
atomic64_set(&m->opened_files, 0);
|
||||
ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL);
|
||||
if (ret)
|
||||
goto err_opened_inodes;
|
||||
ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL);
|
||||
if (ret)
|
||||
goto err_total_inodes;
|
||||
|
||||
m->session = NULL;
|
||||
INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
|
||||
|
||||
return 0;
|
||||
|
||||
err_total_inodes:
|
||||
percpu_counter_destroy(&m->opened_inodes);
|
||||
err_opened_inodes:
|
||||
percpu_counter_destroy(&m->i_caps_mis);
|
||||
err_i_caps_mis:
|
||||
percpu_counter_destroy(&m->i_caps_hit);
|
||||
err_i_caps_hit:
|
||||
|
@ -212,6 +224,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
|
|||
if (!m)
|
||||
return;
|
||||
|
||||
percpu_counter_destroy(&m->total_inodes);
|
||||
percpu_counter_destroy(&m->opened_inodes);
|
||||
percpu_counter_destroy(&m->i_caps_mis);
|
||||
percpu_counter_destroy(&m->i_caps_hit);
|
||||
percpu_counter_destroy(&m->d_lease_mis);
|
||||
|
|
|
@ -115,6 +115,13 @@ struct ceph_client_metric {
|
|||
ktime_t metadata_latency_min;
|
||||
ktime_t metadata_latency_max;
|
||||
|
||||
/* The total number of directories and files that are opened */
|
||||
atomic64_t opened_files;
|
||||
|
||||
/* The total number of inodes that have opened files or directories */
|
||||
struct percpu_counter opened_inodes;
|
||||
struct percpu_counter total_inodes;
|
||||
|
||||
struct ceph_mds_session *session;
|
||||
struct delayed_work delayed_work; /* delayed work */
|
||||
};
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
|
||||
void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
|
||||
{
|
||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||
if (inc)
|
||||
atomic64_inc(&mdsc->quotarealms_count);
|
||||
else
|
||||
|
@ -21,8 +21,8 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
|
|||
|
||||
static inline bool ceph_has_realms_with_quotas(struct inode *inode)
|
||||
{
|
||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
||||
struct super_block *sb = mdsc->fsc->sb;
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
|
||||
struct inode *root = d_inode(sb->s_root);
|
||||
|
||||
if (atomic64_read(&mdsc->quotarealms_count) > 0)
|
||||
|
@ -266,7 +266,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
|
|||
|
||||
static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
|
||||
{
|
||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb);
|
||||
struct ceph_snap_realm *old_realm, *new_realm;
|
||||
bool is_same;
|
||||
|
||||
|
@ -313,7 +313,7 @@ enum quota_check_op {
|
|||
static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
|
||||
loff_t delta)
|
||||
{
|
||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||
struct ceph_inode_info *ci;
|
||||
struct ceph_snap_realm *realm, *next;
|
||||
struct inode *in;
|
||||
|
|
|
@ -602,7 +602,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
|
|||
struct ceph_cap_snap *capsnap)
|
||||
{
|
||||
struct inode *inode = &ci->vfs_inode;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||
|
||||
BUG_ON(capsnap->writing);
|
||||
capsnap->size = inode->i_size;
|
||||
|
|
|
@ -1205,14 +1205,13 @@ static int ceph_init_fs_context(struct fs_context *fc)
|
|||
static void ceph_kill_sb(struct super_block *s)
|
||||
{
|
||||
struct ceph_fs_client *fsc = ceph_sb_to_client(s);
|
||||
dev_t dev = s->s_dev;
|
||||
|
||||
dout("kill_sb %p\n", s);
|
||||
|
||||
ceph_mdsc_pre_umount(fsc->mdsc);
|
||||
flush_fs_workqueues(fsc);
|
||||
|
||||
generic_shutdown_super(s);
|
||||
kill_anon_super(s);
|
||||
|
||||
fsc->client->extra_mon_dispatch = NULL;
|
||||
ceph_fs_debugfs_cleanup(fsc);
|
||||
|
@ -1220,7 +1219,6 @@ static void ceph_kill_sb(struct super_block *s)
|
|||
ceph_fscache_unregister_fs(fsc);
|
||||
|
||||
destroy_fs_client(fsc);
|
||||
free_anon_bdev(dev);
|
||||
}
|
||||
|
||||
static struct file_system_type ceph_fs_type = {
|
||||
|
@ -1243,13 +1241,13 @@ int ceph_force_reconnect(struct super_block *sb)
|
|||
* see remove_session_caps_cb() */
|
||||
flush_workqueue(fsc->inode_wq);
|
||||
|
||||
/* In case that we were blacklisted. This also reset
|
||||
/* In case that we were blocklisted. This also reset
|
||||
* all mon/osd connections */
|
||||
ceph_reset_client_addr(fsc->client);
|
||||
|
||||
ceph_osdc_clear_abort_err(&fsc->client->osdc);
|
||||
|
||||
fsc->blacklisted = false;
|
||||
fsc->blocklisted = false;
|
||||
fsc->mount_state = CEPH_MOUNT_MOUNTED;
|
||||
|
||||
if (sb->s_root) {
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
|
||||
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
|
||||
|
||||
#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */
|
||||
#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */
|
||||
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
|
||||
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
|
||||
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
|
||||
|
@ -109,7 +109,7 @@ struct ceph_fs_client {
|
|||
unsigned long mount_state;
|
||||
|
||||
unsigned long last_auto_reconnect;
|
||||
bool blacklisted;
|
||||
bool blocklisted;
|
||||
|
||||
bool have_copy_from2;
|
||||
|
||||
|
@ -160,7 +160,8 @@ struct ceph_cap {
|
|||
int issued; /* latest, from the mds */
|
||||
int implemented; /* implemented superset of
|
||||
issued (for revocation) */
|
||||
int mds, mds_wanted;
|
||||
int mds; /* mds index for this cap */
|
||||
int mds_wanted; /* caps wanted from this mds */
|
||||
};
|
||||
/* caps to release */
|
||||
struct {
|
||||
|
@ -451,6 +452,12 @@ ceph_sb_to_client(const struct super_block *sb)
|
|||
return (struct ceph_fs_client *)sb->s_fs_info;
|
||||
}
|
||||
|
||||
static inline struct ceph_mds_client *
|
||||
ceph_sb_to_mdsc(const struct super_block *sb)
|
||||
{
|
||||
return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc;
|
||||
}
|
||||
|
||||
static inline struct ceph_vino
|
||||
ceph_vino(const struct inode *inode)
|
||||
{
|
||||
|
|
|
@ -116,7 +116,8 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
|
|||
* NULL terminates however, so call it on a temporary buffer and then memcpy
|
||||
* the result into place.
|
||||
*/
|
||||
static int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
|
||||
static __printf(3, 4)
|
||||
int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
|
||||
{
|
||||
int ret;
|
||||
va_list args;
|
||||
|
|
|
@ -54,7 +54,7 @@ struct ceph_connection_operations {
|
|||
int (*check_message_signature) (struct ceph_msg *msg);
|
||||
};
|
||||
|
||||
/* use format string %s%d */
|
||||
/* use format string %s%lld */
|
||||
#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
|
||||
|
||||
struct ceph_messenger {
|
||||
|
|
|
@ -142,7 +142,7 @@ int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
|
|||
int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
|
||||
ceph_monc_callback_t cb, u64 private_data);
|
||||
|
||||
int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
|
||||
int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
|
||||
struct ceph_entity_addr *client_addr);
|
||||
|
||||
extern int ceph_monc_open_session(struct ceph_mon_client *monc);
|
||||
|
|
|
@ -137,6 +137,17 @@ int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
|
|||
const char *fmt, ...);
|
||||
void ceph_oid_destroy(struct ceph_object_id *oid);
|
||||
|
||||
struct workspace_manager {
|
||||
struct list_head idle_ws;
|
||||
spinlock_t ws_lock;
|
||||
/* Number of free workspaces */
|
||||
int free_ws;
|
||||
/* Total number of allocated workspaces */
|
||||
atomic_t total_ws;
|
||||
/* Waiters for a free workspace */
|
||||
wait_queue_head_t ws_wait;
|
||||
};
|
||||
|
||||
struct ceph_pg_mapping {
|
||||
struct rb_node node;
|
||||
struct ceph_pg pgid;
|
||||
|
@ -184,8 +195,7 @@ struct ceph_osdmap {
|
|||
* the list of osds that store+replicate them. */
|
||||
struct crush_map *crush;
|
||||
|
||||
struct mutex crush_workspace_mutex;
|
||||
void *crush_workspace;
|
||||
struct workspace_manager crush_wsm;
|
||||
};
|
||||
|
||||
static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
|
||||
|
|
|
@ -424,7 +424,7 @@ enum {
|
|||
};
|
||||
|
||||
#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
|
||||
#define EBLACKLISTED ESHUTDOWN /* blacklisted */
|
||||
#define EBLOCKLISTED ESHUTDOWN /* blocklisted */
|
||||
|
||||
/* xattr comparison */
|
||||
enum {
|
||||
|
|
|
@ -346,6 +346,9 @@ struct crush_work_bucket {
|
|||
|
||||
struct crush_work {
|
||||
struct crush_work_bucket **work; /* Per-bucket working store */
|
||||
#ifdef __KERNEL__
|
||||
struct list_head item;
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef __KERNEL__
|
||||
|
|
|
@ -2016,11 +2016,11 @@ static int process_banner(struct ceph_connection *con)
|
|||
sizeof(con->peer_addr)) != 0 &&
|
||||
!(addr_is_blank(&con->actual_peer_addr) &&
|
||||
con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
|
||||
pr_warn("wrong peer, want %s/%d, got %s/%d\n",
|
||||
pr_warn("wrong peer, want %s/%u, got %s/%u\n",
|
||||
ceph_pr_addr(&con->peer_addr),
|
||||
(int)le32_to_cpu(con->peer_addr.nonce),
|
||||
le32_to_cpu(con->peer_addr.nonce),
|
||||
ceph_pr_addr(&con->actual_peer_addr),
|
||||
(int)le32_to_cpu(con->actual_peer_addr.nonce));
|
||||
le32_to_cpu(con->actual_peer_addr.nonce));
|
||||
con->error_msg = "wrong peer at address";
|
||||
return -1;
|
||||
}
|
||||
|
@ -2811,13 +2811,13 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
|
|||
return -ENOENT;
|
||||
}
|
||||
|
||||
dout("%s %p %lu\n", __func__, con, delay);
|
||||
if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
|
||||
dout("%s %p - already queued\n", __func__, con);
|
||||
con->ops->put(con);
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
dout("%s %p %lu\n", __func__, con, delay);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -2998,6 +2998,11 @@ static void con_fault(struct ceph_connection *con)
|
|||
ceph_msg_put(con->in_msg);
|
||||
con->in_msg = NULL;
|
||||
}
|
||||
if (con->out_msg) {
|
||||
BUG_ON(con->out_msg->con != con);
|
||||
ceph_msg_put(con->out_msg);
|
||||
con->out_msg = NULL;
|
||||
}
|
||||
|
||||
/* Requeue anything that hasn't been acked */
|
||||
list_splice_init(&con->out_sent, &con->out_queue);
|
||||
|
|
|
@ -896,8 +896,9 @@ static void handle_command_ack(struct ceph_mon_client *monc,
|
|||
ceph_msg_dump(msg);
|
||||
}
|
||||
|
||||
int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
|
||||
struct ceph_entity_addr *client_addr)
|
||||
static __printf(2, 0)
|
||||
int do_mon_command_vargs(struct ceph_mon_client *monc, const char *fmt,
|
||||
va_list ap)
|
||||
{
|
||||
struct ceph_mon_generic_request *req;
|
||||
struct ceph_mon_command *h;
|
||||
|
@ -925,29 +926,65 @@ int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
|
|||
h->monhdr.session_mon_tid = 0;
|
||||
h->fsid = monc->monmap->fsid;
|
||||
h->num_strs = cpu_to_le32(1);
|
||||
len = sprintf(h->str, "{ \"prefix\": \"osd blacklist\", \
|
||||
\"blacklistop\": \"add\", \
|
||||
\"addr\": \"%pISpc/%u\" }",
|
||||
&client_addr->in_addr, le32_to_cpu(client_addr->nonce));
|
||||
len = vsprintf(h->str, fmt, ap);
|
||||
h->str_len = cpu_to_le32(len);
|
||||
send_generic_request(monc, req);
|
||||
mutex_unlock(&monc->mutex);
|
||||
|
||||
ret = wait_generic_request(req);
|
||||
if (!ret)
|
||||
/*
|
||||
* Make sure we have the osdmap that includes the blacklist
|
||||
* entry. This is needed to ensure that the OSDs pick up the
|
||||
* new blacklist before processing any future requests from
|
||||
* this client.
|
||||
*/
|
||||
ret = ceph_wait_for_latest_osdmap(monc->client, 0);
|
||||
|
||||
out:
|
||||
put_generic_request(req);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(ceph_monc_blacklist_add);
|
||||
|
||||
static __printf(2, 3)
|
||||
int do_mon_command(struct ceph_mon_client *monc, const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
int ret;
|
||||
|
||||
va_start(ap, fmt);
|
||||
ret = do_mon_command_vargs(monc, fmt, ap);
|
||||
va_end(ap);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
|
||||
struct ceph_entity_addr *client_addr)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = do_mon_command(monc,
|
||||
"{ \"prefix\": \"osd blocklist\", \
|
||||
\"blocklistop\": \"add\", \
|
||||
\"addr\": \"%pISpc/%u\" }",
|
||||
&client_addr->in_addr,
|
||||
le32_to_cpu(client_addr->nonce));
|
||||
if (ret == -EINVAL) {
|
||||
/*
|
||||
* The monitor returns EINVAL on an unrecognized command.
|
||||
* Try the legacy command -- it is exactly the same except
|
||||
* for the name.
|
||||
*/
|
||||
ret = do_mon_command(monc,
|
||||
"{ \"prefix\": \"osd blacklist\", \
|
||||
\"blacklistop\": \"add\", \
|
||||
\"addr\": \"%pISpc/%u\" }",
|
||||
&client_addr->in_addr,
|
||||
le32_to_cpu(client_addr->nonce));
|
||||
}
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* Make sure we have the osdmap that includes the blocklist
|
||||
* entry. This is needed to ensure that the OSDs pick up the
|
||||
* new blocklist before processing any future requests from
|
||||
* this client.
|
||||
*/
|
||||
return ceph_wait_for_latest_osdmap(monc->client, 0);
|
||||
}
|
||||
EXPORT_SYMBOL(ceph_monc_blocklist_add);
|
||||
|
||||
/*
|
||||
* Resend pending generic requests.
|
||||
|
|
|
@ -964,6 +964,143 @@ static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* CRUSH workspaces
|
||||
*
|
||||
* workspace_manager framework borrowed from fs/btrfs/compression.c.
|
||||
* Two simplifications: there is only one type of workspace and there
|
||||
* is always at least one workspace.
|
||||
*/
|
||||
static struct crush_work *alloc_workspace(const struct crush_map *c)
|
||||
{
|
||||
struct crush_work *work;
|
||||
size_t work_size;
|
||||
|
||||
WARN_ON(!c->working_size);
|
||||
work_size = crush_work_size(c, CEPH_PG_MAX_SIZE);
|
||||
dout("%s work_size %zu bytes\n", __func__, work_size);
|
||||
|
||||
work = ceph_kvmalloc(work_size, GFP_NOIO);
|
||||
if (!work)
|
||||
return NULL;
|
||||
|
||||
INIT_LIST_HEAD(&work->item);
|
||||
crush_init_workspace(c, work);
|
||||
return work;
|
||||
}
|
||||
|
||||
static void free_workspace(struct crush_work *work)
|
||||
{
|
||||
WARN_ON(!list_empty(&work->item));
|
||||
kvfree(work);
|
||||
}
|
||||
|
||||
static void init_workspace_manager(struct workspace_manager *wsm)
|
||||
{
|
||||
INIT_LIST_HEAD(&wsm->idle_ws);
|
||||
spin_lock_init(&wsm->ws_lock);
|
||||
atomic_set(&wsm->total_ws, 0);
|
||||
wsm->free_ws = 0;
|
||||
init_waitqueue_head(&wsm->ws_wait);
|
||||
}
|
||||
|
||||
static void add_initial_workspace(struct workspace_manager *wsm,
|
||||
struct crush_work *work)
|
||||
{
|
||||
WARN_ON(!list_empty(&wsm->idle_ws));
|
||||
|
||||
list_add(&work->item, &wsm->idle_ws);
|
||||
atomic_set(&wsm->total_ws, 1);
|
||||
wsm->free_ws = 1;
|
||||
}
|
||||
|
||||
static void cleanup_workspace_manager(struct workspace_manager *wsm)
|
||||
{
|
||||
struct crush_work *work;
|
||||
|
||||
while (!list_empty(&wsm->idle_ws)) {
|
||||
work = list_first_entry(&wsm->idle_ws, struct crush_work,
|
||||
item);
|
||||
list_del_init(&work->item);
|
||||
free_workspace(work);
|
||||
}
|
||||
atomic_set(&wsm->total_ws, 0);
|
||||
wsm->free_ws = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Finds an available workspace or allocates a new one. If it's not
|
||||
* possible to allocate a new one, waits until there is one.
|
||||
*/
|
||||
static struct crush_work *get_workspace(struct workspace_manager *wsm,
|
||||
const struct crush_map *c)
|
||||
{
|
||||
struct crush_work *work;
|
||||
int cpus = num_online_cpus();
|
||||
|
||||
again:
|
||||
spin_lock(&wsm->ws_lock);
|
||||
if (!list_empty(&wsm->idle_ws)) {
|
||||
work = list_first_entry(&wsm->idle_ws, struct crush_work,
|
||||
item);
|
||||
list_del_init(&work->item);
|
||||
wsm->free_ws--;
|
||||
spin_unlock(&wsm->ws_lock);
|
||||
return work;
|
||||
|
||||
}
|
||||
if (atomic_read(&wsm->total_ws) > cpus) {
|
||||
DEFINE_WAIT(wait);
|
||||
|
||||
spin_unlock(&wsm->ws_lock);
|
||||
prepare_to_wait(&wsm->ws_wait, &wait, TASK_UNINTERRUPTIBLE);
|
||||
if (atomic_read(&wsm->total_ws) > cpus && !wsm->free_ws)
|
||||
schedule();
|
||||
finish_wait(&wsm->ws_wait, &wait);
|
||||
goto again;
|
||||
}
|
||||
atomic_inc(&wsm->total_ws);
|
||||
spin_unlock(&wsm->ws_lock);
|
||||
|
||||
work = alloc_workspace(c);
|
||||
if (!work) {
|
||||
atomic_dec(&wsm->total_ws);
|
||||
wake_up(&wsm->ws_wait);
|
||||
|
||||
/*
|
||||
* Do not return the error but go back to waiting. We
|
||||
* have the inital workspace and the CRUSH computation
|
||||
* time is bounded so we will get it eventually.
|
||||
*/
|
||||
WARN_ON(atomic_read(&wsm->total_ws) < 1);
|
||||
goto again;
|
||||
}
|
||||
return work;
|
||||
}
|
||||
|
||||
/*
|
||||
* Puts a workspace back on the list or frees it if we have enough
|
||||
* idle ones sitting around.
|
||||
*/
|
||||
static void put_workspace(struct workspace_manager *wsm,
|
||||
struct crush_work *work)
|
||||
{
|
||||
spin_lock(&wsm->ws_lock);
|
||||
if (wsm->free_ws <= num_online_cpus()) {
|
||||
list_add(&work->item, &wsm->idle_ws);
|
||||
wsm->free_ws++;
|
||||
spin_unlock(&wsm->ws_lock);
|
||||
goto wake;
|
||||
}
|
||||
spin_unlock(&wsm->ws_lock);
|
||||
|
||||
free_workspace(work);
|
||||
atomic_dec(&wsm->total_ws);
|
||||
wake:
|
||||
if (wq_has_sleeper(&wsm->ws_wait))
|
||||
wake_up(&wsm->ws_wait);
|
||||
}
|
||||
|
||||
/*
|
||||
* osd map
|
||||
*/
|
||||
|
@ -981,7 +1118,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
|
|||
map->primary_temp = RB_ROOT;
|
||||
map->pg_upmap = RB_ROOT;
|
||||
map->pg_upmap_items = RB_ROOT;
|
||||
mutex_init(&map->crush_workspace_mutex);
|
||||
|
||||
init_workspace_manager(&map->crush_wsm);
|
||||
|
||||
return map;
|
||||
}
|
||||
|
@ -989,8 +1127,11 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
|
|||
void ceph_osdmap_destroy(struct ceph_osdmap *map)
|
||||
{
|
||||
dout("osdmap_destroy %p\n", map);
|
||||
|
||||
if (map->crush)
|
||||
crush_destroy(map->crush);
|
||||
cleanup_workspace_manager(&map->crush_wsm);
|
||||
|
||||
while (!RB_EMPTY_ROOT(&map->pg_temp)) {
|
||||
struct ceph_pg_mapping *pg =
|
||||
rb_entry(rb_first(&map->pg_temp),
|
||||
|
@ -1029,7 +1170,6 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
|
|||
kvfree(map->osd_weight);
|
||||
kvfree(map->osd_addr);
|
||||
kvfree(map->osd_primary_affinity);
|
||||
kvfree(map->crush_workspace);
|
||||
kfree(map);
|
||||
}
|
||||
|
||||
|
@ -1104,26 +1244,22 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
|
|||
|
||||
static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
|
||||
{
|
||||
void *workspace;
|
||||
size_t work_size;
|
||||
struct crush_work *work;
|
||||
|
||||
if (IS_ERR(crush))
|
||||
return PTR_ERR(crush);
|
||||
|
||||
work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
|
||||
dout("%s work_size %zu bytes\n", __func__, work_size);
|
||||
workspace = ceph_kvmalloc(work_size, GFP_NOIO);
|
||||
if (!workspace) {
|
||||
work = alloc_workspace(crush);
|
||||
if (!work) {
|
||||
crush_destroy(crush);
|
||||
return -ENOMEM;
|
||||
}
|
||||
crush_init_workspace(crush, workspace);
|
||||
|
||||
if (map->crush)
|
||||
crush_destroy(map->crush);
|
||||
kvfree(map->crush_workspace);
|
||||
cleanup_workspace_manager(&map->crush_wsm);
|
||||
map->crush = crush;
|
||||
map->crush_workspace = workspace;
|
||||
add_initial_workspace(&map->crush_wsm, work);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -2322,6 +2458,7 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
|
|||
s64 choose_args_index)
|
||||
{
|
||||
struct crush_choose_arg_map *arg_map;
|
||||
struct crush_work *work;
|
||||
int r;
|
||||
|
||||
BUG_ON(result_max > CEPH_PG_MAX_SIZE);
|
||||
|
@ -2332,12 +2469,11 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
|
|||
arg_map = lookup_choose_arg_map(&map->crush->choose_args,
|
||||
CEPH_DEFAULT_CHOOSE_ARGS);
|
||||
|
||||
mutex_lock(&map->crush_workspace_mutex);
|
||||
work = get_workspace(&map->crush_wsm, map->crush);
|
||||
r = crush_do_rule(map->crush, ruleno, x, result, result_max,
|
||||
weight, weight_max, map->crush_workspace,
|
||||
weight, weight_max, work,
|
||||
arg_map ? arg_map->args : NULL);
|
||||
mutex_unlock(&map->crush_workspace_mutex);
|
||||
|
||||
put_workspace(&map->crush_wsm, work);
|
||||
return r;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user