gfs2: iomap buffered write support

With the traditional page-based writes, blocks are allocated separately
for each page written to.  With iomap writes, we can allocate a lot more
blocks at once, with a fraction of the allocation overhead for each
page.

Split calculating the number of blocks that can be allocated at a given
position (gfs2_alloc_size) off from gfs2_iomap_alloc: that size
determines the number of blocks to allocate and reserve in the journal.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Reviewed-by: Bob Peterson <rpeterso@redhat.com>
This commit is contained in:
Andreas Gruenbacher 2018-06-24 15:04:04 +01:00
parent d505a96a3b
commit 64bc06bb32
4 changed files with 338 additions and 43 deletions

View File

@ -22,6 +22,7 @@
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <trace/events/writeback.h>
#include <linux/sched/signal.h>
#include "gfs2.h"
#include "incore.h"
@ -36,9 +37,10 @@
#include "super.h"
#include "util.h"
#include "glops.h"
#include "aops.h"
static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
unsigned int from, unsigned int len)
{
struct buffer_head *head = page_buffers(page);
@ -462,7 +464,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
* Returns: errno
*/
static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
{
struct buffer_head *dibh;
u64 dsize = i_size_read(&ip->i_inode);
@ -776,7 +778,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
* adjust_fs_space - Adjusts the free space available due to gfs2_grow
* @inode: the rindex inode
*/
static void adjust_fs_space(struct inode *inode)
void adjust_fs_space(struct inode *inode)
{
struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@ -822,9 +824,9 @@ static void adjust_fs_space(struct inode *inode)
* This copies the data from the page into the inode block after
* the inode data structure itself.
*
* Returns: errno
* Returns: copied bytes or errno
*/
static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
loff_t pos, unsigned copied,
struct page *page)
{
@ -865,7 +867,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
* The main write_end function for GFS2. We just put our locking around the VFS
* provided functions.
*
* Returns: errno
* Returns: copied bytes or errno
*/
static int gfs2_write_end(struct file *file, struct address_space *mapping,

19
fs/gfs2/aops.h Normal file
View File

@ -0,0 +1,19 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2018 Red Hat, Inc. All rights reserved.
*/
#ifndef __AOPS_DOT_H__
#define __AOPS_DOT_H__
#include "incore.h"
extern int stuffed_readpage(struct gfs2_inode *ip, struct page *page);
extern int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
loff_t pos, unsigned copied,
struct page *page);
extern void adjust_fs_space(struct inode *inode);
extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
unsigned int from, unsigned int len);
#endif /* __AOPS_DOT_H__ */

View File

@ -28,6 +28,7 @@
#include "trans.h"
#include "dir.h"
#include "util.h"
#include "aops.h"
#include "trace_gfs2.h"
/* This doesn't need to be that large as max 64 bit pointers in a 4k
@ -41,6 +42,8 @@ struct metapath {
int mp_aheight; /* actual height (lookup height) */
};
static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
/**
* gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
* @ip: the inode
@ -389,7 +392,7 @@ static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
return mp->mp_aheight - x - 1;
}
static inline void release_metapath(struct metapath *mp)
static void release_metapath(struct metapath *mp)
{
int i;
@ -397,6 +400,7 @@ static inline void release_metapath(struct metapath *mp)
if (mp->mp_bh[i] == NULL)
break;
brelse(mp->mp_bh[i]);
mp->mp_bh[i] = NULL;
}
}
@ -609,11 +613,13 @@ enum alloc_state {
* ii) Indirect blocks to fill in lower part of the metadata tree
* iii) Data blocks
*
* The function is in two parts. The first part works out the total
* number of blocks which we need. The second part does the actual
* allocation asking for an extent at a time (if enough contiguous free
* blocks are available, there will only be one request per bmap call)
* and uses the state machine to initialise the blocks in order.
* This function is called after gfs2_iomap_get, which works out the
* total number of blocks which we need via gfs2_alloc_size.
*
* We then do the actual allocation asking for an extent at a time (if
* enough contiguous free blocks are available, there will only be one
* allocation request per call) and uses the state machine to initialise
* the blocks in order.
*
* Right now, this function will allocate at most one indirect block
* worth of data -- with a default block size of 4K, that's slightly
@ -633,39 +639,26 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
struct buffer_head *dibh = mp->mp_bh[0];
u64 bn;
unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
unsigned dblks = 0;
unsigned ptrs_per_blk;
size_t dblks = iomap->length >> inode->i_blkbits;
const unsigned end_of_metadata = mp->mp_fheight - 1;
int ret;
enum alloc_state state;
__be64 *ptr;
__be64 zero_bn = 0;
size_t maxlen = iomap->length >> inode->i_blkbits;
BUG_ON(mp->mp_aheight < 1);
BUG_ON(dibh == NULL);
BUG_ON(dblks < 1);
gfs2_trans_add_meta(ip->i_gl, dibh);
down_write(&ip->i_rw_mutex);
if (mp->mp_fheight == mp->mp_aheight) {
struct buffer_head *bh;
int eob;
/* Bottom indirect block exists, find unalloced extent size */
ptr = metapointer(end_of_metadata, mp);
bh = mp->mp_bh[end_of_metadata];
dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr,
maxlen, &eob);
BUG_ON(dblks < 1);
/* Bottom indirect block exists */
state = ALLOC_DATA;
} else {
/* Need to allocate indirect blocks */
ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs :
sdp->sd_diptrs;
dblks = min(maxlen, (size_t)(ptrs_per_blk -
mp->mp_list[end_of_metadata]));
if (mp->mp_fheight == ip->i_height) {
/* Writing into existing tree, extend tree down */
iblks = mp->mp_fheight - mp->mp_aheight;
@ -762,6 +755,50 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
/**
* gfs2_alloc_size - Compute the maximum allocation size
* @inode: The inode
* @mp: The metapath
* @size: Requested size in blocks
*
* Compute the maximum size of the next allocation at @mp.
*
* Returns: size in blocks
*/
static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
const __be64 *first, *ptr, *end;
/*
* For writes to stuffed files, this function is called twice via
* gfs2_iomap_get, before and after unstuffing. The size we return the
* first time needs to be large enough to get the reservation and
* allocation sizes right. The size we return the second time must
* be exact or else gfs2_iomap_alloc won't do the right thing.
*/
if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
unsigned int maxsize = mp->mp_fheight > 1 ?
sdp->sd_inptrs : sdp->sd_diptrs;
maxsize -= mp->mp_list[mp->mp_fheight - 1];
if (size > maxsize)
size = maxsize;
return size;
}
first = metapointer(ip->i_height - 1, mp);
end = metaend(ip->i_height - 1, mp);
if (end - first > size)
end = first + size;
for (ptr = first; ptr < end; ptr++) {
if (*ptr)
break;
}
return ptr - first;
}
/**
* gfs2_iomap_get - Map blocks from an inode to disk blocks
* @inode: The inode
@ -797,6 +834,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
ret = gfs2_meta_inode_buffer(ip, &dibh);
if (ret)
goto unlock;
iomap->private = dibh;
if (gfs2_is_stuffed(ip)) {
if (flags & IOMAP_WRITE) {
@ -822,6 +860,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
sizeof(struct gfs2_dinode);
iomap->type = IOMAP_INLINE;
iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
goto out;
}
@ -867,7 +906,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
iomap->bdev = inode->i_sb->s_bdev;
unlock:
up_read(&ip->i_rw_mutex);
if (dibh)
if (ret && dibh)
brelse(dibh);
return ret;
@ -881,13 +920,168 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
else
iomap->length = size - pos;
} else if (!(flags & IOMAP_WRITE)) {
} else if (flags & IOMAP_WRITE) {
u64 alloc_size;
len = gfs2_alloc_size(inode, mp, len);
alloc_size = len << inode->i_blkbits;
if (alloc_size < iomap->length)
iomap->length = alloc_size;
} else {
if (pos < size && height == ip->i_height)
ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
}
goto out;
}
static int gfs2_write_lock(struct inode *inode)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
int error;
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
error = gfs2_glock_nq(&ip->i_gh);
if (error)
goto out_uninit;
if (&ip->i_inode == sdp->sd_rindex) {
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
GL_NOCACHE, &m_ip->i_gh);
if (error)
goto out_unlock;
}
return 0;
out_unlock:
gfs2_glock_dq(&ip->i_gh);
out_uninit:
gfs2_holder_uninit(&ip->i_gh);
return error;
}
static void gfs2_write_unlock(struct inode *inode)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
if (&ip->i_inode == sdp->sd_rindex) {
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
gfs2_glock_dq_uninit(&m_ip->i_gh);
}
gfs2_glock_dq_uninit(&ip->i_gh);
}
static void gfs2_iomap_journaled_page_done(struct inode *inode, loff_t pos,
unsigned copied, struct page *page,
struct iomap *iomap)
{
struct gfs2_inode *ip = GFS2_I(inode);
gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
}
static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
loff_t length, unsigned flags,
struct iomap *iomap)
{
struct metapath mp = { .mp_aheight = 1, };
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
bool unstuff, alloc_required;
int ret;
ret = gfs2_write_lock(inode);
if (ret)
return ret;
unstuff = gfs2_is_stuffed(ip) &&
pos + length > gfs2_max_stuffed_size(ip);
ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
if (ret)
goto out_release;
alloc_required = unstuff || iomap->type == IOMAP_HOLE;
if (alloc_required || gfs2_is_jdata(ip))
gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
&ind_blocks);
if (alloc_required) {
struct gfs2_alloc_parms ap = {
.target = data_blocks + ind_blocks
};
ret = gfs2_quota_lock_check(ip, &ap);
if (ret)
goto out_release;
ret = gfs2_inplace_reserve(ip, &ap);
if (ret)
goto out_qunlock;
}
rblocks = RES_DINODE + ind_blocks;
if (gfs2_is_jdata(ip))
rblocks += data_blocks;
if (ind_blocks || data_blocks)
rblocks += RES_STATFS + RES_QUOTA;
if (inode == sdp->sd_rindex)
rblocks += 2 * RES_STATFS;
if (alloc_required)
rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
ret = gfs2_trans_begin(sdp, rblocks, iomap->length >> inode->i_blkbits);
if (ret)
goto out_trans_fail;
if (unstuff) {
ret = gfs2_unstuff_dinode(ip, NULL);
if (ret)
goto out_trans_end;
release_metapath(&mp);
brelse(iomap->private);
iomap->private = NULL;
ret = gfs2_iomap_get(inode, iomap->offset, iomap->length,
flags, iomap, &mp);
if (ret)
goto out_trans_end;
}
if (iomap->type == IOMAP_HOLE) {
ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
if (ret) {
gfs2_trans_end(sdp);
gfs2_inplace_release(ip);
punch_hole(ip, iomap->offset, iomap->length);
goto out_qunlock;
}
}
release_metapath(&mp);
if (gfs2_is_jdata(ip))
iomap->page_done = gfs2_iomap_journaled_page_done;
return 0;
out_trans_end:
gfs2_trans_end(sdp);
out_trans_fail:
if (alloc_required)
gfs2_inplace_release(ip);
out_qunlock:
if (alloc_required)
gfs2_quota_unlock(ip);
out_release:
if (iomap->private)
brelse(iomap->private);
release_metapath(&mp);
gfs2_write_unlock(inode);
return ret;
}
static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, struct iomap *iomap)
{
@ -897,10 +1091,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
trace_gfs2_iomap_start(ip, pos, length, flags);
if (flags & IOMAP_WRITE) {
ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
if (!ret && iomap->type == IOMAP_HOLE)
ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
release_metapath(&mp);
ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap);
} else {
ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
release_metapath(&mp);
@ -909,8 +1100,59 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
return ret;
}
static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_trans *tr = current->journal_info;
struct buffer_head *dibh = iomap->private;
if (!(flags & IOMAP_WRITE))
goto out;
if (iomap->type != IOMAP_INLINE) {
gfs2_ordered_add_inode(ip);
if (tr->tr_num_buf_new)
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
else
gfs2_trans_add_meta(ip->i_gl, dibh);
}
if (inode == sdp->sd_rindex) {
adjust_fs_space(inode);
sdp->sd_rindex_uptodate = 0;
}
gfs2_trans_end(sdp);
gfs2_inplace_release(ip);
if (length != written && (iomap->flags & IOMAP_F_NEW)) {
/* Deallocate blocks that were just allocated. */
loff_t blockmask = i_blocksize(inode) - 1;
loff_t end = (pos + length) & ~blockmask;
pos = (pos + written + blockmask) & ~blockmask;
if (pos < end) {
truncate_pagecache_range(inode, pos, end - 1);
punch_hole(ip, pos, end - pos);
}
}
if (ip->i_qadata && ip->i_qadata->qa_qd_num)
gfs2_quota_unlock(ip);
gfs2_write_unlock(inode);
out:
if (dibh)
brelse(dibh);
return 0;
}
const struct iomap_ops gfs2_iomap_ops = {
.iomap_begin = gfs2_iomap_begin,
.iomap_end = gfs2_iomap_end,
};
/**

View File

@ -26,10 +26,12 @@
#include <linux/dlm.h>
#include <linux/dlm_plock.h>
#include <linux/delay.h>
#include <linux/backing-dev.h>
#include "gfs2.h"
#include "incore.h"
#include "bmap.h"
#include "aops.h"
#include "dir.h"
#include "glock.h"
#include "glops.h"
@ -691,9 +693,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
/**
* gfs2_file_write_iter - Perform a write to a file
* @iocb: The io context
* @iov: The data to write
* @nr_segs: Number of @iov segments
* @pos: The file position
* @from: The data to write
*
* We have to do a lock/unlock here to refresh the inode size for
* O_APPEND writes, otherwise we can land up writing at the wrong
@ -705,8 +705,9 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct gfs2_inode *ip = GFS2_I(file_inode(file));
int ret;
struct inode *inode = file_inode(file);
struct gfs2_inode *ip = GFS2_I(inode);
ssize_t ret;
ret = gfs2_rsqa_alloc(ip);
if (ret)
@ -723,7 +724,38 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
gfs2_glock_dq_uninit(&gh);
}
if (iocb->ki_flags & IOCB_DIRECT)
return generic_file_write_iter(iocb, from);
inode_lock(inode);
ret = generic_write_checks(iocb, from);
if (ret <= 0)
goto out;
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
ret = file_remove_privs(file);
if (ret)
goto out2;
ret = file_update_time(file);
if (ret)
goto out2;
ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
out2:
current->backing_dev_info = NULL;
out:
inode_unlock(inode);
if (likely(ret > 0)) {
iocb->ki_pos += ret;
/* Handle various SYNC-type writes */
ret = generic_write_sync(iocb, ret);
}
return ret;
}
static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,