kernel_optimize_test/fs/xfs/xfs_fsops.c

655 lines
18 KiB
C
Raw Normal View History

/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
#include "xfs_dir2_sf.h"
#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_btree.h"
#include "xfs_error.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_fsops.h"
#include "xfs_itable.h"
#include "xfs_trans_space.h"
#include "xfs_rtalloc.h"
#include "xfs_rw.h"
[XFS] Concurrent Multi-File Data Streams In media spaces, video is often stored in a frame-per-file format. When dealing with uncompressed realtime HD video streams in this format, it is crucial that files do not get fragmented and that multiple files a placed contiguously on disk. When multiple streams are being ingested and played out at the same time, it is critical that the filesystem does not cross the streams and interleave them together as this creates seek and readahead cache miss latency and prevents both ingest and playout from meeting frame rate targets. This patch set creates a "stream of files" concept into the allocator to place all the data from a single stream contiguously on disk so that RAID array readahead can be used effectively. Each additional stream gets placed in different allocation groups within the filesystem, thereby ensuring that we don't cross any streams. When an AG fills up, we select a new AG for the stream that is not in use. The core of the functionality is the stream tracking - each inode that we create in a directory needs to be associated with the directories' stream. Hence every time we create a file, we look up the directories' stream object and associate the new file with that object. Once we have a stream object for a file, we use the AG that the stream object point to for allocations. If we can't allocate in that AG (e.g. it is full) we move the entire stream to another AG. Other inodes in the same stream are moved to the new AG on their next allocation (i.e. lazy update). Stream objects are kept in a cache and hold a reference on the inode. Hence the inode cannot be reclaimed while there is an outstanding stream reference. This means that on unlink we need to remove the stream association and we also need to flush all the associations on certain events that want to reclaim all unreferenced inodes (e.g. filesystem freeze). SGI-PV: 964469 SGI-Modid: xfs-linux-melb:xfs-kern:29096a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Barry Naujok <bnaujok@sgi.com> Signed-off-by: Donald Douwsma <donaldd@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Tim Shimmin <tes@sgi.com> Signed-off-by: Vlad Apostolov <vapo@sgi.com>
2007-07-11 09:09:12 +08:00
#include "xfs_filestream.h"
/*
* File system operations
*/
int
xfs_fs_geometry(
xfs_mount_t *mp,
xfs_fsop_geom_t *geo,
int new_version)
{
geo->blocksize = mp->m_sb.sb_blocksize;
geo->rtextsize = mp->m_sb.sb_rextsize;
geo->agblocks = mp->m_sb.sb_agblocks;
geo->agcount = mp->m_sb.sb_agcount;
geo->logblocks = mp->m_sb.sb_logblocks;
geo->sectsize = mp->m_sb.sb_sectsize;
geo->inodesize = mp->m_sb.sb_inodesize;
geo->imaxpct = mp->m_sb.sb_imax_pct;
geo->datablocks = mp->m_sb.sb_dblocks;
geo->rtblocks = mp->m_sb.sb_rblocks;
geo->rtextents = mp->m_sb.sb_rextents;
geo->logstart = mp->m_sb.sb_logstart;
ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid));
memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid));
if (new_version >= 2) {
geo->sunit = mp->m_sb.sb_unit;
geo->swidth = mp->m_sb.sb_width;
}
if (new_version >= 3) {
geo->version = XFS_FSOP_GEOM_VERSION;
geo->flags =
(xfs_sb_version_hasattr(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_ATTR : 0) |
(xfs_sb_version_hasnlink(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_NLINK : 0) |
(xfs_sb_version_hasquota(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_QUOTA : 0) |
(xfs_sb_version_hasalign(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_IALIGN : 0) |
(xfs_sb_version_hasdalign(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_DALIGN : 0) |
(xfs_sb_version_hasshared(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_SHARED : 0) |
(xfs_sb_version_hasextflgbit(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) |
(xfs_sb_version_hasdirv2(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
(xfs_sb_version_hassector(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_SECTOR : 0) |
(xfs_sb_version_hasasciici(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) |
[XFS] Lazy Superblock Counters When we have a couple of hundred transactions on the fly at once, they all typically modify the on disk superblock in some way. create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify free block counts. When these counts are modified in a transaction, they must eventually lock the superblock buffer and apply the mods. The buffer then remains locked until the transaction is committed into the incore log buffer. The result of this is that with enough transactions on the fly the incore superblock buffer becomes a bottleneck. The result of contention on the incore superblock buffer is that transaction rates fall - the more pressure that is put on the superblock buffer, the slower things go. The key to removing the contention is to not require the superblock fields in question to be locked. We do that by not marking the superblock dirty in the transaction. IOWs, we modify the incore superblock but do not modify the cached superblock buffer. In short, we do not log superblock modifications to critical fields in the superblock on every transaction. In fact we only do it just before we write the superblock to disk every sync period or just before unmount. This creates an interesting problem - if we don't log or write out the fields in every transaction, then how do the values get recovered after a crash? the answer is simple - we keep enough duplicate, logged information in other structures that we can reconstruct the correct count after log recovery has been performed. It is the AGF and AGI structures that contain the duplicate information; after recovery, we walk every AGI and AGF and sum their individual counters to get the correct value, and we do a transaction into the log to correct them. An optimisation of this is that if we have a clean unmount record, we know the value in the superblock is correct, so we can avoid the summation walk under normal conditions and so mount/recovery times do not change under normal operation. One wrinkle that was discovered during development was that the blocks used in the freespace btrees are never accounted for in the AGF counters. This was once a valid optimisation to make; when the filesystem is full, the free space btrees are empty and consume no space. Hence when it matters, the "accounting" is correct. But that means the when we do the AGF summations, we would not have a correct count and xfs_check would complain. Hence a new counter was added to track the number of blocks used by the free space btrees. This is an *on-disk format change*. As a result of this, lazy superblock counters are a mkfs option and at the moment on linux there is no way to convert an old filesystem. This is possible - xfs_db can be used to twiddle the right bits and then xfs_repair will do the format conversion for you. Similarly, you can convert backwards as well. At some point we'll add functionality to xfs_admin to do the bit twiddling easily.... SGI-PV: 964999 SGI-Modid: xfs-linux-melb:xfs-kern:28652a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
(xfs_sb_version_haslazysbcount(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
(xfs_sb_version_hasattr2(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_ATTR2 : 0);
geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
geo->dirblocksize = mp->m_dirblksize;
}
if (new_version >= 4) {
geo->flags |=
(xfs_sb_version_haslogv2(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_LOGV2 : 0);
geo->logsunit = mp->m_sb.sb_logsunit;
}
return 0;
}
static int
xfs_growfs_data_private(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_growfs_data_t *in) /* growfs data input struct */
{
xfs_agf_t *agf;
xfs_agi_t *agi;
xfs_agnumber_t agno;
xfs_extlen_t agsize;
xfs_extlen_t tmpsize;
xfs_alloc_rec_t *arec;
struct xfs_btree_block *block;
xfs_buf_t *bp;
int bucket;
int dpct;
int error;
xfs_agnumber_t nagcount;
xfs_agnumber_t nagimax = 0;
xfs_rfsblock_t nb, nb_mod;
xfs_rfsblock_t new;
xfs_rfsblock_t nfree;
xfs_agnumber_t oagcount;
int pct;
xfs_trans_t *tp;
nb = in->newblocks;
pct = in->imaxpct;
if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
return XFS_ERROR(EINVAL);
if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
return error;
dpct = pct - mp->m_sb.sb_imax_pct;
error = xfs_read_buf(mp, mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
XFS_FSS_TO_BB(mp, 1), 0, &bp);
if (error)
return error;
ASSERT(bp);
xfs_buf_relse(bp);
new = nb; /* use new as a temporary here */
nb_mod = do_div(new, mp->m_sb.sb_agblocks);
nagcount = new + (nb_mod != 0);
if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
nagcount--;
nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
if (nb < mp->m_sb.sb_dblocks)
return XFS_ERROR(EINVAL);
}
new = nb - mp->m_sb.sb_dblocks;
oagcount = mp->m_sb.sb_agcount;
if (nagcount > oagcount) {
[XFS] Concurrent Multi-File Data Streams In media spaces, video is often stored in a frame-per-file format. When dealing with uncompressed realtime HD video streams in this format, it is crucial that files do not get fragmented and that multiple files a placed contiguously on disk. When multiple streams are being ingested and played out at the same time, it is critical that the filesystem does not cross the streams and interleave them together as this creates seek and readahead cache miss latency and prevents both ingest and playout from meeting frame rate targets. This patch set creates a "stream of files" concept into the allocator to place all the data from a single stream contiguously on disk so that RAID array readahead can be used effectively. Each additional stream gets placed in different allocation groups within the filesystem, thereby ensuring that we don't cross any streams. When an AG fills up, we select a new AG for the stream that is not in use. The core of the functionality is the stream tracking - each inode that we create in a directory needs to be associated with the directories' stream. Hence every time we create a file, we look up the directories' stream object and associate the new file with that object. Once we have a stream object for a file, we use the AG that the stream object point to for allocations. If we can't allocate in that AG (e.g. it is full) we move the entire stream to another AG. Other inodes in the same stream are moved to the new AG on their next allocation (i.e. lazy update). Stream objects are kept in a cache and hold a reference on the inode. Hence the inode cannot be reclaimed while there is an outstanding stream reference. This means that on unlink we need to remove the stream association and we also need to flush all the associations on certain events that want to reclaim all unreferenced inodes (e.g. filesystem freeze). SGI-PV: 964469 SGI-Modid: xfs-linux-melb:xfs-kern:29096a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Barry Naujok <bnaujok@sgi.com> Signed-off-by: Donald Douwsma <donaldd@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Tim Shimmin <tes@sgi.com> Signed-off-by: Vlad Apostolov <vapo@sgi.com>
2007-07-11 09:09:12 +08:00
xfs_filestream_flush(mp);
down_write(&mp->m_peraglock);
mp->m_perag = kmem_realloc(mp->m_perag,
sizeof(xfs_perag_t) * nagcount,
sizeof(xfs_perag_t) * oagcount,
KM_SLEEP);
memset(&mp->m_perag[oagcount], 0,
(nagcount - oagcount) * sizeof(xfs_perag_t));
mp->m_flags |= XFS_MOUNT_32BITINODES;
nagimax = xfs_initialize_perag(mp, nagcount);
up_write(&mp->m_peraglock);
}
tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
tp->t_flags |= XFS_TRANS_RESERVE;
if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
xfs_trans_cancel(tp, 0);
return error;
}
nfree = 0;
for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
/*
* AG freelist header block
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0);
agf = XFS_BUF_TO_AGF(bp);
memset(agf, 0, mp->m_sb.sb_sectsize);
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
agf->agf_seqno = cpu_to_be32(agno);
if (agno == nagcount - 1)
agsize =
nb -
(agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
else
agsize = mp->m_sb.sb_agblocks;
agf->agf_length = cpu_to_be32(agsize);
agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
agf->agf_flfirst = 0;
agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1);
agf->agf_flcount = 0;
tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
agf->agf_freeblks = cpu_to_be32(tmpsize);
agf->agf_longest = cpu_to_be32(tmpsize);
error = xfs_bwrite(mp, bp);
if (error) {
goto error0;
}
/*
* AG inode header block
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0);
agi = XFS_BUF_TO_AGI(bp);
memset(agi, 0, mp->m_sb.sb_sectsize);
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
agi->agi_seqno = cpu_to_be32(agno);
agi->agi_length = cpu_to_be32(agsize);
agi->agi_count = 0;
agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
agi->agi_level = cpu_to_be32(1);
agi->agi_freecount = 0;
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
error = xfs_bwrite(mp, bp);
if (error) {
goto error0;
}
/*
* BNO btree root block
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
BTOBB(mp->m_sb.sb_blocksize), 0);
block = XFS_BUF_TO_BLOCK(bp);
memset(block, 0, mp->m_sb.sb_blocksize);
block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
block->bb_level = 0;
block->bb_numrecs = cpu_to_be16(1);
block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
error = xfs_bwrite(mp, bp);
if (error) {
goto error0;
}
/*
* CNT btree root block
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
BTOBB(mp->m_sb.sb_blocksize), 0);
block = XFS_BUF_TO_BLOCK(bp);
memset(block, 0, mp->m_sb.sb_blocksize);
block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
block->bb_level = 0;
block->bb_numrecs = cpu_to_be16(1);
block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
nfree += be32_to_cpu(arec->ar_blockcount);
error = xfs_bwrite(mp, bp);
if (error) {
goto error0;
}
/*
* INO btree root block
*/
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
BTOBB(mp->m_sb.sb_blocksize), 0);
block = XFS_BUF_TO_BLOCK(bp);
memset(block, 0, mp->m_sb.sb_blocksize);
block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
block->bb_level = 0;
block->bb_numrecs = 0;
block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
error = xfs_bwrite(mp, bp);
if (error) {
goto error0;
}
}
xfs_trans_agblocks_delta(tp, nfree);
/*
* There are new blocks in the old last a.g.
*/
if (new) {
/*
* Change the agi length.
*/
error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
if (error) {
goto error0;
}
ASSERT(bp);
agi = XFS_BUF_TO_AGI(bp);
be32_add_cpu(&agi->agi_length, new);
ASSERT(nagcount == oagcount ||
be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
/*
* Change agf length.
*/
error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
if (error) {
goto error0;
}
ASSERT(bp);
agf = XFS_BUF_TO_AGF(bp);
be32_add_cpu(&agf->agf_length, new);
ASSERT(be32_to_cpu(agf->agf_length) ==
be32_to_cpu(agi->agi_length));
xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
/*
* Free the new space.
*/
error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno,
be32_to_cpu(agf->agf_length) - new), new);
if (error) {
goto error0;
}
}
if (nagcount > oagcount)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
if (nb > mp->m_sb.sb_dblocks)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
nb - mp->m_sb.sb_dblocks);
if (nfree)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
if (dpct)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
error = xfs_trans_commit(tp, 0);
if (error) {
return error;
}
/* New allocation groups fully initialized, so update mount struct */
if (nagimax)
mp->m_maxagi = nagimax;
if (mp->m_sb.sb_imax_pct) {
__uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
do_div(icount, 100);
mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
} else
mp->m_maxicount = 0;
for (agno = 1; agno < nagcount; agno++) {
error = xfs_read_buf(mp, mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &bp);
if (error) {
xfs_fs_cmn_err(CE_WARN, mp,
"error %d reading secondary superblock for ag %d",
error, agno);
break;
}
xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
/*
* If we get an error writing out the alternate superblocks,
* just issue a warning and continue. The real work is
* already done and committed.
*/
if (!(error = xfs_bwrite(mp, bp))) {
continue;
} else {
xfs_fs_cmn_err(CE_WARN, mp,
"write error %d updating secondary superblock for ag %d",
error, agno);
break; /* no point in continuing */
}
}
return 0;
error0:
xfs_trans_cancel(tp, XFS_TRANS_ABORT);
return error;
}
static int
xfs_growfs_log_private(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_growfs_log_t *in) /* growfs log input struct */
{
xfs_extlen_t nb;
nb = in->newblocks;
if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
return XFS_ERROR(EINVAL);
if (nb == mp->m_sb.sb_logblocks &&
in->isint == (mp->m_sb.sb_logstart != 0))
return XFS_ERROR(EINVAL);
/*
* Moving the log is hard, need new interfaces to sync
* the log first, hold off all activity while moving it.
* Can have shorter or longer log in the same space,
* or transform internal to external log or vice versa.
*/
return XFS_ERROR(ENOSYS);
}
/*
* protected versions of growfs function acquire and release locks on the mount
* point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
* XFS_IOC_FSGROWFSRT
*/
int
xfs_growfs_data(
xfs_mount_t *mp,
xfs_growfs_data_t *in)
{
int error;
if (!capable(CAP_SYS_ADMIN))
return XFS_ERROR(EPERM);
if (!mutex_trylock(&mp->m_growlock))
return XFS_ERROR(EWOULDBLOCK);
error = xfs_growfs_data_private(mp, in);
mutex_unlock(&mp->m_growlock);
return error;
}
int
xfs_growfs_log(
xfs_mount_t *mp,
xfs_growfs_log_t *in)
{
int error;
if (!capable(CAP_SYS_ADMIN))
return XFS_ERROR(EPERM);
if (!mutex_trylock(&mp->m_growlock))
return XFS_ERROR(EWOULDBLOCK);
error = xfs_growfs_log_private(mp, in);
mutex_unlock(&mp->m_growlock);
return error;
}
/*
* exported through ioctl XFS_IOC_FSCOUNTS
*/
int
xfs_fs_counts(
xfs_mount_t *mp,
xfs_fsop_counts_t *cnt)
{
xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
spin_lock(&mp->m_sb_lock);
cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
cnt->freertx = mp->m_sb.sb_frextents;
cnt->freeino = mp->m_sb.sb_ifree;
cnt->allocino = mp->m_sb.sb_icount;
spin_unlock(&mp->m_sb_lock);
return 0;
}
/*
* exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
*
* xfs_reserve_blocks is called to set m_resblks
* in the in-core mount table. The number of unused reserved blocks
* is kept in m_resblks_avail.
*
* Reserve the requested number of blocks if available. Otherwise return
* as many as possible to satisfy the request. The actual number
* reserved are returned in outval
*
* A null inval pointer indicates that only the current reserved blocks
* available should be returned no settings are changed.
*/
int
xfs_reserve_blocks(
xfs_mount_t *mp,
__uint64_t *inval,
xfs_fsop_resblks_t *outval)
{
__int64_t lcounter, delta, fdblks_delta;
__uint64_t request;
/* If inval is null, report current values and return */
if (inval == (__uint64_t *)NULL) {
if (!outval)
return EINVAL;
outval->resblks = mp->m_resblks;
outval->resblks_avail = mp->m_resblks_avail;
return 0;
}
request = *inval;
/*
* With per-cpu counters, this becomes an interesting
* problem. we needto work out if we are freeing or allocation
* blocks first, then we can do the modification as necessary.
*
* We do this under the m_sb_lock so that if we are near
* ENOSPC, we will hold out any changes while we work out
* what to do. This means that the amount of free space can
* change while we do this, so we need to retry if we end up
* trying to reserve more space than is available.
*
* We also use the xfs_mod_incore_sb() interface so that we
* don't have to care about whether per cpu counter are
* enabled, disabled or even compiled in....
*/
retry:
spin_lock(&mp->m_sb_lock);
xfs_icsb_sync_counters_locked(mp, 0);
/*
* If our previous reservation was larger than the current value,
* then move any unused blocks back to the free pool.
*/
fdblks_delta = 0;
if (mp->m_resblks > request) {
lcounter = mp->m_resblks_avail - request;
if (lcounter > 0) { /* release unused blocks */
fdblks_delta = lcounter;
mp->m_resblks_avail -= lcounter;
}
mp->m_resblks = request;
} else {
__int64_t free;
free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
if (!free)
goto out; /* ENOSPC and fdblks_delta = 0 */
delta = request - mp->m_resblks;
lcounter = free - delta;
if (lcounter < 0) {
/* We can't satisfy the request, just get what we can */
mp->m_resblks += free;
mp->m_resblks_avail += free;
fdblks_delta = -free;
} else {
fdblks_delta = -delta;
mp->m_resblks = request;
mp->m_resblks_avail += delta;
}
}
out:
if (outval) {
outval->resblks = mp->m_resblks;
outval->resblks_avail = mp->m_resblks_avail;
}
spin_unlock(&mp->m_sb_lock);
if (fdblks_delta) {
/*
* If we are putting blocks back here, m_resblks_avail is
* already at its max so this will put it in the free pool.
*
* If we need space, we'll either succeed in getting it
* from the free block count or we'll get an enospc. If
* we get a ENOSPC, it means things changed while we were
* calculating fdblks_delta and so we should try again to
* see if there is anything left to reserve.
*
* Don't set the reserved flag here - we don't want to reserve
* the extra reserve blocks from the reserve.....
*/
int error;
error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, fdblks_delta, 0);
if (error == ENOSPC)
goto retry;
}
return 0;
}
filesystem freeze: add error handling of write_super_lockfs/unlockfs Currently, ext3 in mainline Linux doesn't have the freeze feature which suspends write requests. So, we cannot take a backup which keeps the filesystem's consistency with the storage device's features (snapshot and replication) while it is mounted. In many case, a commercial filesystem (e.g. VxFS) has the freeze feature and it would be used to get the consistent backup. If Linux's standard filesystem ext3 has the freeze feature, we can do it without a commercial filesystem. So I have implemented the ioctls of the freeze feature. I think we can take the consistent backup with the following steps. 1. Freeze the filesystem with the freeze ioctl. 2. Separate the replication volume or create the snapshot with the storage device's feature. 3. Unfreeze the filesystem with the unfreeze ioctl. 4. Take the backup from the separated replication volume or the snapshot. This patch: VFS: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they can return an error. Rename write_super_lockfs and unlockfs of the super block operation freeze_fs and unfreeze_fs to avoid a confusion. ext3, ext4, xfs, gfs2, jfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that write_super_lockfs returns an error if needed, and unlockfs always returns 0. reiserfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they always return 0 (success) to keep a current behavior. Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com> Signed-off-by: Masayuki Hamaguchi <m-hamaguchi@ys.jp.nec.com> Cc: <xfs-masters@oss.sgi.com> Cc: <linux-ext4@vger.kernel.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Kleikamp <shaggy@austin.ibm.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Alasdair G Kergon <agk@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-10 08:40:58 +08:00
int
xfs_fs_log_dummy(
xfs_mount_t *mp)
{
xfs_trans_t *tp;
xfs_inode_t *ip;
filesystem freeze: add error handling of write_super_lockfs/unlockfs Currently, ext3 in mainline Linux doesn't have the freeze feature which suspends write requests. So, we cannot take a backup which keeps the filesystem's consistency with the storage device's features (snapshot and replication) while it is mounted. In many case, a commercial filesystem (e.g. VxFS) has the freeze feature and it would be used to get the consistent backup. If Linux's standard filesystem ext3 has the freeze feature, we can do it without a commercial filesystem. So I have implemented the ioctls of the freeze feature. I think we can take the consistent backup with the following steps. 1. Freeze the filesystem with the freeze ioctl. 2. Separate the replication volume or create the snapshot with the storage device's feature. 3. Unfreeze the filesystem with the unfreeze ioctl. 4. Take the backup from the separated replication volume or the snapshot. This patch: VFS: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they can return an error. Rename write_super_lockfs and unlockfs of the super block operation freeze_fs and unfreeze_fs to avoid a confusion. ext3, ext4, xfs, gfs2, jfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that write_super_lockfs returns an error if needed, and unlockfs always returns 0. reiserfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they always return 0 (success) to keep a current behavior. Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com> Signed-off-by: Masayuki Hamaguchi <m-hamaguchi@ys.jp.nec.com> Cc: <xfs-masters@oss.sgi.com> Cc: <linux-ext4@vger.kernel.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Kleikamp <shaggy@austin.ibm.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Alasdair G Kergon <agk@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-10 08:40:58 +08:00
int error;
tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
filesystem freeze: add error handling of write_super_lockfs/unlockfs Currently, ext3 in mainline Linux doesn't have the freeze feature which suspends write requests. So, we cannot take a backup which keeps the filesystem's consistency with the storage device's features (snapshot and replication) while it is mounted. In many case, a commercial filesystem (e.g. VxFS) has the freeze feature and it would be used to get the consistent backup. If Linux's standard filesystem ext3 has the freeze feature, we can do it without a commercial filesystem. So I have implemented the ioctls of the freeze feature. I think we can take the consistent backup with the following steps. 1. Freeze the filesystem with the freeze ioctl. 2. Separate the replication volume or create the snapshot with the storage device's feature. 3. Unfreeze the filesystem with the unfreeze ioctl. 4. Take the backup from the separated replication volume or the snapshot. This patch: VFS: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they can return an error. Rename write_super_lockfs and unlockfs of the super block operation freeze_fs and unfreeze_fs to avoid a confusion. ext3, ext4, xfs, gfs2, jfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that write_super_lockfs returns an error if needed, and unlockfs always returns 0. reiserfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they always return 0 (success) to keep a current behavior. Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com> Signed-off-by: Masayuki Hamaguchi <m-hamaguchi@ys.jp.nec.com> Cc: <xfs-masters@oss.sgi.com> Cc: <linux-ext4@vger.kernel.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Kleikamp <shaggy@austin.ibm.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Alasdair G Kergon <agk@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-10 08:40:58 +08:00
error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
if (error) {
xfs_trans_cancel(tp, 0);
filesystem freeze: add error handling of write_super_lockfs/unlockfs Currently, ext3 in mainline Linux doesn't have the freeze feature which suspends write requests. So, we cannot take a backup which keeps the filesystem's consistency with the storage device's features (snapshot and replication) while it is mounted. In many case, a commercial filesystem (e.g. VxFS) has the freeze feature and it would be used to get the consistent backup. If Linux's standard filesystem ext3 has the freeze feature, we can do it without a commercial filesystem. So I have implemented the ioctls of the freeze feature. I think we can take the consistent backup with the following steps. 1. Freeze the filesystem with the freeze ioctl. 2. Separate the replication volume or create the snapshot with the storage device's feature. 3. Unfreeze the filesystem with the unfreeze ioctl. 4. Take the backup from the separated replication volume or the snapshot. This patch: VFS: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they can return an error. Rename write_super_lockfs and unlockfs of the super block operation freeze_fs and unfreeze_fs to avoid a confusion. ext3, ext4, xfs, gfs2, jfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that write_super_lockfs returns an error if needed, and unlockfs always returns 0. reiserfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they always return 0 (success) to keep a current behavior. Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com> Signed-off-by: Masayuki Hamaguchi <m-hamaguchi@ys.jp.nec.com> Cc: <xfs-masters@oss.sgi.com> Cc: <linux-ext4@vger.kernel.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Kleikamp <shaggy@austin.ibm.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Alasdair G Kergon <agk@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-10 08:40:58 +08:00
return error;
}
ip = mp->m_rootip;
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_ihold(tp, ip);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
xfs_trans_set_sync(tp);
filesystem freeze: add error handling of write_super_lockfs/unlockfs Currently, ext3 in mainline Linux doesn't have the freeze feature which suspends write requests. So, we cannot take a backup which keeps the filesystem's consistency with the storage device's features (snapshot and replication) while it is mounted. In many case, a commercial filesystem (e.g. VxFS) has the freeze feature and it would be used to get the consistent backup. If Linux's standard filesystem ext3 has the freeze feature, we can do it without a commercial filesystem. So I have implemented the ioctls of the freeze feature. I think we can take the consistent backup with the following steps. 1. Freeze the filesystem with the freeze ioctl. 2. Separate the replication volume or create the snapshot with the storage device's feature. 3. Unfreeze the filesystem with the unfreeze ioctl. 4. Take the backup from the separated replication volume or the snapshot. This patch: VFS: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they can return an error. Rename write_super_lockfs and unlockfs of the super block operation freeze_fs and unfreeze_fs to avoid a confusion. ext3, ext4, xfs, gfs2, jfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that write_super_lockfs returns an error if needed, and unlockfs always returns 0. reiserfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they always return 0 (success) to keep a current behavior. Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com> Signed-off-by: Masayuki Hamaguchi <m-hamaguchi@ys.jp.nec.com> Cc: <xfs-masters@oss.sgi.com> Cc: <linux-ext4@vger.kernel.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Kleikamp <shaggy@austin.ibm.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Alasdair G Kergon <agk@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-10 08:40:58 +08:00
error = xfs_trans_commit(tp, 0);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
filesystem freeze: add error handling of write_super_lockfs/unlockfs Currently, ext3 in mainline Linux doesn't have the freeze feature which suspends write requests. So, we cannot take a backup which keeps the filesystem's consistency with the storage device's features (snapshot and replication) while it is mounted. In many case, a commercial filesystem (e.g. VxFS) has the freeze feature and it would be used to get the consistent backup. If Linux's standard filesystem ext3 has the freeze feature, we can do it without a commercial filesystem. So I have implemented the ioctls of the freeze feature. I think we can take the consistent backup with the following steps. 1. Freeze the filesystem with the freeze ioctl. 2. Separate the replication volume or create the snapshot with the storage device's feature. 3. Unfreeze the filesystem with the unfreeze ioctl. 4. Take the backup from the separated replication volume or the snapshot. This patch: VFS: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they can return an error. Rename write_super_lockfs and unlockfs of the super block operation freeze_fs and unfreeze_fs to avoid a confusion. ext3, ext4, xfs, gfs2, jfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that write_super_lockfs returns an error if needed, and unlockfs always returns 0. reiserfs: Changed the type of write_super_lockfs and unlockfs from "void" to "int" so that they always return 0 (success) to keep a current behavior. Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com> Signed-off-by: Masayuki Hamaguchi <m-hamaguchi@ys.jp.nec.com> Cc: <xfs-masters@oss.sgi.com> Cc: <linux-ext4@vger.kernel.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Kleikamp <shaggy@austin.ibm.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Alasdair G Kergon <agk@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-10 08:40:58 +08:00
return error;
}
int
xfs_fs_goingdown(
xfs_mount_t *mp,
__uint32_t inflags)
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
if (sb && !IS_ERR(sb)) {
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
thaw_bdev(sb->s_bdev, sb);
}
break;
}
case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
break;
case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
xfs_force_shutdown(mp,
SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
break;
default:
return XFS_ERROR(EINVAL);
}
return 0;
}