From 472d8ea1951b7fde0b3f0fb1f770be4ece4cff7b Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Thu, 10 May 2018 11:50:04 -0400 Subject: [PATCH 01/23] =?UTF-8?q?ext4:=20make=20function=20=E2=80=98ext4?= =?UTF-8?q?=5Fgetfsmap=5Ffind=5Ffixed=5Fmetadata=E2=80=99=20static?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since function ‘ext4_getfsmap_find_fixed_metadata’ can be made static, make it so. Remove the following gcc warning (W=1): fs/ext4/fsmap.c:405:5: warning: no previous prototype for ‘ext4_getfsmap_find_fixed_metadata’ [-Wmissing-prototypes] Signed-off-by: Mathieu Malaterre Signed-off-by: Theodore Ts'o --- fs/ext4/fsmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index e871c4bf18e9..4b99e2db95b8 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -402,8 +402,8 @@ static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list) } /* Find all the fixed metadata in the filesystem. */ -int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, - struct list_head *meta_list) +static int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, + struct list_head *meta_list) { struct ext4_group_desc *gdp; ext4_group_t agno; From 3f706c8c9257e0a90d95e8a1650139aba33d0906 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Thu, 10 May 2018 11:52:14 -0400 Subject: [PATCH 02/23] ext4: use XATTR_CREATE in ext4_initxattrs() I hit ENOSPC error when creating new file in a newly created ext4 with ea_inode feature enabled, if selinux is enabled and ext4 is mounted without any selinux context. e.g. mkfs -t ext4 -O ea_inode -F /dev/sda5 mount /dev/sda5 /mnt/ext4 touch /mnt/ext4/testfile # got ENOSPC here It turns out that we run out of journal credits in ext4_xattr_set_handle() when creating new selinux label for the newly created inode. This is because that in __ext4_new_inode() we use __ext4_xattr_set_credits() to calculate the reserved credits for new xattr, with the 'is_create' argument being true, which implies less credits in the ea_inode case. But we calculate the required credits in ext4_xattr_set_handle() with 'is_create' being false, which means we need more credits if ea_inode feature is enabled. So we don't have enough credits and error out with ENOSPC. Fix it by simply calling ext4_xattr_set_handle() with XATTR_CREATE flag in ext4_initxattrs(), so we end up with requiring less credits than reserved. The semantic of XATTR_CREATE is "Perform a pure create, which fails if the named attribute exists already." (from setxattr(2)), which is fine in this case, because we only call ext4_initxattrs() on newly created inode. Fixes: af65207c76ce ("ext4: fix __ext4_new_inode() journal credits calculation") Cc: Tahsin Erdogan Signed-off-by: Eryu Guan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr_security.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 629001b28632..197a9d8a15ef 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -43,7 +43,7 @@ ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY, xattr->name, xattr->value, - xattr->value_len, 0); + xattr->value_len, XATTR_CREATE); if (err < 0) break; } From e254d1afac83fd441e4051771b3d8f5eaf49fd3a Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Thu, 10 May 2018 11:55:31 -0400 Subject: [PATCH 03/23] ext4: use raw i_version value for ea_inode Currently, creating large xattr (e.g. 2k) in ea_inode would cause ea_inode refcount corruption, e.g. Pass 4: Checking reference counts Extended attribute inode 13 ref count is 0, should be 1. Fix? no This is because that we save the lower 32bit of refcount in inode->i_version and store it in raw_inode->i_disk_version on disk. But since commit ee73f9a52a34 ("ext4: convert to new i_version API"), we load/store modified i_disk_version from/to disk instead of raw value, which causes on-disk ea_inode refcount corruption. Fix it by loading/storing raw i_version/i_disk_version, because it's a self-managed value in this case. Fixes: ee73f9a52a34 ("ext4: convert to new i_version API") Cc: Tahsin Erdogan Signed-off-by: Eryu Guan Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1e50c5efae67..0eb64e8f9602 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4724,6 +4724,26 @@ int ext4_get_projid(struct inode *inode, kprojid_t *projid) return 0; } +/* + * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of + * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag + * set. + */ +static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) +{ + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + inode_set_iversion_raw(inode, val); + else + inode_set_iversion_queried(inode, val); +} +static inline u64 ext4_inode_peek_iversion(const struct inode *inode) +{ + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + return inode_peek_iversion_raw(inode); + else + return inode_peek_iversion(inode); +} + struct inode *ext4_iget(struct super_block *sb, unsigned long ino) { struct ext4_iloc iloc; @@ -4910,7 +4930,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } - inode_set_iversion_queried(inode, ivers); + ext4_inode_set_iversion_queried(inode, ivers); } ret = 0; @@ -5196,7 +5216,7 @@ static int ext4_do_update_inode(handle_t *handle, } if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { - u64 ivers = inode_peek_iversion(inode); + u64 ivers = ext4_inode_peek_iversion(inode); raw_inode->i_disk_version = cpu_to_le32(ivers); if (ei->i_extra_isize) { From 0db9fdeb347c10f64b679577f2640c9e35ea5a30 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Sat, 12 May 2018 11:35:01 -0400 Subject: [PATCH 04/23] ext4: fix wrong return value in ext4_read_inode_bitmap() The only reason that sb_getblk() could fail is out of memory, ext4 codes have returned -ENOMME for all other places except this one, let's fix it here too. Signed-off-by: Wang Shilong Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df92e3ec9913..33a2c98ce1ff 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -143,7 +143,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); - return ERR_PTR(-EIO); + return ERR_PTR(-ENOMEM); } if (bitmap_uptodate(bh)) goto verify; From db79e6d1fb1f715c961bd880101362e07369de23 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Sat, 12 May 2018 11:39:40 -0400 Subject: [PATCH 05/23] ext4: add new ext4_mark_group_bitmap_corrupted() helper Since there are many places to set inode/block bitmap corrupt bit, add a new helper for it, which will make codes more clear. Signed-off-by: Wang Shilong Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/balloc.c | 29 +++++++---------------------- fs/ext4/ext4.h | 7 +++++++ fs/ext4/ialloc.c | 20 ++++---------------- fs/ext4/mballoc.c | 14 ++++---------- fs/ext4/super.c | 30 ++++++++++++++++++++++++++++++ 5 files changed, 52 insertions(+), 48 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 508b905d744d..009dea942e34 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -185,25 +185,15 @@ static int ext4_init_block_bitmap(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; int flex_bg = 0; - struct ext4_group_info *grp; J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - grp = ext4_get_group_info(sb, block_group); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, gdp); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT | + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } memset(bh->b_data, 0, sb->s_blocksize); @@ -375,7 +365,6 @@ static int ext4_validate_block_bitmap(struct super_block *sb, { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); - struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return 0; @@ -387,10 +376,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSBADCRC; } blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); @@ -398,10 +385,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } set_buffer_verified(bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a42e71203e53..fa52b7dd4542 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2530,6 +2530,9 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); +extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t block_group, + unsigned int flags); extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, @@ -2857,6 +2860,10 @@ struct ext4_group_info { #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 33a2c98ce1ff..95611cf9f552 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -83,7 +83,6 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); - struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return 0; @@ -97,14 +96,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, blk); - grp = ext4_get_group_info(sb, block_group); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, desc); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } set_buffer_verified(bh); @@ -337,13 +330,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); - if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, gdp); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); } error_return: diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 769a62708b1c..bc2d1eb9fd5d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -747,10 +747,8 @@ void ext4_mb_generate_buddy(struct super_block *sb, * corrupt and update bb_free using bitmap value */ grp->bb_free = free; - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_set_largest_free_order(sb, grp); @@ -1454,12 +1452,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u); block bitmap corrupt.", block); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - e4b->bd_info->bb_free); - /* Mark the block group as corrupt. */ - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, - &e4b->bd_info->bb_state); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); mb_regenerate_buddy(e4b); goto done; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index eb104e8476f0..d6d6e7db73d6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -763,6 +763,36 @@ __acquires(bitlock) return; } +void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t group, + unsigned int flags) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + + if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) && + !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) { + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &grp->bb_state); + } + + if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) && + !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (gdp) { + int count; + + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, + &grp->bb_state); + } +} + void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; From 206f6d552d0c0596cbc076a3249f7182f08a35d9 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Sat, 12 May 2018 12:15:21 -0400 Subject: [PATCH 06/23] ext4: mark inode bitmap corrupted when found There are still some cases that we missed to set block bitmaps corrupted bit properly: 1)inode bitmap number is wrong. 2)failed to read block bitmap due to disk errors. 3)double allocations from bitmap Also remove a duplicated call ext4_error() afer ext4_read_inode_bitmap(), as ext4_error() have been called inside ext4_read_inode_bitmap() properly. Signed-off-by: Wang Shilong Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/ialloc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 95611cf9f552..4d6e007f3569 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -129,6 +129,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid inode bitmap blk %llu in " "block_group %u", bitmap_blk, block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); @@ -183,6 +185,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EIO); } @@ -902,6 +906,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto next_group; } @@ -1093,6 +1099,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, err = -EIO; ext4_error(sb, "failed to insert inode %lu: doubly allocated?", inode->i_ino); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto out; } inode->i_generation = prandom_u32(); @@ -1194,11 +1202,8 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (IS_ERR(bitmap_bh)) { - ext4_error(sb, "inode bitmap error %ld for orphan %lu", - ino, PTR_ERR(bitmap_bh)); + if (IS_ERR(bitmap_bh)) return (struct inode *) bitmap_bh; - } /* Having the inode bit set should be a 100% indicator that this * is a valid orphan (no e2fsck run on fs). Orphans also include From 736dedbb1a7dcb669e4990884b5dd9d6aef0fc77 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Sat, 12 May 2018 12:37:58 -0400 Subject: [PATCH 07/23] ext4: mark block bitmap corrupted when found There are still some cases that we missed to set block bitmaps corrupted bit properly: 1) block bitmap number is wrong. 2) failed to read block bitmap due to disk errors. 3) double free block bitmaps.. 4) some mismatch check with bitmaps vs buddy information. Signed-off-by: Theodore Ts'o Signed-off-by: Liu Bo Signed-off-by: Wang Shilong Reviewed-by: Liu Bo Reviewed-by: Andreas Dilger --- fs/ext4/balloc.c | 4 ++++ fs/ext4/mballoc.c | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 009dea942e34..b00481c475cb 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -421,6 +421,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid block bitmap block %llu in " "block_group %u", bitmap_blk, block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); @@ -499,6 +501,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EIO; } clear_buffer_new(bh); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bc2d1eb9fd5d..0d473991eebd 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -470,6 +470,8 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, "freeing block already freed " "(bit %u)", first + i); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -1950,6 +1952,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, "%d free clusters as per " "group info. But bitmap says 0", free); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } @@ -1960,6 +1964,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit From 2ee3ee06a8fd792765fa3267ddf928997797eec5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 12 May 2018 19:55:00 -0400 Subject: [PATCH 08/23] ext4: fix hole length detection in ext4_ind_map_blocks() When ext4_ind_map_blocks() computes a length of a hole, it doesn't count with the fact that mapped offset may be somewhere in the middle of the completely empty subtree. In such case it will return too large length of the hole which then results in lseek(SEEK_DATA) to end up returning an incorrect offset beyond the end of the hole. Fix the problem by correctly taking offset within a subtree into account when computing a length of a hole. Fixes: facab4d9711e7aa3532cb82643803e8f1b9518e8 CC: stable@vger.kernel.org Reported-by: Jeff Mahoney Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/indirect.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index c32802c956d5..bf7fa1507e81 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -561,10 +561,16 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); int i; - /* Count number blocks in a subtree under 'partial' */ - count = 1; - for (i = 0; partial + i != chain + depth - 1; i++) - count *= epb; + /* + * Count number blocks in a subtree under 'partial'. At each + * level we count number of complete empty subtrees beyond + * current offset and then descend into the subtree only + * partially beyond current offset. + */ + count = 0; + for (i = partial - chain + 1; i < depth; i++) + count = count * epb + (epb - offsets[i] - 1); + count++; /* Fill in size of a hole we found */ map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, count); From 71fe989961374ae55863c5da527bcf45be020834 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Sun, 13 May 2018 16:01:49 -0400 Subject: [PATCH 09/23] fs: ext4: add new return type vm_fault_t Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. commit 1c8f422059ae ("mm: change return type to vm_fault_t") Signed-off-by: Souptick Joarder Signed-off-by: Theodore Ts'o Reviewed-by: Matthew Wilcox --- fs/ext4/file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index fb6f023622fe..655d1c7bc614 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -277,10 +277,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } #ifdef CONFIG_FS_DAX -static int ext4_dax_huge_fault(struct vm_fault *vmf, +static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { - int result, error = 0; + int error = 0; + vm_fault_t result; int retries = 0; handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -335,7 +336,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, return result; } -static int ext4_dax_fault(struct vm_fault *vmf) +static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); } From 6390d33bf5d9b24fd4f96e415b6888f59c8494f9 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Sun, 13 May 2018 16:45:56 -0400 Subject: [PATCH 10/23] ext4: add verifier check for symlink with append/immutable flags The Linux VFS does not allow a way to set append/immuttable attributes to symlinks, this is just not possible. If this is detected inform the user as the filesystem must be corrupted. Signed-off-by: Luis R. Rodriguez Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0eb64e8f9602..9d512fa80d28 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4965,6 +4965,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { + /* VFS does not allow setting these so must be corruption */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + EXT4_ERROR_INODE(inode, + "immutable or append flags not allowed on symlinks"); + ret = -EFSCORRUPTED; + goto bad_inode; + } if (ext4_encrypted_inode(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; ext4_set_aops(inode); From eee597ac931305eff3d3fd1d61d6aae553bc0984 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 13 May 2018 19:28:35 -0400 Subject: [PATCH 11/23] ext4: update mtime in ext4_punch_hole even if no blocks are released Currently in ext4_punch_hole we're going to skip the mtime update if there are no actual blocks to release. However we've actually modified the file by zeroing the partial block so the mtime should be updated. Moreover the sync and datasync handling is skipped as well, which is also wrong. Fix it. Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o Reported-by: Joe Habermann Cc: --- fs/ext4/inode.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9d512fa80d28..58301a697379 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4298,28 +4298,28 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) EXT4_BLOCK_SIZE_BITS(sb); stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - /* If there are no blocks to remove, return now */ - if (first_block >= stop_block) - goto out_stop; + /* If there are blocks to remove, do it */ + if (stop_block > first_block) { - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_remove_space(inode, first_block, + stop_block - 1); + else + ret = ext4_ind_remove_space(handle, inode, first_block, + stop_block); - ret = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - if (ret) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; } - - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_remove_space(inode, first_block, - stop_block - 1); - else - ret = ext4_ind_remove_space(handle, inode, first_block, - stop_block); - - up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); From 0c8e3fe35db9b66ae0030849545030ec7c0fc45c Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 13 May 2018 22:40:30 -0400 Subject: [PATCH 12/23] vfs: add the sb_start_intwrite_trylock() helper Needed by ext4 to test frozen fs before updating s_last_mounted. Signed-off-by: Amir Goldstein Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 760d8da1b6c7..cac41f1bad05 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1597,6 +1597,11 @@ static inline void sb_start_intwrite(struct super_block *sb) __sb_start_write(sb, SB_FREEZE_FS, true); } +static inline int sb_start_intwrite_trylock(struct super_block *sb) +{ + return __sb_start_write(sb, SB_FREEZE_FS, false); +} + extern bool inode_owner_or_capable(const struct inode *inode); From 833a950882d33a7dfc319d5e152fdf35028936eb Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 13 May 2018 22:44:23 -0400 Subject: [PATCH 13/23] ext4: factor out helper ext4_sample_last_mounted() Signed-off-by: Amir Goldstein Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/file.c | 82 ++++++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 36 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 655d1c7bc614..c48ea76b63e4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -381,50 +381,60 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int ext4_file_open(struct inode * inode, struct file * filp) +static int ext4_sample_last_mounted(struct super_block *sb, + struct vfsmount *mnt) { - struct super_block *sb = inode->i_sb; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct vfsmount *mnt = filp->f_path.mnt; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct path path; char buf[64], *cp; + handle_t *handle; + int err; + + if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED)) + return 0; + + if (sb_rdonly(sb)) + return 0; + + sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; + /* + * Sample where the filesystem has been mounted and + * store it in the superblock for sysadmin convenience + * when trying to sort through large numbers of block + * devices or filesystem images. + */ + memset(buf, 0, sizeof(buf)); + path.mnt = mnt; + path.dentry = mnt->mnt_root; + cp = d_path(&path, buf, sizeof(buf)); + if (IS_ERR(cp)) + return 0; + + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out; + strlcpy(sbi->s_es->s_last_mounted, cp, + sizeof(sbi->s_es->s_last_mounted)); + ext4_handle_dirty_super(handle, sb); +out: + ext4_journal_stop(handle); + return err; +} + +static int ext4_file_open(struct inode * inode, struct file * filp) +{ int ret; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && - !sb_rdonly(sb))) { - sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; - /* - * Sample where the filesystem has been mounted and - * store it in the superblock for sysadmin convenience - * when trying to sort through large numbers of block - * devices or filesystem images. - */ - memset(buf, 0, sizeof(buf)); - path.mnt = mnt; - path.dentry = mnt->mnt_root; - cp = d_path(&path, buf, sizeof(buf)); - if (!IS_ERR(cp)) { - handle_t *handle; - int err; - - handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); - if (err) { - ext4_journal_stop(handle); - return err; - } - strlcpy(sbi->s_es->s_last_mounted, cp, - sizeof(sbi->s_es->s_last_mounted)); - ext4_handle_dirty_super(handle, sb); - ext4_journal_stop(handle); - } - } + ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); + if (ret) + return ret; ret = fscrypt_file_open(inode, filp); if (ret) From db6516a5e7ddb6dc72d167b920f2f272596ea22d Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 13 May 2018 22:54:44 -0400 Subject: [PATCH 14/23] ext4: do not update s_last_mounted of a frozen fs If fs is frozen after mount and before the first file open, the update of s_last_mounted bypasses freeze protection and prints out a WARNING splat: $ mount /vdf $ fsfreeze -f /vdf $ cat /vdf/foo [ 31.578555] WARNING: CPU: 1 PID: 1415 at fs/ext4/ext4_jbd2.c:53 ext4_journal_check_start+0x48/0x82 [ 31.614016] Call Trace: [ 31.614997] __ext4_journal_start_sb+0xe4/0x1a4 [ 31.616771] ? ext4_file_open+0xb6/0x189 [ 31.618094] ext4_file_open+0xb6/0x189 If fs is frozen, skip s_last_mounted update. [backport hint: to apply to stable tree, need to apply also patches vfs: add the sb_start_intwrite_trylock() helper ext4: factor out helper ext4_sample_last_mounted()] Cc: stable@vger.kernel.org Fixes: bc0b0d6d69ee ("ext4: update the s_last_mounted field in the superblock") Signed-off-by: Amir Goldstein Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/file.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index c48ea76b63e4..7f8023340eb8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -393,7 +393,7 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED)) return 0; - if (sb_rdonly(sb)) + if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) return 0; sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; @@ -407,21 +407,25 @@ static int ext4_sample_last_mounted(struct super_block *sb, path.mnt = mnt; path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); + err = 0; if (IS_ERR(cp)) - return 0; + goto out; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + err = PTR_ERR(handle); if (IS_ERR(handle)) - return PTR_ERR(handle); + goto out; BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) - goto out; + goto out_journal; strlcpy(sbi->s_es->s_last_mounted, cp, sizeof(sbi->s_es->s_last_mounted)); ext4_handle_dirty_super(handle, sb); -out: +out_journal: ext4_journal_stop(handle); +out: + sb_end_intwrite(sb); return err; } From c89128a008381478121a76537762eb3a62052971 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sun, 13 May 2018 23:02:19 -0400 Subject: [PATCH 15/23] ext4: handle errors on ext4_commit_super When remounting ext4 from ro to rw, currently it allows its transition, even if ext4_commit_super() returns EIO. Even worse thing is, after that, fs/buffer complains buffer dirty bits like: Call trace: [] mark_buffer_dirty+0x184/0x1a4 [] __ext4_handle_dirty_super+0x4c/0xfc [] ext4_file_open+0x154/0x1c0 [] do_dentry_open+0x114/0x2d0 [] vfs_open+0x5c/0x94 [] path_openat+0x668/0xfe8 [] do_filp_open+0x74/0x120 [] do_sys_open+0x148/0x254 [] SyS_openat+0x10/0x18 [] el0_svc_naked+0x24/0x28 EXT4-fs (dm-1): previous I/O error to superblock detected Buffer I/O error on dev dm-1, logical block 0, lost sync page write EXT4-fs (dm-1): re-mounted. Opts: (null) Buffer I/O error on dev dm-1, logical block 80, lost async page write Signed-off-by: Jaegeuk Kim Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d6d6e7db73d6..1388e56bb3f5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2146,12 +2146,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { struct ext4_sb_info *sbi = EXT4_SB(sb); - int res = 0; + int err = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); - res = SB_RDONLY; + err = -EROFS; } if (read_only) goto done; @@ -2184,7 +2184,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (sbi->s_journal) ext4_set_feature_journal_needs_recovery(sb); - ext4_commit_super(sb, 1); + err = ext4_commit_super(sb, 1); done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " @@ -2196,7 +2196,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, sbi->s_mount_opt, sbi->s_mount_opt2); cleancache_init_fs(sb); - return res; + return err; } int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) @@ -4254,8 +4254,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount4; } - if (ext4_setup_super(sb, es, sb_rdonly(sb))) + ret = ext4_setup_super(sb, es, sb_rdonly(sb)); + if (ret == -EROFS) { sb->s_flags |= SB_RDONLY; + ret = 0; + } else if (ret) + goto failed_mount4a; /* determine the minimum size of new large inodes, if present */ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && @@ -4790,11 +4794,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) unlock_buffer(sbh); error = __sync_dirty_buffer(sbh, REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); - if (error) - return error; - - error = buffer_write_io_error(sbh); - if (error) { + if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " "superblock"); clear_buffer_write_io_error(sbh); @@ -5195,8 +5195,12 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) ext4_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); - if (!ext4_setup_super(sb, es, 0)) - sb->s_flags &= ~SB_RDONLY; + + err = ext4_setup_super(sb, es, 0); + if (err) + goto restore_opts; + + sb->s_flags &= ~SB_RDONLY; if (ext4_has_feature_mmp(sb)) if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) { @@ -5220,8 +5224,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } ext4_setup_system_zone(sb); - if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) - ext4_commit_super(sb, 1); + if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { + err = ext4_commit_super(sb, 1); + if (err) + goto restore_opts; + } #ifdef CONFIG_QUOTA /* Release old quota file names */ From 9196f57151ffe817582efdde4f0b18eaa6637243 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Sun, 20 May 2018 22:14:29 -0400 Subject: [PATCH 16/23] jbd2: remove bunch of empty lines with jbd2 debug See following dmesg output with jbd2 debug enabled: ...(start_this_handle, 313): New handle 00000000c88d6ceb going live. ...(start_this_handle, 383): Handle 00000000c88d6ceb given 53 credits (total 53, free 32681) ...(do_get_write_access, 838): journal_head 0000000002856fc0, force_copy 0 ...(jbd2_journal_cancel_revoke, 421): journal_head 0000000002856fc0, cancelling revoke We have an extra line with every messages, this is a waste of buffer, we can fix it by removing "\n" in the caller or remove it in the __jbd2_debug(), i checked every jbd2_debug() passed '\n' explicitly. To avoid more lines, let's remove it inside __jbd2_debug(). Signed-off-by: Wang Shilong Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/jbd2/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index dfb057900e79..e94270192cad 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -114,7 +114,7 @@ void __jbd2_debug(int level, const char *file, const char *func, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); + printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf); va_end(args); } EXPORT_SYMBOL(__jbd2_debug); From 8bdd5b60e0273e6682d5c269c70e6287ae60eb55 Mon Sep 17 00:00:00 2001 From: Wang Long Date: Sun, 20 May 2018 22:38:26 -0400 Subject: [PATCH 17/23] jbd2: remove NULL check before calling kmem_cache_destroy() The kmem_cache_destroy() function already checks for null pointers, so we can remove the check at the call site. This patch also sets jbd2_handle_cache and jbd2_inode_cache to be NULL after freeing them in jbd2_journal_destroy_handle_cache(). Signed-off-by: Wang Long Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/jbd2/journal.c | 18 +++++++----------- fs/jbd2/revoke.c | 12 ++++-------- fs/jbd2/transaction.c | 6 ++---- 3 files changed, 13 insertions(+), 23 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e94270192cad..8ef6b6daaa7a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2302,8 +2302,7 @@ static void jbd2_journal_destroy_slabs(void) int i; for (i = 0; i < JBD2_MAX_SLABS; i++) { - if (jbd2_slab[i]) - kmem_cache_destroy(jbd2_slab[i]); + kmem_cache_destroy(jbd2_slab[i]); jbd2_slab[i] = NULL; } } @@ -2404,10 +2403,8 @@ static int jbd2_journal_init_journal_head_cache(void) static void jbd2_journal_destroy_journal_head_cache(void) { - if (jbd2_journal_head_cache) { - kmem_cache_destroy(jbd2_journal_head_cache); - jbd2_journal_head_cache = NULL; - } + kmem_cache_destroy(jbd2_journal_head_cache); + jbd2_journal_head_cache = NULL; } /* @@ -2665,11 +2662,10 @@ static int __init jbd2_journal_init_handle_cache(void) static void jbd2_journal_destroy_handle_cache(void) { - if (jbd2_handle_cache) - kmem_cache_destroy(jbd2_handle_cache); - if (jbd2_inode_cache) - kmem_cache_destroy(jbd2_inode_cache); - + kmem_cache_destroy(jbd2_handle_cache); + jbd2_handle_cache = NULL; + kmem_cache_destroy(jbd2_inode_cache); + jbd2_inode_cache = NULL; } /* diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 696ef15ec942..240779e4689c 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -180,14 +180,10 @@ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, void jbd2_journal_destroy_revoke_caches(void) { - if (jbd2_revoke_record_cache) { - kmem_cache_destroy(jbd2_revoke_record_cache); - jbd2_revoke_record_cache = NULL; - } - if (jbd2_revoke_table_cache) { - kmem_cache_destroy(jbd2_revoke_table_cache); - jbd2_revoke_table_cache = NULL; - } + kmem_cache_destroy(jbd2_revoke_record_cache); + jbd2_revoke_record_cache = NULL; + kmem_cache_destroy(jbd2_revoke_table_cache); + jbd2_revoke_table_cache = NULL; } int __init jbd2_journal_init_revoke_caches(void) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 8aa453784402..51dd68e67b0f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -49,10 +49,8 @@ int __init jbd2_journal_init_transaction_cache(void) void jbd2_journal_destroy_transaction_cache(void) { - if (transaction_cache) { - kmem_cache_destroy(transaction_cache); - transaction_cache = NULL; - } + kmem_cache_destroy(transaction_cache); + transaction_cache = NULL; } void jbd2_journal_free_transaction(transaction_t *transaction) From 21c580d88e2abd62b58ce34872cceb5c0d056330 Mon Sep 17 00:00:00 2001 From: Sean Fu Date: Sun, 20 May 2018 22:44:13 -0400 Subject: [PATCH 18/23] ext4: remove NULL check before calling kmem_cache_destroy() Signed-off-by: Sean Fu Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 3 +-- fs/ext4/mballoc.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 763ef185dd17..c4e6fb15101b 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -162,8 +162,7 @@ int __init ext4_init_es(void) void ext4_exit_es(void) { - if (ext4_es_cachep) - kmem_cache_destroy(ext4_es_cachep); + kmem_cache_destroy(ext4_es_cachep); } void ext4_es_init_tree(struct ext4_es_tree *tree) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0d473991eebd..243c42fdc155 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2537,8 +2537,7 @@ static void ext4_groupinfo_destroy_slabs(void) int i; for (i = 0; i < NR_GRPINFO_CACHES; i++) { - if (ext4_groupinfo_caches[i]) - kmem_cache_destroy(ext4_groupinfo_caches[i]); + kmem_cache_destroy(ext4_groupinfo_caches[i]); ext4_groupinfo_caches[i] = NULL; } } From f06925c7394236d769c788940c9a7b85dbe4c40c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sun, 20 May 2018 22:49:54 -0400 Subject: [PATCH 19/23] ext4: report delalloc reserve as non-free in statfs for project quota This reserved space isn't committed yet but cannot be used for allocations. For userspace it has no difference from used space. XFS already does this. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Fixes: 689c958cbe6b ("ext4: add project quota support") --- fs/ext4/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1388e56bb3f5..c1c5c8775ae7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5289,7 +5289,8 @@ static int ext4_statfs_project(struct super_block *sb, dquot->dq_dqb.dqb_bsoftlimit : dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { - curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + curblock = (dquot->dq_dqb.dqb_curspace + + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; buf->f_blocks = limit; buf->f_bfree = buf->f_bavail = (buf->f_blocks > curblock) ? From 117166efb1ee8f13c38f9e96b258f16d4923f888 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 22 May 2018 16:15:24 -0400 Subject: [PATCH 20/23] ext4: do not allow external inodes for inline data The inline data feature was implemented before we added support for external inodes for xattrs. It makes no sense to support that combination, but the problem is that there are a number of extended attribute checks that are skipped if e_value_inum is non-zero. Unfortunately, the inline data code is completely e_value_inum unaware, and attempts to interpret the xattr fields as if it were an inline xattr --- at which point, Hilarty Ensues. This addresses CVE-2018-11412. https://bugzilla.kernel.org/show_bug.cgi?id=199803 Reported-by: Jann Horn Reviewed-by: Andreas Dilger Signed-off-by: Theodore Ts'o Fixes: e50e5129f384 ("ext4: xattr-in-inode support") Cc: stable@kernel.org --- fs/ext4/inline.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 70cf4c7b268a..44b4fcdc3755 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -144,6 +144,12 @@ int ext4_find_inline_data_nolock(struct inode *inode) goto out; if (!is.s.not_found) { + if (is.s.here->e_value_inum) { + EXT4_ERROR_INODE(inode, "inline data xattr refers " + "to an external xattr inode"); + error = -EFSCORRUPTED; + goto out; + } EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + From eb9b5f01c33adebc31cbc236c02695f605b0e417 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 22 May 2018 17:14:07 -0400 Subject: [PATCH 21/23] ext4: bubble errors from ext4_find_inline_data_nolock() up to ext4_iget() If ext4_find_inline_data_nolock() returns an error it needs to get reflected up to ext4_iget(). In order to fix this, ext4_iget_extra_inode() needs to return an error (and not return void). This is related to "ext4: do not allow external inodes for inline data" (which fixes CVE-2018-11412) in that in the errors=continue case, it would be useful to for userspace to receive an error indicating that file system is corrupted. Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@kernel.org --- fs/ext4/inode.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 58301a697379..2ea07efbe016 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4701,19 +4701,21 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, } } -static inline void ext4_iget_extra_inode(struct inode *inode, +static inline int ext4_iget_extra_inode(struct inode *inode, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <= EXT4_INODE_SIZE(inode->i_sb) && *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { ext4_set_inode_state(inode, EXT4_STATE_XATTR); - ext4_find_inline_data_nolock(inode); + return ext4_find_inline_data_nolock(inode); } else EXT4_I(inode)->i_inline_off = 0; + return 0; } int ext4_get_projid(struct inode *inode, kprojid_t *projid) @@ -4913,7 +4915,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { - ext4_iget_extra_inode(inode, raw_inode, ei); + ret = ext4_iget_extra_inode(inode, raw_inode, ei); + if (ret) + goto bad_inode; } } From 8a2b307c21d4b290e3cbe33f768f194286d07c23 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 23 May 2018 11:31:03 -0400 Subject: [PATCH 22/23] ext4: correctly handle a zero-length xattr with a non-zero e_value_offs Ext4 will always create ext4 extended attributes which do not have a value (where e_value_size is zero) with e_value_offs set to zero. In most places e_value_offs will not be used in a substantive way if e_value_size is zero. There was one exception to this, which is in ext4_xattr_set_entry(), where if there is a maliciously crafted file system where there is an extended attribute with e_value_offs is non-zero and e_value_size is 0, the attempt to remove this xattr will result in a negative value getting passed to memmove, leading to the following sadness: [ 41.225365] EXT4-fs (loop0): mounted filesystem with ordered data mode. Opts: (null) [ 44.538641] BUG: unable to handle kernel paging request at ffff9ec9a3000000 [ 44.538733] IP: __memmove+0x81/0x1a0 [ 44.538755] PGD 1249bd067 P4D 1249bd067 PUD 1249c1067 PMD 80000001230000e1 [ 44.538793] Oops: 0003 [#1] SMP PTI [ 44.539074] CPU: 0 PID: 1470 Comm: poc Not tainted 4.16.0-rc1+ #1 ... [ 44.539475] Call Trace: [ 44.539832] ext4_xattr_set_entry+0x9e7/0xf80 ... [ 44.539972] ext4_xattr_block_set+0x212/0xea0 ... [ 44.540041] ext4_xattr_set_handle+0x514/0x610 [ 44.540065] ext4_xattr_set+0x7f/0x120 [ 44.540090] __vfs_removexattr+0x4d/0x60 [ 44.540112] vfs_removexattr+0x75/0xe0 [ 44.540132] removexattr+0x4d/0x80 ... [ 44.540279] path_removexattr+0x91/0xb0 [ 44.540300] SyS_removexattr+0xf/0x20 [ 44.540322] do_syscall_64+0x71/0x120 [ 44.540344] entry_SYSCALL_64_after_hwframe+0x21/0x86 https://bugzilla.kernel.org/show_bug.cgi?id=199347 This addresses CVE-2018-10840. Reported-by: "Xu, Wen" Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@kernel.org Fixes: dec214d00e0d7 ("ext4: xattr inode deduplication") --- fs/ext4/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 499cb4b1fbd2..fc4ced59c565 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1688,7 +1688,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, /* No failures allowed past this point. */ - if (!s->not_found && here->e_value_offs) { + if (!s->not_found && here->e_value_size && here->e_value_offs) { /* Remove the old value. */ void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(here->e_value_offs); From 4f2f76f751433908364ccff82f437a57d0e6e9b7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 25 May 2018 12:51:25 -0400 Subject: [PATCH 23/23] ext4: fix fencepost error in check for inode count overflow during resize ext4_resize_fs() has an off-by-one bug when checking whether growing of a filesystem will not overflow inode count. As a result it allows a filesystem with 8192 inodes per group to grow to 64TB which overflows inode count to 0 and makes filesystem unusable. Fix it. Cc: stable@vger.kernel.org Fixes: 3f8a6411fbada1fa482276591e037f3b1adcf55b Reported-by: Jaco Kroon Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger --- fs/ext4/resize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b6bec270a8e4..d792b7689d92 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1933,7 +1933,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) return 0; n_group = ext4_get_group_number(sb, n_blocks_count - 1); - if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { + if (n_group >= (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { ext4_warning(sb, "resize would cause inodes_count overflow"); return -EINVAL; }