From f84891289e62a74e9b4942eaad80617368b2d778 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:21:10 -0400 Subject: [PATCH 01/44] ext4: create a new BH_Verified flag to avoid unnecessary metadata validation Create a new BH_Verified flag to indicate that we've verified all the data in a buffer_head for correctness. This allows us to bypass expensive verification steps when they are not necessary without missing them when they are. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 35 ++++++++++++++++++++++++++--------- include/linux/jbd_common.h | 2 ++ 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index abcdeab67f52..8c1334ee8c7f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -412,6 +412,26 @@ int ext4_ext_check_inode(struct inode *inode) return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); } +static int __ext4_ext_check_block(const char *function, unsigned int line, + struct inode *inode, + struct ext4_extent_header *eh, + int depth, + struct buffer_head *bh) +{ + int ret; + + if (buffer_verified(bh)) + return 0; + ret = ext4_ext_check(inode, eh, depth); + if (ret) + return ret; + set_buffer_verified(bh); + return ret; +} + +#define ext4_ext_check_block(inode, eh, depth, bh) \ + __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh) + #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) { @@ -668,8 +688,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, i = depth; /* walk through the tree */ while (i) { - int need_to_validate = 0; - ext_debug("depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); @@ -688,8 +706,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, put_bh(bh); goto err; } - /* validate the extent entries */ - need_to_validate = 1; } eh = ext_block_hdr(bh); ppos++; @@ -703,7 +719,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_hdr = eh; i--; - if (need_to_validate && ext4_ext_check(inode, eh, i)) + if (ext4_ext_check_block(inode, eh, i, bh)) goto err; } @@ -1344,7 +1360,8 @@ static int ext4_ext_search_right(struct inode *inode, return -EIO; eh = ext_block_hdr(bh); /* subtract from p_depth to get proper eh_depth */ - if (ext4_ext_check(inode, eh, path->p_depth - depth)) { + if (ext4_ext_check_block(inode, eh, + path->p_depth - depth, bh)) { put_bh(bh); return -EIO; } @@ -1357,7 +1374,7 @@ static int ext4_ext_search_right(struct inode *inode, if (bh == NULL) return -EIO; eh = ext_block_hdr(bh); - if (ext4_ext_check(inode, eh, path->p_depth - depth)) { + if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) { put_bh(bh); return -EIO; } @@ -2644,8 +2661,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, err = -EIO; break; } - if (ext4_ext_check(inode, ext_block_hdr(bh), - depth - i - 1)) { + if (ext4_ext_check_block(inode, ext_block_hdr(bh), + depth - i - 1, bh)) { err = -EIO; break; } diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h index 6230f8556a4e..6133679bc4c0 100644 --- a/include/linux/jbd_common.h +++ b/include/linux/jbd_common.h @@ -12,6 +12,7 @@ enum jbd_state_bits { BH_State, /* Pins most journal_head state */ BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ + BH_Verified, /* Metadata block has been verified ok */ BH_JBDPrivateStart, /* First bit available for private use by FS */ }; @@ -24,6 +25,7 @@ TAS_BUFFER_FNS(Revoked, revoked) BUFFER_FNS(RevokeValid, revokevalid) TAS_BUFFER_FNS(RevokeValid, revokevalid) BUFFER_FNS(Freed, freed) +BUFFER_FNS(Verified, verified) static inline struct buffer_head *jh2bh(struct journal_head *jh) { From e615391896064eb5a0c760d086b8e1c6ecfffeab Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:23:10 -0400 Subject: [PATCH 02/44] ext4: change on-disk layout to support extended metadata checksumming Define flags and change structure definitions to allow checksumming of ext4 metadata. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 47 +++++++++++++++++++++++++++++++++++------- fs/ext4/ext4_extents.h | 13 ++++++++++++ fs/ext4/namei.c | 8 +++++++ fs/ext4/xattr.h | 4 +++- 4 files changed, 63 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0e01e90add8b..61dd739b64f0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -298,7 +298,9 @@ struct ext4_group_desc __le16 bg_free_inodes_count_lo;/* Free inodes count */ __le16 bg_used_dirs_count_lo; /* Directories count */ __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ - __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ + __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ + __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ + __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ __le16 bg_itable_unused_lo; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ @@ -308,7 +310,10 @@ struct ext4_group_desc __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ __le16 bg_used_dirs_count_hi; /* Directories count MSB */ __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ - __u32 bg_reserved2[3]; + __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ + __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ + __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ + __u32 bg_reserved; }; /* @@ -650,7 +655,8 @@ struct ext4_inode { __le16 l_i_file_acl_high; __le16 l_i_uid_high; /* these 2 fields */ __le16 l_i_gid_high; /* were reserved2[0] */ - __u32 l_i_reserved2; + __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ + __le16 l_i_reserved; } linux2; struct { __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ @@ -666,7 +672,7 @@ struct ext4_inode { } masix2; } osd2; /* OS dependent 2 */ __le16 i_extra_isize; - __le16 i_pad1; + __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ @@ -768,7 +774,7 @@ do { \ #define i_gid_low i_gid #define i_uid_high osd2.linux2.l_i_uid_high #define i_gid_high osd2.linux2.l_i_gid_high -#define i_reserved2 osd2.linux2.l_i_reserved2 +#define i_checksum_lo osd2.linux2.l_i_checksum_lo #elif defined(__GNU__) @@ -1001,6 +1007,9 @@ extern void ext4_set_bits(void *bm, int cur, int len); #define EXT4_ERRORS_PANIC 3 /* Panic */ #define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE +/* Metadata checksum algorithm codes */ +#define EXT4_CRC32C_CHKSUM 1 + /* * Structure of the super block */ @@ -1087,7 +1096,7 @@ struct ext4_super_block { __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ - __u8 s_reserved_char_pad; + __u8 s_checksum_type; /* metadata checksum algorithm used */ __le16 s_reserved_pad; __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ __le32 s_snapshot_inum; /* Inode number of active snapshot */ @@ -1113,7 +1122,8 @@ struct ext4_super_block { __le32 s_usr_quota_inum; /* inode for tracking user quota */ __le32 s_grp_quota_inum; /* inode for tracking group quota */ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ - __le32 s_reserved[109]; /* Padding to the end of the block */ + __le32 s_reserved[108]; /* Padding to the end of the block */ + __le32 s_checksum; /* crc32c(superblock) */ }; #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) @@ -1414,6 +1424,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 +/* + * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When + * METADATA_CSUM is set, group descriptor checksums use the same algorithm as + * all other data structures' checksums. However, the METADATA_CSUM and + * GDT_CSUM bits are mutually exclusive. + */ #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 @@ -1526,6 +1542,18 @@ struct ext4_dir_entry_2 { char name[EXT4_NAME_LEN]; /* File name */ }; +/* + * This is a bogus directory entry at the end of each leaf block that + * records checksums. + */ +struct ext4_dir_entry_tail { + __le32 det_reserved_zero1; /* Pretend to be unused */ + __le16 det_rec_len; /* 12 */ + __u8 det_reserved_zero2; /* Zero name length */ + __u8 det_reserved_ft; /* 0xDE, fake file type */ + __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + /* * Ext4 directory file types. Only the low 3 bits are used. The * other bits are reserved for now. @@ -1541,6 +1569,8 @@ struct ext4_dir_entry_2 { #define EXT4_FT_MAX 8 +#define EXT4_FT_DIR_CSUM 0xDE + /* * EXT4_DIR_PAD defines the directory entries boundaries * @@ -1741,7 +1771,8 @@ struct mmp_struct { __le16 mmp_check_interval; __le16 mmp_pad1; - __le32 mmp_pad2[227]; + __le32 mmp_pad2[226]; + __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ }; /* arguments passed to the mmp thread */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 0f58b86e3a02..94822e74ef73 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -63,8 +63,21 @@ * ext4_inode has i_block array (60 bytes total). * The first 12 bytes store ext4_extent_header; * the remainder stores an array of ext4_extent. + * For non-inode extent blocks, ext4_extent_tail + * follows the array. */ +/* + * This is the extent tail on-disk structure. + * All other extent structures are 12 bytes long. It turns out that + * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which + * covers all valid ext4 block sizes. Therefore, this tail structure can be + * crammed into the end of the block without having to rebalance the tree. + */ +struct ext4_extent_tail { + __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */ +}; + /* * This is the extent on-disk structure. * It's used at the bottom of the tree. diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 349d7b3671c8..b58bd5c8ffe7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -145,6 +145,14 @@ struct dx_map_entry u16 size; }; +/* + * This goes at the end of each htree block. + */ +struct dx_tail { + u32 dt_reserved; + __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ +}; + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); static inline unsigned dx_get_hash(struct dx_entry *entry); diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 25b7387ff183..91f31ca7d9af 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -27,7 +27,9 @@ struct ext4_xattr_header { __le32 h_refcount; /* reference count */ __le32 h_blocks; /* number of disk blocks used */ __le32 h_hash; /* hash value of all attributes */ - __u32 h_reserved[4]; /* zero right now */ + __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */ + /* id = inum if refcount=1, blknum otherwise */ + __u32 h_reserved[3]; /* zero right now */ }; struct ext4_xattr_ibody_header { From d25425f8e0ed01fc0167c043aee7e619fc3f6ab2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:25:10 -0400 Subject: [PATCH 03/44] ext4: record the checksum algorithm in use in the superblock Record the type of checksum algorithm we're using for metadata in the superblock, in case we ever want/need to change the algorithm. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e1fb1d5de58e..f489ffb3605d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -112,6 +112,16 @@ static struct file_system_type ext3_fs_type = { #define IS_EXT3_SB(sb) (0) #endif +static int ext4_verify_csum_type(struct super_block *sb, + struct ext4_super_block *es) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + return es->s_checksum_type == EXT4_CRC32C_CHKSUM; +} + void *ext4_kvmalloc(size_t size, gfp_t flags) { void *ret; @@ -3025,6 +3035,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto cantfind_ext4; sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); + /* Check for a known checksum algorithm */ + if (!ext4_verify_csum_type(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "unknown checksum algorithm."); + silent = 1; + goto cantfind_ext4; + } + /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); set_opt(sb, INIT_INODE_TABLE); From 0441984a3398970ab4820410b9cf4ff85bf3a6b0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:27:10 -0400 Subject: [PATCH 04/44] ext4: load the crc32c driver if necessary Obtain a reference to the cryptoapi and crc32c if we mount a filesystem with metadata checksumming enabled. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/Kconfig | 2 ++ fs/ext4/ext4.h | 23 +++++++++++++++++++++++ fs/ext4/super.c | 16 ++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 9ed1bb1f319f..c22f17021b6e 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -2,6 +2,8 @@ config EXT4_FS tristate "The Extended 4 (ext4) filesystem" select JBD2 select CRC16 + select CRYPTO + select CRYPTO_CRC32C help This is the next generation of the ext3 filesystem. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 61dd739b64f0..f19192bde094 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef __KERNEL__ #include #endif @@ -1276,6 +1277,9 @@ struct ext4_sb_info { /* record the last minlen when FITRIM is called. */ atomic_t s_last_trim_minblks; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1639,6 +1643,25 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 +static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(sbi->s_chksum_driver)]; + } desc; + int err; + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + #ifdef __KERNEL__ /* hash info structure used by the directory hash */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f489ffb3605d..11a0a7078da7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -915,6 +915,8 @@ static void ext4_put_super(struct super_block *sb) unlock_super(sb); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); kfree(sbi); } @@ -3043,6 +3045,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto cantfind_ext4; } + /* Load the checksum driver */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); + ret = PTR_ERR(sbi->s_chksum_driver); + sbi->s_chksum_driver = NULL; + goto failed_mount; + } + } + /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); set_opt(sb, INIT_INODE_TABLE); @@ -3728,6 +3742,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { remove_proc_entry("options", sbi->s_proc); remove_proc_entry(sb->s_id, ext4_proc_root); From a9c4731780544d52b243bf46e4dd635c67fa9f84 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:29:10 -0400 Subject: [PATCH 05/44] ext4: calculate and verify superblock checksum Calculate and verify the superblock checksum. Since the UUID and block group number are embedded in each copy of the superblock, we need only checksum the entire block. Refactor some of the code to eliminate open-coding of the checksum update call. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 10 ++++++++++ fs/ext4/ext4_jbd2.c | 9 ++++++++- fs/ext4/ext4_jbd2.h | 7 +++++-- fs/ext4/inode.c | 2 +- fs/ext4/namei.c | 4 ++-- fs/ext4/resize.c | 4 +++- fs/ext4/super.c | 47 +++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 76 insertions(+), 7 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f19192bde094..5adbce519b66 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1280,6 +1280,9 @@ struct ext4_sb_info { /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; + + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_csum_seed; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -2004,6 +2007,10 @@ extern int ext4_group_extend(struct super_block *sb, extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ +extern int ext4_superblock_csum_verify(struct super_block *sb, + struct ext4_super_block *es); +extern void ext4_superblock_csum_set(struct super_block *sb, + struct ext4_super_block *es); extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern void ext4_kvfree(void *ptr); @@ -2279,6 +2286,9 @@ static inline void ext4_unlock_group(struct super_block *sb, static inline void ext4_mark_super_dirty(struct super_block *sb) { + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + ext4_superblock_csum_set(sb, es); if (EXT4_SB(sb)->s_journal == NULL) sb->s_dirt =1; } diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index aca179017582..90f7c2e84db1 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -138,16 +138,23 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, } int __ext4_handle_dirty_super(const char *where, unsigned int line, - handle_t *handle, struct super_block *sb) + handle_t *handle, struct super_block *sb, + int now) { struct buffer_head *bh = EXT4_SB(sb)->s_sbh; int err = 0; if (ext4_handle_valid(handle)) { + ext4_superblock_csum_set(sb, + (struct ext4_super_block *)bh->b_data); err = jbd2_journal_dirty_metadata(handle, bh); if (err) ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + } else if (now) { + ext4_superblock_csum_set(sb, + (struct ext4_super_block *)bh->b_data); + mark_buffer_dirty(bh); } else sb->s_dirt = 1; return err; diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 83b20fcf9400..f440e8f1841f 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -213,7 +213,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, struct buffer_head *bh); int __ext4_handle_dirty_super(const char *where, unsigned int line, - handle_t *handle, struct super_block *sb); + handle_t *handle, struct super_block *sb, + int now); #define ext4_journal_get_write_access(handle, bh) \ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) @@ -225,8 +226,10 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ (bh)) +#define ext4_handle_dirty_super_now(handle, sb) \ + __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 1) #define ext4_handle_dirty_super(handle, sb) \ - __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) + __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 0) handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c77b0bd2c711..8bc21ecc1df5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3936,7 +3936,7 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_SET_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_LARGE_FILE); ext4_handle_sync(handle); - err = ext4_handle_dirty_super(handle, sb); + err = ext4_handle_dirty_super_now(handle, sb); } } raw_inode->i_generation = cpu_to_le32(inode->i_generation); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b58bd5c8ffe7..625125172d05 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2021,7 +2021,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) /* Insert this inode at the head of the on-disk orphan list... */ NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_super_now(handle, sb); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; @@ -2094,7 +2094,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (err) goto out_brelse; sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + err = ext4_handle_dirty_super_now(handle, inode->i_sb); } else { struct ext4_iloc iloc2; struct inode *i_prev = diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 59fa0be27251..e0374757a94b 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -796,7 +796,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, ext4_kvfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_super_now(handle, sb); if (err) ext4_std_error(sb, err); @@ -968,6 +968,8 @@ static void update_backups(struct super_block *sb, goto exit_err; } + ext4_superblock_csum_set(sb, (struct ext4_super_block *)data); + while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) { struct buffer_head *bh; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 11a0a7078da7..f80c7e612829 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -122,6 +122,38 @@ static int ext4_verify_csum_type(struct super_block *sb, return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } +static __le32 ext4_superblock_csum(struct super_block *sb, + struct ext4_super_block *es) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int offset = offsetof(struct ext4_super_block, s_checksum); + __u32 csum; + + csum = ext4_chksum(sbi, ~0, (char *)es, offset); + + return cpu_to_le32(csum); +} + +int ext4_superblock_csum_verify(struct super_block *sb, + struct ext4_super_block *es) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + return es->s_checksum == ext4_superblock_csum(sb, es); +} + +void ext4_superblock_csum_set(struct super_block *sb, + struct ext4_super_block *es) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + es->s_checksum = ext4_superblock_csum(sb, es); +} + void *ext4_kvmalloc(size_t size, gfp_t flags) { void *ret; @@ -3057,6 +3089,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } + /* Check superblock checksum */ + if (!ext4_superblock_csum_verify(sb, es)) { + ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " + "invalid superblock checksum. Run e2fsck?"); + silent = 1; + goto cantfind_ext4; + } + + /* Precompute checksum seed for all metadata */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + sizeof(es->s_uuid)); + /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); set_opt(sb, INIT_INODE_TABLE); @@ -4059,6 +4105,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) &EXT4_SB(sb)->s_freeinodes_counter)); sb->s_dirt = 0; BUFFER_TRACE(sbh, "marking dirty"); + ext4_superblock_csum_set(sb, es); mark_buffer_dirty(sbh); if (sync) { error = sync_dirty_buffer(sbh); From 814525f4df50a196464ce2c7abe91f693203060f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:31:10 -0400 Subject: [PATCH 06/44] ext4: calculate and verify inode checksums This patch introduces to ext4 the ability to calculate and verify inode checksums. This requires the use of a new ro compatibility flag and some accompanying e2fsprogs patches to provide the relevant features in tune2fs and e2fsck. The inode generation changes have been integrated into this patch. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 ++ fs/ext4/ialloc.c | 13 ++++++ fs/ext4/inode.c | 111 +++++++++++++++++++++++++++++++++++++++++++---- fs/ext4/ioctl.c | 7 +++ 4 files changed, 126 insertions(+), 8 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5adbce519b66..ca96401b4b81 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -915,6 +915,9 @@ struct ext4_inode_info { */ tid_t i_sync_tid; tid_t i_datasync_tid; + + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ + __u32 i_csum_seed; }; /* diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 409c2ee7750a..8207dfab2682 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -850,6 +850,19 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); + /* Precompute checksum seed for inode metadata */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = cpu_to_le32(inode->i_generation); + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + sizeof(inum)); + ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, + sizeof(gen)); + } + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8bc21ecc1df5..16a67359b4c7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,6 +47,73 @@ #define MPAGE_DA_EXTENT_TAIL 0x01 +static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u16 csum_lo; + __u16 csum_hi = 0; + __u32 csum; + + csum_lo = raw->i_checksum_lo; + raw->i_checksum_lo = 0; + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { + csum_hi = raw->i_checksum_hi; + raw->i_checksum_hi = 0; + } + + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, + EXT4_INODE_SIZE(inode->i_sb)); + + raw->i_checksum_lo = csum_lo; + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + raw->i_checksum_hi = csum_hi; + + return csum; +} + +static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + __u32 provided, calculated; + + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_LINUX) || + !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + provided = le16_to_cpu(raw->i_checksum_lo); + calculated = ext4_inode_csum(inode, raw, ei); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; + else + calculated &= 0xFFFF; + + return provided == calculated; +} + +static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei) +{ + __u32 csum; + + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_LINUX) || + !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + csum = ext4_inode_csum(inode, raw, ei); + raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) + raw->i_checksum_hi = cpu_to_le16(csum >> 16); +} + static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { @@ -3644,6 +3711,39 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (ret < 0) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT4_INODE_SIZE(inode->i_sb)) { + EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)", + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); + ret = -EIO; + goto bad_inode; + } + } else + ei->i_extra_isize = 0; + + /* Precompute checksum seed for inode metadata */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 csum; + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = raw_inode->i_generation; + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + sizeof(inum)); + ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, + sizeof(gen)); + } + + if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { + EXT4_ERROR_INODE(inode, "checksum invalid"); + ret = -EIO; + goto bad_inode; + } + inode->i_mode = le16_to_cpu(raw_inode->i_mode); inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); @@ -3721,12 +3821,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { - ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); - if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT4_INODE_SIZE(inode->i_sb)) { - ret = -EIO; - goto bad_inode; - } if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ ei->i_extra_isize = sizeof(struct ext4_inode) - @@ -3738,8 +3832,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) ext4_set_inode_state(inode, EXT4_STATE_XATTR); } - } else - ei->i_extra_isize = 0; + } EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); @@ -3963,6 +4056,8 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } + ext4_inode_csum_set(inode, raw_inode, ei); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); rc = ext4_handle_dirty_metadata(handle, NULL, bh); if (!err) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 6eee25591b81..feba55a225a6 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -152,6 +152,13 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!inode_owner_or_capable(inode)) return -EPERM; + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + ext4_warning(sb, "Setting inode version is not " + "supported with metadata_csum enabled."); + return -ENOTTY; + } + err = mnt_want_write_file(filp); if (err) return err; From 41a246d1ff75a95d2be3191ca6e6db139dc0f430 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:33:10 -0400 Subject: [PATCH 07/44] ext4: calculate and verify checksums for inode bitmaps Compute and verify the checksum of the inode bitmap; the checkum is stored in the block group descriptor. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/bitmap.c | 39 +++++++++++++++++++++++++++++++++++++++ fs/ext4/ext4.h | 20 ++++++++++++++++++++ fs/ext4/ialloc.c | 34 +++++++++++++++++++++++++++++----- fs/ext4/resize.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+), 5 deletions(-) diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index fa3af81ac565..5813e7f2e2c6 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -29,3 +29,42 @@ unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars) #endif /* EXT4FS_DEBUG */ +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz) +{ + __u32 hi; + __u32 provided, calculated; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); + calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { + hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); + provided |= (hi << 16); + } else + calculated &= 0xFFFF; + + return provided == calculated; +} + +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz) +{ + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); + if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) + gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); +} diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ca96401b4b81..7f0ff88f32ba 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -317,6 +317,13 @@ struct ext4_group_desc __u32 bg_reserved; }; +#define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ + sizeof(__le16)) +#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ + (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ + sizeof(__le16)) + /* * Structure of a flex block group info */ @@ -1844,6 +1851,12 @@ struct mmpd_data { /* bitmap.c */ extern unsigned int ext4_count_free(struct buffer_head *, unsigned); +void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); /* balloc.c */ extern unsigned int ext4_block_group(struct super_block *sb, @@ -2094,6 +2107,13 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, struct ext4_group_desc *gdp); +static inline int ext4_has_group_desc_csum(struct super_block *sb) +{ + return EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM | + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM); +} + static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 8207dfab2682..fb897ec183c8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -82,12 +82,17 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); + ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, + EXT4_INODES_PER_GROUP(sb) / 8); return 0; } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); + ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, + EXT4_INODES_PER_GROUP(sb) / 8); + gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); return EXT4_INODES_PER_GROUP(sb); } @@ -128,12 +133,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) return NULL; } if (bitmap_uptodate(bh)) - return bh; + goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); - return bh; + goto verify; } ext4_lock_group(sb, block_group); @@ -141,6 +146,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_init_inode_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); + set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; @@ -154,7 +160,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) */ set_bitmap_uptodate(bh); unlock_buffer(bh); - return bh; + goto verify; } /* * submit the buffer_head for reading @@ -171,6 +177,20 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) block_group, bitmap_blk); return NULL; } + +verify: + ext4_lock_group(sb, block_group); + if (!buffer_verified(bh) && + !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, + EXT4_INODES_PER_GROUP(sb) / 8)) { + ext4_unlock_group(sb, block_group); + put_bh(bh); + ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " + "inode_bitmap = %llu", block_group, bitmap_blk); + return NULL; + } + ext4_unlock_group(sb, block_group); + set_buffer_verified(bh); return bh; } @@ -276,6 +296,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ext4_used_dirs_set(sb, gdp, count); percpu_counter_dec(&sbi->s_dirs_counter); } + ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, + EXT4_INODES_PER_GROUP(sb) / 8); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); ext4_unlock_group(sb, block_group); @@ -751,7 +773,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, goto fail; /* Update the relevant bg descriptor fields */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + if (ext4_has_group_desc_csum(sb)) { int free; struct ext4_group_info *grp = ext4_get_group_info(sb, group); @@ -782,7 +804,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, atomic_inc(&sbi->s_flex_groups[f].used_dirs); } } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + if (ext4_has_group_desc_csum(sb)) { + ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, + EXT4_INODES_PER_GROUP(sb) / 8); gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); ext4_unlock_group(sb, group); } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e0374757a94b..7810cac241e1 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1069,6 +1069,47 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, return err; } +static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) +{ + struct buffer_head *bh = sb_getblk(sb, block); + if (!bh) + return NULL; + + if (bitmap_uptodate(bh)) + return bh; + + lock_buffer(bh); + if (bh_submit_read(bh) < 0) { + unlock_buffer(bh); + brelse(bh); + return NULL; + } + unlock_buffer(bh); + + return bh; +} + +static int ext4_set_bitmap_checksums(struct super_block *sb, + ext4_group_t group, + struct ext4_group_desc *gdp, + struct ext4_new_group_data *group_data) +{ + struct buffer_head *bh; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 0; + + bh = ext4_get_bitmap(sb, group_data->inode_bitmap); + if (!bh) + return -EIO; + ext4_inode_bitmap_csum_set(sb, group, gdp, bh, + EXT4_INODES_PER_GROUP(sb) / 8); + brelse(bh); + + return 0; +} + /* * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg */ @@ -1101,6 +1142,12 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, memset(gdp, 0, EXT4_DESC_SIZE(sb)); ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); + err = ext4_set_bitmap_checksums(sb, group, gdp, group_data); + if (err) { + ext4_std_error(sb, err); + break; + } + ext4_inode_table_set(sb, gdp, group_data->inode_table); ext4_free_group_clusters_set(sb, gdp, EXT4_B2C(sbi, group_data->free_blocks_count)); From fa77dcfafeaa6bc73293c646bfc3d5192dcf0be2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:35:10 -0400 Subject: [PATCH 08/44] ext4: calculate and verify block bitmap checksum Compute and verify the checksum of the block bitmap; this checksum is stored in the block group descriptor. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 39 ++++++++++++++++++++++++++++++++------- fs/ext4/bitmap.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/ext4.h | 10 ++++++++++ fs/ext4/ialloc.c | 4 ++++ fs/ext4/mballoc.c | 8 +++++++- fs/ext4/resize.c | 7 +++++++ 6 files changed, 104 insertions(+), 8 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 4bbd07a6fa18..0a64e4d5dbad 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -174,6 +174,8 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_free_inodes_set(sb, gdp, 0); ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); + ext4_block_bitmap_csum_set(sb, block_group, gdp, bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8); return; } memset(bh->b_data, 0, sb->s_blocksize); @@ -210,6 +212,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, */ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), sb->s_blocksize * 8, bh->b_data); + ext4_block_bitmap_csum_set(sb, block_group, gdp, bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8); + gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); } /* Return the number of free blocks in a block group. It is used when @@ -276,9 +281,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, } static int ext4_valid_block_bitmap(struct super_block *sb, - struct ext4_group_desc *desc, - unsigned int block_group, - struct buffer_head *bh) + struct ext4_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh) { ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; @@ -325,6 +330,23 @@ static int ext4_valid_block_bitmap(struct super_block *sb, block_group, bitmap_blk); return 0; } + +void ext4_validate_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh) +{ + if (buffer_verified(bh)) + return; + + ext4_lock_group(sb, block_group); + if (ext4_valid_block_bitmap(sb, desc, block_group, bh) && + ext4_block_bitmap_csum_verify(sb, block_group, desc, bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8)) + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); +} + /** * ext4_read_block_bitmap() * @sb: super block @@ -355,12 +377,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) } if (bitmap_uptodate(bh)) - return bh; + goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); - return bh; + goto verify; } ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -379,7 +401,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) */ set_bitmap_uptodate(bh); unlock_buffer(bh); - return bh; + goto verify; } /* * submit the buffer_head for reading @@ -390,6 +412,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) get_bh(bh); submit_bh(READ, bh); return bh; +verify: + ext4_validate_block_bitmap(sb, desc, block_group, bh); + return bh; } /* Returns 0 on success, 1 on error */ @@ -412,7 +437,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, } clear_buffer_new(bh); /* Panic or remount fs read-only if block bitmap is invalid */ - ext4_valid_block_bitmap(sb, desc, block_group, bh); + ext4_validate_block_bitmap(sb, desc, block_group, bh); return 0; } diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 5813e7f2e2c6..b319721da26a 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -68,3 +68,47 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); } + +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz) +{ + __u32 hi; + __u32 provided, calculated; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); + calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { + hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); + provided |= (hi << 16); + } else + calculated &= 0xFFFF; + + if (provided == calculated) + return 1; + + ext4_error(sb, "Bad block bitmap checksum: block_group = %u", group); + return 0; +} + +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz) +{ + __u32 csum; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); + if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) + gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); +} diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7f0ff88f32ba..7a4f16075257 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1857,8 +1857,18 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz); +void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); +int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *gdp, + struct buffer_head *bh, int sz); /* balloc.c */ +extern void ext4_validate_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + unsigned int block_group, + struct buffer_head *bh); extern unsigned int ext4_block_group(struct super_block *sb, ext4_fsblk_t blocknr); extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index fb897ec183c8..a6383fcb714b 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -753,6 +753,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); + ext4_block_bitmap_csum_set(sb, group, gdp, + block_bitmap_bh, + EXT4_BLOCKS_PER_GROUP(sb) / + 8); gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 99ab428bcfa0..95a388fef397 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -788,7 +788,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) int first_block; struct super_block *sb; struct buffer_head *bhs; - struct buffer_head **bh; + struct buffer_head **bh = NULL; struct inode *inode; char *data; char *bitmap; @@ -2797,6 +2797,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; ext4_free_group_clusters_set(sb, gdp, len); + ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); @@ -4659,6 +4661,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, ret = ext4_free_group_clusters(sb, gdp) + count_clusters; ext4_free_group_clusters_set(sb, gdp, ret); + ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); @@ -4803,6 +4807,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, mb_free_blocks(NULL, &e4b, bit, count); blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); ext4_free_group_clusters_set(sb, desc, blk_free_count); + ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 7810cac241e1..dc597e066308 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1107,6 +1107,13 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, EXT4_INODES_PER_GROUP(sb) / 8); brelse(bh); + bh = ext4_get_bitmap(sb, group_data->block_bitmap); + if (!bh) + return -EIO; + ext4_block_bitmap_csum_set(sb, group, gdp, bh, + EXT4_BLOCKS_PER_GROUP(sb) / 8); + brelse(bh); + return 0; } From 7ac5990d5a3e2e465162880819cc46c6427d3b6f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:37:10 -0400 Subject: [PATCH 09/44] ext4: verify and calculate checksums for extent tree blocks Calculate and verify the checksum for each extent tree block. The checksum is located in the space immediately after the last possible ext4_extent in the block. The space is is typically the last 4-8 bytes in the block. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_extents.h | 11 ++++++++++ fs/ext4/extents.c | 50 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 94822e74ef73..cb1b2c919963 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -114,6 +114,17 @@ struct ext4_extent_header { #define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) +#define EXT4_EXTENT_TAIL_OFFSET(hdr) \ + (sizeof(struct ext4_extent_header) + \ + (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max))) + +static inline struct ext4_extent_tail * +find_ext4_extent_tail(struct ext4_extent_header *eh) +{ + return (struct ext4_extent_tail *)(((void *)eh) + + EXT4_EXTENT_TAIL_OFFSET(eh)); +} + /* * Array of ext4_ext_path contains path to some extent. * Creation/lookup routines use it for traversal/splitting/etc. diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8c1334ee8c7f..6f3b49bd34c7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -52,6 +52,46 @@ #define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ #define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ +static __le32 ext4_extent_block_csum(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 csum; + + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, + EXT4_EXTENT_TAIL_OFFSET(eh)); + return cpu_to_le32(csum); +} + +static int ext4_extent_block_csum_verify(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent_tail *et; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + et = find_ext4_extent_tail(eh); + if (et->et_checksum != ext4_extent_block_csum(inode, eh)) + return 0; + return 1; +} + +static void ext4_extent_block_csum_set(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent_tail *et; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + et = find_ext4_extent_tail(eh); + et->et_checksum = ext4_extent_block_csum(inode, eh); +} + static int ext4_split_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, @@ -117,6 +157,7 @@ static int __ext4_ext_dirty(const char *where, unsigned int line, { int err; if (path->p_bh) { + ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); /* path points to block */ err = __ext4_handle_dirty_metadata(where, line, handle, inode, path->p_bh); @@ -391,6 +432,12 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid extent entries"; goto corrupted; } + /* Verify checksum on non-root extent tree nodes */ + if (ext_depth(inode) != depth && + !ext4_extent_block_csum_verify(inode, eh)) { + error_msg = "extent tree corrupted"; + goto corrupted; + } return 0; corrupted: @@ -930,6 +977,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, le16_add_cpu(&neh->eh_entries, m); } + ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -1008,6 +1056,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent_idx) * m); le16_add_cpu(&neh->eh_entries, m); } + ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -1105,6 +1154,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, else neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; + ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); From dbe89444042ab6540bc304343cfdcbc8b95d003d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:39:10 -0400 Subject: [PATCH 10/44] ext4: Calculate and verify checksums for htree nodes Calculate and verify the checksum for directory index tree (htree) node blocks. The checksum is stored in the last 4 bytes of the htree block and requires the dx_entry array to stop 1 dx_entry short of the end of the block. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 156 insertions(+), 4 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 625125172d05..f8efedde7593 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -188,6 +188,121 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); +/* checksumming functions */ +static struct dx_countlimit *get_dx_countlimit(struct inode *inode, + struct ext4_dir_entry *dirent, + int *offset) +{ + struct ext4_dir_entry *dp; + struct dx_root_info *root; + int count_offset; + + if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) + count_offset = 8; + else if (le16_to_cpu(dirent->rec_len) == 12) { + dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); + if (le16_to_cpu(dp->rec_len) != + EXT4_BLOCK_SIZE(inode->i_sb) - 12) + return NULL; + root = (struct dx_root_info *)(((void *)dp + 12)); + if (root->reserved_zero || + root->info_length != sizeof(struct dx_root_info)) + return NULL; + count_offset = 32; + } else + return NULL; + + if (offset) + *offset = count_offset; + return (struct dx_countlimit *)(((void *)dirent) + count_offset); +} + +static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, + int count_offset, int count, struct dx_tail *t) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + __u32 csum, old_csum; + int size; + + size = count_offset + (count * sizeof(struct dx_entry)); + old_csum = t->dt_checksum; + t->dt_checksum = 0; + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); + t->dt_checksum = old_csum; + + return cpu_to_le32(csum); +} + +static int ext4_dx_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent) +{ + struct dx_countlimit *c; + struct dx_tail *t; + int count_offset, limit, count; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + c = get_dx_countlimit(inode, dirent, &count_offset); + if (!c) { + EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); + return 1; + } + limit = le16_to_cpu(c->limit); + count = le16_to_cpu(c->count); + if (count_offset + (limit * sizeof(struct dx_entry)) > + EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { + EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " + "tree checksum found. Run e2fsck -D."); + return 1; + } + t = (struct dx_tail *)(((struct dx_entry *)c) + limit); + + if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset, + count, t)) + return 0; + return 1; +} + +static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) +{ + struct dx_countlimit *c; + struct dx_tail *t; + int count_offset, limit, count; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + c = get_dx_countlimit(inode, dirent, &count_offset); + if (!c) { + EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); + return; + } + limit = le16_to_cpu(c->limit); + count = le16_to_cpu(c->count); + if (count_offset + (limit * sizeof(struct dx_entry)) > + EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { + EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " + "tree checksum. Run e2fsck -D."); + return; + } + t = (struct dx_tail *)(((struct dx_entry *)c) + limit); + + t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t); +} + +static inline int ext4_handle_dirty_dx_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + /* * p is at least 6 bytes before the end of page */ @@ -247,12 +362,20 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - EXT4_DIR_REC_LEN(2) - infosize; + + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } static inline unsigned dx_node_limit(struct inode *dir) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); + + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -398,6 +521,15 @@ dx_probe(const struct qstr *d_name, struct inode *dir, goto fail; } + if (!buffer_verified(bh) && + !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) { + ext4_warning(dir->i_sb, "Root failed checksum"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + set_buffer_verified(bh); + entries = (struct dx_entry *) (((char *)&root->info) + root->info.info_length); @@ -458,6 +590,17 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) goto fail2; at = entries = ((struct dx_node *) bh->b_data)->entries; + + if (!buffer_verified(bh) && + !ext4_dx_csum_verify(dir, + (struct ext4_dir_entry *)bh->b_data)) { + ext4_warning(dir->i_sb, "Node failed checksum"); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + } + set_buffer_verified(bh); + if (dx_get_limit(entries) != dx_node_limit (dir)) { ext4_warning(dir->i_sb, "dx entry: limit != node limit"); @@ -557,6 +700,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), 0, &err))) return err; /* Failure */ + + if (!buffer_verified(bh) && + !ext4_dx_csum_verify(dir, + (struct ext4_dir_entry *)bh->b_data)) { + ext4_warning(dir->i_sb, "Node failed checksum"); + return -EIO; + } + set_buffer_verified(bh); + p++; brelse(p->bh); p->bh = bh; @@ -1232,7 +1384,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, err = ext4_handle_dirty_metadata(handle, dir, bh2); if (err) goto journal_error; - err = ext4_handle_dirty_metadata(handle, dir, frame->bh); + err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (err) goto journal_error; brelse(bh2); @@ -1419,7 +1571,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, frame->bh = bh; bh = bh2; - ext4_handle_dirty_metadata(handle, dir, frame->bh); + ext4_handle_dirty_dx_node(handle, dir, frame->bh); ext4_handle_dirty_metadata(handle, dir, bh); de = do_split(handle,dir, &bh, frame, &hinfo, &retval); @@ -1594,7 +1746,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, dxtrace(dx_show_index("node", frames[1].entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_handle_dirty_metadata(handle, dir, bh2); + err = ext4_handle_dirty_dx_node(handle, dir, bh2); if (err) goto journal_error; brelse (bh2); @@ -1620,7 +1772,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); + err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); if (err) { ext4_std_error(inode->i_sb, err); goto cleanup; From b0336e8d2108e6302aecaadd17c6c0bd686da22d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:41:10 -0400 Subject: [PATCH 11/44] ext4: calculate and verify checksums of directory leaf blocks Calculate and verify the checksums for directory leaf blocks (i.e. blocks that only contain actual directory entries). The checksum lives in what looks to be an unused directory entry with a 0 name_len at the end of the block. This scheme is not used for internal htree nodes because the mechanism in place there only costs one dx_entry, whereas the "empty" directory entry would cost two dx_entries. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 12 +++ fs/ext4/ext4.h | 2 + fs/ext4/namei.c | 260 +++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 259 insertions(+), 15 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index b86786202643..aa39e600d159 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -179,6 +179,18 @@ static int ext4_readdir(struct file *filp, continue; } + /* Check the checksum */ + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(inode, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_FILE(filp, 0, "directory fails checksum " + "at offset %llu", + (unsigned long long)filp->f_pos); + filp->f_pos += sb->s_blocksize - offset; + continue; + } + set_buffer_verified(bh); + revalidate: /* If the dir block has changed since the last call to * readdir(2), then we might be pointing to an invalid diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7a4f16075257..57d7c356eaab 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2019,6 +2019,8 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); extern int ext4_ext_migrate(struct inode *); /* namei.c */ +extern int ext4_dirent_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent); extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f8efedde7593..5861d64929f6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -189,6 +189,115 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); /* checksumming functions */ +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + +static void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize) +{ + memset(t, 0, sizeof(struct ext4_dir_entry_tail)); + t->det_rec_len = ext4_rec_len_to_disk( + sizeof(struct ext4_dir_entry_tail), blocksize); + t->det_reserved_ft = EXT4_FT_DIR_CSUM; +} + +/* Walk through a dirent block to find a checksum "dirent" at the tail */ +static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, + struct ext4_dir_entry *de) +{ + struct ext4_dir_entry_tail *t; + +#ifdef PARANOID + struct ext4_dir_entry *d, *top; + + d = de; + top = (struct ext4_dir_entry *)(((void *)de) + + (EXT4_BLOCK_SIZE(inode->i_sb) - + sizeof(struct ext4_dir_entry_tail))); + while (d < top && d->rec_len) + d = (struct ext4_dir_entry *)(((void *)d) + + le16_to_cpu(d->rec_len)); + + if (d != top) + return NULL; + + t = (struct ext4_dir_entry_tail *)d; +#else + t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb)); +#endif + + if (t->det_reserved_zero1 || + le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) || + t->det_reserved_zero2 || + t->det_reserved_ft != EXT4_FT_DIR_CSUM) + return NULL; + + return t; +} + +static __le32 ext4_dirent_csum(struct inode *inode, + struct ext4_dir_entry *dirent, int size) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + __u32 csum; + + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + return cpu_to_le32(csum); +} + +int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) +{ + struct ext4_dir_entry_tail *t; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + t = get_dirent_tail(inode, dirent); + if (!t) { + EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " + "leaf for checksum. Please run e2fsck -D."); + return 0; + } + + if (t->det_checksum != ext4_dirent_csum(inode, dirent, + (void *)t - (void *)dirent)) + return 0; + + return 1; +} + +static void ext4_dirent_csum_set(struct inode *inode, + struct ext4_dir_entry *dirent) +{ + struct ext4_dir_entry_tail *t; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + t = get_dirent_tail(inode, dirent); + if (!t) { + EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " + "leaf for checksum. Please run e2fsck -D."); + return; + } + + t->det_checksum = ext4_dirent_csum(inode, dirent, + (void *)t - (void *)dirent); +} + +static inline int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + static struct dx_countlimit *get_dx_countlimit(struct inode *inode, struct ext4_dir_entry *dirent, int *offset) @@ -737,6 +846,11 @@ static int htree_dirblock_to_tree(struct file *dir_file, if (!(bh = ext4_bread (NULL, dir, block, 0, &err))) return err; + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) + return -EIO; + set_buffer_verified(bh); + de = (struct ext4_dir_entry_2 *) bh->b_data; top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - @@ -1096,6 +1210,15 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, brelse(bh); goto next; } + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(dir, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_INODE(dir, "checksumming directory " + "block %lu", (unsigned long)block); + brelse(bh); + goto next; + } + set_buffer_verified(bh); i = search_dirblock(bh, dir, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { @@ -1147,6 +1270,16 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q if (!(bh = ext4_bread(NULL, dir, block, 0, err))) goto errout; + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(dir, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_INODE(dir, "checksumming directory " + "block %lu", (unsigned long)block); + brelse(bh); + *err = -EIO; + goto errout; + } + set_buffer_verified(bh); retval = search_dirblock(bh, dir, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); @@ -1319,8 +1452,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, char *data1 = (*bh)->b_data, *data2; unsigned split, move, size; struct ext4_dir_entry_2 *de = NULL, *de2; + struct ext4_dir_entry_tail *t; + int csum_size = 0; int err = 0, i; + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + bh2 = ext4_append (handle, dir, &newblock, &err); if (!(bh2)) { brelse(*bh); @@ -1367,10 +1506,20 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, /* Fancy dance to stay within two buffers */ de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); de = dx_pack_dirents(data1, blocksize); - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, blocksize); - de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2, + de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) - + (char *) de2, blocksize); + if (csum_size) { + t = EXT4_DIRENT_TAIL(data2, blocksize); + initialize_dirent_tail(t, blocksize); + + t = EXT4_DIRENT_TAIL(data1, blocksize); + initialize_dirent_tail(t, blocksize); + } + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); @@ -1381,7 +1530,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, de = de2; } dx_insert_block(frame, hash2 + continued, newblock); - err = ext4_handle_dirty_metadata(handle, dir, bh2); + err = ext4_handle_dirty_dirent_node(handle, dir, bh2); if (err) goto journal_error; err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); @@ -1421,11 +1570,16 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, unsigned short reclen; int nlen, rlen, err; char *top; + int csum_size = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); reclen = EXT4_DIR_REC_LEN(namelen); if (!de) { de = (struct ext4_dir_entry_2 *)bh->b_data; - top = bh->b_data + blocksize - reclen; + top = bh->b_data + (blocksize - csum_size) - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) return -EIO; @@ -1481,7 +1635,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, dir->i_version++; ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, bh); + err = ext4_handle_dirty_dirent_node(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); return 0; @@ -1502,6 +1656,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct dx_frame frames[2], *frame; struct dx_entry *entries; struct ext4_dir_entry_2 *de, *de2; + struct ext4_dir_entry_tail *t; char *data1, *top; unsigned len; int retval; @@ -1509,6 +1664,11 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct dx_hash_info hinfo; ext4_lblk_t block; struct fake_dirent *fde; + int csum_size = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); @@ -1529,7 +1689,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, brelse(bh); return -EIO; } - len = ((char *) root) + blocksize - (char *) de; + len = ((char *) root) + (blocksize - csum_size) - (char *) de; /* Allocate new block for the 0th block's dirents */ bh2 = ext4_append(handle, dir, &block, &retval); @@ -1545,8 +1705,15 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, top = data1 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) de = de2; - de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, blocksize); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(data1, blocksize); + initialize_dirent_tail(t, blocksize); + } + /* Initialize the root; the dot dirents already exist */ de = (struct ext4_dir_entry_2 *) (&root->dotdot); de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), @@ -1572,7 +1739,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, bh = bh2; ext4_handle_dirty_dx_node(handle, dir, frame->bh); - ext4_handle_dirty_metadata(handle, dir, bh); + ext4_handle_dirty_dirent_node(handle, dir, bh); de = do_split(handle,dir, &bh, frame, &hinfo, &retval); if (!de) { @@ -1608,11 +1775,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct inode *dir = dentry->d_parent->d_inode; struct buffer_head *bh; struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; struct super_block *sb; int retval; int dx_fallback=0; unsigned blocksize; ext4_lblk_t block, blocks; + int csum_size = 0; + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; blocksize = sb->s_blocksize; @@ -1631,6 +1804,11 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, bh = ext4_bread(handle, dir, block, 0, &retval); if(!bh) return retval; + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(dir, + (struct ext4_dir_entry *)bh->b_data)) + return -EIO; + set_buffer_verified(bh); retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); if (retval != -ENOSPC) { brelse(bh); @@ -1647,7 +1825,13 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; - de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); + de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(bh->b_data, blocksize); + initialize_dirent_tail(t, blocksize); + } + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); brelse(bh); if (retval == 0) @@ -1679,6 +1863,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) goto cleanup; + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) + goto journal_error; + set_buffer_verified(bh); + BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); if (err) @@ -1804,12 +1993,17 @@ static int ext4_delete_entry(handle_t *handle, { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; int i, err; + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + i = 0; pde = NULL; de = (struct ext4_dir_entry_2 *) bh->b_data; - while (i < bh->b_size) { + while (i < bh->b_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, i)) return -EIO; if (de == de_del) { @@ -1830,7 +2024,7 @@ static int ext4_delete_entry(handle_t *handle, de->inode = 0; dir->i_version++; BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, dir, bh); + err = ext4_handle_dirty_dirent_node(handle, dir, bh); if (unlikely(err)) { ext4_std_error(dir->i_sb, err); return err; @@ -1972,9 +2166,15 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; int err, retries = 0; + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; @@ -2015,16 +2215,24 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ext4_set_de_type(dir->i_sb, de, S_IFDIR); de = ext4_next_entry(de, blocksize); de->inode = cpu_to_le32(dir->i_ino); - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1), + de->rec_len = ext4_rec_len_to_disk(blocksize - + (csum_size + EXT4_DIR_REC_LEN(1)), blocksize); de->name_len = 2; strcpy(de->name, ".."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); set_nlink(inode, 2); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); + initialize_dirent_tail(t, blocksize); + } + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, dir_block); + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); if (err) goto out_clear_inode; + set_buffer_verified(dir_block); err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); @@ -2074,6 +2282,14 @@ static int empty_dir(struct inode *inode) inode->i_ino); return 1; } + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(inode, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_INODE(inode, "checksum error reading directory " + "lblock 0"); + return -EIO; + } + set_buffer_verified(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; de1 = ext4_next_entry(de, sb->s_blocksize); if (le32_to_cpu(de->inode) != inode->i_ino || @@ -2105,6 +2321,14 @@ static int empty_dir(struct inode *inode) offset += sb->s_blocksize; continue; } + if (!buffer_verified(bh) && + !ext4_dirent_csum_verify(inode, + (struct ext4_dir_entry *)bh->b_data)) { + EXT4_ERROR_INODE(inode, "checksum error " + "reading directory lblock 0"); + return -EIO; + } + set_buffer_verified(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; } if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { @@ -2605,6 +2829,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); if (!dir_bh) goto end_rename; + if (!buffer_verified(dir_bh) && + !ext4_dirent_csum_verify(old_inode, + (struct ext4_dir_entry *)dir_bh->b_data)) + goto end_rename; + set_buffer_verified(dir_bh); if (le32_to_cpu(PARENT_INO(dir_bh->b_data, old_dir->i_sb->s_blocksize)) != old_dir->i_ino) goto end_rename; @@ -2635,7 +2864,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_current_time(new_dir); ext4_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh); + retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh); if (unlikely(retval)) { ext4_std_error(new_dir->i_sb, retval); goto end_rename; @@ -2689,7 +2918,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh); + retval = ext4_handle_dirty_dirent_node(handle, old_inode, + dir_bh); if (retval) { ext4_std_error(old_dir->i_sb, retval); goto end_rename; From cc8e94fd126ab2d2e4bcb1b65c7316196f0cec8c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:43:10 -0400 Subject: [PATCH 12/44] ext4: Calculate and verify checksums of extended attribute blocks Calculate and verify the checksums of extended attribute blocks. This only applies to separate EA blocks that are pointed to by inode->i_file_acl (i.e. external EA blocks); the checksum lives in the EA header. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/xattr.c | 92 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 77 insertions(+), 15 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index e88748e55c0f..e56c9ed7d6e3 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -122,6 +122,58 @@ const struct xattr_handler *ext4_xattr_handlers[] = { NULL }; +static __le32 ext4_xattr_block_csum(struct inode *inode, + sector_t block_nr, + struct ext4_xattr_header *hdr) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + __u32 csum, old; + + old = hdr->h_checksum; + hdr->h_checksum = 0; + if (le32_to_cpu(hdr->h_refcount) != 1) { + block_nr = cpu_to_le64(block_nr); + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr, + sizeof(block_nr)); + } else + csum = ei->i_csum_seed; + csum = ext4_chksum(sbi, csum, (__u8 *)hdr, + EXT4_BLOCK_SIZE(inode->i_sb)); + hdr->h_checksum = old; + return cpu_to_le32(csum); +} + +static int ext4_xattr_block_csum_verify(struct inode *inode, + sector_t block_nr, + struct ext4_xattr_header *hdr) +{ + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr))) + return 0; + return 1; +} + +static void ext4_xattr_block_csum_set(struct inode *inode, + sector_t block_nr, + struct ext4_xattr_header *hdr) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr); +} + +static inline int ext4_handle_dirty_xattr_block(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) +{ + ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh)); + return ext4_handle_dirty_metadata(handle, inode, bh); +} + static inline const struct xattr_handler * ext4_xattr_handler(int name_index) { @@ -156,12 +208,22 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) } static inline int -ext4_xattr_check_block(struct buffer_head *bh) +ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) { + int error; + + if (buffer_verified(bh)) + return 0; + if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) return -EIO; - return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) + return -EIO; + error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + if (!error) + set_buffer_verified(bh); + return error; } static inline int @@ -224,7 +286,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, goto cleanup; ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext4_xattr_check_block(bh)) { + if (ext4_xattr_check_block(inode, bh)) { bad_block: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); @@ -369,7 +431,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) goto cleanup; ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); - if (ext4_xattr_check_block(bh)) { + if (ext4_xattr_check_block(inode, bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EIO; @@ -492,7 +554,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ce) mb_cache_entry_release(ce); unlock_buffer(bh); - error = ext4_handle_dirty_metadata(handle, inode, bh); + error = ext4_handle_dirty_xattr_block(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, 1); @@ -662,7 +724,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); - if (ext4_xattr_check_block(bs->bh)) { + if (ext4_xattr_check_block(inode, bs->bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EIO; @@ -725,9 +787,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (error == -EIO) goto bad_block; if (!error) - error = ext4_handle_dirty_metadata(handle, - inode, - bs->bh); + error = ext4_handle_dirty_xattr_block(handle, + inode, + bs->bh); if (error) goto cleanup; goto inserted; @@ -796,9 +858,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ea_bdebug(new_bh, "reusing; refcount now=%d", le32_to_cpu(BHDR(new_bh)->h_refcount)); unlock_buffer(new_bh); - error = ext4_handle_dirty_metadata(handle, - inode, - new_bh); + error = ext4_handle_dirty_xattr_block(handle, + inode, + new_bh); if (error) goto cleanup_dquot; } @@ -855,8 +917,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, set_buffer_uptodate(new_bh); unlock_buffer(new_bh); ext4_xattr_cache_insert(new_bh); - error = ext4_handle_dirty_metadata(handle, - inode, new_bh); + error = ext4_handle_dirty_xattr_block(handle, + inode, new_bh); if (error) goto cleanup; } @@ -1193,7 +1255,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, error = -EIO; if (!bh) goto cleanup; - if (ext4_xattr_check_block(bh)) { + if (ext4_xattr_check_block(inode, bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EIO; From feb0ab32a57e4e6c8b24f6fb68f0ce08efe4603c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:45:10 -0400 Subject: [PATCH 13/44] ext4: make block group checksums use metadata_csum algorithm metadata_csum supersedes uninit_bg. Convert the ROCOMPAT uninit_bg flag check to a helper function that covers both, and make the checksum calculation algorithm use either crc16 or the metadata_csum chosen algorithm depending on which flag is set. Print a warning if we try to mount a filesystem with both feature flags set. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 4 +-- fs/ext4/ext4.h | 6 ++-- fs/ext4/ialloc.c | 17 +++++----- fs/ext4/inode.c | 3 +- fs/ext4/mballoc.c | 6 ++-- fs/ext4/resize.c | 9 ++---- fs/ext4/super.c | 79 +++++++++++++++++++++++++++++++++-------------- 7 files changed, 75 insertions(+), 49 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0a64e4d5dbad..5745d53c67e8 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -168,7 +168,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); ext4_free_group_clusters_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); @@ -214,7 +214,7 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, sb->s_blocksize * 8, bh->b_data); ext4_block_bitmap_csum_set(sb, block_group, gdp, bh, EXT4_BLOCKS_PER_GROUP(sb) / 8); - gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_group_desc_csum_set(sb, block_group, gdp); } /* Return the number of free blocks in a block group. It is used when diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 57d7c356eaab..84e76bb96070 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2114,10 +2114,10 @@ extern void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); -extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, - struct ext4_group_desc *gdp); -extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, +extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); +extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, + struct ext4_group_desc *gdp); static inline int ext4_has_group_desc_csum(struct super_block *sb) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index a6383fcb714b..a044a9b77491 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -70,13 +70,11 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); ext4_free_group_clusters_set(sb, gdp, 0); ext4_free_inodes_set(sb, gdp, 0); @@ -92,7 +90,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, bh->b_data); ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, EXT4_INODES_PER_GROUP(sb) / 8); - gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_group_desc_csum_set(sb, block_group, gdp); return EXT4_INODES_PER_GROUP(sb); } @@ -298,7 +296,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) } ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); - gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); percpu_counter_inc(&sbi->s_freeinodes_counter); @@ -731,7 +729,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, got: /* We may have to initialize the block bitmap if it isn't already */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && + if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { struct buffer_head *block_bitmap_bh; @@ -757,8 +755,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, block_bitmap_bh, EXT4_BLOCKS_PER_GROUP(sb) / 8); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, - gdp); + ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); @@ -811,7 +808,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, if (ext4_has_group_desc_csum(sb)) { ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); } @@ -1181,7 +1178,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, skip_zeroout: ext4_lock_group(sb, group); gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); BUFFER_TRACE(group_desc_bh, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 16a67359b4c7..7bccdf32a32c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3584,8 +3584,7 @@ static int __ext4_get_inode_loc(struct inode *inode, b = table; end = b + EXT4_SB(sb)->s_inode_readahead_blks; num = EXT4_INODES_PER_GROUP(sb); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 95a388fef397..ccb9c2f527d5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2799,7 +2799,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_free_group_clusters_set(sb, gdp, len); ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh, EXT4_BLOCKS_PER_GROUP(sb) / 8); - gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); + ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); @@ -4663,7 +4663,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, ext4_free_group_clusters_set(sb, gdp, ret); ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, EXT4_BLOCKS_PER_GROUP(sb) / 8); - gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); @@ -4809,7 +4809,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_free_group_clusters_set(sb, desc, blk_free_count); ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh, EXT4_BLOCKS_PER_GROUP(sb) / 8); - desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); + ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, EXT4_B2C(sbi, blocks_freed)); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index dc597e066308..a58ce7c507ae 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1160,7 +1160,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, EXT4_B2C(sbi, group_data->free_blocks_count)); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); gdp->bg_flags = cpu_to_le16(*bg_flags); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_group_desc_csum_set(sb, group, gdp); err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { @@ -1399,17 +1399,14 @@ static int ext4_setup_next_flex_gd(struct super_block *sb, (1 + ext4_bg_num_gdb(sb, group + i) + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; group_data[i].free_blocks_count = blocks_per_group - overhead; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + if (ext4_has_group_desc_csum(sb)) flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | EXT4_BG_INODE_UNINIT; else flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; } - if (last_group == n_group && - EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + if (last_group == n_group && ext4_has_group_desc_csum(sb)) /* We need to initialize block bitmap of last group. */ flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f80c7e612829..c18c5968cd96 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1952,43 +1952,69 @@ static int ext4_fill_flex_info(struct super_block *sb) return 0; } -__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, - struct ext4_group_desc *gdp) +static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, + struct ext4_group_desc *gdp) { + int offset; __u16 crc = 0; + __le32 le_group = cpu_to_le32(block_group); - if (sbi->s_es->s_feature_ro_compat & - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - int offset = offsetof(struct ext4_group_desc, bg_checksum); - __le32 le_group = cpu_to_le32(block_group); + if ((sbi->s_es->s_feature_ro_compat & + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { + /* Use new metadata_csum algorithm */ + __u16 old_csum; + __u32 csum32; - crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); - crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); - crc = crc16(crc, (__u8 *)gdp, offset); - offset += sizeof(gdp->bg_checksum); /* skip checksum */ - /* for checksum of struct ext4_group_desc do the rest...*/ - if ((sbi->s_es->s_feature_incompat & - cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && - offset < le16_to_cpu(sbi->s_es->s_desc_size)) - crc = crc16(crc, (__u8 *)gdp + offset, - le16_to_cpu(sbi->s_es->s_desc_size) - - offset); + old_csum = gdp->bg_checksum; + gdp->bg_checksum = 0; + csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, + sizeof(le_group)); + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, + sbi->s_desc_size); + gdp->bg_checksum = old_csum; + + crc = csum32 & 0xFFFF; + goto out; } + /* old crc16 code */ + offset = offsetof(struct ext4_group_desc, bg_checksum); + + crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); + crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); + crc = crc16(crc, (__u8 *)gdp, offset); + offset += sizeof(gdp->bg_checksum); /* skip checksum */ + /* for checksum of struct ext4_group_desc do the rest...*/ + if ((sbi->s_es->s_feature_incompat & + cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && + offset < le16_to_cpu(sbi->s_es->s_desc_size)) + crc = crc16(crc, (__u8 *)gdp + offset, + le16_to_cpu(sbi->s_es->s_desc_size) - + offset); + +out: return cpu_to_le16(crc); } -int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group, +int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { - if ((sbi->s_es->s_feature_ro_compat & - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) && - (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp))) + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb), + block_group, gdp))) return 0; return 1; } +void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, + struct ext4_group_desc *gdp) +{ + if (!ext4_has_group_desc_csum(sb)) + return; + gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp); +} + /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, ext4_group_t *first_not_zeroed) @@ -2043,7 +2069,7 @@ static int ext4_check_descriptors(struct super_block *sb, return 0; } ext4_lock_group(sb, i); - if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { + if (!ext4_group_desc_csum_verify(sb, i, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)", i, le16_to_cpu(ext4_group_desc_csum(sbi, i, @@ -3069,6 +3095,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto cantfind_ext4; sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); + /* Warn if metadata_csum and gdt_csum are both set. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are " + "redundant flags; please run fsck."); + /* Check for a known checksum algorithm */ if (!ext4_verify_csum_type(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " @@ -4400,7 +4433,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) struct ext4_group_desc *gdp = ext4_get_group_desc(sb, g, NULL); - if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { + if (!ext4_group_desc_csum_verify(sb, g, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_remount: Checksum for group %u failed (%u!=%u)", g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), From 5c359a47e7d999a0ea7f397da2c15590d0a82815 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 29 Apr 2012 18:47:10 -0400 Subject: [PATCH 14/44] ext4: add checksums to the MMP block Compute and verify a checksum for the MMP block. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 +++ fs/ext4/mmp.c | 44 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 84e76bb96070..0ee9669394f5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2413,6 +2413,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); +extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp); +extern int ext4_mmp_csum_verify(struct super_block *sb, + struct mmp_struct *mmp); /* BH_Uninit flag: blocks are allocated but uninitialized on disk */ enum ext4_state_bits { diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index ed6548d89165..f99a1311e847 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -6,12 +6,45 @@ #include "ext4.h" +/* Checksumming functions */ +static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int offset = offsetof(struct mmp_struct, mmp_checksum); + __u32 csum; + + csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset); + + return cpu_to_le32(csum); +} + +int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return 1; + + return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); +} + +void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + return; + + mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); +} + /* * Write the MMP block using WRITE_SYNC to try to get the block on-disk * faster. */ -static int write_mmp_block(struct buffer_head *bh) +static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) { + struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); + + ext4_mmp_csum_set(sb, mmp); mark_buffer_dirty(bh); lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; @@ -59,7 +92,8 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, } mmp = (struct mmp_struct *)((*bh)->b_data); - if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC || + !ext4_mmp_csum_verify(sb, mmp)) return -EINVAL; return 0; @@ -120,7 +154,7 @@ static int kmmpd(void *data) mmp->mmp_time = cpu_to_le64(get_seconds()); last_update_time = jiffies; - retval = write_mmp_block(bh); + retval = write_mmp_block(sb, bh); /* * Don't spew too many error messages. Print one every * (s_mmp_update_interval * 60) seconds. @@ -200,7 +234,7 @@ static int kmmpd(void *data) mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN); mmp->mmp_time = cpu_to_le64(get_seconds()); - retval = write_mmp_block(bh); + retval = write_mmp_block(sb, bh); failed: kfree(data); @@ -299,7 +333,7 @@ int ext4_multi_mount_protect(struct super_block *sb, seq = mmp_new_seq(); mmp->mmp_seq = cpu_to_le32(seq); - retval = write_mmp_block(bh); + retval = write_mmp_block(sb, bh); if (retval) goto failed; From b09de7fa5217bbcb4caf3d19bd5e94816947ff7a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 30 Apr 2012 07:40:00 -0400 Subject: [PATCH 15/44] ext4: remove unnecessary check in add_dirent_to_buf() None of this function callers ever pass in a NULL inode pointer, so this check is unnecessary, and the else clause is dead code. (This change should make the code coverage people a little happier. :-) Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5861d64929f6..a9fd5f48f3bf 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1612,11 +1612,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, de = de1; } de->file_type = EXT4_FT_UNKNOWN; - if (inode) { - de->inode = cpu_to_le32(inode->i_ino); - ext4_set_de_type(dir->i_sb, de, inode->i_mode); - } else - de->inode = 0; + de->inode = cpu_to_le32(inode->i_ino); + ext4_set_de_type(dir->i_sb, de, inode->i_mode); de->name_len = namelen; memcpy(de->name, name, namelen); /* From f32aaf2d2b996b14e993da1c23f7ee22ea7333a3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 21 May 2012 11:42:02 -0400 Subject: [PATCH 16/44] ext4: enable the 64-bit jbd2 feature based on the 64-bit ext4 feature Previously we were only enabling the 64-bit jbd2 feature if the number of blocks in the file system was greater 2**32-1. The problem with this is that it makes it harder to test the 64-bit journal code paths with small file systems, since a small test file system would with the 64-bit ext4 feature enable would use a 64-bit file system on-disk data structures, but use a 32-bit journal. This would also cause problems when trying to do an online resize to grow the filesystem above the 2**32-1 boundary. Fortunately the patch to support online resize for 64-bit file systems hasn't been merged yet, so this problem hasn't arisen in practice. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c18c5968cd96..88c2054fb938 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3603,7 +3603,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto no_journal; } - if (ext4_blocks_count(es) > 0xffffffffULL && + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); From 8f888ef846d4481e24c74b4a91ece771d2bcbcb5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 22 May 2012 22:43:41 -0400 Subject: [PATCH 17/44] jbd2: change disk layout for metadata checksumming Define flags and allocate space in on-disk journal structures to support checksumming of journal metadata. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 4 ++-- fs/jbd2/recovery.c | 6 +++--- include/linux/jbd2.h | 30 ++++++++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 840f70f50792..69d780310aea 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -627,7 +627,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) tag = (journal_block_tag_t *) tagp; write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); - tag->t_flags = cpu_to_be32(tag_flag); + tag->t_flags = cpu_to_be16(tag_flag); tagp += tag_bytes; space_left -= tag_bytes; @@ -651,7 +651,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) submitting the IOs. "tag" still points to the last tag we set up. */ - tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG); + tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); start_journal_io: for (i = 0; i < bufs; i++) { diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index c1a03354a22f..980f3d6b5f88 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -193,10 +193,10 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) nr++; tagp += tag_bytes; - if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID))) + if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID))) tagp += 16; - if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG)) + if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG)) break; } @@ -485,7 +485,7 @@ static int do_one_pass(journal_t *journal, unsigned long io_block; tag = (journal_block_tag_t *) tagp; - flags = be32_to_cpu(tag->t_flags); + flags = be16_to_cpu(tag->t_flags); io_block = next_log_block++; wrap(journal, next_log_block); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 912c30a8ddb1..809c439066c5 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -147,12 +147,24 @@ typedef struct journal_header_s #define JBD2_CRC32_CHKSUM 1 #define JBD2_MD5_CHKSUM 2 #define JBD2_SHA1_CHKSUM 3 +#define JBD2_CRC32C_CHKSUM 4 #define JBD2_CRC32_CHKSUM_SIZE 4 #define JBD2_CHECKSUM_BYTES (32 / sizeof(u32)) /* * Commit block header for storing transactional checksums: + * + * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum* + * fields are used to store a checksum of the descriptor and data blocks. + * + * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum + * field is used to store crc32c(uuid+commit_block). Each journal metadata + * block gets its own checksum, and data block checksums are stored in + * journal_block_tag (in the descriptor). The other h_chksum* fields are + * not used. + * + * Checksum v1 and v2 are mutually exclusive features. */ struct commit_header { __be32 h_magic; @@ -175,13 +187,19 @@ struct commit_header { typedef struct journal_block_tag_s { __be32 t_blocknr; /* The on-disk block number */ - __be32 t_flags; /* See below */ + __be16 t_checksum; /* truncated crc32c(uuid+seq+block) */ + __be16 t_flags; /* See below */ __be32 t_blocknr_high; /* most-significant high 32bits. */ } journal_block_tag_t; #define JBD2_TAG_SIZE32 (offsetof(journal_block_tag_t, t_blocknr_high)) #define JBD2_TAG_SIZE64 (sizeof(journal_block_tag_t)) +/* Tail of descriptor block, for checksumming */ +struct jbd2_journal_block_tail { + __be32 t_checksum; /* crc32c(uuid+descr_block) */ +}; + /* * The revoke descriptor: used on disk to describe a series of blocks to * be revoked from the log @@ -192,6 +210,10 @@ typedef struct jbd2_journal_revoke_header_s __be32 r_count; /* Count of bytes used in the block */ } jbd2_journal_revoke_header_t; +/* Tail of revoke block, for checksumming */ +struct jbd2_journal_revoke_tail { + __be32 r_checksum; /* crc32c(uuid+revoke_block) */ +}; /* Definitions for the journal tag flags word: */ #define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ @@ -241,7 +263,10 @@ typedef struct journal_superblock_s __be32 s_max_trans_data; /* Limit of data blocks per trans. */ /* 0x0050 */ - __u32 s_padding[44]; + __u8 s_checksum_type; /* checksum type */ + __u8 s_padding2[3]; + __u32 s_padding[42]; + __be32 s_checksum; /* crc32c(superblock) */ /* 0x0100 */ __u8 s_users[16*48]; /* ids of all fs'es sharing the log */ @@ -263,6 +288,7 @@ typedef struct journal_superblock_s #define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 #define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 +#define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x00000008 /* Features known to this kernel version: */ #define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM From 25ed6e8a54df904c875365eebedbd19138a47328 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 27 May 2012 07:48:56 -0400 Subject: [PATCH 18/44] jbd2: enable journal clients to enable v2 checksumming Add in the necessary code so that journal clients can enable the new journal checksumming features. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 55 ++++++++++++++++++++++++++++++++++++----------- fs/jbd2/journal.c | 50 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 13 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 88c2054fb938..8dfb42e380ea 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3013,6 +3013,44 @@ static void ext4_destroy_lazyinit_thread(void) kthread_stop(ext4_lazyinit_task); } +static int set_journal_csum_feature_set(struct super_block *sb) +{ + int ret = 1; + int compat, incompat; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + /* journal checksum v2 */ + compat = 0; + incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2; + } else { + /* journal checksum v1 */ + compat = JBD2_FEATURE_COMPAT_CHECKSUM; + incompat = 0; + } + + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ret = jbd2_journal_set_features(sbi->s_journal, + compat, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | + incompat); + } else if (test_opt(sb, JOURNAL_CHECKSUM)) { + ret = jbd2_journal_set_features(sbi->s_journal, + compat, 0, + incompat); + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + } else { + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | + JBD2_FEATURE_INCOMPAT_CSUM_V2); + } + + return ret; +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) { char *orig_data = kstrdup(data, GFP_KERNEL); @@ -3610,19 +3648,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount_wq; } - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } else if (test_opt(sb, JOURNAL_CHECKSUM)) { - jbd2_journal_set_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); - jbd2_journal_clear_features(sbi->s_journal, 0, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); - } else { - jbd2_journal_clear_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + if (!set_journal_csum_feature_set(sb)) { + ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " + "feature set"); + goto failed_mount_wq; } /* We have now updated the journal if required, so we can diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 1afb701622b0..63175f9391ab 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -97,6 +97,15 @@ EXPORT_SYMBOL(jbd2_inode_cache); static void __journal_abort_soft (journal_t *journal, int errno); static int jbd2_journal_create_slab(size_t slab_size); +/* Checksumming functions */ +int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) +{ + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return 1; + + return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; +} + /* * Helper function used to manage commit timeouts */ @@ -1376,6 +1385,9 @@ static int journal_get_superblock(journal_t *journal) } } + if (buffer_verified(bh)) + return 0; + sb = journal->j_superblock; err = -EINVAL; @@ -1413,6 +1425,21 @@ static int journal_get_superblock(journal_t *journal) goto out; } + if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) && + JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + /* Can't have checksum v1 and v2 on at the same time! */ + printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 " + "at the same time!\n"); + goto out; + } + + if (!jbd2_verify_csum_type(journal, sb)) { + printk(KERN_ERR "JBD: Unknown checksum type\n"); + goto out; + } + + set_buffer_verified(bh); + return 0; out: @@ -1653,6 +1680,10 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com int jbd2_journal_set_features (journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { +#define INCOMPAT_FEATURE_ON(f) \ + ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f))) +#define COMPAT_FEATURE_ON(f) \ + ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f))) journal_superblock_t *sb; if (jbd2_journal_check_used_features(journal, compat, ro, incompat)) @@ -1661,16 +1692,35 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) return 0; + /* Asking for checksumming v2 and v1? Only give them v2. */ + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 && + compat & JBD2_FEATURE_COMPAT_CHECKSUM) + compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; + jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", compat, ro, incompat); sb = journal->j_superblock; + /* If enabling v2 checksums, update superblock */ + if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + sb->s_checksum_type = JBD2_CRC32C_CHKSUM; + sb->s_feature_compat &= + ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); + } + + /* If enabling v1 checksums, downgrade superblock */ + if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM)) + sb->s_feature_incompat &= + ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2); + sb->s_feature_compat |= cpu_to_be32(compat); sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); return 1; +#undef COMPAT_FEATURE_ON +#undef INCOMPAT_FEATURE_ON } /* From 01b5adcebb977bc61b64167adce6d8260c9da33c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 27 May 2012 07:50:56 -0400 Subject: [PATCH 19/44] jbd2: Grab a reference to the crc32c driver if necessary Obtain a reference to the crc32c driver if needed for the v2 checksum. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/jbd2/Kconfig | 2 ++ fs/jbd2/journal.c | 25 +++++++++++++++++++++++++ include/linux/jbd2.h | 23 +++++++++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig index f32f346f4b0a..69a48c2944da 100644 --- a/fs/jbd2/Kconfig +++ b/fs/jbd2/Kconfig @@ -1,6 +1,8 @@ config JBD2 tristate select CRC32 + select CRYPTO + select CRYPTO_CRC32C help This is a generic journaling layer for block devices that support both 32-bit and 64-bit block numbers. It is currently used by diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 63175f9391ab..f04ab6c4b428 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1438,6 +1438,17 @@ static int journal_get_superblock(journal_t *journal) goto out; } + /* Load the checksum driver */ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD: Cannot load crc32c driver.\n"); + err = PTR_ERR(journal->j_chksum_driver); + journal->j_chksum_driver = NULL; + goto out; + } + } + set_buffer_verified(bh); return 0; @@ -1591,6 +1602,8 @@ int jbd2_journal_destroy(journal_t *journal) iput(journal->j_inode); if (journal->j_revoke) jbd2_journal_destroy_revoke(journal); + if (journal->j_chksum_driver) + crypto_free_shash(journal->j_chksum_driver); kfree(journal->j_wbuf); kfree(journal); @@ -1707,6 +1720,18 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); + + /* Load the checksum driver */ + if (journal->j_chksum_driver == NULL) { + journal->j_chksum_driver = crypto_alloc_shash("crc32c", + 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD: Cannot load crc32c " + "driver.\n"); + journal->j_chksum_driver = NULL; + return 0; + } + } } /* If enabling v1 checksums, downgrade superblock */ diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 809c439066c5..71e77dddebf1 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -31,6 +31,7 @@ #include #include #include +#include #endif #define journal_oom_retry 1 @@ -965,6 +966,9 @@ struct journal_s * superblock pointer here */ void *j_private; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *j_chksum_driver; }; /* @@ -1294,6 +1298,25 @@ static inline int jbd_space_needed(journal_t *journal) extern int jbd_blocks_per_page(struct inode *inode); +static inline u32 jbd2_chksum(journal_t *journal, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(journal->j_chksum_driver)]; + } desc; + int err; + + desc.shash.tfm = journal->j_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + #ifdef __KERNEL__ #define buffer_trace_init(bh) do {} while (0) From 4fd5ea43bc11602bfabe2c8f5378586d34bd2b0a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 27 May 2012 08:08:22 -0400 Subject: [PATCH 20/44] jbd2: checksum journal superblock Calculate and verify a checksum covering the journal superblock. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 47 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/jbd2.h | 3 +++ 2 files changed, 50 insertions(+) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f04ab6c4b428..9072f03d30b5 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -106,6 +106,34 @@ int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; } +static __u32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) +{ + __u32 csum, old_csum; + + old_csum = sb->s_checksum; + sb->s_checksum = 0; + csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t)); + sb->s_checksum = old_csum; + + return cpu_to_be32(csum); +} + +int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) +{ + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return 1; + + return sb->s_checksum == jbd2_superblock_csum(j, sb); +} + +void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) +{ + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return; + + sb->s_checksum = jbd2_superblock_csum(j, sb); +} + /* * Helper function used to manage commit timeouts */ @@ -1357,6 +1385,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal) jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", journal->j_errno); sb->s_errno = cpu_to_be32(journal->j_errno); + jbd2_superblock_csum_set(journal, sb); read_unlock(&journal->j_state_lock); jbd2_write_superblock(journal, WRITE_SYNC); @@ -1449,6 +1478,17 @@ static int journal_get_superblock(journal_t *journal) } } + /* Check superblock checksum */ + if (!jbd2_superblock_csum_verify(journal, sb)) { + printk(KERN_ERR "JBD: journal checksum error\n"); + goto out; + } + + /* Precompute checksum seed for all metadata */ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + set_buffer_verified(bh); return 0; @@ -1732,6 +1772,13 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, return 0; } } + + /* Precompute checksum seed for all metadata */ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_CSUM_V2)) + journal->j_csum_seed = jbd2_chksum(journal, ~0, + sb->s_uuid, + sizeof(sb->s_uuid)); } /* If enabling v1 checksums, downgrade superblock */ diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 71e77dddebf1..a9632bc55d97 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -969,6 +969,9 @@ struct journal_s /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *j_chksum_driver; + + /* Precomputed journal UUID checksum for seeding other checksums */ + __u32 j_csum_seed; }; /* From 42a7106de636ebf9c0b93d25b4230e14f5f2682e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 27 May 2012 08:08:24 -0400 Subject: [PATCH 21/44] jbd2: checksum revocation blocks Compute and verify revoke blocks inside the journal. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/jbd2/recovery.c | 22 ++++++++++++++++++++++ fs/jbd2/revoke.c | 27 ++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 980f3d6b5f88..728ecd0705d9 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -706,6 +706,25 @@ static int do_one_pass(journal_t *journal, return err; } +static int jbd2_revoke_block_csum_verify(journal_t *j, + void *buf) +{ + struct jbd2_journal_revoke_tail *tail; + __u32 provided, calculated; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return 1; + + tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize - + sizeof(struct jbd2_journal_revoke_tail)); + provided = tail->r_checksum; + tail->r_checksum = 0; + calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + tail->r_checksum = provided; + + provided = be32_to_cpu(provided); + return provided == calculated; +} /* Scan a revoke record, marking all blocks mentioned as revoked. */ @@ -720,6 +739,9 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, offset = sizeof(jbd2_journal_revoke_header_t); max = be32_to_cpu(header->r_count); + if (!jbd2_revoke_block_csum_verify(journal, header)) + return -EINVAL; + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) record_len = 8; diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 6973705d6a3d..f30b80b4ce8b 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -578,6 +578,7 @@ static void write_one_revoke_record(journal_t *journal, struct jbd2_revoke_record_s *record, int write_op) { + int csum_size = 0; struct journal_head *descriptor; int offset; journal_header_t *header; @@ -592,9 +593,13 @@ static void write_one_revoke_record(journal_t *journal, descriptor = *descriptorp; offset = *offsetp; + /* Do we need to leave space at the end for a checksum? */ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + csum_size = sizeof(struct jbd2_journal_revoke_tail); + /* Make sure we have a descriptor with space left for the record */ if (descriptor) { - if (offset == journal->j_blocksize) { + if (offset >= journal->j_blocksize - csum_size) { flush_descriptor(journal, descriptor, offset, write_op); descriptor = NULL; } @@ -631,6 +636,24 @@ static void write_one_revoke_record(journal_t *journal, *offsetp = offset; } +static void jbd2_revoke_csum_set(journal_t *j, + struct journal_head *descriptor) +{ + struct jbd2_journal_revoke_tail *tail; + __u32 csum; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return; + + tail = (struct jbd2_journal_revoke_tail *) + (jh2bh(descriptor)->b_data + j->j_blocksize - + sizeof(struct jbd2_journal_revoke_tail)); + tail->r_checksum = 0; + csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, + j->j_blocksize); + tail->r_checksum = cpu_to_be32(csum); +} + /* * Flush a revoke descriptor out to the journal. If we are aborting, * this is a noop; otherwise we are generating a buffer which needs to @@ -652,6 +675,8 @@ static void flush_descriptor(journal_t *journal, header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data; header->r_count = cpu_to_be32(offset); + jbd2_revoke_csum_set(journal, descriptor); + set_buffer_jwrite(bh); BUFFER_TRACE(bh, "write"); set_buffer_dirty(bh); From 3caa487f53f65fd1e3950a6b6ae1709e6c43b334 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 27 May 2012 08:10:22 -0400 Subject: [PATCH 22/44] jbd2: checksum descriptor blocks Calculate and verify a checksum of each descriptor block. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 25 ++++++++++++++++++++++++- fs/jbd2/recovery.c | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 69d780310aea..5d505cfd21c2 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -301,6 +301,24 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); } +static void jbd2_descr_block_csum_set(journal_t *j, + struct journal_head *descriptor) +{ + struct jbd2_journal_block_tail *tail; + __u32 csum; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return; + + tail = (struct jbd2_journal_block_tail *) + (jh2bh(descriptor)->b_data + j->j_blocksize - + sizeof(struct jbd2_journal_block_tail)); + tail->t_checksum = 0; + csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, + j->j_blocksize); + tail->t_checksum = cpu_to_be32(csum); +} + /* * jbd2_journal_commit_transaction * @@ -334,6 +352,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) unsigned long first_block; tid_t first_tid; int update_tail; + int csum_size = 0; + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + csum_size = sizeof(struct jbd2_journal_block_tail); /* * First job: lock down the current transaction and wait for @@ -643,7 +665,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (bufs == journal->j_wbufsize || commit_transaction->t_buffers == NULL || - space_left < tag_bytes + 16) { + space_left < tag_bytes + 16 + csum_size) { jbd_debug(4, "JBD2: Submit %d IOs\n", bufs); @@ -653,6 +675,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); + jbd2_descr_block_csum_set(journal, descriptor); start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 728ecd0705d9..066ee603db50 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -174,6 +174,25 @@ static int jread(struct buffer_head **bhp, journal_t *journal, return 0; } +static int jbd2_descr_block_csum_verify(journal_t *j, + void *buf) +{ + struct jbd2_journal_block_tail *tail; + __u32 provided, calculated; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return 1; + + tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize - + sizeof(struct jbd2_journal_block_tail)); + provided = tail->t_checksum; + tail->t_checksum = 0; + calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + tail->t_checksum = provided; + + provided = be32_to_cpu(provided); + return provided == calculated; +} /* * Count the number of in-use tags in a journal descriptor block. @@ -186,6 +205,9 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) int nr = 0, size = journal->j_blocksize; int tag_bytes = journal_tag_bytes(journal); + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + size -= sizeof(struct jbd2_journal_block_tail); + tagp = &bh->b_data[sizeof(journal_header_t)]; while ((tagp - bh->b_data + tag_bytes) <= size) { @@ -366,6 +388,7 @@ static int do_one_pass(journal_t *journal, int blocktype; int tag_bytes = journal_tag_bytes(journal); __u32 crc32_sum = ~0; /* Transactional Checksums */ + int descr_csum_size = 0; /* * First thing is to establish what we expect to find in the log @@ -451,6 +474,18 @@ static int do_one_pass(journal_t *journal, switch(blocktype) { case JBD2_DESCRIPTOR_BLOCK: + /* Verify checksum first */ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_CSUM_V2)) + descr_csum_size = + sizeof(struct jbd2_journal_block_tail); + if (descr_csum_size > 0 && + !jbd2_descr_block_csum_verify(journal, + bh->b_data)) { + err = -EIO; + goto failed; + } + /* If it is a valid descriptor block, replay it * in pass REPLAY; if journal_checksums enabled, then * calculate checksums in PASS_SCAN, otherwise, @@ -481,7 +516,7 @@ static int do_one_pass(journal_t *journal, tagp = &bh->b_data[sizeof(journal_header_t)]; while ((tagp - bh->b_data + tag_bytes) - <= journal->j_blocksize) { + <= journal->j_blocksize - descr_csum_size) { unsigned long io_block; tag = (journal_block_tag_t *) tagp; From 1f56c5890e3e815c6f4eabfc87a8a81f439b6f3d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 27 May 2012 08:10:25 -0400 Subject: [PATCH 23/44] jbd2: checksum commit blocks Calculate and verify the checksum of commit blocks. In checksum v2, deprecate most of the checksum v1 commit block checksum fields, since each block has its own checksum. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 19 +++++++++++++++++++ fs/jbd2/recovery.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5d505cfd21c2..16e1fe2a769f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -85,6 +85,24 @@ static void release_buffer_page(struct buffer_head *bh) __brelse(bh); } +static void jbd2_commit_block_csum_set(journal_t *j, + struct journal_head *descriptor) +{ + struct commit_header *h; + __u32 csum; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return; + + h = (struct commit_header *)(jh2bh(descriptor)->b_data); + h->h_chksum_type = 0; + h->h_chksum_size = 0; + h->h_chksum[0] = 0; + csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, + j->j_blocksize); + h->h_chksum[0] = cpu_to_be32(csum); +} + /* * Done it all: now submit the commit record. We should have * cleaned up our previous buffers by now, so if we are in abort @@ -128,6 +146,7 @@ static int journal_submit_commit_record(journal_t *journal, tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; tmp->h_chksum[0] = cpu_to_be32(crc32_sum); } + jbd2_commit_block_csum_set(journal, descriptor); JBUFFER_TRACE(descriptor, "submit commit block"); lock_buffer(bh); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 066ee603db50..df4a20e7d854 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -375,6 +375,24 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh, return 0; } +static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) +{ + struct commit_header *h; + __u32 provided, calculated; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return 1; + + h = buf; + provided = h->h_chksum[0]; + h->h_chksum[0] = 0; + calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + h->h_chksum[0] = provided; + + provided = be32_to_cpu(provided); + return provided == calculated; +} + static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { @@ -685,6 +703,19 @@ static int do_one_pass(journal_t *journal, } crc32_sum = ~0; } + if (pass == PASS_SCAN && + !jbd2_commit_block_csum_verify(journal, + bh->b_data)) { + info->end_transaction = next_commit_ID; + + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + journal->j_failed_commit = + next_commit_ID; + brelse(bh); + break; + } + } brelse(bh); next_commit_ID++; continue; From c390087591dcbecd244c31d979ccdad49ae83364 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 27 May 2012 08:12:12 -0400 Subject: [PATCH 24/44] jbd2: checksum data blocks that are stored in the journal Calculate and verify checksums of each data block being stored in the journal. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 22 ++++++++++++++++++++++ fs/jbd2/journal.c | 10 ++++++++-- fs/jbd2/recovery.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 16e1fe2a769f..216f4299f65e 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -338,6 +338,26 @@ static void jbd2_descr_block_csum_set(journal_t *j, tail->t_checksum = cpu_to_be32(csum); } +static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, + struct buffer_head *bh, __u32 sequence) +{ + struct page *page = bh->b_page; + __u8 *addr; + __u32 csum; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return; + + sequence = cpu_to_be32(sequence); + addr = kmap_atomic(page, KM_USER0); + csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, + sizeof(sequence)); + csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data), + bh->b_size); + kunmap_atomic(addr, KM_USER0); + + tag->t_checksum = cpu_to_be32(csum); +} /* * jbd2_journal_commit_transaction * @@ -669,6 +689,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) tag = (journal_block_tag_t *) tagp; write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); tag->t_flags = cpu_to_be16(tag_flag); + jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh), + commit_transaction->t_tid); tagp += tag_bytes; space_left -= tag_bytes; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9072f03d30b5..e9a3c4c85594 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2097,10 +2097,16 @@ int jbd2_journal_blocks_per_page(struct inode *inode) */ size_t journal_tag_bytes(journal_t *journal) { + journal_block_tag_t tag; + size_t x = 0; + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + x += sizeof(tag.t_checksum); + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) - return JBD2_TAG_SIZE64; + return x + JBD2_TAG_SIZE64; else - return JBD2_TAG_SIZE32; + return x + JBD2_TAG_SIZE32; } /* diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index df4a20e7d854..0131e4362534 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -393,6 +393,23 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) return provided == calculated; } +static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, + void *buf, __u32 sequence) +{ + __u32 provided, calculated; + + if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + return 1; + + sequence = cpu_to_be32(sequence); + calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, + sizeof(sequence)); + calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize); + provided = be32_to_cpu(tag->t_checksum); + + return provided == cpu_to_be32(calculated); +} + static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { @@ -569,6 +586,19 @@ static int do_one_pass(journal_t *journal, goto skip_write; } + /* Look for block corruption */ + if (!jbd2_block_tag_csum_verify( + journal, tag, obh->b_data, + be32_to_cpu(tmp->h_sequence))) { + brelse(obh); + success = -EIO; + printk(KERN_ERR "JBD: Invalid " + "checksum recovering " + "block %llu in log\n", + blocknr); + continue; + } + /* Find a buffer for the new * data being restored */ nbh = __getblk(journal->j_fs_dev, From e93376c20b70d1e62bb3246acd1bbe21fe58859f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 27 May 2012 08:12:42 -0400 Subject: [PATCH 25/44] ext4/jbd2: add metadata checksumming to the list of supported features Activate the metadata checksumming feature by adding it to ext4 and jbd2's lists of supported features. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 ++- include/linux/jbd2.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0ee9669394f5..d1e2735ac22f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1494,7 +1494,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ - EXT4_FEATURE_RO_COMPAT_BIGALLOC) + EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) /* * Default values for user and/or group using reserved blocks diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a9632bc55d97..f334c7fab967 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -296,7 +296,8 @@ typedef struct journal_superblock_s #define JBD2_KNOWN_ROCOMPAT_FEATURES 0 #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ JBD2_FEATURE_INCOMPAT_64BIT | \ - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \ + JBD2_FEATURE_INCOMPAT_CSUM_V2) #ifdef __KERNEL__ From bb3d132a24cd8bf5e7773b2d9f9baa58b07a7dae Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 28 May 2012 14:16:57 -0400 Subject: [PATCH 26/44] ext4: fix potential NULL dereference in ext4_free_inodes_counts() The ext4_get_group_desc() function returns NULL on error, and ext4_free_inodes_count() function dereferences it without checking. There is a check on the next line, but it's too late. Reviewed-by: Jan Kara Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ialloc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index a044a9b77491..ea32d7e1d6e3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -508,10 +508,12 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); - grp_free = ext4_free_inodes_count(sb, desc); - if (desc && grp_free && grp_free >= avefreei) { - *group = grp; - return 0; + if (desc) { + grp_free = ext4_free_inodes_count(sb, desc); + if (grp_free && grp_free >= avefreei) { + *group = grp; + return 0; + } } } From 7e84b6216467b84cd332c8e567bf5aa113fd2f38 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 28 May 2012 14:17:25 -0400 Subject: [PATCH 27/44] ext4: force ro mount if ext4_setup_super() fails If ext4_setup_super() fails i.e. due to a too-high revision, the error is logged in dmesg but the fs is not mounted RO as indicated. Tested by: # mkfs.ext4 -r 4 /dev/sdb6 # mount /dev/sdb6 /mnt/test # dmesg | grep "too high" [164919.759248] EXT4-fs (sdb6): revision level too high, forcing read-only mode # grep sdb6 /proc/mounts /dev/sdb6 /mnt/test2 ext4 rw,seclabel,relatime,data=ordered 0 0 Reviewed-by: Andreas Dilger Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8dfb42e380ea..d9466a508414 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3732,7 +3732,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount4; } - ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY); + if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY)) + sb->s_flags |= MS_RDONLY; /* determine the minimum size of new large inodes, if present */ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { From 9d99012ff26380a09092a9fddbb6e5f996dc631f Mon Sep 17 00:00:00 2001 From: Akira Fujita Date: Mon, 28 May 2012 14:19:25 -0400 Subject: [PATCH 28/44] ext4: remove needs_recovery in ext4_mb_init() needs_recovery in ext4_mb_init() is not used, remove it. Signed-off-by: Akira Fujita Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 +- fs/ext4/mballoc.c | 2 +- fs/ext4/super.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d1e2735ac22f..511011a1e85d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1948,7 +1948,7 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* mballoc.c */ extern long ext4_mb_stats; extern long ext4_mb_max_to_scan; -extern int ext4_mb_init(struct super_block *, int); +extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ccb9c2f527d5..45cb31e8dd27 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2375,7 +2375,7 @@ static int ext4_groupinfo_create_slab(size_t size) return 0; } -int ext4_mb_init(struct super_block *sb, int needs_recovery) +int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d9466a508414..dbc9544c7087 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3768,7 +3768,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } ext4_ext_init(sb); - err = ext4_mb_init(sb, needs_recovery); + err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); From 967ac8af4475ce45474800709b12137aa7634c77 Mon Sep 17 00:00:00 2001 From: Haogang Chen Date: Mon, 28 May 2012 14:21:55 -0400 Subject: [PATCH 29/44] ext4: fix potential integer overflow in alloc_flex_gd() In alloc_flex_gd(), when flexbg_size is large, kmalloc size would overflow and flex_gd->groups would point to a buffer smaller than expected, causing OOB accesses when it is used. Note that in ext4_resize_fs(), flexbg_size is calculated using sbi->s_log_groups_per_flex, which is read from the disk and only bounded to [1, 31]. The patch returns NULL for too large flexbg_size. Reviewed-by: Eric Sandeen Signed-off-by: Haogang Chen Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/resize.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a58ce7c507ae..05c7979226e5 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -161,6 +161,8 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size) if (flex_gd == NULL) goto out3; + if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data)) + goto out2; flex_gd->count = flexbg_size; flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) * From 7e936b737211e6b54e34b71a827e56b872e958d8 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Mon, 28 May 2012 17:02:25 -0400 Subject: [PATCH 30/44] ext4: disallow hard-linked directory in ext4_lookup A hard-linked directory to its parent can cause the VFS to deadlock, and is a sign of a corrupted file system. So detect this case in ext4_lookup(), before the rmdir() lockup scenario can take place. Signed-off-by: Andreas Dilger Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/namei.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a9fd5f48f3bf..daf8260f0b03 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1330,6 +1330,12 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); return ERR_PTR(-EIO); } + if (unlikely(ino == dir->i_ino)) { + EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir", + dentry->d_name.len, + dentry->d_name.name); + return ERR_PTR(-EIO); + } inode = ext4_iget(dir->i_sb, ino); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, From 2716b80284c5ca415b8ce93ca178b0ca28482568 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 28 May 2012 17:47:52 -0400 Subject: [PATCH 31/44] ext4: remove redundundant "(char *) bh->b_data" casts The b_data field of the buffer_head is already a char *, so there's no point casting it to a char *. Signed-off-by: "Theodore Ts'o" --- fs/ext4/resize.c | 2 +- fs/ext4/super.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 05c7979226e5..7ea6cbb44121 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1145,7 +1145,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, */ gdb_bh = sbi->s_group_desc[gdb_num]; /* Update group descriptor block for new group */ - gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data + + gdp = (struct ext4_group_desc *)(gdb_bh->b_data + gdb_off * EXT4_DESC_SIZE(sb)); memset(gdp, 0, EXT4_DESC_SIZE(sb)); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dbc9544c7087..545932c0a290 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3126,7 +3126,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Note: s_es must be initialized as soon as possible because * some ext4 macro-instructions depend on its value */ - es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); + es = (struct ext4_super_block *) (bh->b_data + offset); sbi->s_es = es; sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT4_SUPER_MAGIC) @@ -3335,7 +3335,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "Can't read superblock on 2nd try"); goto failed_mount; } - es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); + es = (struct ext4_super_block *)(bh->b_data + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { ext4_msg(sb, KERN_ERR, @@ -3976,7 +3976,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } - es = (struct ext4_super_block *) (((char *)bh->b_data) + offset); + es = (struct ext4_super_block *) (bh->b_data + offset); if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { From 2cde417de013b2e5ae3007374d937e8c4c426bd4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 28 May 2012 17:49:54 -0400 Subject: [PATCH 32/44] ext4: return ENOMEM when mounts fail due to lack of memory This is a port of the ext3 commit: 4569cd1b0d9 Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 545932c0a290..628cfcdc0823 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3527,6 +3527,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) GFP_KERNEL); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); + ret = -ENOMEM; goto failed_mount; } @@ -3584,6 +3585,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); + ret = err; goto failed_mount3; } From 400db9d30146dc062aaba97a6301b425eb6015bc Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 28 May 2012 17:53:53 -0400 Subject: [PATCH 33/44] ext4: cleanup in ext4_discard_allocated_blocks() remove 'len' variable in ext4_discard_allocated_blocks() because it is useless. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 45cb31e8dd27..e4b4ac1ec684 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3073,13 +3073,9 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; - int len; - - if (pa && pa->pa_type == MB_INODE_PA) { - len = ac->ac_b_ex.fe_len; - pa->pa_free += len; - } + if (pa && pa->pa_type == MB_INODE_PA) + pa->pa_free += ac->ac_b_ex.fe_len; } /* From 4a3c3a5120bff77cc8c3a70c7cb681f458f47bfd Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 28 May 2012 17:55:16 -0400 Subject: [PATCH 34/44] ext4: fix format flag in ext4_ext_binsearch_idx() fix ext_debug format flag in ext4_ext_binsearch_idx(). Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6f3b49bd34c7..496151f893eb 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -603,7 +603,7 @@ ext4_ext_binsearch_idx(struct inode *inode, } path->p_idx = l - 1; - ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), + ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH From 8563000d3bf3ccfccce335c092a7c43078be8ffd Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 28 May 2012 18:06:51 -0400 Subject: [PATCH 35/44] ext4: use consistent ssize_t type in ext4_file_write() The generic_file_aio_write() function returns ssize_t, and ext4_file_write() returns a ssize_t, so use a ssize_t to collect the return value from generic_file_aio_write(). It shouldn't matter since the VFS read/write paths shouldn't allow a read greater than MAX_INT, but there was previously a bug in the AIO code paths, and it's best if we use a consistent type so that the return value from generic_file_aio_write() can't get truncated. Reported-by: Jouni Siren Reviewed-by: Jan Kara Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index cb70f1812a70..8c7642a00054 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -95,7 +95,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; int unaligned_aio = 0; - int ret; + ssize_t ret; /* * If we have encountered a bitmap-format file, the size limit From 6f2e9f0e7d795214b9cf5a47724a273b705fd113 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 28 May 2012 18:20:59 -0400 Subject: [PATCH 36/44] ext4: protect group inode free counting with group lock Now when we set the group inode free count, we don't have a proper group lock so that multiple threads may decrease the inode free count at the same time. And e2fsck will complain something like: Free inodes count wrong for group #1 (1, counted=0). Fix? no Free inodes count wrong for group #2 (3, counted=0). Fix? no Directories count wrong for group #2 (780, counted=779). Fix? no Free inodes count wrong for group #3 (2272, counted=2273). Fix? no So this patch try to protect it with the ext4_lock_group. btw, it is found by xfstests test case 269 and the volume is mkfsed with the parameter "-O ^resize_inode,^uninit_bg,extent,meta_bg,flex_bg,ext_attr" and I have run it 100 times and the error in e2fsck doesn't show up again. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index ea32d7e1d6e3..03583ab52e0e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -797,7 +797,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, ext4_itable_unused_set(sb, gdp, (EXT4_INODES_PER_GROUP(sb) - ino)); up_read(&grp->alloc_sem); + } else { + ext4_lock_group(sb, group); } + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (S_ISDIR(mode)) { ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); @@ -811,8 +814,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, group, gdp); - ext4_unlock_group(sb, group); } + ext4_unlock_group(sb, group); BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); From 2c0544b23568674efba22532e1f25fb62ce10163 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 30 May 2012 22:56:46 -0400 Subject: [PATCH 37/44] ext4: add debugging trigger for ext4_error() Make it easy to test whether or not the error handling subsystem in ext4 is working correctly. This allows us to simulate an ext4_error() by echoing a string to /sys/fs/ext4//trigger_fs_error. Signed-off-by: "Theodore Ts'o" Cc: ksumrall@google.com --- fs/ext4/ext4.h | 1 + fs/ext4/super.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 511011a1e85d..decc15d1b4b6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1197,6 +1197,7 @@ struct ext4_sb_info { struct proc_dir_entry *s_proc; struct kobject s_kobj; struct completion s_kobj_unregister; + struct super_block *s_sb; /* Journaling */ struct journal_s *s_journal; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 628cfcdc0823..9f1ae6b36502 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2473,6 +2473,23 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, return count; } +static ssize_t trigger_test_error(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + int len = count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (len && buf[len-1] == '\n') + len--; + + if (len) + ext4_error(sbi->s_sb, "%.*s", len, buf); + return count; +} + #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -2503,6 +2520,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); +EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), @@ -2517,6 +2535,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(trigger_fs_error), NULL, }; @@ -3087,6 +3106,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto out_free_orig; } sb->s_fs_info = sbi; + sbi->s_sb = sb; sbi->s_mount_opt = 0; sbi->s_resuid = EXT4_DEF_RESUID; sbi->s_resgid = EXT4_DEF_RESGID; From f3fc0210c0fc91900766c995f089c39170e68305 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 30 May 2012 23:00:16 -0400 Subject: [PATCH 38/44] ext4: add missing save_error_info() to ext4_error() The ext4_error() function is missing a call to save_error_info(). Since this is the function which marks the file system as containing an error, this oversight (which was introduced in 2.6.36) is quite significant, and should be backported to older stable kernels with high urgency. Reported-by: Ken Sumrall Signed-off-by: "Theodore Ts'o" Cc: ksumrall@google.com Cc: stable@kernel.org --- fs/ext4/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9f1ae6b36502..b8d5fc10d8c0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -539,6 +539,7 @@ void __ext4_error(struct super_block *sb, const char *function, printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", sb->s_id, function, line, current->comm, &vaf); va_end(args); + save_error_info(sb, function, line); ext4_handle_error(sb); } From 9660755100ae7677d65772a28e16d475a2ee9eab Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 31 May 2012 22:54:16 -0400 Subject: [PATCH 39/44] ext4: let getattr report the right blocks in delalloc+bigalloc In delayed allocation, i_reserved_data_blocks now indicates clusters, not blocks. So report it in the right number. This can be easily exposed by the following command: echo foo > blah; du -hc blah; sync; du -hc blah Reported-by: Eric Sandeen Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7bccdf32a32c..fc083e87c231 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4301,7 +4301,8 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, * will return the blocks that include the delayed allocation * blocks for this file. */ - delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; + delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), + EXT4_I(inode)->i_reserved_data_blocks); stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; return 0; From 79906964a187c405db72a3abc60eb9b50d804fbc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 31 May 2012 23:46:01 -0400 Subject: [PATCH 40/44] ext4: don't trash state flags in EXT4_IOC_SETFLAGS In commit 353eb83c we removed i_state_flags with 64-bit longs, But when handling the EXT4_IOC_SETFLAGS ioctl, we replace i_flags directly, which trashes the state flags which are stored in the high 32-bits of i_flags on 64-bit platforms. So use the the ext4_{set,clear}_inode_flags() functions which use atomic bit manipulation functions instead. Reported-by: Tao Ma Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ioctl.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index feba55a225a6..8ad112ae0ade 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) handle_t *handle = NULL; int err, migrate = 0; struct ext4_iloc iloc; - unsigned int oldflags; + unsigned int oldflags, mask, i; unsigned int jflag; if (!inode_owner_or_capable(inode)) @@ -115,8 +115,14 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) goto flags_err; - flags = flags & EXT4_FL_USER_MODIFIABLE; - flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE; + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + if (!(mask & EXT4_FL_USER_MODIFIABLE)) + continue; + if (mask & flags) + ext4_set_inode_flag(inode, i); + else + ext4_clear_inode_flag(inode, i); + } ei->i_flags = flags; ext4_set_inode_flags(inode); From 02b7831019ea4e7994968c84b5826fa8b248ffc8 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Thu, 31 May 2012 23:51:27 -0400 Subject: [PATCH 41/44] ext4: add ext4_mb_unload_buddy in the error path ext4_free_blocks fails to pair an ext4_mb_load_buddy with a matching ext4_mb_unload_buddy when it fails a memory allocation. Signed-off-by: Salman Qazi Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/mballoc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e4b4ac1ec684..3d9277c48bc8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4634,6 +4634,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, */ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); if (!new_entry) { + ext4_mb_unload_buddy(&e4b); err = -ENOMEM; goto error_return; } From 95599968d19db175829fb580baa6b68939b320fb Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Thu, 31 May 2012 23:52:14 -0400 Subject: [PATCH 42/44] ext4: remove mb_groups before tearing down the buddy_cache We can't have references held on pages in the s_buddy_cache while we are trying to truncate its pages and put the inode. All the pages must be gone before we reach clear_inode. This can only be gauranteed if we can prevent new users from grabbing references to s_buddy_cache's pages. The original bug can be reproduced and the bug fix can be verified by: while true; do mount -t ext4 /dev/ram0 /export/hda3/ram0; \ umount /export/hda3/ram0; done & while true; do cat /proc/fs/ext4/ram0/mb_groups; done Signed-off-by: Salman Qazi Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/mballoc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3d9277c48bc8..1cd6994fc446 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2517,6 +2517,9 @@ int ext4_mb_release(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + if (sbi->s_proc) + remove_proc_entry("mb_groups", sbi->s_proc); + if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { grinfo = ext4_get_group_info(sb, i); @@ -2564,8 +2567,6 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); - if (sbi->s_proc) - remove_proc_entry("mb_groups", sbi->s_proc); return 0; } From b2f4edb335f23626225a792e0669d1becf68312f Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Fri, 1 Jun 2012 00:10:32 -0400 Subject: [PATCH 43/44] jbd2: use kmem_cache_zalloc wrapper instead of flag Use kmem_cache_zalloc wrapper instead of flag __GFP_ZERO. Signed-off-by: Wanlong Gao Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ddcd3549c6c2..fb1ab9533b67 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -162,8 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle, alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kmem_cache_alloc(transaction_cache, - gfp_mask | __GFP_ZERO); + new_transaction = kmem_cache_zalloc(transaction_cache, + gfp_mask); if (!new_transaction) { /* * If __GFP_FS is not present, then we may be From 5e44f8c374dc4f8eadf61cd18b2c0d46bc87c1b7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 1 Jun 2012 00:15:28 -0400 Subject: [PATCH 44/44] ext4: hole-punch use truncate_pagecache_range When truncating a file, we unmap pages from userspace first, as that's usually more efficient than relying, page by page, on the fallback in truncate_inode_page() - particularly if the file is mapped many times. Do the same when punching a hole: 3.4 added truncate_pagecache_range() to do the unmap and trunc, so use it in ext4_ext_punch_hole(), instead of calling truncate_inode_pages_range() directly. Signed-off-by: Hugh Dickins Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 496151f893eb..91341ec6e06a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4789,8 +4789,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) /* Now release the pages */ if (last_page_offset > first_page_offset) { - truncate_inode_pages_range(mapping, first_page_offset, - last_page_offset-1); + truncate_pagecache_range(inode, first_page_offset, + last_page_offset - 1); } /* finish any pending end_io work */