diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 8bd00da6d2a4..2f46ef3800aa 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -414,7 +414,16 @@ struct xfs_log_dinode { /* start of the extended dinode, writable fields */ uint32_t di_crc; /* CRC of the inode */ uint64_t di_changecount; /* number of attribute changes */ - xfs_lsn_t di_lsn; /* flush sequence */ + + /* + * The LSN we write to this field during formatting is not a reflection + * of the current on-disk LSN. It should never be used for recovery + * sequencing, nor should it be recovered into the on-disk inode at all. + * See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk() + * for details. + */ + xfs_lsn_t di_lsn; + uint64_t di_flags2; /* more random flags */ uint32_t di_cowextsize; /* basic cow extent size for file */ uint8_t di_pad2[12]; /* more padding for future expansion */ diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index cb44f7653f03..538724f9f85c 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -145,7 +145,8 @@ xfs_log_dinode_to_disk_ts( STATIC void xfs_log_dinode_to_disk( struct xfs_log_dinode *from, - struct xfs_dinode *to) + struct xfs_dinode *to, + xfs_lsn_t lsn) { to->di_magic = cpu_to_be16(from->di_magic); to->di_mode = cpu_to_be16(from->di_mode); @@ -182,7 +183,7 @@ xfs_log_dinode_to_disk( to->di_flags2 = cpu_to_be64(from->di_flags2); to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); - to->di_lsn = cpu_to_be64(from->di_lsn); + to->di_lsn = cpu_to_be64(lsn); memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &from->di_uuid); to->di_flushiter = 0; @@ -261,16 +262,25 @@ xlog_recover_inode_commit_pass2( } /* - * If the inode has an LSN in it, recover the inode only if it's less - * than the lsn of the transaction we are replaying. Note: we still - * need to replay an owner change even though the inode is more recent - * than the transaction as there is no guarantee that all the btree - * blocks are more recent than this transaction, too. + * If the inode has an LSN in it, recover the inode only if the on-disk + * inode's LSN is older than the lsn of the transaction we are + * replaying. We can have multiple checkpoints with the same start LSN, + * so the current LSN being equal to the on-disk LSN doesn't necessarily + * mean that the on-disk inode is more recent than the change being + * replayed. + * + * We must check the current_lsn against the on-disk inode + * here because the we can't trust the log dinode to contain a valid LSN + * (see comment below before replaying the log dinode for details). + * + * Note: we still need to replay an owner change even though the inode + * is more recent than the transaction as there is no guarantee that all + * the btree blocks are more recent than this transaction, too. */ if (dip->di_version >= 3) { xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) { trace_xfs_log_recover_inode_skip(log, in_f); error = 0; goto out_owner_change; @@ -368,8 +378,17 @@ xlog_recover_inode_commit_pass2( goto out_release; } - /* recover the log dinode inode into the on disk inode */ - xfs_log_dinode_to_disk(ldip, dip); + /* + * Recover the log dinode inode into the on disk inode. + * + * The LSN in the log dinode is garbage - it can be zero or reflect + * stale in-memory runtime state that isn't coherent with the changes + * logged in this transaction or the changes written to the on-disk + * inode. Hence we write the current lSN into the inode because that + * matches what xfs_iflush() would write inode the inode when flushing + * the changes in this transaction. + */ + xfs_log_dinode_to_disk(ldip, dip, current_lsn); fields = in_f->ilf_fields; if (fields & XFS_ILOG_DEV)