From 6c9468aad215a198742c8375b0415e42521c905c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:54 -0700 Subject: [PATCH 01/20] fscrypt: replace raw loads of info pointer with helper function Add and use a helper function fscrypt_get_inode_info_raw(). It loads an inode's fscrypt info pointer using a raw dereference, which is appropriate when the caller knows the key setup already happened. This eliminates most occurrences of inode::i_crypt_info in the source, in preparation for replacing that with a filesystem-specific field. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-2-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/crypto/bio.c | 2 +- fs/crypto/crypto.c | 14 ++++++++------ fs/crypto/fname.c | 11 ++++++----- fs/crypto/hooks.c | 2 +- fs/crypto/inline_crypt.c | 12 +++++++----- fs/crypto/policy.c | 7 ++++--- include/linux/fscrypt.h | 16 ++++++++++++++++ 7 files changed, 43 insertions(+), 21 deletions(-) diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 486fcb2ecf13..0d746de4cd10 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -113,7 +113,7 @@ out: int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index b6ccab524fde..07f9cbfe3ea4 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -173,7 +173,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs, gfp_t gfp_flags) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; struct page *ciphertext_page; @@ -232,8 +232,9 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_ENCRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); @@ -255,7 +256,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + @@ -305,8 +306,9 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_DECRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index f9f6713e144f..fb77ad1ca74a 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -94,7 +94,7 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -138,7 +138,7 @@ static int fname_decrypt(const struct inode *inode, const struct fscrypt_str *iname, struct fscrypt_str *oname) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -274,8 +274,9 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { - return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, - orig_len, max_len, + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); + + return __fscrypt_fname_encrypted_size(&ci->ci_policy, orig_len, max_len, encrypted_len_ret); } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); @@ -543,7 +544,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name); */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { - const struct fscrypt_inode_info *ci = dir->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(dir); WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index e0b32ac841f7..7a5d4c168c49 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -199,7 +199,7 @@ int fscrypt_prepare_setflags(struct inode *inode, err = fscrypt_require_key(inode); if (err) return err; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; mk = ci->ci_master_key; diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index caaff809765b..5dee7c498bc8 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -263,7 +263,7 @@ int fscrypt_derive_sw_secret(struct super_block *sb, bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { - return inode->i_crypt_info->ci_inlinecrypt; + return fscrypt_get_inode_info_raw(inode)->ci_inlinecrypt; } EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto); @@ -307,7 +307,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, if (!fscrypt_inode_uses_inline_crypto(inode)) return; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); fscrypt_generate_dun(ci, first_lblk, dun); bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask); @@ -385,22 +385,24 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) { const struct bio_crypt_ctx *bc = bio->bi_crypt_context; + const struct fscrypt_inode_info *ci; u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; if (!!bc != fscrypt_inode_uses_inline_crypto(inode)) return false; if (!bc) return true; + ci = fscrypt_get_inode_info_raw(inode); /* * Comparing the key pointers is good enough, as all I/O for each key * uses the same pointer. I.e., there's currently no need to support * merging requests where the keys are the same but the pointers differ. */ - if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key) + if (bc->bc_key != ci->ci_enc_key.blk_key) return false; - fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun); + fscrypt_generate_dun(ci, next_lblk, next_dun); return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun); } EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio); @@ -502,7 +504,7 @@ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) if (nr_blocks <= 1) return nr_blocks; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (!(fscrypt_policy_flags(&ci->ci_policy) & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) return nr_blocks; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6ad30ae07c06..9d51f3500de3 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -727,7 +727,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) err = fscrypt_require_key(dir); if (err) return ERR_PTR(err); - return &dir->i_crypt_info->ci_policy; + return &fscrypt_get_inode_info_raw(dir)->ci_policy; } return fscrypt_get_dummy_policy(dir->i_sb); @@ -746,7 +746,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) */ int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); BUILD_BUG_ON(sizeof(union fscrypt_context) != FSCRYPT_SET_CONTEXT_MAX_SIZE); @@ -771,7 +771,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); */ int fscrypt_set_context(struct inode *inode, void *fs_data) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci; union fscrypt_context ctx; int ctxsize; @@ -783,6 +783,7 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) * This may be the first time the inode number is available, so do any * delayed key setup that requires the inode number. */ + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) fscrypt_hash_inode_number(ci, ci->ci_master_key); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 10dd161690a2..23c5198612d1 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -195,6 +195,22 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +/* + * Load the inode's fscrypt info pointer, using a raw dereference. Since this + * uses a raw dereference with no memory barrier, it is appropriate to use only + * when the caller knows the inode's key setup already happened, resulting in + * non-NULL fscrypt info. E.g., the file contents en/decryption functions use + * this, since fscrypt_file_open() set up the key. + */ +static inline struct fscrypt_inode_info * +fscrypt_get_inode_info_raw(const struct inode *inode) +{ + struct fscrypt_inode_info *ci = inode->i_crypt_info; + + VFS_WARN_ON_ONCE(ci == NULL); + return ci; +} + static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { From 93221de31a8df6710e02328f82dc68d7ab4ad9e6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:55 -0700 Subject: [PATCH 02/20] fscrypt: add support for info in fs-specific part of inode Add an inode_info_offs field to struct fscrypt_operations, and update fs/crypto/ to support it. When set to a nonzero value, it specifies the offset to the fscrypt_inode_info pointer within the filesystem-specific part of the inode structure, to be used instead of inode::i_crypt_info. Since this makes inode::i_crypt_info no longer necessarily used, update comments that mentioned it. This is a prerequisite for a later commit that removes inode::i_crypt_info, saving memory and improving cache efficiency with filesystems that don't support fscrypt. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-3-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/crypto/fscrypt_private.h | 4 ++-- fs/crypto/keysetup.c | 43 ++++++++++++++++++++++--------------- include/linux/fscrypt.h | 22 +++++++++++++++---- 3 files changed, 46 insertions(+), 23 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index d8b485b9881c..245e6b84aa17 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -249,8 +249,8 @@ struct fscrypt_prepared_key { * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is - * allocated and stored in ->i_crypt_info. Once created, it remains until the - * inode is evicted. + * allocated and a pointer to it is stored in the file's in-memory inode. Once + * created, it remains until the inode is evicted. */ struct fscrypt_inode_info { diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 4f3b9ecbfe4e..c1f85715c276 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -642,15 +642,16 @@ fscrypt_setup_encryption_info(struct inode *inode, goto out; /* - * For existing inodes, multiple tasks may race to set ->i_crypt_info. - * So use cmpxchg_release(). This pairs with the smp_load_acquire() in - * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with - * a RELEASE barrier so that other tasks can ACQUIRE it. + * For existing inodes, multiple tasks may race to set the inode's + * fscrypt info pointer. So use cmpxchg_release(). This pairs with the + * smp_load_acquire() in fscrypt_get_inode_info(). I.e., publish the + * pointer with a RELEASE barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { + if (cmpxchg_release(fscrypt_inode_info_addr(inode), NULL, crypt_info) == + NULL) { /* - * We won the race and set ->i_crypt_info to our crypt_info. - * Now link it into the master key's inode list. + * We won the race and set the inode's fscrypt info to our + * crypt_info. Now link it into the master key's inode list. */ if (mk) { crypt_info->ci_master_key = mk; @@ -681,13 +682,13 @@ out: * %false unless the operation being performed is needed in * order for files (or directories) to be deleted. * - * Set up ->i_crypt_info, if it hasn't already been done. + * Set up the inode's encryption key, if it hasn't already been done. * - * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So + * Note: unless the key setup was already done, this isn't %GFP_NOFS-safe. So * generally this shouldn't be called from within a filesystem transaction. * - * Return: 0 if ->i_crypt_info was set or was already set, *or* if the - * encryption key is unavailable. (Use fscrypt_has_encryption_key() to + * Return: 0 if the key is now set up, *or* if it couldn't be set up because the + * needed master key is absent. (Use fscrypt_has_encryption_key() to * distinguish these cases.) Also can return another -errno code. */ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) @@ -741,9 +742,9 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * ->i_ino doesn't need to be set yet. * @encrypt_ret: (output) set to %true if the new inode will be encrypted * - * If the directory is encrypted, set up its ->i_crypt_info in preparation for + * If the directory is encrypted, set up its encryption key in preparation for * encrypting the name of the new file. Also, if the new inode will be - * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true. + * encrypted, set up its encryption key too and set *encrypt_ret=true. * * This isn't %GFP_NOFS-safe, and therefore it should be called before starting * any filesystem transaction to create the inode. For this reason, ->i_ino @@ -752,8 +753,8 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * This doesn't persist the new inode's encryption context. That still needs to * be done later by calling fscrypt_set_context(). * - * Return: 0 on success, -ENOKEY if the encryption key is missing, or another - * -errno code + * Return: 0 on success, -ENOKEY if a key needs to be set up for @dir or @inode + * but the needed master key is absent, or another -errno code */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) @@ -800,8 +801,16 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); */ void fscrypt_put_encryption_info(struct inode *inode) { - put_crypt_info(inode->i_crypt_info); - inode->i_crypt_info = NULL; + /* + * Ideally we'd start with a lightweight IS_ENCRYPTED() check here + * before proceeding to retrieve and check the pointer. However, during + * inode creation, the fscrypt_inode_info is set before S_ENCRYPTED. If + * an error occurs, it needs to be cleaned up regardless. + */ + struct fscrypt_inode_info **ci_addr = fscrypt_inode_info_addr(inode); + + put_crypt_info(*ci_addr); + *ci_addr = NULL; } EXPORT_SYMBOL(fscrypt_put_encryption_info); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 23c5198612d1..d7ff53accbfe 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -61,6 +61,12 @@ struct fscrypt_name { /* Crypto operations for filesystems */ struct fscrypt_operations { + /* + * The offset of the pointer to struct fscrypt_inode_info in the + * filesystem-specific part of the inode, relative to the beginning of + * the common part of the inode (the 'struct inode'). + */ + ptrdiff_t inode_info_offs; /* * If set, then fs/crypto/ will allocate a global bounce page pool the @@ -195,6 +201,14 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +static inline struct fscrypt_inode_info ** +fscrypt_inode_info_addr(const struct inode *inode) +{ + if (inode->i_sb->s_cop->inode_info_offs == 0) + return (struct fscrypt_inode_info **)&inode->i_crypt_info; + return (void *)inode + inode->i_sb->s_cop->inode_info_offs; +} + /* * Load the inode's fscrypt info pointer, using a raw dereference. Since this * uses a raw dereference with no memory barrier, it is appropriate to use only @@ -205,7 +219,7 @@ int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, static inline struct fscrypt_inode_info * fscrypt_get_inode_info_raw(const struct inode *inode) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = *fscrypt_inode_info_addr(inode); VFS_WARN_ON_ONCE(ci == NULL); return ci; @@ -216,11 +230,11 @@ fscrypt_get_inode_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info(). - * I.e., another task may publish ->i_crypt_info concurrently, executing - * a RELEASE barrier. We need to use smp_load_acquire() here to safely + * I.e., another task may publish the fscrypt info concurrently, + * executing a RELEASE barrier. Use smp_load_acquire() here to safely * ACQUIRE the memory the other task published. */ - return smp_load_acquire(&inode->i_crypt_info); + return smp_load_acquire(fscrypt_inode_info_addr(inode)); } /** From 80e07df424e583d4124be6059be54080e4c4cb64 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:56 -0700 Subject: [PATCH 03/20] ext4: move crypt info pointer to fs-specific part of inode Move the fscrypt_inode_info pointer into the filesystem-specific part of the inode by adding the field ext4_inode_info::i_crypt_info and configuring fscrypt_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_crypt_info, saving memory and improving cache efficiency with filesystems that don't support fscrypt. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-4-ebiggers@kernel.org Signed-off-by: Theodore Ts'o Signed-off-by: Christian Brauner --- fs/ext4/crypto.c | 2 ++ fs/ext4/ext4.h | 4 ++++ fs/ext4/super.c | 3 +++ 3 files changed, 9 insertions(+) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 0a056d97e640..cf0a0970c095 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -227,6 +227,8 @@ static bool ext4_has_stable_inodes(struct super_block *sb) } const struct fscrypt_operations ext4_cryptops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_crypt_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 01a6e2de7fc3..c897109dadb1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1182,6 +1182,10 @@ struct ext4_inode_info { __u32 i_csum_seed; kprojid_t i_projid; + +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; +#endif }; /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c7d39da7e733..0c3059ecce37 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1470,6 +1470,9 @@ static void init_once(void *foo) init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); +#ifdef CONFIG_FS_ENCRYPTION + ei->i_crypt_info = NULL; +#endif } static int __init init_inodecache(void) From 7afb71ee92de72f54877b06a31d46614edcaad24 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:57 -0700 Subject: [PATCH 04/20] f2fs: move crypt info pointer to fs-specific part of inode Move the fscrypt_inode_info pointer into the filesystem-specific part of the inode by adding the field f2fs_inode_info::i_crypt_info and configuring fscrypt_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_crypt_info, saving memory and improving cache efficiency with filesystems that don't support fscrypt. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-5-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/super.c | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 46be7560548c..2f5c30c069c3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -907,6 +907,9 @@ struct f2fs_inode_info { unsigned int atomic_write_cnt; loff_t original_i_size; /* original i_size before atomic write */ +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */ +#endif }; static inline void get_read_extent_info(struct extent_info *ext, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e16c4e2830c2..b42b55280d9e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -480,6 +480,9 @@ static void init_once(void *foo) struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; inode_init_once(&fi->vfs_inode); +#ifdef CONFIG_FS_ENCRYPTION + fi->i_crypt_info = NULL; +#endif } #ifdef CONFIG_QUOTA @@ -3570,6 +3573,8 @@ static struct block_device **f2fs_get_devices(struct super_block *sb, } static const struct fscrypt_operations f2fs_cryptops = { + .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_crypt_info) - + (int)offsetof(struct f2fs_inode_info, vfs_inode), .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, @@ -3581,7 +3586,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .has_stable_inodes = f2fs_has_stable_inodes, .get_devices = f2fs_get_devices, }; -#endif +#endif /* CONFIG_FS_ENCRYPTION */ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) From e1add70aaa5ea469980a60a1747ab9863fec2124 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:58 -0700 Subject: [PATCH 05/20] ubifs: move crypt info pointer to fs-specific part of inode Move the fscrypt_inode_info pointer into the filesystem-specific part of the inode by adding the field ubifs_inode::i_crypt_info and configuring fscrypt_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_crypt_info, saving memory and improving cache efficiency with filesystems that don't support fscrypt. Note that the initialization of ubifs_inode::i_crypt_info to NULL on inode allocation is handled by the memset() in ubifs_alloc_inode(). Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-6-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/ubifs/crypto.c | 2 ++ fs/ubifs/ubifs.h | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index fb5ac358077b..0b14d004a095 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -88,6 +88,8 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, } const struct fscrypt_operations ubifs_crypt_operations = { + .inode_info_offs = (int)offsetof(struct ubifs_inode, i_crypt_info) - + (int)offsetof(struct ubifs_inode, vfs_inode), .legacy_key_prefix = "ubifs:", .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 5db45c9e26ee..49e50431741c 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -365,6 +365,7 @@ struct ubifs_gced_idx_leb { * @read_in_a_row: number of consecutive pages read in a row (for bulk read) * @data_len: length of the data attached to the inode * @data: inode's data + * @i_crypt_info: inode's fscrypt information * * @ui_mutex exists for two main reasons. At first it prevents inodes from * being written back while UBIFS changing them, being in the middle of an VFS @@ -416,6 +417,9 @@ struct ubifs_inode { pgoff_t read_in_a_row; int data_len; void *data; +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; +#endif }; /** From bbe395ded3ef2a2aecfc90372bda2b3e3ed8f2ee Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:59 -0700 Subject: [PATCH 06/20] ceph: move crypt info pointer to fs-specific part of inode Move the fscrypt_inode_info pointer into the filesystem-specific part of the inode by adding the field ceph_inode_info::i_crypt_info and configuring fscrypt_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_crypt_info, saving memory and improving cache efficiency with filesystems that don't support fscrypt. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-7-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/ceph/crypto.c | 2 ++ fs/ceph/inode.c | 1 + fs/ceph/super.h | 1 + 3 files changed, 4 insertions(+) diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index cab722619207..7026e794813c 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -133,6 +133,8 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb) } static struct fscrypt_operations ceph_fscrypt_ops = { + .inode_info_offs = (int)offsetof(struct ceph_inode_info, i_crypt_info) - + (int)offsetof(struct ceph_inode_info, netfs.inode), .needs_bounce_pages = 1, .get_context = ceph_crypt_get_context, .set_context = ceph_crypt_set_context, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index fc543075b827..480cb3a1d639 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -665,6 +665,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); #ifdef CONFIG_FS_ENCRYPTION + ci->i_crypt_info = NULL; ci->fscrypt_auth = NULL; ci->fscrypt_auth_len = 0; #endif diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cf176aab0f82..25d8bacbcf44 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -463,6 +463,7 @@ struct ceph_inode_info { unsigned long i_work_mask; #ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; u32 fscrypt_auth_len; u32 fscrypt_file_len; u8 *fscrypt_auth; From ab90c2d2476c4dd6deddd089c7e83b858d135783 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:00 -0700 Subject: [PATCH 07/20] fs: remove inode::i_crypt_info Now that all fscrypt-capable filesystems store the pointer to fscrypt_inode_info in the filesystem-specific part of the inode structure, inode::i_crypt_info is no longer needed. Update fscrypt_inode_info_addr() to no longer support the fallback to inode::i_crypt_info. Finally, remove inode::i_crypt_info itself along with the now-unnecessary forward declaration of fscrypt_inode_info. The end result of the migration to the filesystem-specific pointer is memory savings on CONFIG_FS_ENCRYPTION=y kernels for all filesystems that don't support fscrypt. Specifically, their in-memory inodes are now smaller by the size of a pointer: either 4 or 8 bytes. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-8-ebiggers@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 ----- include/linux/fscrypt.h | 8 ++++++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..1dafa18169be 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -72,7 +72,6 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; -struct fscrypt_inode_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; @@ -780,10 +779,6 @@ struct inode { struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif -#ifdef CONFIG_FS_ENCRYPTION - struct fscrypt_inode_info *i_crypt_info; -#endif - #ifdef CONFIG_FS_VERITY struct fsverity_info *i_verity_info; #endif diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index d7ff53accbfe..516aba5b858b 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -201,11 +201,15 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +/* + * Returns the address of the fscrypt info pointer within the + * filesystem-specific part of the inode. (To save memory on filesystems that + * don't support fscrypt, a field in 'struct inode' itself is no longer used.) + */ static inline struct fscrypt_inode_info ** fscrypt_inode_info_addr(const struct inode *inode) { - if (inode->i_sb->s_cop->inode_info_offs == 0) - return (struct fscrypt_inode_info **)&inode->i_crypt_info; + VFS_WARN_ON_ONCE(inode->i_sb->s_cop->inode_info_offs == 0); return (void *)inode + inode->i_sb->s_cop->inode_info_offs; } From 2a7349add18e5915cd87251af5f98db1772b6131 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:01 -0700 Subject: [PATCH 08/20] fsverity: add support for info in fs-specific part of inode Add an inode_info_offs field to struct fsverity_operations, and update fs/verity/ to support it. When set to a nonzero value, it specifies the offset to the fsverity_info pointer within the filesystem-specific part of the inode structure, to be used instead of inode::i_verity_info. Since this makes inode::i_verity_info no longer necessarily used, update comments that mentioned it. This is a prerequisite for a later commit that removes inode::i_verity_info, saving memory and improving cache efficiency on filesystems that don't support fsverity. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-9-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/verity/enable.c | 6 ++--- fs/verity/fsverity_private.h | 9 ++++---- fs/verity/open.c | 23 ++++++++++--------- fs/verity/verify.c | 2 +- include/linux/fsverity.h | 44 ++++++++++++++++++++++++++++-------- 5 files changed, 55 insertions(+), 29 deletions(-) diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 503268cf4296..89eccc4becf9 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -284,9 +284,9 @@ static int enable_verity(struct file *filp, /* Successfully enabled verity */ /* - * Readers can start using ->i_verity_info immediately, so it - * can't be rolled back once set. So don't set it until just - * after the filesystem has successfully enabled verity. + * Readers can start using the inode's verity info immediately, + * so it can't be rolled back once set. So don't set it until + * just after the filesystem has successfully enabled verity. */ fsverity_set_info(inode, vi); } diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 5fe854a5b9ad..bc1d887c532e 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -63,10 +63,11 @@ struct merkle_tree_params { * fsverity_info - cached verity metadata for an inode * * When a verity file is first opened, an instance of this struct is allocated - * and stored in ->i_verity_info; it remains until the inode is evicted. It - * caches information about the Merkle tree that's needed to efficiently verify - * data read from the file. It also caches the file digest. The Merkle tree - * pages themselves are not cached here, but the filesystem may cache them. + * and a pointer to it is stored in the file's in-memory inode. It remains + * until the inode is evicted. It caches information about the Merkle tree + * that's needed to efficiently verify data read from the file. It also caches + * the file digest. The Merkle tree pages themselves are not cached here, but + * the filesystem may cache them. */ struct fsverity_info { struct merkle_tree_params tree_params; diff --git a/fs/verity/open.c b/fs/verity/open.c index c561e130cd0c..77b1c977af02 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -244,17 +244,17 @@ fail: void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) { /* - * Multiple tasks may race to set ->i_verity_info, so use - * cmpxchg_release(). This pairs with the smp_load_acquire() in - * fsverity_get_info(). I.e., here we publish ->i_verity_info with a - * RELEASE barrier so that other tasks can ACQUIRE it. + * Multiple tasks may race to set the inode's verity info pointer, so + * use cmpxchg_release(). This pairs with the smp_load_acquire() in + * fsverity_get_info(). I.e., publish the pointer with a RELEASE + * barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) { - /* Lost the race, so free the fsverity_info we allocated. */ + if (cmpxchg_release(fsverity_info_addr(inode), NULL, vi) != NULL) { + /* Lost the race, so free the verity info we allocated. */ fsverity_free_info(vi); /* - * Afterwards, the caller may access ->i_verity_info directly, - * so make sure to ACQUIRE the winning fsverity_info. + * Afterwards, the caller may access the inode's verity info + * directly, so make sure to ACQUIRE the winning verity info. */ (void)fsverity_get_info(inode); } @@ -350,7 +350,6 @@ int fsverity_get_descriptor(struct inode *inode, return 0; } -/* Ensure the inode has an ->i_verity_info */ static int ensure_verity_info(struct inode *inode) { struct fsverity_info *vi = fsverity_get_info(inode); @@ -395,8 +394,10 @@ EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr); void __fsverity_cleanup_inode(struct inode *inode) { - fsverity_free_info(inode->i_verity_info); - inode->i_verity_info = NULL; + struct fsverity_info **vi_addr = fsverity_info_addr(inode); + + fsverity_free_info(*vi_addr); + *vi_addr = NULL; } EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); diff --git a/fs/verity/verify.c b/fs/verity/verify.c index a1f00c3fd3b2..affc307eb6a6 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -245,7 +245,7 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, unsigned long max_ra_pages) { struct inode *inode = data_folio->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; + struct fsverity_info *vi = *fsverity_info_addr(inode); const unsigned int block_size = vi->tree_params.block_size; u64 pos = (u64)data_folio->index << PAGE_SHIFT; diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 1eb7eae580be..e0f132cb7839 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -28,6 +28,12 @@ /* Verity operations for filesystems */ struct fsverity_operations { + /** + * The offset of the pointer to struct fsverity_info in the + * filesystem-specific part of the inode, relative to the beginning of + * the common part of the inode (the 'struct inode'). + */ + ptrdiff_t inode_info_offs; /** * Begin enabling verity on the given file. @@ -124,15 +130,33 @@ struct fsverity_operations { #ifdef CONFIG_FS_VERITY +static inline struct fsverity_info ** +fsverity_info_addr(const struct inode *inode) +{ + if (inode->i_sb->s_vop->inode_info_offs == 0) + return (struct fsverity_info **)&inode->i_verity_info; + return (void *)inode + inode->i_sb->s_vop->inode_info_offs; +} + static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { /* - * Pairs with the cmpxchg_release() in fsverity_set_info(). - * I.e., another task may publish ->i_verity_info concurrently, - * executing a RELEASE barrier. We need to use smp_load_acquire() here - * to safely ACQUIRE the memory the other task published. + * Since this function can be called on inodes belonging to filesystems + * that don't support fsverity at all, and fsverity_info_addr() doesn't + * work on such filesystems, we have to start with an IS_VERITY() check. + * Checking IS_VERITY() here is also useful to minimize the overhead of + * fsverity_active() on non-verity files. */ - return smp_load_acquire(&inode->i_verity_info); + if (!IS_VERITY(inode)) + return NULL; + + /* + * Pairs with the cmpxchg_release() in fsverity_set_info(). I.e., + * another task may publish the inode's verity info concurrently, + * executing a RELEASE barrier. Use smp_load_acquire() here to safely + * ACQUIRE the memory the other task published. + */ + return smp_load_acquire(fsverity_info_addr(inode)); } /* enable.c */ @@ -156,11 +180,11 @@ void __fsverity_cleanup_inode(struct inode *inode); * fsverity_cleanup_inode() - free the inode's verity info, if present * @inode: an inode being evicted * - * Filesystems must call this on inode eviction to free ->i_verity_info. + * Filesystems must call this on inode eviction to free the inode's verity info. */ static inline void fsverity_cleanup_inode(struct inode *inode) { - if (inode->i_verity_info) + if (*fsverity_info_addr(inode)) __fsverity_cleanup_inode(inode); } @@ -267,12 +291,12 @@ static inline bool fsverity_verify_page(struct page *page) * fsverity_active() - do reads from the inode need to go through fs-verity? * @inode: inode to check * - * This checks whether ->i_verity_info has been set. + * This checks whether the inode's verity info has been set. * * Filesystems call this from ->readahead() to check whether the pages need to * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with - * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) + * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before the verity info.) * * Return: true if reads need to go through fs-verity, otherwise false */ @@ -287,7 +311,7 @@ static inline bool fsverity_active(const struct inode *inode) * @filp: the struct file being set up * * When opening a verity file, deny the open if it is for writing. Otherwise, - * set up the inode's ->i_verity_info if not already done. + * set up the inode's verity info if not already done. * * When combined with fscrypt, this must be called after fscrypt_file_open(). * Otherwise, we won't have the key set up to decrypt the verity metadata. From c9fff804b59c5495db944ddf84e1f963967cc361 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:02 -0700 Subject: [PATCH 09/20] ext4: move verity info pointer to fs-specific part of inode Move the fsverity_info pointer into the filesystem-specific part of the inode by adding the field ext4_inode_info::i_verity_info and configuring fsverity_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_verity_info, saving memory and improving cache efficiency on filesystems that don't support fsverity. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-10-ebiggers@kernel.org Acked-by: Theodore Ts'o Signed-off-by: Christian Brauner --- fs/ext4/ext4.h | 4 ++++ fs/ext4/super.c | 3 +++ fs/ext4/verity.c | 2 ++ 3 files changed, 9 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c897109dadb1..6cb784a56b3b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1186,6 +1186,10 @@ struct ext4_inode_info { #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; #endif + +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; +#endif }; /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0c3059ecce37..46138a6cb32a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1473,6 +1473,9 @@ static void init_once(void *foo) #ifdef CONFIG_FS_ENCRYPTION ei->i_crypt_info = NULL; #endif +#ifdef CONFIG_FS_VERITY + ei->i_verity_info = NULL; +#endif } static int __init init_inodecache(void) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index d9203228ce97..b0acb0c50313 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -389,6 +389,8 @@ static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations ext4_verityops = { + .inode_info_offs = (int)offsetof(struct ext4_inode_info, i_verity_info) - + (int)offsetof(struct ext4_inode_info, vfs_inode), .begin_enable_verity = ext4_begin_enable_verity, .end_enable_verity = ext4_end_enable_verity, .get_verity_descriptor = ext4_get_verity_descriptor, From 1f66cef4a9a3033b76db08de25eb017ddc6967e6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:03 -0700 Subject: [PATCH 10/20] f2fs: move verity info pointer to fs-specific part of inode Move the fsverity_info pointer into the filesystem-specific part of the inode by adding the field f2fs_inode_info::i_verity_info and configuring fsverity_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_verity_info, saving memory and improving cache efficiency on filesystems that don't support fsverity. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-11-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/super.c | 3 +++ fs/f2fs/verity.c | 2 ++ 3 files changed, 8 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2f5c30c069c3..6e465bbc85ee 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -910,6 +910,9 @@ struct f2fs_inode_info { #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; /* filesystem encryption info */ #endif +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; /* filesystem verity info */ +#endif }; static inline void get_read_extent_info(struct extent_info *ext, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b42b55280d9e..1db024b20e29 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -483,6 +483,9 @@ static void init_once(void *foo) #ifdef CONFIG_FS_ENCRYPTION fi->i_crypt_info = NULL; #endif +#ifdef CONFIG_FS_VERITY + fi->i_verity_info = NULL; +#endif } #ifdef CONFIG_QUOTA diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 2287f238ae09..f0ab9a3c7a82 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -287,6 +287,8 @@ static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations f2fs_verityops = { + .inode_info_offs = (int)offsetof(struct f2fs_inode_info, i_verity_info) - + (int)offsetof(struct f2fs_inode_info, vfs_inode), .begin_enable_verity = f2fs_begin_enable_verity, .end_enable_verity = f2fs_end_enable_verity, .get_verity_descriptor = f2fs_get_verity_descriptor, From fcafdd4210658986470208230253ba5cdc6107a0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:04 -0700 Subject: [PATCH 11/20] btrfs: move verity info pointer to fs-specific part of inode Move the fsverity_info pointer into the filesystem-specific part of the inode by adding the field btrfs_inode::i_verity_info and configuring fsverity_operations::inode_info_offs accordingly. This is a prerequisite for a later commit that removes inode::i_verity_info, saving memory and improving cache efficiency on filesystems that don't support fsverity. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-12-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/btrfs/btrfs_inode.h | 5 +++++ fs/btrfs/inode.c | 3 +++ fs/btrfs/verity.c | 2 ++ 3 files changed, 10 insertions(+) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b99fb0273292..2c9489497cbe 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -338,6 +338,11 @@ struct btrfs_inode { struct list_head delayed_iput; struct rw_semaphore i_mmap_lock; + +#ifdef CONFIG_FS_VERITY + struct fsverity_info *i_verity_info; +#endif + struct inode vfs_inode; }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b77dd22b8cdb..de722b232ec1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7961,6 +7961,9 @@ static void init_once(void *foo) struct btrfs_inode *ei = foo; inode_init_once(&ei->vfs_inode); +#ifdef CONFIG_FS_VERITY + ei->i_verity_info = NULL; +#endif } void __cold btrfs_destroy_cachep(void) diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index b7a96a005487..4633cbcfcdb9 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -802,6 +802,8 @@ static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf, } const struct fsverity_operations btrfs_verityops = { + .inode_info_offs = (int)offsetof(struct btrfs_inode, i_verity_info) - + (int)offsetof(struct btrfs_inode, vfs_inode), .begin_enable_verity = btrfs_begin_enable_verity, .end_enable_verity = btrfs_end_enable_verity, .get_verity_descriptor = btrfs_get_verity_descriptor, From 818c659ac164e4e4639ceaedaccbdfebb1ef63b5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:05 -0700 Subject: [PATCH 12/20] fs: remove inode::i_verity_info Now that all fsverity-capable filesystems store the pointer to fsverity_info in the filesystem-specific part of the inode structure, inode::i_verity_info is no longer needed. Update fsverity_info_addr() to no longer support the fallback to inode::i_verity_info. Finally, remove inode::i_verity_info itself, and move the forward declaration of struct fsverity_info from fs.h (which no longer needs it) to fsverity.h. The end result of the migration to the filesystem-specific pointer is memory savings on CONFIG_FS_VERITY=y kernels for all filesystems that don't support fsverity. Specifically, their in-memory inodes are now smaller by the size of a pointer: either 4 or 8 bytes. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-13-ebiggers@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 ----- include/linux/fsverity.h | 10 ++++++++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 1dafa18169be..12ecc6b0e6f9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -73,7 +73,6 @@ struct seq_file; struct workqueue_struct; struct iov_iter; struct fscrypt_operations; -struct fsverity_info; struct fsverity_operations; struct fsnotify_mark_connector; struct fsnotify_sb_info; @@ -779,10 +778,6 @@ struct inode { struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif -#ifdef CONFIG_FS_VERITY - struct fsverity_info *i_verity_info; -#endif - void *i_private; /* fs or device private pointer */ } __randomize_layout; diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index e0f132cb7839..844f7b8b56bb 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -26,6 +26,8 @@ /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 +struct fsverity_info; + /* Verity operations for filesystems */ struct fsverity_operations { /** @@ -130,11 +132,15 @@ struct fsverity_operations { #ifdef CONFIG_FS_VERITY +/* + * Returns the address of the verity info pointer within the filesystem-specific + * part of the inode. (To save memory on filesystems that don't support + * fsverity, a field in 'struct inode' itself is no longer used.) + */ static inline struct fsverity_info ** fsverity_info_addr(const struct inode *inode) { - if (inode->i_sb->s_vop->inode_info_offs == 0) - return (struct fsverity_info **)&inode->i_verity_info; + VFS_WARN_ON_ONCE(inode->i_sb->s_vop->inode_info_offs == 0); return (void *)inode + inode->i_sb->s_vop->inode_info_offs; } From 8a3d00dde63a339d31d1fdeead24ddfd4d459c70 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:06 -0700 Subject: [PATCH 13/20] fsverity: check IS_VERITY() in fsverity_cleanup_inode() Since getting the address of the fsverity_info has gotten a bit more expensive, make fsverity_cleanup_inode() check for IS_VERITY() instead. This avoids adding more overhead to non-verity files. This assumes that verity info is never set when !IS_VERITY(), which is currently true, but add a VFS_WARN_ON_ONCE() that asserts that. (This of course defeats the optimization, but only when CONFIG_VFS_DEBUG=y.) Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-14-ebiggers@kernel.org Signed-off-by: Christian Brauner --- include/linux/fsverity.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 844f7b8b56bb..5bc7280425a7 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -190,8 +190,15 @@ void __fsverity_cleanup_inode(struct inode *inode); */ static inline void fsverity_cleanup_inode(struct inode *inode) { - if (*fsverity_info_addr(inode)) + /* + * Only IS_VERITY() inodes can have verity info, so start by checking + * for IS_VERITY() (which is faster than retrieving the pointer to the + * verity info). This minimizes overhead for non-verity inodes. + */ + if (IS_VERITY(inode)) __fsverity_cleanup_inode(inode); + else + VFS_WARN_ON_ONCE(*fsverity_info_addr(inode) != NULL); } /* read_metadata.c */ From bc986b1d756482a5ec2d7d9625229d9b9df95ae1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 21 Aug 2025 16:18:21 -0400 Subject: [PATCH 14/20] fs: stop accessing ->i_count directly in f2fs and gfs2 Instead of accessing ->i_count directly in these file systems, use the appropriate __iget and iput helpers. Signed-off-by: Josef Bacik Link: https://lore.kernel.org/b8e6eb8a3e690ce082828d3580415bf70dfa93aa.1755806649.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- fs/f2fs/super.c | 4 ++-- fs/gfs2/ops_fstype.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1db024b20e29..2045642cfe3b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1750,7 +1750,7 @@ static int f2fs_drop_inode(struct inode *inode) if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ - atomic_inc(&inode->i_count); + __iget(inode); spin_unlock(&inode->i_lock); /* should remain fi->extent_tree for writepage */ @@ -1769,7 +1769,7 @@ static int f2fs_drop_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); spin_lock(&inode->i_lock); - atomic_dec(&inode->i_count); + iput(inode); } trace_f2fs_drop_inode(inode, 0); return 0; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index efe99b732551..c770006f8889 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1754,7 +1754,7 @@ static void gfs2_evict_inodes(struct super_block *sb) spin_unlock(&inode->i_lock); continue; } - atomic_inc(&inode->i_count); + __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); From 9a98f9e84cfbeaa51af42ba2b8bbbde046c709a7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Aug 2025 11:39:01 -0400 Subject: [PATCH 15/20] fs: make the i_state flags an enum Adjusting i_state flags always means updating the values manually. Bring these forward into the 2020's and make a nice clean macro for defining the i_state values as an enum, providing __ variants for the cases where we need the bit position instead of the actual value, and leaving the actual NAME as the 1U << bit value. Reviewed-by: Christian Brauner Signed-off-by: Josef Bacik Link: https://lore.kernel.org/0da9348da6ece0dce12fccec07b1dd2b8e4cfdab.1756222464.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 231 +++++++++++++++++++++++---------------------- 1 file changed, 119 insertions(+), 112 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 12ecc6b0e6f9..c34554d8c4fe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -664,6 +664,124 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 +/* + * Inode state bits. Protected by inode->i_lock + * + * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, + * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. + * + * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, + * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at + * various stages of removing an inode. + * + * Two bits are used for locking and completion notification, I_NEW and I_SYNC. + * + * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on + * fdatasync() (unless I_DIRTY_DATASYNC is also set). + * Timestamp updates are the usual cause. + * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of + * these changes separately from I_DIRTY_SYNC so that we + * don't have to write inode on fdatasync() when only + * e.g. the timestamps have changed. + * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. + * I_DIRTY_TIME The inode itself has dirty timestamps, and the + * lazytime mount option is enabled. We keep track of this + * separately from I_DIRTY_SYNC in order to implement + * lazytime. This gets cleared if I_DIRTY_INODE + * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But + * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already + * in place because writeback might already be in progress + * and we don't want to lose the time update + * I_NEW Serves as both a mutex and completion notification. + * New inodes set I_NEW. If two processes both create + * the same inode, one of them will release its inode and + * wait for I_NEW to be released before returning. + * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can + * also cause waiting on I_NEW, without I_NEW actually + * being set. find_inode() uses this to prevent returning + * nearly-dead inodes. + * I_WILL_FREE Must be set when calling write_inode_now() if i_count + * is zero. I_FREEING must be set when I_WILL_FREE is + * cleared. + * I_FREEING Set when inode is about to be freed but still has dirty + * pages or buffers attached or the inode itself is still + * dirty. + * I_CLEAR Added by clear_inode(). In this state the inode is + * clean and can be destroyed. Inode keeps I_FREEING. + * + * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are + * prohibited for many purposes. iget() must wait for + * the inode to be completely released, then create it + * anew. Other functions will just ignore such inodes, + * if appropriate. I_NEW is used for waiting. + * + * I_SYNC Writeback of inode is running. The bit is set during + * data writeback, and cleared with a wakeup on the bit + * address once it is done. The bit is also used to pin + * the inode in memory for flusher thread. + * + * I_REFERENCED Marks the inode as recently references on the LRU list. + * + * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to + * synchronize competing switching instances and to tell + * wb stat updates to grab the i_pages lock. See + * inode_switch_wbs_work_fn() for details. + * + * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper + * and work dirs among overlayfs mounts. + * + * I_CREATING New object's inode in the middle of setting up. + * + * I_DONTCACHE Evict inode as soon as it is not used anymore. + * + * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. + * Used to detect that mark_inode_dirty() should not move + * inode between dirty lists. + * + * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. + * + * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding + * i_count. + * + * Q: What is the difference between I_WILL_FREE and I_FREEING? + * + * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait + * upon. There's one free address left. + */ + +enum inode_state_bits { + __I_NEW = 0U, + __I_SYNC = 1U, + __I_LRU_ISOLATING = 2U + /* reserved wait address bit 3 */ +}; + +enum inode_state_flags_t { + I_NEW = (1U << __I_NEW), + I_SYNC = (1U << __I_SYNC), + I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING), + /* reserved flag bit 3 */ + I_DIRTY_SYNC = (1U << 4), + I_DIRTY_DATASYNC = (1U << 5), + I_DIRTY_PAGES = (1U << 6), + I_WILL_FREE = (1U << 7), + I_FREEING = (1U << 8), + I_CLEAR = (1U << 9), + I_REFERENCED = (1U << 10), + I_LINKABLE = (1U << 11), + I_DIRTY_TIME = (1U << 12), + I_WB_SWITCH = (1U << 13), + I_OVL_INUSE = (1U << 14), + I_CREATING = (1U << 15), + I_DONTCACHE = (1U << 16), + I_SYNC_QUEUED = (1U << 17), + I_PINNING_NETFS_WB = (1U << 18) +}; + +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) +#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) +#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) + /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning @@ -722,7 +840,7 @@ struct inode { #endif /* Misc */ - u32 i_state; + enum inode_state_flags_t i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; @@ -2482,117 +2600,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, }; } -/* - * Inode state bits. Protected by inode->i_lock - * - * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, - * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. - * - * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, - * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at - * various stages of removing an inode. - * - * Two bits are used for locking and completion notification, I_NEW and I_SYNC. - * - * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on - * fdatasync() (unless I_DIRTY_DATASYNC is also set). - * Timestamp updates are the usual cause. - * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of - * these changes separately from I_DIRTY_SYNC so that we - * don't have to write inode on fdatasync() when only - * e.g. the timestamps have changed. - * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. - * I_DIRTY_TIME The inode itself has dirty timestamps, and the - * lazytime mount option is enabled. We keep track of this - * separately from I_DIRTY_SYNC in order to implement - * lazytime. This gets cleared if I_DIRTY_INODE - * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But - * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already - * in place because writeback might already be in progress - * and we don't want to lose the time update - * I_NEW Serves as both a mutex and completion notification. - * New inodes set I_NEW. If two processes both create - * the same inode, one of them will release its inode and - * wait for I_NEW to be released before returning. - * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can - * also cause waiting on I_NEW, without I_NEW actually - * being set. find_inode() uses this to prevent returning - * nearly-dead inodes. - * I_WILL_FREE Must be set when calling write_inode_now() if i_count - * is zero. I_FREEING must be set when I_WILL_FREE is - * cleared. - * I_FREEING Set when inode is about to be freed but still has dirty - * pages or buffers attached or the inode itself is still - * dirty. - * I_CLEAR Added by clear_inode(). In this state the inode is - * clean and can be destroyed. Inode keeps I_FREEING. - * - * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are - * prohibited for many purposes. iget() must wait for - * the inode to be completely released, then create it - * anew. Other functions will just ignore such inodes, - * if appropriate. I_NEW is used for waiting. - * - * I_SYNC Writeback of inode is running. The bit is set during - * data writeback, and cleared with a wakeup on the bit - * address once it is done. The bit is also used to pin - * the inode in memory for flusher thread. - * - * I_REFERENCED Marks the inode as recently references on the LRU list. - * - * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to - * synchronize competing switching instances and to tell - * wb stat updates to grab the i_pages lock. See - * inode_switch_wbs_work_fn() for details. - * - * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper - * and work dirs among overlayfs mounts. - * - * I_CREATING New object's inode in the middle of setting up. - * - * I_DONTCACHE Evict inode as soon as it is not used anymore. - * - * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. - * Used to detect that mark_inode_dirty() should not move - * inode between dirty lists. - * - * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. - * - * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding - * i_count. - * - * Q: What is the difference between I_WILL_FREE and I_FREEING? - * - * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait - * upon. There's one free address left. - */ -#define __I_NEW 0 -#define I_NEW (1 << __I_NEW) -#define __I_SYNC 1 -#define I_SYNC (1 << __I_SYNC) -#define __I_LRU_ISOLATING 2 -#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) - -#define I_DIRTY_SYNC (1 << 3) -#define I_DIRTY_DATASYNC (1 << 4) -#define I_DIRTY_PAGES (1 << 5) -#define I_WILL_FREE (1 << 6) -#define I_FREEING (1 << 7) -#define I_CLEAR (1 << 8) -#define I_REFERENCED (1 << 9) -#define I_LINKABLE (1 << 10) -#define I_DIRTY_TIME (1 << 11) -#define I_WB_SWITCH (1 << 12) -#define I_OVL_INUSE (1 << 13) -#define I_CREATING (1 << 14) -#define I_DONTCACHE (1 << 15) -#define I_SYNC_QUEUED (1 << 16) -#define I_PINNING_NETFS_WB (1 << 17) - -#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) -#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) -#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) - extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) { From 9e70e985bdc2c6fe7a160e4d59ddd7c0a39bc077 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Aug 2025 11:39:03 -0400 Subject: [PATCH 16/20] fs: rework iput logic Currently, if we are the last iput, and we have the I_DIRTY_TIME bit set, we will grab a reference on the inode again and then mark it dirty and then redo the put. This is to make sure we delay the time update for as long as possible. We can rework this logic to simply dec i_count if it is not 1, and if it is do the time update while still holding the i_count reference. Then we can replace the atomic_dec_and_lock with locking the ->i_lock and doing atomic_dec_and_test, since we did the atomic_add_unless above. Co-developed-by: Mateusz Guzik Signed-off-by: Mateusz Guzik Signed-off-by: Josef Bacik Link: https://lore.kernel.org/be208b89bdb650202e712ce2bcfc407ac7044c7a.1756222464.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- fs/inode.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 01ebdc40021e..01a554e11279 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1908,20 +1908,44 @@ static void iput_final(struct inode *inode) */ void iput(struct inode *inode) { - if (!inode) + if (unlikely(!inode)) return; - BUG_ON(inode->i_state & I_CLEAR); + retry: - if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { - if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { - atomic_inc(&inode->i_count); - spin_unlock(&inode->i_lock); - trace_writeback_lazytime_iput(inode); - mark_inode_dirty_sync(inode); - goto retry; - } - iput_final(inode); + lockdep_assert_not_held(&inode->i_lock); + VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode); + /* + * Note this assert is technically racy as if the count is bogusly + * equal to one, then two CPUs racing to further drop it can both + * conclude it's fine. + */ + VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode); + + if (atomic_add_unless(&inode->i_count, -1, 1)) + return; + + if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) { + trace_writeback_lazytime_iput(inode); + mark_inode_dirty_sync(inode); + goto retry; } + + spin_lock(&inode->i_lock); + if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) { + spin_unlock(&inode->i_lock); + goto retry; + } + + if (!atomic_dec_and_test(&inode->i_count)) { + spin_unlock(&inode->i_lock); + return; + } + + /* + * iput_final() drops ->i_lock, we can't assert on it as the inode may + * be deallocated by the time the call returns. + */ + iput_final(inode); } EXPORT_SYMBOL(iput); From 37b27bd5d6217b75d315f28b4399aad0a336f299 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Aug 2025 11:39:02 -0400 Subject: [PATCH 17/20] fs: add an icount_read helper Instead of doing direct access to ->i_count, add a helper to handle this. This will make it easier to convert i_count to a refcount later. Signed-off-by: Josef Bacik Link: https://lore.kernel.org/9bc62a84c6b9d6337781203f60837bd98fbc4a96.1756222464.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- arch/powerpc/platforms/cell/spufs/file.c | 2 +- fs/btrfs/inode.c | 2 +- fs/ceph/mds_client.c | 2 +- fs/ext4/ialloc.c | 4 ++-- fs/fs-writeback.c | 2 +- fs/hpfs/inode.c | 2 +- fs/inode.c | 8 ++++---- fs/nfs/inode.c | 4 ++-- fs/notify/fsnotify.c | 2 +- fs/smb/client/inode.c | 2 +- fs/ubifs/super.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_trace.h | 2 +- include/linux/fs.h | 5 +++++ include/trace/events/filelock.h | 2 +- security/landlock/fs.c | 2 +- 16 files changed, 25 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index d5a2c77bc908..ce839783c0df 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -1430,7 +1430,7 @@ static int spufs_mfc_open(struct inode *inode, struct file *file) if (ctx->owner != current->mm) return -EINVAL; - if (atomic_read(&inode->i_count) != 1) + if (icount_read(inode) != 1) return -EBUSY; mutex_lock(&ctx->mapping_lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index de722b232ec1..5bcd8e25fa78 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4538,7 +4538,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root) inode = btrfs_find_first_inode(root, min_ino); while (inode) { - if (atomic_read(&inode->vfs_inode.i_count) > 1) + if (icount_read(&inode->vfs_inode) > 1) d_prune_aliases(&inode->vfs_inode); min_ino = btrfs_ino(inode) + 1; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0f497c39ff82..62dba710504d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2221,7 +2221,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg) int count; dput(dentry); d_prune_aliases(inode); - count = atomic_read(&inode->i_count); + count = icount_read(inode); if (count == 1) (*remaining)--; doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df4051613b29..ba4fd9aba1c1 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) "nonexistent device\n", __func__, __LINE__); return; } - if (atomic_read(&inode->i_count) > 1) { + if (icount_read(inode) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", __func__, __LINE__, inode->i_ino, - atomic_read(&inode->i_count)); + icount_read(inode)); return; } if (inode->i_nlink) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cc57367fb641..6088a67b2aae 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1767,7 +1767,7 @@ static int writeback_single_inode(struct inode *inode, int ret = 0; spin_lock(&inode->i_lock); - if (!atomic_read(&inode->i_count)) + if (!icount_read(inode)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else WARN_ON(inode->i_state & I_WILL_FREE); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index a59e8fa630db..34008442ee26 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i) struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct inode *parent; if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; - if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) { + if (hpfs_inode->i_rddir_off && !icount_read(i)) { if (*hpfs_inode->i_rddir_off) pr_err("write_inode: some position still there\n"); kfree(hpfs_inode->i_rddir_off); diff --git a/fs/inode.c b/fs/inode.c index 01a554e11279..fe4868e2a954 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -534,7 +534,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate) { if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) return; if (!(inode->i_sb->s_flags & SB_ACTIVE)) return; @@ -871,11 +871,11 @@ void evict_inodes(struct super_block *sb) again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) continue; spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_count)) { + if (icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } @@ -937,7 +937,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, * unreclaimable for a while. Remove them lazily here; iput, * sync, or the last page cache deletion will requeue them. */ - if (atomic_read(&inode->i_count) || + if (icount_read(inode) || (inode->i_state & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 338ef77ae423..b52805951856 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -608,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), - atomic_read(&inode->i_count)); + icount_read(inode)); out: return inode; @@ -2229,7 +2229,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), - atomic_read(&inode->i_count), fattr->valid); + icount_read(inode), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 079b868552c2..46bfc543f946 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -66,7 +66,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * removed all zero refcount inodes, in any case. Test to * be sure. */ - if (!atomic_read(&inode->i_count)) { + if (!icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 75be4b46bc6f..211d5b8b42f4 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2749,7 +2749,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) } cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", - full_path, inode, inode->i_count.counter, + full_path, inode, icount_read(inode), dentry, cifs_get_time(dentry), jiffies); again: diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f3e3b2068608..a0269ba96e3d 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode) goto out; dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); - ubifs_assert(c, !atomic_read(&inode->i_count)); + ubifs_assert(c, !icount_read(inode)); truncate_inode_pages_final(&inode->i_data); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9c39251961a3..df8eab11dc48 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1035,7 +1035,7 @@ xfs_itruncate_extents_flags( int error = 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - if (atomic_read(&VFS_I(ip)->i_count)) + if (icount_read(VFS_I(ip))) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e1794e3e3156..34001503fc8b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1151,7 +1151,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->count = icount_read(VFS_I(ip)); __entry->pincount = atomic_read(&ip->i_pincount); __entry->iflags = ip->i_flags; __entry->caller_ip = caller_ip; diff --git a/include/linux/fs.h b/include/linux/fs.h index c34554d8c4fe..c4fd010cf5bf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2611,6 +2611,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +static inline int icount_read(const struct inode *inode) +{ + return atomic_read(&inode->i_count); +} + /* * Returns true if the given inode itself only has dirty timestamps (its pages * may still be dirty) and isn't currently being allocated or freed. diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h index b8d1e00a7982..fdd36b1daa25 100644 --- a/include/trace/events/filelock.h +++ b/include/trace/events/filelock.h @@ -189,7 +189,7 @@ TRACE_EVENT(generic_add_lease, __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); - __entry->icount = atomic_read(&inode->i_count); + __entry->icount = icount_read(inode); __entry->owner = fl->c.flc_owner; __entry->flags = fl->c.flc_flags; __entry->type = fl->c.flc_type; diff --git a/security/landlock/fs.c b/security/landlock/fs.c index c04f8879ad03..0bade2c5aa1d 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1281,7 +1281,7 @@ static void hook_sb_delete(struct super_block *const sb) struct landlock_object *object; /* Only handles referenced inodes. */ - if (!atomic_read(&inode->i_count)) + if (!icount_read(inode)) continue; /* From 90ccf10de527c0c9b117beddd09ee7ac38efaa5b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 1 Sep 2025 12:40:38 +0200 Subject: [PATCH 18/20] inode: fix whitespace issues Fix two minor whitespace issues. Signed-off-by: Christian Brauner --- fs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index fe4868e2a954..833de5457a06 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -550,11 +550,11 @@ static void __inode_add_lru(struct inode *inode, bool rotate) struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit) { - void *bit_address; + void *bit_address; - bit_address = inode_state_wait_address(inode, bit); - init_wait_var_entry(wqe, bit_address, 0); - return __var_waitqueue(bit_address); + bit_address = inode_state_wait_address(inode, bit); + init_wait_var_entry(wqe, bit_address, 0); + return __var_waitqueue(bit_address); } EXPORT_SYMBOL(inode_bit_waitqueue); @@ -2938,7 +2938,7 @@ EXPORT_SYMBOL(mode_strip_sgid); */ void dump_inode(struct inode *inode, const char *reason) { - pr_warn("%s encountered for inode %px", reason, inode); + pr_warn("%s encountered for inode %px", reason, inode); } EXPORT_SYMBOL(dump_inode); From cde560f98a9b6e64dd675f6bd10137cc8243a32a Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 11 Sep 2025 08:56:41 +0200 Subject: [PATCH 19/20] fs: expand dump_inode() This adds fs name and few fields from struct inode: i_mode, i_opflags, i_flags, i_state and i_count. All values printed raw, no attempt to pretty-print anything. Compile tested on i386 and runtime tested on amd64. Sample output: [ 23.121281] VFS_WARN_ON_INODE("crap") encountered for inode ffff9a1a83ce3660 fs pipefs mode 10600 opflags 0x4 flags 0x0 state 0x38 count 0 Signed-off-by: Mateusz Guzik --- fs/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 833de5457a06..bf7503760206 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2935,10 +2935,18 @@ EXPORT_SYMBOL(mode_strip_sgid); * * TODO: add a proper inode dumping routine, this is a stub to get debug off the * ground. + * + * TODO: handle getting to fs type with get_kernel_nofault()? + * See dump_mapping() above. */ void dump_inode(struct inode *inode, const char *reason) { - pr_warn("%s encountered for inode %px", reason, inode); + struct super_block *sb = inode->i_sb; + + pr_warn("%s encountered for inode %px\n" + "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n", + reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags, + inode->i_flags, inode->i_state, atomic_read(&inode->i_count)); } EXPORT_SYMBOL(dump_inode); From 2ef435a872abc347dc0a92f1c213bb0af3cbf195 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Wed, 17 Sep 2025 17:36:31 +0200 Subject: [PATCH 20/20] fs: add might_sleep() annotation to iput() and more When iput() drops the reference counter to zero, it may sleep via inode_wait_for_writeback(). This happens rarely because it's usually the dcache which evicts inodes, but really iput() should only ever be called in contexts where sleeping is allowed. This annotation allows finding buggy callers. Additionally, this patch annotates a few low-level functions that can call iput() conditionally. Cc: Mateusz Guzik Signed-off-by: Max Kellermann Link: https://lore.kernel.org/20250917153632.2228828-1-max.kellermann@ionos.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index bf7503760206..1a9d4fa5e0cb 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1279,6 +1279,8 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; + might_sleep(); + again: spin_lock(&inode_hash_lock); old = find_inode(inode->i_sb, head, test, data, true); @@ -1382,6 +1384,8 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode, *new; + might_sleep(); + again: inode = find_inode(sb, head, test, data, false); if (inode) { @@ -1422,6 +1426,9 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + + might_sleep(); + again: inode = find_inode_fast(sb, head, ino, false); if (inode) { @@ -1605,6 +1612,9 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; + + might_sleep(); + again: inode = ilookup5_nowait(sb, hashval, test, data); if (inode) { @@ -1630,6 +1640,9 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + + might_sleep(); + again: inode = find_inode_fast(sb, head, ino, false); @@ -1780,6 +1793,8 @@ int insert_inode_locked(struct inode *inode) ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); + might_sleep(); + while (1) { struct inode *old = NULL; spin_lock(&inode_hash_lock); @@ -1826,6 +1841,8 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, { struct inode *old; + might_sleep(); + inode->i_state |= I_CREATING; old = inode_insert5(inode, hashval, test, NULL, data); @@ -1908,6 +1925,7 @@ static void iput_final(struct inode *inode) */ void iput(struct inode *inode) { + might_sleep(); if (unlikely(!inode)) return;