From a6773e6932cbfbc0e123bc0529ea27f2a8abb833 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 23 Oct 2025 10:38:50 -0700 Subject: [PATCH 01/35] jfs: Rename _inline to avoid conflict with clang's '-fms-extensions' Building fs/jfs with clang and '-fms-extensions' errors with: In file included from fs/jfs/jfs_unicode.c:8: fs/jfs/jfs_incore.h:86:13: error: type name does not allow function specifier to be specified 86 | unchar _inline[128]; | ^ fs/jfs/jfs_incore.h:86:20: error: expected member name or ';' after declaration specifiers 86 | unchar _inline[128]; | ~~~~~~~~~~~~~~^ '-fms-extensions' in clang enables several other Microsoft specific keywords such as _inline [1], presumably for compatibility with MSVC, as Microsoft's documentation [2] mentions: For compatibility with previous versions, _inline and _forceinline are synonyms for __inline and __forceinline, respectively Rename the _inline array in 'struct jfs_inode_info' to _inline_sym to avoid this conflict, which is not a large workaround as this member is only ever referred to via the i_inline macro. Link: https://github.com/llvm/llvm-project/blob/249883d0c5883996bed038cd82a8999f342994c9/clang/include/clang/Basic/TokenKinds.def#L744-L79 [1] Link: https://learn.microsoft.com/en-us/cpp/c-language/inline-functions [2] Acked-by: Dave Kleikamp Link: https://patch.msgid.link/20251023-jfs-fix-conflict-with-clang-ms-ext-v1-1-e219d59a1e68@kernel.org Signed-off-by: Nathan Chancellor --- fs/jfs/jfs_incore.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 10934f9a11be..5aaafedb8fbc 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -76,14 +76,14 @@ struct jfs_inode_info { struct { unchar _unused[16]; /* 16: */ dxd_t _dxd; /* 16: */ - /* _inline may overflow into _inline_ea when needed */ + /* _inline_sym may overflow into _inline_ea when needed */ /* _inline_ea may overlay the last part of * file._xtroot if maxentry = XTROOTINITSLOT */ union { struct { /* 128: inline symlink */ - unchar _inline[128]; + unchar _inline_sym[128]; /* 128: inline extended attr */ unchar _inline_ea[128]; }; @@ -101,7 +101,7 @@ struct jfs_inode_info { #define i_imap u.file._imap #define i_dirtable u.dir._table #define i_dtroot u.dir._dtroot -#define i_inline u.link._inline +#define i_inline u.link._inline_sym #define i_inline_ea u.link._inline_ea #define i_inline_all u.link._inline_all From c4781dc3d1cf0e017e1f290607ddc56cfe187afc Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 20 Oct 2025 16:22:27 +0200 Subject: [PATCH 02/35] Kbuild: enable -fms-extensions Once in a while, it turns out that enabling -fms-extensions could allow some slightly prettier code. But every time it has come up, the code that had to be used instead has been deemed "not too awful" and not worth introducing another compiler flag for. That's probably true for each individual case, but then it's somewhat of a chicken/egg situation. If we just "bite the bullet" as Linus says and enable it once and for all, it is available whenever a use case turns up, and no individual case has to justify it. A lore.kernel.org search provides these examples: - https://lore.kernel.org/lkml/200706301813.58435.agruen@suse.de/ - https://lore.kernel.org/lkml/20180419152817.GD25406@bombadil.infradead.org/ - https://lore.kernel.org/lkml/170622208395.21664.2510213291504081000@noble.neil.brown.name/ - https://lore.kernel.org/lkml/87h6475w9q.fsf@prevas.dk/ - https://lore.kernel.org/lkml/CAHk-=wjeZwww6Zswn6F_iZTpUihTSNKYppLqj36iQDDhfntuEw@mail.gmail.com/ Undoubtedly, there are more places in the code where this could also be used but where -fms-extensions just didn't come up in any discussion. Signed-off-by: Rasmus Villemoes Acked-by: David Sterba Link: https://patch.msgid.link/20251020142228.1819871-2-linux@rasmusvillemoes.dk [nathan: Move disabled clang warning to scripts/Makefile.extrawarn and adjust comment] Signed-off-by: Nathan Chancellor --- Makefile | 3 +++ scripts/Makefile.extrawarn | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 17cfa11ca716..1dc8873d4968 100644 --- a/Makefile +++ b/Makefile @@ -1061,6 +1061,9 @@ NOSTDINC_FLAGS += -nostdinc # perform bounds checking. KBUILD_CFLAGS += $(call cc-option, -fstrict-flex-arrays=3) +# Allow including a tagged struct or union anonymously in another struct/union. +KBUILD_CFLAGS += -fms-extensions + # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += -fno-strict-overflow diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index 6af392f9cd02..68e6fafcb80c 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -28,8 +28,10 @@ endif KBUILD_CFLAGS-$(CONFIG_CC_NO_ARRAY_BOUNDS) += -Wno-array-bounds ifdef CONFIG_CC_IS_CLANG -# The kernel builds with '-std=gnu11' so use of GNU extensions is acceptable. +# The kernel builds with '-std=gnu11' and '-fms-extensions' so use of GNU and +# Microsoft extensions is acceptable. KBUILD_CFLAGS += -Wno-gnu +KBUILD_CFLAGS += -Wno-microsoft-anon-tag # Clang checks for overflow/truncation with '%p', while GCC does not: # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111219 From 5ff8ad3909524ad55297a434f87c238224825bf4 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 30 Oct 2025 21:26:28 -0400 Subject: [PATCH 03/35] kbuild: Add '-fms-extensions' to areas with dedicated CFLAGS This is a follow up to commit c4781dc3d1cf ("Kbuild: enable -fms-extensions") but in a separate change due to being substantially different from the initial submission. There are many places within the kernel that use their own CFLAGS instead of the main KBUILD_CFLAGS, meaning code written with the main kernel's use of '-fms-extensions' in mind that may be tangentially included in these areas will result in "error: declaration does not declare anything" messages from the compiler. Add '-fms-extensions' to all these areas to ensure consistency, along with -Wno-microsoft-anon-tag to silence clang's warning about use of the extension that the kernel cares about using. parisc does not build with clang so it does not need this warning flag. LoongArch does not need it either because -W flags from KBUILD_FLAGS are pulled into cflags-vdso. Reported-by: Christian Brauner Closes: https://lore.kernel.org/20251030-meerjungfrau-getrocknet-7b46eacc215d@brauner/ Reviewed-by: Christian Brauner Acked-by: Ard Biesheuvel Signed-off-by: Nathan Chancellor --- arch/arm64/kernel/vdso32/Makefile | 3 ++- arch/loongarch/vdso/Makefile | 2 +- arch/parisc/boot/compressed/Makefile | 2 +- arch/powerpc/boot/Makefile | 3 ++- arch/s390/Makefile | 3 ++- arch/s390/purgatory/Makefile | 3 ++- arch/x86/Makefile | 4 +++- arch/x86/boot/compressed/Makefile | 7 +++++-- drivers/firmware/efi/libstub/Makefile | 4 ++-- 9 files changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index ffa3536581f6..9d0efed91414 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -63,7 +63,7 @@ VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ $(filter -Werror,$(KBUILD_CPPFLAGS)) \ -Werror-implicit-function-declaration \ -Wno-format-security \ - -std=gnu11 + -std=gnu11 -fms-extensions VDSO_CFLAGS += -O2 # Some useful compiler-dependent flags from top-level Makefile VDSO_CFLAGS += $(call cc32-option,-Wno-pointer-sign) @@ -71,6 +71,7 @@ VDSO_CFLAGS += -fno-strict-overflow VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes) VDSO_CFLAGS += -Werror=date-time VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types) +VDSO_CFLAGS += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag) # Compile as THUMB2 or ARM. Unwinding via frame-pointers in THUMB2 is # unreliable. diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile index d8316f993482..c0cc3ca5da9f 100644 --- a/arch/loongarch/vdso/Makefile +++ b/arch/loongarch/vdso/Makefile @@ -19,7 +19,7 @@ ccflags-vdso := \ cflags-vdso := $(ccflags-vdso) \ -isystem $(shell $(CC) -print-file-name=include) \ $(filter -W%,$(filter-out -Wa$(comma)%,$(KBUILD_CFLAGS))) \ - -std=gnu11 -O2 -g -fno-strict-aliasing -fno-common -fno-builtin \ + -std=gnu11 -fms-extensions -O2 -g -fno-strict-aliasing -fno-common -fno-builtin \ -fno-stack-protector -fno-jump-tables -DDISABLE_BRANCH_PROFILING \ $(call cc-option, -fno-asynchronous-unwind-tables) \ $(call cc-option, -fno-stack-protector) diff --git a/arch/parisc/boot/compressed/Makefile b/arch/parisc/boot/compressed/Makefile index 17c42d718eb3..f8481e4e9d21 100644 --- a/arch/parisc/boot/compressed/Makefile +++ b/arch/parisc/boot/compressed/Makefile @@ -18,7 +18,7 @@ KBUILD_CFLAGS += -fno-PIE -mno-space-regs -mdisable-fpregs -Os ifndef CONFIG_64BIT KBUILD_CFLAGS += -mfast-indirect-calls endif -KBUILD_CFLAGS += -std=gnu11 +KBUILD_CFLAGS += -std=gnu11 -fms-extensions LDFLAGS_vmlinux := -X -e startup --as-needed -T $(obj)/vmlinux: $(obj)/vmlinux.lds $(addprefix $(obj)/, $(OBJECTS)) $(LIBGCC) FORCE diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index c47b78c1d3e7..f1a4761ebd44 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -70,7 +70,7 @@ BOOTCPPFLAGS := -nostdinc $(LINUXINCLUDE) BOOTCPPFLAGS += -isystem $(shell $(BOOTCC) -print-file-name=include) BOOTCFLAGS := $(BOOTTARGETFLAGS) \ - -std=gnu11 \ + -std=gnu11 -fms-extensions \ -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -O2 \ -msoft-float -mno-altivec -mno-vsx \ @@ -86,6 +86,7 @@ BOOTARFLAGS := -crD ifdef CONFIG_CC_IS_CLANG BOOTCFLAGS += $(CLANG_FLAGS) +BOOTCFLAGS += -Wno-microsoft-anon-tag BOOTAFLAGS += $(CLANG_FLAGS) endif diff --git a/arch/s390/Makefile b/arch/s390/Makefile index b4769241332b..8578361133a4 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -22,7 +22,7 @@ KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__ ifndef CONFIG_AS_IS_LLVM KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf)) endif -KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack -std=gnu11 +KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack -std=gnu11 -fms-extensions KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY KBUILD_CFLAGS_DECOMPRESSOR += -D__DECOMPRESSOR KBUILD_CFLAGS_DECOMPRESSOR += -Wno-pointer-sign @@ -35,6 +35,7 @@ KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-membe KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,)) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds) +KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag) UTS_MACHINE := s390x STACK_SIZE := $(if $(CONFIG_KASAN),65536,$(if $(CONFIG_KMSAN),65536,16384)) diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index bd39b36e7bd6..0c196a5b194a 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -13,7 +13,7 @@ CFLAGS_sha256.o := -D__NO_FORTIFY $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE $(call if_changed_rule,as_o_S) -KBUILD_CFLAGS := -std=gnu11 -fno-strict-aliasing -Wall -Wstrict-prototypes +KBUILD_CFLAGS := -std=gnu11 -fms-extensions -fno-strict-aliasing -Wall -Wstrict-prototypes KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common @@ -21,6 +21,7 @@ KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING KBUILD_CFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += $(CLANG_FLAGS) +KBUILD_CFLAGS += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) KBUILD_AFLAGS += -D__DISABLE_EXPORTS diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 4db7e4bf69f5..e20e25b8b16c 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -48,7 +48,8 @@ endif # How to compile the 16-bit code. Note we always compile for -march=i386; # that way we can complain to the user if the CPU is insufficient. -REALMODE_CFLAGS := -std=gnu11 -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \ +REALMODE_CFLAGS := -std=gnu11 -fms-extensions -m16 -g -Os \ + -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none) @@ -60,6 +61,7 @@ REALMODE_CFLAGS += $(cc_stack_align4) REALMODE_CFLAGS += $(CLANG_FLAGS) ifdef CONFIG_CC_IS_CLANG REALMODE_CFLAGS += -Wno-gnu +REALMODE_CFLAGS += -Wno-microsoft-anon-tag endif export REALMODE_CFLAGS diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 74657589264d..68f9d7a1683b 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -25,7 +25,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ # avoid errors with '-march=i386', and future flags may depend on the target to # be valid. KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS) -KBUILD_CFLAGS += -std=gnu11 +KBUILD_CFLAGS += -std=gnu11 -fms-extensions KBUILD_CFLAGS += -fno-strict-aliasing -fPIE KBUILD_CFLAGS += -Wundef KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING @@ -36,7 +36,10 @@ KBUILD_CFLAGS += -mno-mmx -mno-sse KBUILD_CFLAGS += -ffreestanding -fshort-wchar KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) -KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +ifdef CONFIG_CC_IS_CLANG +KBUILD_CFLAGS += -Wno-gnu +KBUILD_CFLAGS += -Wno-microsoft-anon-tag +endif KBUILD_CFLAGS += -Wno-pointer-sign KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += -D__DISABLE_EXPORTS diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 94b05e4451dd..7d15a85d579f 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -11,12 +11,12 @@ cflags-y := $(KBUILD_CFLAGS) cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small -cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ -std=gnu11 \ +cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ -std=gnu11 -fms-extensions \ -fPIC -fno-strict-aliasing -mno-red-zone \ -mno-mmx -mno-sse -fshort-wchar \ -Wno-pointer-sign \ $(call cc-disable-warning, address-of-packed-member) \ - $(call cc-disable-warning, gnu) \ + $(if $(CONFIG_CC_IS_CLANG),-Wno-gnu -Wno-microsoft-anon-tag) \ -fno-asynchronous-unwind-tables \ $(CLANG_FLAGS) From b2c43efc3c8d9cda34eb8421249d15e6fed4d650 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 18 Sep 2025 18:24:47 +0200 Subject: [PATCH 04/35] initrd: Replace simple_strtol with kstrtoint to improve ramdisk_start_setup Replace simple_strtol() with the recommended kstrtoint() for parsing the 'ramdisk_start=' boot parameter. Unlike simple_strtol(), which returns a a long, kstrtoint() converts the string directly to an integer and avoids implicit casting. Check the return value of kstrtoint() and reject invalid values. This adds error handling while preserving existing behavior for valid values, and removes use of the deprecated simple_strtol() helper. Signed-off-by: Thorsten Blum Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- init/do_mounts_rd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index 19d9f33dcacf..eddbe5cb0413 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -29,8 +29,7 @@ int __initdata rd_image_start; /* starting block # of image */ static int __init ramdisk_start_setup(char *str) { - rd_image_start = simple_strtol(str,NULL,0); - return 1; + return kstrtoint(str, 0, &rd_image_start) == 0; } __setup("ramdisk_start=", ramdisk_start_setup); From 10436adf9df60ae73390754b47a4bdebf728529f Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 14 Aug 2025 16:21:37 +0200 Subject: [PATCH 05/35] iomap: use largest_zero_folio() in iomap_dio_zero() iomap_dio_zero() uses a custom allocated memory of zeroes for padding zeroes. This was a temporary solution until there was a way to request a zero folio that was greater than the PAGE_SIZE. Use largest_zero_folio() function instead of using the custom allocated memory of zeroes. There is no guarantee from largest_zero_folio() function that it will always return a PMD sized folio. Adapt the code so that it can also work if largest_zero_folio() returns a ZERO_PAGE. Signed-off-by: Pankaj Raghav Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 5d5d63efbd57..f8acb41c76d4 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -24,13 +24,6 @@ #define IOMAP_DIO_WRITE (1U << 30) #define IOMAP_DIO_DIRTY (1U << 31) -/* - * Used for sub block zeroing in iomap_dio_zero() - */ -#define IOMAP_ZERO_PAGE_SIZE (SZ_64K) -#define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE)) -static struct page *zero_page; - struct iomap_dio { struct kiocb *iocb; const struct iomap_dio_ops *dops; @@ -285,24 +278,35 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, { struct inode *inode = file_inode(dio->iocb->ki_filp); struct bio *bio; + struct folio *zero_folio = largest_zero_folio(); + int nr_vecs = max(1, i_blocksize(inode) / folio_size(zero_folio)); if (!len) return 0; + /* - * Max block size supported is 64k + * This limit shall never be reached as most filesystems have a + * maximum blocksize of 64k. */ - if (WARN_ON_ONCE(len > IOMAP_ZERO_PAGE_SIZE)) + if (WARN_ON_ONCE(nr_vecs > BIO_MAX_VECS)) return -EINVAL; - bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); + bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - __bio_add_page(bio, zero_page, len, 0); + while (len > 0) { + unsigned int io_len = min(len, folio_size(zero_folio)); + + bio_add_folio_nofail(bio, zero_folio, io_len, 0); + len -= io_len; + } iomap_dio_submit_bio(iter, dio, bio, pos); + return 0; } @@ -825,15 +829,3 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, return iomap_dio_complete(dio); } EXPORT_SYMBOL_GPL(iomap_dio_rw); - -static int __init iomap_dio_init(void) -{ - zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, - IOMAP_ZERO_PAGE_ORDER); - - if (!zero_page) - return -ENOMEM; - - return 0; -} -fs_initcall(iomap_dio_init); From 0bbb838f385aa7073510b727706a9c0215c8cd9f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 11 Oct 2025 13:00:10 -0700 Subject: [PATCH 06/35] ecryptfs: Use MD5 library instead of crypto_shash eCryptfs uses MD5 for a couple unusual purposes: to "mix" the key into the IVs for file contents encryption (similar to ESSIV), and to prepend some key-dependent bytes to the plaintext when encrypting filenames (which is useless since eCryptfs encrypts the filenames with ECB). Currently, eCryptfs computes these MD5 hashes using the crypto_shash API. Update it to instead use the MD5 library API. This is simpler and faster: the library doesn't require memory allocations, can't fail, and provides direct access to MD5 without overhead such as indirect calls. To preserve the existing behavior of eCryptfs support being disabled when the kernel is booted with "fips=1", make ecryptfs_get_tree() check fips_enabled itself. Previously it relied on crypto_alloc_shash("md5") failing. I don't know for sure that this is actually needed; e.g., it could be argued that eCryptfs's use of MD5 isn't for a security purpose as far as FIPS is concerned. But this preserves the existing behavior. Tested by verifying that an existing eCryptfs can still be mounted with a kernel that has this commit, with all the files matching. Also tested creating a filesystem with this commit and mounting+reading it without. Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20251011200010.193140-1-ebiggers@kernel.org Acked-by: Ard Biesheuvel Signed-off-by: Christian Brauner --- fs/ecryptfs/Kconfig | 2 +- fs/ecryptfs/crypto.c | 90 ++++------------------------------- fs/ecryptfs/ecryptfs_kernel.h | 13 ++--- fs/ecryptfs/inode.c | 7 +-- fs/ecryptfs/keystore.c | 65 +++++-------------------- fs/ecryptfs/main.c | 7 +++ fs/ecryptfs/super.c | 5 +- 7 files changed, 35 insertions(+), 154 deletions(-) diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 1bdeaa6d5790..c2f4fb41b4e6 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -4,7 +4,7 @@ config ECRYPT_FS depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) select CRYPTO_ECB select CRYPTO_CBC - select CRYPTO_MD5 + select CRYPTO_LIB_MD5 help Encrypted filesystem that operates on the VFS layer. See to learn more about diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 69536cacdea8..260f8a4938b0 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -9,7 +9,6 @@ * Michael C. Thompson */ -#include #include #include #include @@ -48,32 +47,6 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size) } } -/** - * ecryptfs_calculate_md5 - calculates the md5 of @src - * @dst: Pointer to 16 bytes of allocated memory - * @crypt_stat: Pointer to crypt_stat struct for the current inode - * @src: Data to be md5'd - * @len: Length of @src - * - * Uses the allocated crypto context that crypt_stat references to - * generate the MD5 sum of the contents of src. - */ -static int ecryptfs_calculate_md5(char *dst, - struct ecryptfs_crypt_stat *crypt_stat, - char *src, int len) -{ - int rc = crypto_shash_tfm_digest(crypt_stat->hash_tfm, src, len, dst); - - if (rc) { - printk(KERN_ERR - "%s: Error computing crypto hash; rc = [%d]\n", - __func__, rc); - goto out; - } -out: - return rc; -} - static int ecryptfs_crypto_api_algify_cipher_name(char **algified_name, char *cipher_name, char *chaining_modifier) @@ -104,13 +77,10 @@ out: * * Generate the initialization vector from the given root IV and page * offset. - * - * Returns zero on success; non-zero on error. */ -int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, - loff_t offset) +void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset) { - int rc = 0; char dst[MD5_DIGEST_SIZE]; char src[ECRYPTFS_MAX_IV_BYTES + 16]; @@ -129,20 +99,12 @@ int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, ecryptfs_printk(KERN_DEBUG, "source:\n"); ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16)); } - rc = ecryptfs_calculate_md5(dst, crypt_stat, src, - (crypt_stat->iv_bytes + 16)); - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error attempting to compute " - "MD5 while generating IV for a page\n"); - goto out; - } + md5(src, crypt_stat->iv_bytes + 16, dst); memcpy(iv, dst, crypt_stat->iv_bytes); if (unlikely(ecryptfs_verbosity > 0)) { ecryptfs_printk(KERN_DEBUG, "derived iv:\n"); ecryptfs_dump_hex(iv, crypt_stat->iv_bytes); } -out: - return rc; } /** @@ -151,29 +113,14 @@ out: * * Initialize the crypt_stat structure. */ -int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) +void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) { - struct crypto_shash *tfm; - int rc; - - tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0); - if (IS_ERR(tfm)) { - rc = PTR_ERR(tfm); - ecryptfs_printk(KERN_ERR, "Error attempting to " - "allocate crypto context; rc = [%d]\n", - rc); - return rc; - } - memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); INIT_LIST_HEAD(&crypt_stat->keysig_list); mutex_init(&crypt_stat->keysig_list_mutex); mutex_init(&crypt_stat->cs_mutex); mutex_init(&crypt_stat->cs_tfm_mutex); - crypt_stat->hash_tfm = tfm; crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED; - - return 0; } /** @@ -187,7 +134,6 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) struct ecryptfs_key_sig *key_sig, *key_sig_tmp; crypto_free_skcipher(crypt_stat->tfm); - crypto_free_shash(crypt_stat->hash_tfm); list_for_each_entry_safe(key_sig, key_sig_tmp, &crypt_stat->keysig_list, crypt_stat_list) { list_del(&key_sig->crypt_stat_list); @@ -361,14 +307,7 @@ static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat, int rc; extent_base = (((loff_t)page_index) * (PAGE_SIZE / extent_size)); - rc = ecryptfs_derive_iv(extent_iv, crypt_stat, - (extent_base + extent_offset)); - if (rc) { - ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for " - "extent [0x%.16llx]; rc = [%d]\n", - (unsigned long long)(extent_base + extent_offset), rc); - goto out; - } + ecryptfs_derive_iv(extent_iv, crypt_stat, extent_base + extent_offset); sg_init_table(&src_sg, 1); sg_init_table(&dst_sg, 1); @@ -609,31 +548,20 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) */ int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat) { - int rc = 0; char dst[MD5_DIGEST_SIZE]; BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE); BUG_ON(crypt_stat->iv_bytes <= 0); if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) { - rc = -EINVAL; ecryptfs_printk(KERN_WARNING, "Session key not valid; " "cannot generate root IV\n"); - goto out; - } - rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key, - crypt_stat->key_size); - if (rc) { - ecryptfs_printk(KERN_WARNING, "Error attempting to compute " - "MD5 while generating root IV\n"); - goto out; - } - memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes); -out: - if (rc) { memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes); crypt_stat->flags |= ECRYPTFS_SECURITY_WARNING; + return -EINVAL; } - return rc; + md5(crypt_stat->key, crypt_stat->key_size, dst); + memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes); + return 0; } static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat) diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 9e6ab0b41337..62a2ea7f59ed 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -14,6 +14,7 @@ #ifndef ECRYPTFS_KERNEL_H #define ECRYPTFS_KERNEL_H +#include #include #include #include @@ -137,8 +138,6 @@ ecryptfs_get_key_payload_data(struct key *key) + MAGIC_ECRYPTFS_MARKER_SIZE_BYTES) #define ECRYPTFS_DEFAULT_CIPHER "aes" #define ECRYPTFS_DEFAULT_KEY_BYTES 16 -#define ECRYPTFS_DEFAULT_HASH "md5" -#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED @@ -163,8 +162,6 @@ ecryptfs_get_key_payload_data(struct key *key) * ECRYPTFS_MAX_IV_BYTES */ #define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16 #define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */ -#define MD5_DIGEST_SIZE 16 -#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE #define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \ + ECRYPTFS_SIG_SIZE + 1 + 1) #define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \ @@ -237,8 +234,6 @@ struct ecryptfs_crypt_stat { unsigned int extent_mask; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct crypto_skcipher *tfm; - struct crypto_shash *hash_tfm; /* Crypto context for generating - * the initialization vectors */ unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; @@ -558,7 +553,7 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, int sg_size); int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_rotate_iv(unsigned char *iv); -int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); +void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_destroy_mount_crypt_stat( struct ecryptfs_mount_crypt_stat *mount_crypt_stat); @@ -693,8 +688,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, char *data, size_t max_packet_size); int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, struct ecryptfs_mount_crypt_stat *mount_crypt_stat); -int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, - loff_t offset); +void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset); extern const struct xattr_handler * const ecryptfs_xattr_handlers[]; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index ed1394da8d6b..bae9011fa62f 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -903,11 +903,8 @@ static int ecryptfs_setattr(struct mnt_idmap *idmap, struct ecryptfs_crypt_stat *crypt_stat; crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat; - if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) { - rc = ecryptfs_init_crypt_stat(crypt_stat); - if (rc) - return rc; - } + if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) + ecryptfs_init_crypt_stat(crypt_stat); inode = d_inode(dentry); lower_inode = ecryptfs_inode_to_lower(inode); lower_dentry = ecryptfs_dentry_to_lower(dentry); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 7f9f68c00ef6..bbf8603242fa 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -11,7 +11,6 @@ * Trevor S. Highland */ -#include #include #include #include @@ -601,10 +600,7 @@ struct ecryptfs_write_tag_70_packet_silly_stack { struct crypto_skcipher *skcipher_tfm; struct skcipher_request *skcipher_req; char iv[ECRYPTFS_MAX_IV_BYTES]; - char hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; - char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; - struct crypto_shash *hash_tfm; - struct shash_desc *hash_desc; + char hash[MD5_DIGEST_SIZE]; }; /* @@ -741,51 +737,15 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, "password tokens\n", __func__); goto out_free_unlock; } - s->hash_tfm = crypto_alloc_shash(ECRYPTFS_TAG_70_DIGEST, 0, 0); - if (IS_ERR(s->hash_tfm)) { - rc = PTR_ERR(s->hash_tfm); - printk(KERN_ERR "%s: Error attempting to " - "allocate hash crypto context; rc = [%d]\n", - __func__, rc); - goto out_free_unlock; - } - s->hash_desc = kmalloc(sizeof(*s->hash_desc) + - crypto_shash_descsize(s->hash_tfm), GFP_KERNEL); - if (!s->hash_desc) { - rc = -ENOMEM; - goto out_release_free_unlock; - } - - s->hash_desc->tfm = s->hash_tfm; - - rc = crypto_shash_digest(s->hash_desc, - (u8 *)s->auth_tok->token.password.session_key_encryption_key, - s->auth_tok->token.password.session_key_encryption_key_bytes, - s->hash); - if (rc) { - printk(KERN_ERR - "%s: Error computing crypto hash; rc = [%d]\n", - __func__, rc); - goto out_release_free_unlock; - } + md5(s->auth_tok->token.password.session_key_encryption_key, + s->auth_tok->token.password.session_key_encryption_key_bytes, + s->hash); for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) { s->block_aligned_filename[s->j] = - s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)]; - if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE) - == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) { - rc = crypto_shash_digest(s->hash_desc, (u8 *)s->hash, - ECRYPTFS_TAG_70_DIGEST_SIZE, - s->tmp_hash); - if (rc) { - printk(KERN_ERR - "%s: Error computing crypto hash; " - "rc = [%d]\n", __func__, rc); - goto out_release_free_unlock; - } - memcpy(s->hash, s->tmp_hash, - ECRYPTFS_TAG_70_DIGEST_SIZE); - } + s->hash[s->j % MD5_DIGEST_SIZE]; + if ((s->j % MD5_DIGEST_SIZE) == (MD5_DIGEST_SIZE - 1)) + md5(s->hash, MD5_DIGEST_SIZE, s->hash); if (s->block_aligned_filename[s->j] == '\0') s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL; } @@ -798,7 +758,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, "convert filename memory to scatterlist; rc = [%d]. " "block_aligned_filename_size = [%zd]\n", __func__, rc, s->block_aligned_filename_size); - goto out_release_free_unlock; + goto out_free_unlock; } rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size, s->dst_sg, 2); @@ -807,7 +767,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, "convert encrypted filename memory to scatterlist; " "rc = [%d]. block_aligned_filename_size = [%zd]\n", __func__, rc, s->block_aligned_filename_size); - goto out_release_free_unlock; + goto out_free_unlock; } /* The characters in the first block effectively do the job * of the IV here, so we just use 0's for the IV. Note the @@ -825,7 +785,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, rc, s->auth_tok->token.password.session_key_encryption_key, mount_crypt_stat->global_default_fn_cipher_key_bytes); - goto out_release_free_unlock; + goto out_free_unlock; } skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg, s->block_aligned_filename_size, s->iv); @@ -833,13 +793,11 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt filename; " "rc = [%d]\n", __func__, rc); - goto out_release_free_unlock; + goto out_free_unlock; } s->i += s->block_aligned_filename_size; (*packet_size) = s->i; (*remaining_bytes) -= (*packet_size); -out_release_free_unlock: - crypto_free_shash(s->hash_tfm); out_free_unlock: kfree_sensitive(s->block_aligned_filename); out_unlock: @@ -850,7 +808,6 @@ out: key_put(auth_tok_key); } skcipher_request_free(s->skcipher_req); - kfree_sensitive(s->hash_desc); kfree(s); return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 16ea14dd2c62..c12dc680f8fe 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -454,6 +455,12 @@ static int ecryptfs_get_tree(struct fs_context *fc) goto out; } + if (fips_enabled) { + rc = -EINVAL; + err = "eCryptfs support is disabled due to FIPS"; + goto out; + } + s = sget_fc(fc, NULL, set_anon_super_fc); if (IS_ERR(s)) { rc = PTR_ERR(s); diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index e7b7f426fecf..3bc21d677564 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -41,10 +41,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb) inode_info = alloc_inode_sb(sb, ecryptfs_inode_info_cache, GFP_KERNEL); if (unlikely(!inode_info)) goto out; - if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) { - kmem_cache_free(ecryptfs_inode_info_cache, inode_info); - goto out; - } + ecryptfs_init_crypt_stat(&inode_info->crypt_stat); mutex_init(&inode_info->lower_file_mutex); atomic_set(&inode_info->lower_file_count, 0); inode_info->lower_file = NULL; From 20052f2ef08a74e09a655e70e41abe648b63a9e1 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 29 Oct 2025 14:49:52 +0100 Subject: [PATCH 07/35] fs: touch up predicts in putname() 1. we already expect the refcount is 1. 2. path creation predicts name == iname I verified this straightens out the asm, no functional changes. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251029134952.658450-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 7377020a2cba..1d4d17f24fb2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -282,7 +282,7 @@ void putname(struct filename *name) return; refcnt = atomic_read(&name->refcnt); - if (refcnt != 1) { + if (unlikely(refcnt != 1)) { if (WARN_ON_ONCE(!refcnt)) return; @@ -290,7 +290,7 @@ void putname(struct filename *name) return; } - if (name->name != name->iname) { + if (unlikely(name->name != name->iname)) { __putname(name->name); kfree(name); } else From fbc22c29963668e8d5eac603ab2b90b844df9787 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 3 Nov 2025 14:37:28 +1030 Subject: [PATCH 08/35] fs: do not pass a parameter for sync_inodes_one_sb() The function sync_inodes_one_sb() will always wait for the writeback, and ignore the optional parameter. Explicitly pass NULL as parameter for the call sites inside do_sync_work(). Signed-off-by: Qu Wenruo Link: https://patch.msgid.link/8079af1c4798cb36887022a8c51547a727c353cf.1762142636.git.wqu@suse.com Signed-off-by: Christian Brauner --- fs/sync.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/sync.c b/fs/sync.c index 2955cd4c77a3..c80c2e658b09 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -122,10 +122,10 @@ static void do_sync_work(struct work_struct *work) * Sync twice to reduce the possibility we skipped some inodes / pages * because they were temporarily locked */ - iterate_supers(sync_inodes_one_sb, &nowait); + iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); sync_bdevs(false); - iterate_supers(sync_inodes_one_sb, &nowait); + iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); sync_bdevs(false); printk("Emergency Sync complete\n"); From 2706659d642eb6b2d659aab52c122d6170e59dd0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 3 Nov 2025 14:37:29 +1030 Subject: [PATCH 09/35] fs: fully sync all fses even for an emergency sync [BUG] There is a bug report that during emergency sync, btrfs only write back all the dirty data and metadadta, but no full transaction commit, resulting the super block still pointing to the old trees, thus the end user can only see the old data, not the newer one. [CAUSE] Initially this looks like a btrfs specific bug, since ext4 doesn't get affected by this one. But the root problem here is, a combination of btrfs features and the no wait nature of emergency sync. Firstly do_sync_work() will call sync_inodes_one_sb() for every fs, to writeback all the dirty pages for the fs. Btrfs will properly writeback all dirty pages, including both data and the updated metadata. So far so good. Then sync_fs_one_sb() called with @nowait, in the case of btrfs it means no full transaction commit, thus no super block update. At this stage, btrfs is only one super block update away to be fully committed. I believe it's the more or less the same for other fses too. The problem is the next step, sync_bdevs(). Normally other fses have their super block already updated in the page cache of the block device, but btrfs only updates the super block during full transaction commit. So sync_bdevs() may work for other fses, but not for btrfs, btrfs is still using its older super block, all pointing back to the old metadata and data. Thus if after emergency sync, power loss happened, the end user will only see the old data, not the newer one, despite that everything but the super block is already written back. [FIX] Since the emergency sync is already executing in a workqueue, I didn't see much need to only do a nowait sync. Especially after the fact that sync_inodes_one_sb() always wait for the writeback to finish. Instead for the second iteration of sync_fs_one_sb(), pass wait == 1 into it, so fses like btrfs can properly commit its super blocks. Reported-by: Askar Safin Link: https://lore.kernel.org/linux-btrfs/20251101150429.321537-1-safinaskar@gmail.com/ Signed-off-by: Qu Wenruo Link: https://patch.msgid.link/7b7fd40c5fe440b633b6c0c741d96ce93eb5a89a.1762142636.git.wqu@suse.com Signed-off-by: Christian Brauner --- fs/sync.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/sync.c b/fs/sync.c index c80c2e658b09..d4bb6d25b1de 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -117,6 +117,7 @@ SYSCALL_DEFINE0(sync) static void do_sync_work(struct work_struct *work) { int nowait = 0; + int wait = 1; /* * Sync twice to reduce the possibility we skipped some inodes / pages @@ -126,7 +127,7 @@ static void do_sync_work(struct work_struct *work) iterate_supers(sync_fs_one_sb, &nowait); sync_bdevs(false); iterate_supers(sync_inodes_one_sb, NULL); - iterate_supers(sync_fs_one_sb, &nowait); + iterate_supers(sync_fs_one_sb, &wait); sync_bdevs(false); printk("Emergency Sync complete\n"); kfree(work); From a6446829f841bf108fca8ca1e8f4f67b3a59d86c Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Mon, 3 Nov 2025 13:36:27 +0530 Subject: [PATCH 10/35] init: Replace simple_strtoul() with kstrtouint() in root_delay_setup() Replace deprecated simple_strtoul() with kstrtouint() for better error handling and input validation. Return 0 on parsing failure to indicate invalid parameter, maintaining existing behavior for valid inputs. The simple_strtoul() function is deprecated in favor of kstrtoint() family functions which provide better error handling and are recommended for new code and replacements. Signed-off-by: Kaushlendra Kumar Link: https://patch.msgid.link/20251103080627.1844645-1-kaushlendra.kumar@intel.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- init/do_mounts.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index 6af29da8889e..64d5e25a2cb5 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -120,7 +120,8 @@ static int __init fs_names_setup(char *str) static unsigned int __initdata root_delay; static int __init root_delay_setup(char *str) { - root_delay = simple_strtoul(str, NULL, 0); + if (kstrtouint(str, 0, &root_delay)) + return 0; return 1; } From 854e8df2ce6b02c8be40d6f26bd8aa700b375bb2 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 23 Oct 2025 10:21:42 +0200 Subject: [PATCH 11/35] fs/pipe: stop duplicating union pipe_index declaration Now that we build with -fms-extensions, union pipe_index can be included as an anonymous member in struct pipe_inode_info, avoiding the duplication. Signed-off-by: Rasmus Villemoes Link: https://patch.msgid.link/20251023082142.2104456-1-linux@rasmusvillemoes.dk Signed-off-by: Nathan Chancellor Signed-off-by: Christian Brauner --- include/linux/pipe_fs_i.h | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 9d42d473d201..7f6a92ac9704 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -44,11 +44,11 @@ typedef unsigned int pipe_index_t; typedef unsigned short pipe_index_t; #endif -/* - * We have to declare this outside 'struct pipe_inode_info', - * but then we can't use 'union pipe_index' for an anonymous - * union, so we end up having to duplicate this declaration - * below. Annoying. +/** + * struct pipe_index - pipe indeces + * @head: The point of buffer production + * @tail: The point of buffer consumption + * @head_tail: unsigned long union of @head and @tail */ union pipe_index { unsigned long head_tail; @@ -63,9 +63,7 @@ union pipe_index { * @mutex: mutex protecting the whole thing * @rd_wait: reader wait point in case of empty pipe * @wr_wait: writer wait point in case of full pipe - * @head: The point of buffer production - * @tail: The point of buffer consumption - * @head_tail: unsigned long union of @head and @tail + * @pipe_index: the pipe indeces * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) @@ -87,14 +85,7 @@ struct pipe_inode_info { struct mutex mutex; wait_queue_head_t rd_wait, wr_wait; - /* This has to match the 'union pipe_index' above */ - union { - unsigned long head_tail; - struct { - pipe_index_t head; - pipe_index_t tail; - }; - }; + union pipe_index; unsigned int max_usage; unsigned int ring_size; From e631df89cd5d638a9d7c152dd9b0a92643efab3e Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 7 Nov 2025 15:21:47 +0100 Subject: [PATCH 12/35] fs: speed up path lookup with cheaper handling of MAY_EXEC The generic inode_permission() routine does work which is known to be of no significance for lookup. There are checks for MAY_WRITE, while the requested permission is MAY_EXEC. Additionally devcgroup_inode_permission() is called to check for devices, but it is an invariant the inode is a directory. Absent a ->permission func, execution lands in generic_permission() which checks upfront if the requested permission is granted for everyone. We can elide the branches which are guaranteed to be false and cut straight to the check if everyone happens to be allowed MAY_EXEC on the inode (which holds true most of the time). Moreover, filesystems which provide their own ->permission routine can take advantage of the optimization by setting the IOP_FASTPERM_MAY_EXEC flag on their inodes, which they can legitimately do if their MAY_EXEC handling matches generic_permission(). As a simple benchmark, as part of compilation gcc issues access(2) on numerous long paths, for example /usr/lib/gcc/x86_64-linux-gnu/12/crtendS.o Issuing access(2) on it in a loop on ext4 on Sapphire Rapids (ops/s): before: 3797556 after: 3987789 (+5%) Note: this depends on the not-yet-landed ext4 patch to mark inodes with cache_no_acl() Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251107142149.989998-2-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namei.c | 43 +++++++++++++++++++++++++++++++++++++++++-- include/linux/fs.h | 13 +++++++------ 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 1d4d17f24fb2..94cb52b01022 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -540,6 +540,9 @@ static inline int do_inode_permission(struct mnt_idmap *idmap, * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. + * + * Note: lookup_inode_permission_may_exec() does not call here. If you add + * MAY_EXEC checks, adjust it. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { @@ -602,6 +605,42 @@ int inode_permission(struct mnt_idmap *idmap, } EXPORT_SYMBOL(inode_permission); +/* + * lookup_inode_permission_may_exec - Check traversal right for given inode + * + * This is a special case routine for may_lookup() making assumptions specific + * to path traversal. Use inode_permission() if you are doing something else. + * + * Work is shaved off compared to inode_permission() as follows: + * - we know for a fact there is no MAY_WRITE to worry about + * - it is an invariant the inode is a directory + * + * Since majority of real-world traversal happens on inodes which grant it for + * everyone, we check it upfront and only resort to more expensive work if it + * fails. + * + * Filesystems which have their own ->permission hook and consequently miss out + * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC + * on their directory inodes. + */ +static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap, + struct inode *inode, int mask) +{ + /* Lookup already checked this to return -ENOTDIR */ + VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode); + VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0); + + mask |= MAY_EXEC; + + if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC)))) + return inode_permission(idmap, inode, mask); + + if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode))) + return inode_permission(idmap, inode, mask); + + return security_inode_permission(inode, mask); +} + /** * path_get - get a reference to a path * @path: path to get the reference to @@ -1855,7 +1894,7 @@ static inline int may_lookup(struct mnt_idmap *idmap, int err, mask; mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; - err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); + err = lookup_inode_permission_may_exec(idmap, nd->inode, mask); if (likely(!err)) return 0; @@ -1870,7 +1909,7 @@ static inline int may_lookup(struct mnt_idmap *idmap, if (err != -ECHILD) // hard error return err; - return inode_permission(idmap, nd->inode, MAY_EXEC); + return lookup_inode_permission_may_exec(idmap, nd->inode, 0); } static int reserve_stack(struct nameidata *nd, struct path *link) diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..ff69734b9fde 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -659,13 +659,14 @@ is_uncached_acl(struct posix_acl *acl) return (long)acl & 1; } -#define IOP_FASTPERM 0x0001 -#define IOP_LOOKUP 0x0002 -#define IOP_NOFOLLOW 0x0004 -#define IOP_XATTR 0x0008 +#define IOP_FASTPERM 0x0001 +#define IOP_LOOKUP 0x0002 +#define IOP_NOFOLLOW 0x0004 +#define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 -#define IOP_MGTIME 0x0020 -#define IOP_CACHED_LINK 0x0040 +#define IOP_MGTIME 0x0020 +#define IOP_CACHED_LINK 0x0040 +#define IOP_FASTPERM_MAY_EXEC 0x0080 /* * Inode state bits. Protected by inode->i_lock From 3e18f6256ecc1a1fce1fd8a7707c5daa162bd6b6 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 7 Nov 2025 15:21:48 +0100 Subject: [PATCH 13/35] btrfs: utilize IOP_FASTPERM_MAY_EXEC Root filesystem was ext4, btrfs was mounted on /testfs. Then issuing access(2) in a loop on /testfs/repos/linux/include/linux/fs.h on Sapphire Rapids (ops/s): before: 3447976 after: 3620879 (+5%) Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251107142149.989998-3-mjguzik@gmail.com Acked-by: David Sterba Signed-off-by: Christian Brauner --- fs/btrfs/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b1b3a0553ee..36c451a9a0bf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5837,6 +5837,8 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (ret) return ERR_PTR(ret); + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC; unlock_new_inode(&inode->vfs_inode); return inode; } @@ -6788,8 +6790,11 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry, } ret = btrfs_create_new_inode(trans, &new_inode_args); - if (!ret) + if (!ret) { + if (S_ISDIR(inode->i_mode)) + inode->i_opflags |= IOP_FASTPERM_MAY_EXEC; d_instantiate_new(dentry, inode); + } btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -9169,6 +9174,11 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } +/* + * NOTE: in case you are adding MAY_EXEC check for directories: + * we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to + * elide calls here. + */ static int btrfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { From a0a28c4e41251a85b4b6db987a72ffbc8613e497 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 7 Nov 2025 15:21:49 +0100 Subject: [PATCH 14/35] fs: retire now stale MAY_WRITE predicts in inode_permission() The primary non-MAY_WRITE consumer now uses lookup_inode_permission_may_exec(). Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251107142149.989998-4-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 94cb52b01022..25cc9680e6dc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -546,7 +546,7 @@ static inline int do_inode_permission(struct mnt_idmap *idmap, */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { - if (unlikely(mask & MAY_WRITE)) { + if (mask & MAY_WRITE) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ @@ -577,7 +577,7 @@ int inode_permission(struct mnt_idmap *idmap, if (unlikely(retval)) return retval; - if (unlikely(mask & MAY_WRITE)) { + if (mask & MAY_WRITE) { /* * Nobody gets write access to an immutable file. */ From 50b2a4f19b224694e2bc71a98a7a67aeebacc95e Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 10 Nov 2025 20:47:14 +0800 Subject: [PATCH 15/35] bdev: add hint prints in sb_set_blocksize() for LBS dependency on THP Support for block sizes greater than the page size depends on large folios, which in turn require CONFIG_TRANSPARENT_HUGEPAGE to be enabled. Because the code is wrapped in multiple layers of abstraction, this dependency is rather obscure, so users may not realize it and may be unsure how to enable LBS. As suggested by Theodore, I have added hint messages in sb_set_blocksize so that users can distinguish whether a mount failure with block size larger than page size is due to lack of filesystem support or the absence of CONFIG_TRANSPARENT_HUGEPAGE. Suggested-by: Theodore Ts'o Link: https://patch.msgid.link/20251110043226.GD2988753@mit.edu Signed-off-by: Baokun Li Link: https://patch.msgid.link/20251110124714.1329978-1-libaokun@huaweicloud.com Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- block/bdev.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/block/bdev.c b/block/bdev.c index 810707cca970..4888831acaf5 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -217,9 +217,26 @@ int set_blocksize(struct file *file, int size) EXPORT_SYMBOL(set_blocksize); +static int sb_validate_large_blocksize(struct super_block *sb, int size) +{ + const char *err_str = NULL; + + if (!(sb->s_type->fs_flags & FS_LBS)) + err_str = "not supported by filesystem"; + else if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + err_str = "is only supported with CONFIG_TRANSPARENT_HUGEPAGE"; + + if (!err_str) + return 0; + + pr_warn_ratelimited("%s: block size(%d) > page size(%lu) %s\n", + sb->s_type->name, size, PAGE_SIZE, err_str); + return -EINVAL; +} + int sb_set_blocksize(struct super_block *sb, int size) { - if (!(sb->s_type->fs_flags & FS_LBS) && size > PAGE_SIZE) + if (size > PAGE_SIZE && sb_validate_large_blocksize(sb, size)) return 0; if (set_blocksize(sb->s_bdev_file, size)) return 0; From e41c1f42919b01649ddea25b95eaa72dce2d591a Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 9 Nov 2025 13:52:54 +0100 Subject: [PATCH 16/35] fs: touch predicts in do_dentry_open() Helps out some of the asm, the routine is still a mess. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251109125254.1288882-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/open.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/open.c b/fs/open.c index 3d64372ecc67..e3c737ede4e4 100644 --- a/fs/open.c +++ b/fs/open.c @@ -940,7 +940,7 @@ static int do_dentry_open(struct file *f, } error = security_file_open(f); - if (error) + if (unlikely(error)) goto cleanup_all; /* @@ -950,11 +950,11 @@ static int do_dentry_open(struct file *f, * pseudo file, this call will not change the mode. */ error = fsnotify_open_perm_and_set_mode(f); - if (error) + if (unlikely(error)) goto cleanup_all; error = break_lease(file_inode(f), f->f_flags); - if (error) + if (unlikely(error)) goto cleanup_all; /* normally all 3 are set; ->open() can clear them if needed */ From 21b561dab1406e63740ebe240c7b69f19e1bcf58 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Nov 2025 16:36:22 +0100 Subject: [PATCH 17/35] fs: hide dentry_cache behind runtime const machinery Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251105153622.758836-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/dcache.c | 6 ++++-- include/asm-generic/vmlinux.lds.h | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 035cccbc9276..5c6282b03ba2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -86,7 +86,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); -static struct kmem_cache *dentry_cache __ro_after_init; +static struct kmem_cache *__dentry_cache __ro_after_init; +#define dentry_cache runtime_const_ptr(__dentry_cache) const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); @@ -3222,9 +3223,10 @@ static void __init dcache_init(void) * but it is probably not worth it because of the cache nature * of the dcache. */ - dentry_cache = KMEM_CACHE_USERCOPY(dentry, + __dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_shortname.string); + runtime_const_init(ptr, __dentry_cache); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8a9a2e732a65..20939d2445e7 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -955,7 +955,8 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) #define RUNTIME_CONST_VARIABLES \ RUNTIME_CONST(shift, d_hash_shift) \ - RUNTIME_CONST(ptr, dentry_hashtable) + RUNTIME_CONST(ptr, dentry_hashtable) \ + RUNTIME_CONST(ptr, __dentry_cache) /* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */ #define KUNIT_TABLE() \ From 9eda581bfe8a1774390dd66f365a2e00a9d27a41 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 10 Nov 2025 10:56:34 +0100 Subject: [PATCH 18/35] fs: move fd_install() slowpath into a dedicated routine and provide commentary On stock kernel gcc 14 emits avoidable register spillage: endbr64 call ffffffff81374630 <__fentry__> push %r13 push %r12 push %rbx sub $0x8,%rsp [snip] Total fast path is 99 bytes. Moving the slowpath out avoids it and shortens the fast path to 74 bytes. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251110095634.1433061-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/file.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/fs/file.c b/fs/file.c index 28743b742e3c..3f56890068aa 100644 --- a/fs/file.c +++ b/fs/file.c @@ -641,6 +641,34 @@ void put_unused_fd(unsigned int fd) EXPORT_SYMBOL(put_unused_fd); +/* + * Install a file pointer in the fd array while it is being resized. + * + * We need to make sure our update to the array does not get lost as the resizing + * thread can be copying the content as we modify it. + * + * We have two ways to do it: + * - go off CPU waiting for resize_in_progress to clear + * - take the spin lock + * + * The latter is trivial to implement and saves us from having to might_sleep() + * for debugging purposes. + * + * This is moved out of line from fd_install() to convince gcc to optimize that + * routine better. + */ +static void noinline fd_install_slowpath(unsigned int fd, struct file *file) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + spin_unlock(&files->file_lock); +} + /** * fd_install - install a file pointer in the fd array * @fd: file descriptor to install the file in @@ -658,14 +686,9 @@ void fd_install(unsigned int fd, struct file *file) return; rcu_read_lock_sched(); - if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); - rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); + fd_install_slowpath(fd, file); return; } /* coupled with smp_wmb() in expand_fdtable() */ From 030e86dfdaa789dd2e2e481d7118979a9d1f8f4e Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Nov 2025 16:06:30 +0100 Subject: [PATCH 19/35] fs: touch up predicts in path lookup Rationale: - ND_ROOT_PRESET is only set in a condition already marked unlikely - LOOKUP_IS_SCOPED already has unlikely on it, but inconsistently applied - set_root() only fails if there is a bug - most names are not empty (see !*s) - most of the time path_init() does not encounter LOOKUP_CACHED without LOOKUP_RCU - LOOKUP_IN_ROOT is a rarely seen flag Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251105150630.756606-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/namei.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 25cc9680e6dc..a5bffc2a29f6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -990,8 +990,8 @@ static int complete_walk(struct nameidata *nd) * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ - if (!(nd->state & ND_ROOT_PRESET)) - if (!(nd->flags & LOOKUP_IS_SCOPED)) + if (likely(!(nd->state & ND_ROOT_PRESET))) + if (likely(!(nd->flags & LOOKUP_IS_SCOPED))) nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) @@ -1073,7 +1073,7 @@ static int nd_jump_root(struct nameidata *nd) } if (!nd->root.mnt) { int error = set_root(nd); - if (error) + if (unlikely(error)) return error; } if (nd->flags & LOOKUP_RCU) { @@ -2140,7 +2140,7 @@ static const char *handle_dots(struct nameidata *nd, int type) if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); - if (error) + if (unlikely(error)) return error; } if (nd->flags & LOOKUP_RCU) @@ -2582,10 +2582,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags) const char *s = nd->pathname; /* LOOKUP_CACHED requires RCU, ask caller to retry */ - if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) + if (unlikely((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)) return ERR_PTR(-EAGAIN); - if (!*s) + if (unlikely(!*s)) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); @@ -2599,7 +2599,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); - if (nd->state & ND_ROOT_PRESET) { + if (unlikely(nd->state & ND_ROOT_PRESET)) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) @@ -2618,7 +2618,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->root.mnt = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ - if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { + if (*s == '/' && likely(!(flags & LOOKUP_IN_ROOT))) { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); @@ -2671,7 +2671,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) } /* For scoped-lookups we need to set the root to the dirfd as well. */ - if (flags & LOOKUP_IS_SCOPED) { + if (unlikely(flags & LOOKUP_IS_SCOPED)) { nd->root = nd->path; if (flags & LOOKUP_RCU) { nd->root_seq = nd->seq; From c29383a874ee86af1c68488f15f88544140414fe Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 18 Nov 2025 13:07:06 -0800 Subject: [PATCH 20/35] watch_queue: Use local kmap in post_one_notification() Replace the now deprecated kmap_atomic() with kmap_local_page(). Optimize for the non-highmem cases and avoid disabling preemption and pagefaults, the caller's context is atomic anyway, but that is irrelevant to kmap. The memcpy itself does not require any such semantics and the mapping would hold valid across context switches anyway. Further, highmem is planned to to be removed[1]. [1] https://lore.kernel.org/all/4ff89b72-03ff-4447-9d21-dd6a5fe1550f@app.fastmail.com/ Signed-off-by: Davidlohr Bueso Link: https://patch.msgid.link/20251118210706.1816303-1-dave@stgolabs.net Signed-off-by: Christian Brauner --- kernel/watch_queue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 7e45559521af..52f89f1137da 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -119,9 +119,9 @@ static bool post_one_notification(struct watch_queue *wqueue, offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; get_page(page); len = n->info & WATCH_INFO_LENGTH; - p = kmap_atomic(page); + p = kmap_local_page(page); memcpy(p + offset, n, len); - kunmap_atomic(p); + kunmap_local(p); buf = pipe_buf(pipe, head); buf->page = page; From 6d228c181ed27957100bb1e77ccbb8078a22a8c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 19 Nov 2025 11:14:15 +0100 Subject: [PATCH 21/35] fs: remove spurious exports in fs/file_attr.c Commit 2f952c9e8fe1 ("fs: split fileattr related helpers into separate file") added various exports without users despite claiming to be a simple refactor. Drop them again. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251119101415.2732320-1-hch@lst.de Signed-off-by: Christian Brauner --- fs/file_attr.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/file_attr.c b/fs/file_attr.c index 1dcec88c0680..4c4916632f11 100644 --- a/fs/file_attr.c +++ b/fs/file_attr.c @@ -316,7 +316,6 @@ int ioctl_getflags(struct file *file, unsigned int __user *argp) err = put_user(fa.flags, argp); return err; } -EXPORT_SYMBOL(ioctl_getflags); int ioctl_setflags(struct file *file, unsigned int __user *argp) { @@ -337,7 +336,6 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp) } return err; } -EXPORT_SYMBOL(ioctl_setflags); int ioctl_fsgetxattr(struct file *file, void __user *argp) { @@ -350,7 +348,6 @@ int ioctl_fsgetxattr(struct file *file, void __user *argp) return err; } -EXPORT_SYMBOL(ioctl_fsgetxattr); int ioctl_fssetxattr(struct file *file, void __user *argp) { @@ -369,7 +366,6 @@ int ioctl_fssetxattr(struct file *file, void __user *argp) } return err; } -EXPORT_SYMBOL(ioctl_fssetxattr); SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename, struct file_attr __user *, ufattr, size_t, usize, From bfef6e1f3488fc09ae966cb4dd2cb09f73cff791 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 14 Nov 2025 21:18:03 +0100 Subject: [PATCH 22/35] fs: move mntput_no_expire() slowpath into a dedicated routine In the stock variant the compiler spills several registers on the stack and employs stack smashing protection, adding even more code + a branch on exit.. The actual fast path is small enough that the compiler inlines it for all callers -- the symbol is no longer emitted. Forcing noinline on it just for code-measurement purposes shows the fast path dropping from 111 to 39 bytes. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251114201803.2183505-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/namespace.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index d82910f33dc4..43191f190ef6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1345,26 +1345,12 @@ static void delayed_mntput(struct work_struct *unused) } static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); -static void mntput_no_expire(struct mount *mnt) +static void noinline mntput_no_expire_slowpath(struct mount *mnt) { LIST_HEAD(list); int count; - rcu_read_lock(); - if (likely(READ_ONCE(mnt->mnt_ns))) { - /* - * Since we don't do lock_mount_hash() here, - * ->mnt_ns can change under us. However, if it's - * non-NULL, then there's a reference that won't - * be dropped until after an RCU delay done after - * turning ->mnt_ns NULL. So if we observe it - * non-NULL under rcu_read_lock(), the reference - * we are dropping is not the final one. - */ - mnt_add_count(mnt, -1); - rcu_read_unlock(); - return; - } + VFS_BUG_ON(mnt->mnt_ns); lock_mount_hash(); /* * make sure that if __legitimize_mnt() has not seen us grab @@ -1415,6 +1401,26 @@ static void mntput_no_expire(struct mount *mnt) cleanup_mnt(mnt); } +static void mntput_no_expire(struct mount *mnt) +{ + rcu_read_lock(); + if (likely(READ_ONCE(mnt->mnt_ns))) { + /* + * Since we don't do lock_mount_hash() here, + * ->mnt_ns can change under us. However, if it's + * non-NULL, then there's a reference that won't + * be dropped until after an RCU delay done after + * turning ->mnt_ns NULL. So if we observe it + * non-NULL under rcu_read_lock(), the reference + * we are dropping is not the final one. + */ + mnt_add_count(mnt, -1); + rcu_read_unlock(); + return; + } + mntput_no_expire_slowpath(mnt); +} + void mntput(struct vfsmount *mnt) { if (mnt) { From 7c179096e77eca210caf43abfcf3e556030fea3b Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 19 Nov 2025 15:29:54 +0100 Subject: [PATCH 23/35] fs: add predicts based on nd->depth Stats from nd->depth usage during the venerable kernel build collected like so: bpftrace -e 'kprobe:terminate_walk,kprobe:walk_component,kprobe:legitimize_links { @[probe] = lhist(((struct nameidata *)arg0)->depth, 0, 8, 1); }' @[kprobe:legitimize_links]: [0, 1) 6554906 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [1, 2) 3534 | | @[kprobe:terminate_walk]: [0, 1) 12153664 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| @[kprobe:walk_component]: [0, 1) 53075749 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [1, 2) 971421 | | [2, 3) 84946 | | Additionally a custom probe was added for depth within link_path_walk(): bpftrace -e 'kprobe:link_path_walk_probe { @[probe] = lhist(arg0, 0, 8, 1); }' @[kprobe:link_path_walk_probe]: [0, 1) 7528231 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [1, 2) 407905 |@@ | Given these results: 1. terminate_walk() is called towards the end of the lookup and in this test it never had any links to clean up. 2. legitimize_links() is also called towards the end of lookup and most of the time there s 0 depth. Patch consumers to avoid calling into it in that case. 3. walk_component() is typically called with WALK_MORE and zero depth, checked in that order. Check depth first and predict it is 0. 4. link_path_walk() also does not deal with a symlink most of the time when !*name Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251119142954.2909394-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/namei.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index a5bffc2a29f6..5fbb05fb96b7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -785,7 +785,8 @@ static void leave_rcu(struct nameidata *nd) static void terminate_walk(struct nameidata *nd) { - drop_links(nd); + if (unlikely(nd->depth)) + drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) { int i; path_put(&nd->path); @@ -882,7 +883,7 @@ static bool try_to_unlazy(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (unlikely(!legitimize_links(nd))) + if (unlikely(nd->depth && !legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; @@ -917,7 +918,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (unlikely(!legitimize_links(nd))) + if (unlikely(nd->depth && !legitimize_links(nd))) goto out2; res = __legitimize_mnt(nd->path.mnt, nd->m_seq); if (unlikely(res)) { @@ -2179,7 +2180,7 @@ static const char *walk_component(struct nameidata *nd, int flags) * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { - if (!(flags & WALK_MORE) && nd->depth) + if (unlikely(nd->depth) && !(flags & WALK_MORE)) put_link(nd); return handle_dots(nd, nd->last_type); } @@ -2191,7 +2192,7 @@ static const char *walk_component(struct nameidata *nd, int flags) if (IS_ERR(dentry)) return ERR_CAST(dentry); } - if (!(flags & WALK_MORE) && nd->depth) + if (unlikely(nd->depth) && !(flags & WALK_MORE)) put_link(nd); return step_into(nd, flags, dentry); } @@ -2544,7 +2545,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (unlikely(!*name)) { OK: /* pathname or trailing symlink, done */ - if (!depth) { + if (likely(!depth)) { nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; From 8d79ec9e7f634e10c6cdc7f3999023bd988df1ad Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 19 Nov 2025 15:49:30 +0100 Subject: [PATCH 24/35] fs: mark lookup_slow() as noinline Otherwise it gets inlined notably in walk_component(), which convinces the compiler to push/pop additional registers in the fast path to accomodate existence of the inlined version. Shortens the fast path of that routine from 87 to 71 bytes. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251119144930.2911698-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 5fbb05fb96b7..efa592a98155 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1863,7 +1863,7 @@ again: return dentry; } -static struct dentry *lookup_slow(const struct qstr *name, +static noinline struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { From bef0202fb77b9b733054838df1d3111406bd21d4 Mon Sep 17 00:00:00 2001 From: Askar Safin Date: Thu, 20 Nov 2025 21:13:16 +0000 Subject: [PATCH 25/35] fs/splice.c: trivial fix: pipes -> pipe's Trivial fix. Signed-off-by: Askar Safin Link: https://patch.msgid.link/20251120211316.706725-1-safinaskar@gmail.com Signed-off-by: Christian Brauner --- fs/splice.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/splice.c b/fs/splice.c index f5094b6d00a0..d338fe56b50b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1498,7 +1498,7 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, /* * For lack of a better implementation, implement vmsplice() to userspace - * as a simple copy of the pipes pages to the user iov. + * as a simple copy of the pipe's pages to the user iov. */ static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, unsigned int flags) From 54ca9e913e22e364292a484783efc4fcdb6fdc51 Mon Sep 17 00:00:00 2001 From: Askar Safin Date: Thu, 20 Nov 2025 19:51:40 +0000 Subject: [PATCH 26/35] include/linux/fs.h: trivial fix: regualr -> regular Trivial fix. Signed-off-by: Askar Safin Link: https://patch.msgid.link/20251120195140.571608-1-safinaskar@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index ff69734b9fde..e02700b4e36b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3102,7 +3102,7 @@ static inline bool inode_wrong_type(const struct inode *inode, umode_t mode) * file_start_write - get write access to a superblock for regular file io * @file: the file we want to write to * - * This is a variant of sb_start_write() which is a noop on non-regualr file. + * This is a variant of sb_start_write() which is a noop on non-regular file. * Should be matched with a call to file_end_write(). */ static inline void file_start_write(struct file *file) From 3cd9a42f1b5e34d3972237cbf8541af60844cbd4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Nov 2025 07:47:22 +0100 Subject: [PATCH 27/35] fs: refactor file timestamp update logic Currently the two high-level APIs use two helper functions to implement almost all of the logic. Refactor the two helpers and the common logic into a new file_update_time_flags routine that gets the iocb flags or 0 in case of file_update_time passed so that the entire logic is contained in a single function and can be easily understood and modified. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251120064859.2911749-2-hch@lst.de Reviewed-by: Chaitanya Kulkarni Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 54 +++++++++++++++++------------------------------------- 1 file changed, 17 insertions(+), 37 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index ec9339024ac3..4884ffa931e7 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2310,10 +2310,12 @@ out: } EXPORT_SYMBOL(current_time); -static int inode_needs_update_time(struct inode *inode) +static int file_update_time_flags(struct file *file, unsigned int flags) { + struct inode *inode = file_inode(file); struct timespec64 now, ts; - int sync_it = 0; + int sync_mode = 0; + int ret = 0; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) @@ -2323,29 +2325,23 @@ static int inode_needs_update_time(struct inode *inode) ts = inode_get_mtime(inode); if (!timespec64_equal(&ts, &now)) - sync_it |= S_MTIME; - + sync_mode |= S_MTIME; ts = inode_get_ctime(inode); if (!timespec64_equal(&ts, &now)) - sync_it |= S_CTIME; - + sync_mode |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) - sync_it |= S_VERSION; + sync_mode |= S_VERSION; - return sync_it; -} + if (!sync_mode) + return 0; -static int __file_update_time(struct file *file, int sync_mode) -{ - int ret = 0; - struct inode *inode = file_inode(file); - - /* try to update time settings */ - if (!mnt_get_write_access_file(file)) { - ret = inode_update_time(inode, sync_mode); - mnt_put_write_access_file(file); - } + if (flags & IOCB_NOWAIT) + return -EAGAIN; + if (mnt_get_write_access_file(file)) + return 0; + ret = inode_update_time(inode, sync_mode); + mnt_put_write_access_file(file); return ret; } @@ -2365,14 +2361,7 @@ static int __file_update_time(struct file *file, int sync_mode) */ int file_update_time(struct file *file) { - int ret; - struct inode *inode = file_inode(file); - - ret = inode_needs_update_time(inode); - if (ret <= 0) - return ret; - - return __file_update_time(file, ret); + return file_update_time_flags(file, 0); } EXPORT_SYMBOL(file_update_time); @@ -2394,7 +2383,6 @@ EXPORT_SYMBOL(file_update_time); static int file_modified_flags(struct file *file, int flags) { int ret; - struct inode *inode = file_inode(file); /* * Clear the security bits if the process is not being run by root. @@ -2403,17 +2391,9 @@ static int file_modified_flags(struct file *file, int flags) ret = file_remove_privs_flags(file, flags); if (ret) return ret; - if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; - - ret = inode_needs_update_time(inode); - if (ret <= 0) - return ret; - if (flags & IOCB_NOWAIT) - return -EAGAIN; - - return __file_update_time(file, ret); + return file_update_time_flags(file, flags); } /** From 7f30e7a42371af4bba53f9a875a0d320cead9f4b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Nov 2025 07:47:23 +0100 Subject: [PATCH 28/35] fs: lift the FMODE_NOCMTIME check into file_update_time_flags FMODE_NOCMTIME used to be just a hack for the legacy XFS handle-based "invisible I/O", but commit e5e9b24ab8fa ("nfsd: freeze c/mtime updates with outstanding WRITE_ATTRS delegation") started using it from generic callers. I'm not sure other file systems are actually read for this in general, so the above commit should get a closer look, but for it to make any sense, file_update_time needs to respect the flag. Lift the check from file_modified_flags to file_update_time so that users of file_update_time inherit the behavior and so that all the checks are done in one place. Fixes: e5e9b24ab8fa ("nfsd: freeze c/mtime updates with outstanding WRITE_ATTRS delegation") Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251120064859.2911749-3-hch@lst.de Reviewed-by: Chaitanya Kulkarni Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 4884ffa931e7..24dab63844db 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2320,6 +2320,8 @@ static int file_update_time_flags(struct file *file, unsigned int flags) /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; + if (unlikely(file->f_mode & FMODE_NOCMTIME)) + return 0; now = current_time(inode); @@ -2391,8 +2393,6 @@ static int file_modified_flags(struct file *file, int flags) ret = file_remove_privs_flags(file, flags); if (ret) return ret; - if (unlikely(file->f_mode & FMODE_NOCMTIME)) - return 0; return file_update_time_flags(file, flags); } From 013983665227b16d0ec2c4fec19b43c3e265ebc5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Nov 2025 07:47:24 +0100 Subject: [PATCH 29/35] fs: export vfs_utimes This will be used to replace an incorrect direct call into generic_update_time in btrfs. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251120064859.2911749-4-hch@lst.de Reviewed-by: Chaitanya Kulkarni Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/utimes.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/utimes.c b/fs/utimes.c index c7c7958e57b2..3e7156396230 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -76,6 +76,7 @@ retry_deleg: out: return error; } +EXPORT_SYMBOL_GPL(vfs_utimes); static int do_utimes_path(int dfd, const char __user *filename, struct timespec64 *times, int flags) From ded99587047c7db44837cbbc56448dcdfdae04ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Nov 2025 07:47:25 +0100 Subject: [PATCH 30/35] btrfs: use vfs_utimes to update file timestamps Btrfs updates the device node timestamps for block device special files when it stop using the device. Commit 8f96a5bfa150 ("btrfs: update the bdev time directly when closing") switch that update from the correct layering to directly call the low-level helper on the bdev inode. This is wrong and got fixed in commit 54fde91f52f5 ("btrfs: update device path inode time instead of bd_inode") by updating the file system inode instead of the bdev inode, but this kept the incorrect bypassing of the VFS interfaces and file system ->update_times method. Fix this by using the propet vfs_utimes interface. Fixes: 8f96a5bfa150 ("btrfs: update the bdev time directly when closing") Fixes: 54fde91f52f5 ("btrfs: update device path inode time instead of bd_inode") Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251120064859.2911749-5-hch@lst.de Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/btrfs/volumes.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2bec544d8ba3..259e8b0496df 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2002,14 +2002,11 @@ out: static void update_dev_time(const char *device_path) { struct path path; - int ret; - ret = kern_path(device_path, LOOKUP_FOLLOW, &path); - if (ret) - return; - - inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); - path_put(&path); + if (!kern_path(device_path, LOOKUP_FOLLOW, &path)) { + vfs_utimes(&path, NULL); + path_put(&path); + } } static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, From f981264ae75e1fee75b7e3f87f4942142e590d01 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Nov 2025 07:47:26 +0100 Subject: [PATCH 31/35] btrfs: fix the comment on btrfs_update_time Since commit e41f941a2311 ("Btrfs: move over to use ->update_time") this is not a copy of the high-level file_update_time helper. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251120064859.2911749-6-hch@lst.de Signed-off-by: Christian Brauner --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 36c451a9a0bf..e74cf1712778 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6291,8 +6291,8 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) } /* - * This is a copy of file_update_time. We need this so we can return error on - * ENOSPC for updating the inode in the case of file write and mmap writes. + * We need our own ->update_time so that we can return error on ENOSPC for + * updating the inode in the case of file write and mmap writes. */ static int btrfs_update_time(struct inode *inode, int flags) { From eff094a58d00acf1c84f729c3715fc4cf7fddcee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Nov 2025 07:47:27 +0100 Subject: [PATCH 32/35] orangefs: use inode_update_timestamps directly Orangefs has no i_version handling and __orangefs_setattr already explicitly marks the inode dirty. So instead of the using the flags return value from generic_update_time, just call the lower level inode_update_timestamps helper directly. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251120064859.2911749-7-hch@lst.de Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/orangefs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index a01400cd41fd..55f6c8026812 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -878,7 +878,9 @@ int orangefs_update_time(struct inode *inode, int flags) gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n", get_khandle_from_ino(inode)); - flags = generic_update_time(inode, flags); + + flags = inode_update_timestamps(inode, flags); + memset(&iattr, 0, sizeof iattr); if (flags & S_ATIME) iattr.ia_valid |= ATTR_ATIME; From 9d2a6211a7b972563d20edebccaae42994c429fb Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 20 Nov 2025 01:38:02 +0100 Subject: [PATCH 33/35] fs: tidy up step_into() & friends before inlining Symlink handling is already marked as unlikely and pushing out some of it into pick_link() reduces register spillage on entry to step_into() with gcc 14.2. The compiler needed additional convincing that handle_mounts() is unlikely to fail. At the same time neither clang nor gcc could be convinced to tail-call into pick_link(). While pick_link() takes an address of stack-based object as an argument (which definitely prevents the optimization), splitting it into separate tuple did not help. The issue persists even when compiled without stack protector. As such nothing was done about this for the time being to not grow the diff. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251120003803.2979978-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/namei.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index efa592a98155..5fee8afa510e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1672,13 +1672,15 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { unsigned int seq = nd->next_seq; + if (likely(!d_managed(dentry))) + return 0; if (likely(__follow_mount_rcu(nd, path))) return 0; // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; nd->next_seq = seq; - if (!try_to_unlazy_next(nd, dentry)) + if (unlikely(!try_to_unlazy_next(nd, dentry))) return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); @@ -1941,13 +1943,23 @@ static int reserve_stack(struct nameidata *nd, struct path *link) enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; -static const char *pick_link(struct nameidata *nd, struct path *link, +static noinline const char *pick_link(struct nameidata *nd, struct path *link, struct inode *inode, int flags) { struct saved *last; const char *res; - int error = reserve_stack(nd, link); + int error; + if (nd->flags & LOOKUP_RCU) { + /* make sure that d_is_symlink from step_into() matches the inode */ + if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + } else { + if (link->mnt == nd->path.mnt) + mntget(link->mnt); + } + + error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) path_put(link); @@ -2026,9 +2038,10 @@ static const char *step_into(struct nameidata *nd, int flags, { struct path path; struct inode *inode; - int err = handle_mounts(nd, dentry, &path); + int err; - if (err < 0) + err = handle_mounts(nd, dentry, &path); + if (unlikely(err < 0)) return ERR_PTR(err); inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || @@ -2050,14 +2063,6 @@ static const char *step_into(struct nameidata *nd, int flags, nd->seq = nd->next_seq; return NULL; } - if (nd->flags & LOOKUP_RCU) { - /* make sure that d_is_symlink above matches inode */ - if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) - return ERR_PTR(-ECHILD); - } else { - if (path.mnt == nd->path.mnt) - mntget(path.mnt); - } return pick_link(nd, &path, inode, flags); } From 177fdbae39ecccb441d45e5e5ab146ea35b03d49 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 20 Nov 2025 01:38:03 +0100 Subject: [PATCH 34/35] fs: inline step_into() and walk_component() The primary consumer is link_path_walk(), calling walk_component() every time which in turn calls step_into(). Inlining these saves overhead of 2 function calls per path component, along with allowing the compiler to do better job optimizing them in place. step_into() had absolutely atrocious assembly to facilitate the slowpath. In order to lessen the burden at the callsite all the hard work is moved into step_into_slowpath() and instead an inline-able fastpath is implemented for rcu-walk. The new fastpath is a stripped down step_into() RCU handling with a d_managed() check from handle_mounts(). Benchmarked as follows on Sapphire Rapids: 1. the "before" was a kernel with not-yet-merged optimizations (notably elision of calls to security_inode_permission() and marking ext4 inodes as not having acls as applicable) 2. "after" is the same + the prep patch + this patch 3. benchmark consists of issuing 205 calls to access(2) in a loop with pathnames lifted out of gcc and the linker building real code, most of which have several path components and 118 of which fail with -ENOENT. Result in terms of ops/s: before: 21619 after: 22536 (+4%) profile before: 20.25% [kernel] [k] __d_lookup_rcu 10.54% [kernel] [k] link_path_walk 10.22% [kernel] [k] entry_SYSCALL_64 6.50% libc.so.6 [.] __GI___access 6.35% [kernel] [k] strncpy_from_user 4.87% [kernel] [k] step_into 3.68% [kernel] [k] kmem_cache_alloc_noprof 2.88% [kernel] [k] walk_component 2.86% [kernel] [k] kmem_cache_free 2.14% [kernel] [k] set_root 2.08% [kernel] [k] lookup_fast after: 23.38% [kernel] [k] __d_lookup_rcu 11.27% [kernel] [k] entry_SYSCALL_64 10.89% [kernel] [k] link_path_walk 7.00% libc.so.6 [.] __GI___access 6.88% [kernel] [k] strncpy_from_user 3.50% [kernel] [k] kmem_cache_alloc_noprof 2.01% [kernel] [k] kmem_cache_free 2.00% [kernel] [k] set_root 1.99% [kernel] [k] lookup_fast 1.81% [kernel] [k] do_syscall_64 1.69% [kernel] [k] entry_SYSCALL_64_safe_stack While walk_component() and step_into() of course disappear from the profile, the link_path_walk() barely gets more overhead despite the inlining thanks to the fast path added and while completing more walks per second. I did not investigate why overhead grew a lot on __d_lookup_rcu(). Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251120003803.2979978-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/namei.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 5fee8afa510e..8281dfe5047f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1951,7 +1951,7 @@ static noinline const char *pick_link(struct nameidata *nd, struct path *link, int error; if (nd->flags & LOOKUP_RCU) { - /* make sure that d_is_symlink from step_into() matches the inode */ + /* make sure that d_is_symlink from step_into_slowpath() matches the inode */ if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); } else { @@ -2033,7 +2033,7 @@ all_done: // pure jump * * NOTE: dentry must be what nd->next_seq had been sampled from. */ -static const char *step_into(struct nameidata *nd, int flags, +static noinline const char *step_into_slowpath(struct nameidata *nd, int flags, struct dentry *dentry) { struct path path; @@ -2066,6 +2066,31 @@ static const char *step_into(struct nameidata *nd, int flags, return pick_link(nd, &path, inode, flags); } +static __always_inline const char *step_into(struct nameidata *nd, int flags, + struct dentry *dentry) +{ + /* + * In the common case we are in rcu-walk and traversing over a non-mounted on + * directory (as opposed to e.g., a symlink). + * + * We can handle that and negative entries with the checks below. + */ + if (likely((nd->flags & LOOKUP_RCU) && + !d_managed(dentry) && !d_is_symlink(dentry))) { + struct inode *inode = dentry->d_inode; + if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) + return ERR_PTR(-ECHILD); + if (unlikely(!inode)) + return ERR_PTR(-ENOENT); + nd->path.dentry = dentry; + /* nd->path.mnt is retained on purpose */ + nd->inode = inode; + nd->seq = nd->next_seq; + return NULL; + } + return step_into_slowpath(nd, flags, dentry); +} + static struct dentry *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent, *old; @@ -2176,7 +2201,7 @@ static const char *handle_dots(struct nameidata *nd, int type) return NULL; } -static const char *walk_component(struct nameidata *nd, int flags) +static __always_inline const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; /* From ebf8538979101ef879742dcfaf04b684f5461e12 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 26 Nov 2025 16:15:48 -0500 Subject: [PATCH 35/35] MAINTAINERS: add German Maglione as virtiofs co-maintainer German Maglione is a co-maintainer of the virtiofsd userspace device implementation (https://gitlab.com/virtio-fs/virtiofsd) and is currently one of the most active virtiofs developers outside the kernel. I have not worked on virtiofs except to review kernel patches for a few years now and would like German to take over from me gradually. It is healthier to have a kernel maintainer who is actively involved. I expect to remove myself in a few months. Signed-off-by: Stefan Hajnoczi Link: https://patch.msgid.link/20251126211548.598469-1-stefanha@redhat.com Signed-off-by: Christian Brauner --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 3da2c26a796b..6172cc42a4ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27156,6 +27156,7 @@ F: arch/s390/include/uapi/asm/virtio-ccw.h F: drivers/s390/virtio/ VIRTIO FILE SYSTEM +M: German Maglione M: Vivek Goyal M: Stefan Hajnoczi M: Miklos Szeredi