linux/fs/ext4/file.c
Greg Kroah-Hartman a1ac3f3093 Merge 5.10.36 into android12-5.10
Changes in 5.10.36
	bus: mhi: core: Fix check for syserr at power_up
	bus: mhi: core: Clear configuration from channel context during reset
	bus: mhi: core: Sanity check values from remote device before use
	nitro_enclaves: Fix stale file descriptors on failed usercopy
	dyndbg: fix parsing file query without a line-range suffix
	s390/disassembler: increase ebpf disasm buffer size
	s390/zcrypt: fix zcard and zqueue hot-unplug memleak
	vhost-vdpa: fix vm_flags for virtqueue doorbell mapping
	tpm: acpi: Check eventlog signature before using it
	ACPI: custom_method: fix potential use-after-free issue
	ACPI: custom_method: fix a possible memory leak
	ftrace: Handle commands when closing set_ftrace_filter file
	ARM: 9056/1: decompressor: fix BSS size calculation for LLVM ld.lld
	arm64: dts: marvell: armada-37xx: add syscon compatible to NB clk node
	arm64: dts: mt8173: fix property typo of 'phys' in dsi node
	ecryptfs: fix kernel panic with null dev_name
	fs/epoll: restore waking from ep_done_scan()
	mtd: spi-nor: core: Fix an issue of releasing resources during read/write
	Revert "mtd: spi-nor: macronix: Add support for mx25l51245g"
	mtd: spinand: core: add missing MODULE_DEVICE_TABLE()
	mtd: rawnand: atmel: Update ecc_stats.corrected counter
	mtd: physmap: physmap-bt1-rom: Fix unintentional stack access
	erofs: add unsupported inode i_format check
	spi: stm32-qspi: fix pm_runtime usage_count counter
	spi: spi-ti-qspi: Free DMA resources
	scsi: qla2xxx: Fix crash in qla2xxx_mqueuecommand()
	scsi: mpt3sas: Block PCI config access from userspace during reset
	mmc: uniphier-sd: Fix an error handling path in uniphier_sd_probe()
	mmc: uniphier-sd: Fix a resource leak in the remove function
	mmc: sdhci: Check for reset prior to DMA address unmap
	mmc: sdhci-pci: Fix initialization of some SD cards for Intel BYT-based controllers
	mmc: sdhci-tegra: Add required callbacks to set/clear CQE_EN bit
	mmc: block: Update ext_csd.cache_ctrl if it was written
	mmc: block: Issue a cache flush only when it's enabled
	mmc: core: Do a power cycle when the CMD11 fails
	mmc: core: Set read only for SD cards with permanent write protect bit
	mmc: core: Fix hanging on I/O during system suspend for removable cards
	irqchip/gic-v3: Do not enable irqs when handling spurious interrups
	cifs: Return correct error code from smb2_get_enc_key
	cifs: fix out-of-bound memory access when calling smb3_notify() at mount point
	cifs: detect dead connections only when echoes are enabled.
	smb2: fix use-after-free in smb2_ioctl_query_info()
	btrfs: handle remount to no compress during compression
	x86/build: Disable HIGHMEM64G selection for M486SX
	btrfs: fix metadata extent leak after failure to create subvolume
	intel_th: pci: Add Rocket Lake CPU support
	btrfs: fix race between transaction aborts and fsyncs leading to use-after-free
	posix-timers: Preserve return value in clock_adjtime32()
	fbdev: zero-fill colormap in fbcmap.c
	cpuidle: tegra: Fix C7 idling state on Tegra114
	bus: ti-sysc: Probe for l4_wkup and l4_cfg interconnect devices first
	staging: wimax/i2400m: fix byte-order issue
	spi: ath79: always call chipselect function
	spi: ath79: remove spi-master setup and cleanup assignment
	bus: mhi: core: Destroy SBL devices when moving to mission mode
	crypto: api - check for ERR pointers in crypto_destroy_tfm()
	crypto: qat - fix unmap invalid dma address
	usb: gadget: uvc: add bInterval checking for HS mode
	usb: webcam: Invalid size of Processing Unit Descriptor
	x86/sev: Do not require Hypervisor CPUID bit for SEV guests
	crypto: hisilicon/sec - fixes a printing error
	genirq/matrix: Prevent allocation counter corruption
	usb: gadget: f_uac2: validate input parameters
	usb: gadget: f_uac1: validate input parameters
	usb: dwc3: gadget: Ignore EP queue requests during bus reset
	usb: xhci: Fix port minor revision
	kselftest/arm64: mte: Fix compilation with native compiler
	ARM: tegra: acer-a500: Rename avdd to vdda of touchscreen node
	PCI: PM: Do not read power state in pci_enable_device_flags()
	kselftest/arm64: mte: Fix MTE feature detection
	ARM: dts: BCM5301X: fix "reg" formatting in /memory node
	ARM: dts: ux500: Fix up TVK R3 sensors
	x86/build: Propagate $(CLANG_FLAGS) to $(REALMODE_FLAGS)
	x86/boot: Add $(CLANG_FLAGS) to compressed KBUILD_CFLAGS
	efi/libstub: Add $(CLANG_FLAGS) to x86 flags
	soc/tegra: pmc: Fix completion of power-gate toggling
	arm64: dts: imx8mq-librem5-r3: Mark buck3 as always on
	tee: optee: do not check memref size on return from Secure World
	soundwire: cadence: only prepare attached devices on clock stop
	perf/arm_pmu_platform: Use dev_err_probe() for IRQ errors
	perf/arm_pmu_platform: Fix error handling
	random: initialize ChaCha20 constants with correct endianness
	usb: xhci-mtk: support quirk to disable usb2 lpm
	fpga: dfl: pci: add DID for D5005 PAC cards
	xhci: check port array allocation was successful before dereferencing it
	xhci: check control context is valid before dereferencing it.
	xhci: fix potential array out of bounds with several interrupters
	bus: mhi: core: Clear context for stopped channels from remove()
	ARM: dts: at91: change the key code of the gpio key
	tools/power/x86/intel-speed-select: Increase string size
	platform/x86: ISST: Account for increased timeout in some cases
	spi: dln2: Fix reference leak to master
	spi: omap-100k: Fix reference leak to master
	spi: qup: fix PM reference leak in spi_qup_remove()
	usb: gadget: tegra-xudc: Fix possible use-after-free in tegra_xudc_remove()
	usb: musb: fix PM reference leak in musb_irq_work()
	usb: core: hub: Fix PM reference leak in usb_port_resume()
	usb: dwc3: gadget: Check for disabled LPM quirk
	tty: n_gsm: check error while registering tty devices
	intel_th: Consistency and off-by-one fix
	phy: phy-twl4030-usb: Fix possible use-after-free in twl4030_usb_remove()
	crypto: sun8i-ss - Fix PM reference leak when pm_runtime_get_sync() fails
	crypto: sun8i-ce - Fix PM reference leak in sun8i_ce_probe()
	crypto: stm32/hash - Fix PM reference leak on stm32-hash.c
	crypto: stm32/cryp - Fix PM reference leak on stm32-cryp.c
	crypto: sa2ul - Fix PM reference leak in sa_ul_probe()
	crypto: omap-aes - Fix PM reference leak on omap-aes.c
	platform/x86: intel_pmc_core: Don't use global pmcdev in quirks
	spi: sync up initial chipselect state
	btrfs: do proper error handling in create_reloc_root
	btrfs: do proper error handling in btrfs_update_reloc_root
	btrfs: convert logic BUG_ON()'s in replace_path to ASSERT()'s
	drm: Added orientation quirk for OneGX1 Pro
	drm/qxl: do not run release if qxl failed to init
	drm/qxl: release shadow on shutdown
	drm/ast: Fix invalid usage of AST_MAX_HWC_WIDTH in cursor atomic_check
	drm/amd/display: changing sr exit latency
	drm/ast: fix memory leak when unload the driver
	drm/amd/display: Check for DSC support instead of ASIC revision
	drm/amd/display: Don't optimize bandwidth before disabling planes
	drm/amdgpu/display: buffer INTERRUPT_LOW_IRQ_CONTEXT interrupt work
	drm/amd/display/dc/dce/dce_aux: Remove duplicate line causing 'field overwritten' issue
	scsi: lpfc: Fix incorrect dbde assignment when building target abts wqe
	scsi: lpfc: Fix pt2pt connection does not recover after LOGO
	drm/amdgpu: Fix some unload driver issues
	sched/pelt: Fix task util_est update filtering
	kvfree_rcu: Use same set of GFP flags as does single-argument
	scsi: target: pscsi: Fix warning in pscsi_complete_cmd()
	media: ite-cir: check for receive overflow
	media: drivers: media: pci: sta2x11: fix Kconfig dependency on GPIOLIB
	media: imx: capture: Return -EPIPE from __capture_legacy_try_fmt()
	atomisp: don't let it go past pipes array
	power: supply: bq27xxx: fix power_avg for newer ICs
	extcon: arizona: Fix some issues when HPDET IRQ fires after the jack has been unplugged
	extcon: arizona: Fix various races on driver unbind
	media: media/saa7164: fix saa7164_encoder_register() memory leak bugs
	media: gspca/sq905.c: fix uninitialized variable
	power: supply: Use IRQF_ONESHOT
	backlight: qcom-wled: Use sink_addr for sync toggle
	backlight: qcom-wled: Fix FSC update issue for WLED5
	drm/amdgpu: mask the xgmi number of hops reported from psp to kfd
	drm/amdkfd: Fix UBSAN shift-out-of-bounds warning
	drm/amdgpu : Fix asic reset regression issue introduce by 8f211fe8ac
	drm/amd/pm: fix workload mismatch on vega10
	drm/amd/display: Fix UBSAN warning for not a valid value for type '_Bool'
	drm/amd/display: DCHUB underflow counter increasing in some scenarios
	drm/amd/display: fix dml prefetch validation
	scsi: qla2xxx: Always check the return value of qla24xx_get_isp_stats()
	drm/vkms: fix misuse of WARN_ON
	scsi: qla2xxx: Fix use after free in bsg
	mmc: sdhci-esdhc-imx: validate pinctrl before use it
	mmc: sdhci-pci: Add PCI IDs for Intel LKF
	mmc: sdhci-brcmstb: Remove CQE quirk
	ata: ahci: Disable SXS for Hisilicon Kunpeng920
	drm/komeda: Fix bit check to import to value of proper type
	nvmet: return proper error code from discovery ctrl
	selftests/resctrl: Enable gcc checks to detect buffer overflows
	selftests/resctrl: Fix compilation issues for global variables
	selftests/resctrl: Fix compilation issues for other global variables
	selftests/resctrl: Clean up resctrl features check
	selftests/resctrl: Fix missing options "-n" and "-p"
	selftests/resctrl: Use resctrl/info for feature detection
	selftests/resctrl: Fix incorrect parsing of iMC counters
	selftests/resctrl: Fix checking for < 0 for unsigned values
	power: supply: cpcap-charger: Add usleep to cpcap charger to avoid usb plug bounce
	scsi: smartpqi: Use host-wide tag space
	scsi: smartpqi: Correct request leakage during reset operations
	scsi: smartpqi: Add new PCI IDs
	scsi: scsi_dh_alua: Remove check for ASC 24h in alua_rtpg()
	media: em28xx: fix memory leak
	media: vivid: update EDID
	drm/msm/dp: Fix incorrect NULL check kbot warnings in DP driver
	clk: socfpga: arria10: Fix memory leak of socfpga_clk on error return
	power: supply: generic-adc-battery: fix possible use-after-free in gab_remove()
	power: supply: s3c_adc_battery: fix possible use-after-free in s3c_adc_bat_remove()
	media: tc358743: fix possible use-after-free in tc358743_remove()
	media: adv7604: fix possible use-after-free in adv76xx_remove()
	media: i2c: adv7511-v4l2: fix possible use-after-free in adv7511_remove()
	media: i2c: tda1997: Fix possible use-after-free in tda1997x_remove()
	media: i2c: adv7842: fix possible use-after-free in adv7842_remove()
	media: platform: sti: Fix runtime PM imbalance in regs_show
	media: sun8i-di: Fix runtime PM imbalance in deinterlace_start_streaming
	media: dvb-usb: fix memory leak in dvb_usb_adapter_init
	media: gscpa/stv06xx: fix memory leak
	sched/fair: Ignore percpu threads for imbalance pulls
	drm/msm/mdp5: Configure PP_SYNC_HEIGHT to double the vtotal
	drm/msm/mdp5: Do not multiply vclk line count by 100
	drm/amdgpu/ttm: Fix memory leak userptr pages
	drm/radeon/ttm: Fix memory leak userptr pages
	drm/amd/display: Fix debugfs link_settings entry
	drm/amd/display: Fix UBSAN: shift-out-of-bounds warning
	drm/amdkfd: Fix cat debugfs hang_hws file causes system crash bug
	amdgpu: avoid incorrect %hu format string
	drm/amd/display: Try YCbCr420 color when YCbCr444 fails
	drm/amdgpu: fix NULL pointer dereference
	scsi: lpfc: Fix crash when a REG_RPI mailbox fails triggering a LOGO response
	scsi: lpfc: Fix error handling for mailboxes completed in MBX_POLL mode
	scsi: lpfc: Remove unsupported mbox PORT_CAPABILITIES logic
	mfd: intel-m10-bmc: Fix the register access range
	mfd: da9063: Support SMBus and I2C mode
	mfd: arizona: Fix rumtime PM imbalance on error
	scsi: libfc: Fix a format specifier
	perf: Rework perf_event_exit_event()
	sched,fair: Alternative sched_slice()
	block/rnbd-clt: Fix missing a memory free when unloading the module
	s390/archrandom: add parameter check for s390_arch_random_generate
	sched,psi: Handle potential task count underflow bugs more gracefully
	power: supply: cpcap-battery: fix invalid usage of list cursor
	ALSA: emu8000: Fix a use after free in snd_emu8000_create_mixer
	ALSA: hda/conexant: Re-order CX5066 quirk table entries
	ALSA: sb: Fix two use after free in snd_sb_qsound_build
	ALSA: usb-audio: Explicitly set up the clock selector
	ALSA: usb-audio: Add dB range mapping for Sennheiser Communications Headset PC 8
	ALSA: hda/realtek: fix mute/micmute LEDs for HP ProBook 445 G7
	ALSA: hda/realtek: GA503 use same quirks as GA401
	ALSA: hda/realtek: fix mic boost on Intel NUC 8
	ALSA: hda/realtek - Headset Mic issue on HP platform
	ALSA: hda/realtek: fix static noise on ALC285 Lenovo laptops
	ALSA: hda/realtek: Add quirk for Intel Clevo PCx0Dx
	tools/power/turbostat: Fix turbostat for AMD Zen CPUs
	btrfs: fix race when picking most recent mod log operation for an old root
	arm64/vdso: Discard .note.gnu.property sections in vDSO
	Makefile: Move -Wno-unused-but-set-variable out of GCC only block
	fs: fix reporting supported extra file attributes for statx()
	virtiofs: fix memory leak in virtio_fs_probe()
	kcsan, debugfs: Move debugfs file creation out of early init
	ubifs: Only check replay with inode type to judge if inode linked
	f2fs: fix error handling in f2fs_end_enable_verity()
	f2fs: fix to avoid out-of-bounds memory access
	mlxsw: spectrum_mr: Update egress RIF list before route's action
	openvswitch: fix stack OOB read while fragmenting IPv4 packets
	ACPI: GTDT: Don't corrupt interrupt mappings on watchdow probe failure
	NFS: fs_context: validate UDP retrans to prevent shift out-of-bounds
	NFS: Don't discard pNFS layout segments that are marked for return
	NFSv4: Don't discard segments marked for return in _pnfs_return_layout()
	Input: ili210x - add missing negation for touch indication on ili210x
	jffs2: Fix kasan slab-out-of-bounds problem
	jffs2: Hook up splice_write callback
	powerpc/powernv: Enable HAIL (HV AIL) for ISA v3.1 processors
	powerpc/eeh: Fix EEH handling for hugepages in ioremap space.
	powerpc/kexec_file: Use current CPU info while setting up FDT
	powerpc/32: Fix boot failure with CONFIG_STACKPROTECTOR
	powerpc: fix EDEADLOCK redefinition error in uapi/asm/errno.h
	intel_th: pci: Add Alder Lake-M support
	tpm: efi: Use local variable for calculating final log size
	tpm: vtpm_proxy: Avoid reading host log when using a virtual device
	crypto: arm/curve25519 - Move '.fpu' after '.arch'
	crypto: rng - fix crypto_rng_reset() refcounting when !CRYPTO_STATS
	md/raid1: properly indicate failure when ending a failed write request
	dm raid: fix inconclusive reshape layout on fast raid4/5/6 table reload sequences
	fuse: fix write deadlock
	exfat: fix erroneous discard when clear cluster bit
	sfc: farch: fix TX queue lookup in TX flush done handling
	sfc: farch: fix TX queue lookup in TX event handling
	security: commoncap: fix -Wstringop-overread warning
	Fix misc new gcc warnings
	jffs2: check the validity of dstlen in jffs2_zlib_compress()
	smb3: when mounting with multichannel include it in requested capabilities
	smb3: do not attempt multichannel to server which does not support it
	Revert 337f13046f ("futex: Allow FUTEX_CLOCK_REALTIME with FUTEX_WAIT op")
	futex: Do not apply time namespace adjustment on FUTEX_LOCK_PI
	x86/cpu: Initialize MSR_TSC_AUX if RDTSCP *or* RDPID is supported
	kbuild: update config_data.gz only when the content of .config is changed
	ext4: annotate data race in start_this_handle()
	ext4: annotate data race in jbd2_journal_dirty_metadata()
	ext4: fix check to prevent false positive report of incorrect used inodes
	ext4: do not set SB_ACTIVE in ext4_orphan_cleanup()
	ext4: fix error code in ext4_commit_super
	ext4: fix ext4_error_err save negative errno into superblock
	ext4: fix error return code in ext4_fc_perform_commit()
	ext4: allow the dax flag to be set and cleared on inline directories
	ext4: Fix occasional generic/418 failure
	media: dvbdev: Fix memory leak in dvb_media_device_free()
	media: dvb-usb: Fix use-after-free access
	media: dvb-usb: Fix memory leak at error in dvb_usb_device_init()
	media: staging/intel-ipu3: Fix memory leak in imu_fmt
	media: staging/intel-ipu3: Fix set_fmt error handling
	media: staging/intel-ipu3: Fix race condition during set_fmt
	media: v4l2-ctrls: fix reference to freed memory
	media: venus: hfi_parser: Don't initialize parser on v1
	usb: gadget: dummy_hcd: fix gpf in gadget_setup
	usb: gadget: Fix double free of device descriptor pointers
	usb: gadget/function/f_fs string table fix for multiple languages
	usb: dwc3: gadget: Remove FS bInterval_m1 limitation
	usb: dwc3: gadget: Fix START_TRANSFER link state check
	usb: dwc3: core: Do core softreset when switch mode
	usb: dwc2: Fix session request interrupt handler
	tty: fix memory leak in vc_deallocate
	rsi: Use resume_noirq for SDIO
	tools/power turbostat: Fix offset overflow issue in index converting
	tracing: Map all PIDs to command lines
	tracing: Restructure trace_clock_global() to never block
	dm persistent data: packed struct should have an aligned() attribute too
	dm space map common: fix division bug in sm_ll_find_free_block()
	dm integrity: fix missing goto in bitmap_flush_interval error handling
	dm rq: fix double free of blk_mq_tag_set in dev remove after table load fails
	lib/vsprintf.c: remove leftover 'f' and 'F' cases from bstr_printf()
	thermal/drivers/cpufreq_cooling: Fix slab OOB issue
	thermal/core/fair share: Lock the thermal zone while looping over instances
	Linux 5.10.36

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: I7b8075de5edd8de69205205cddb9a3273d7d0810
2021-05-13 14:22:11 +02:00

944 lines
24 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* linux/fs/ext4/file.c
*
* Copyright (C) 1992, 1993, 1994, 1995
* Remy Card (card@masi.ibp.fr)
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie (Paris VI)
*
* from
*
* linux/fs/minix/file.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* ext4 fs regular file handling primitives
*
* 64-bit file support on 64-bit platforms by Jakub Jelinek
* (jj@sunsite.ms.mff.cuni.cz)
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/mount.h>
#include <linux/path.h>
#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
#include <linux/mman.h>
#include <linux/backing-dev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "truncate.h"
static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
{
struct inode *inode = file_inode(iocb->ki_filp);
if (!fscrypt_dio_supported(iocb, iter))
return false;
if (fsverity_active(inode))
return false;
if (ext4_should_journal_data(inode))
return false;
if (ext4_has_inline_data(inode))
return false;
return true;
}
static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t ret;
struct inode *inode = file_inode(iocb->ki_filp);
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock_shared(inode))
return -EAGAIN;
} else {
inode_lock_shared(inode);
}
if (!ext4_dio_supported(iocb, to)) {
inode_unlock_shared(inode);
/*
* Fallback to buffered I/O if the operation being performed on
* the inode is not supported by direct I/O. The IOCB_DIRECT
* flag needs to be cleared here in order to ensure that the
* direct I/O path within generic_file_read_iter() is not
* taken.
*/
iocb->ki_flags &= ~IOCB_DIRECT;
return generic_file_read_iter(iocb, to);
}
ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
is_sync_kiocb(iocb));
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
return ret;
}
#ifdef CONFIG_FS_DAX
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock_shared(inode))
return -EAGAIN;
} else {
inode_lock_shared(inode);
}
/*
* Recheck under inode lock - at this point we are sure it cannot
* change anymore
*/
if (!IS_DAX(inode)) {
inode_unlock_shared(inode);
/* Fallback to buffered IO in case we cannot support DAX */
return generic_file_read_iter(iocb, to);
}
ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
return ret;
}
#endif
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
if (!iov_iter_count(to))
return 0; /* skip atime */
#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
return ext4_dax_read_iter(iocb, to);
#endif
if (iocb->ki_flags & IOCB_DIRECT)
return ext4_dio_read_iter(iocb, to);
return generic_file_read_iter(iocb, to);
}
/*
* Called when an inode is released. Note that this is different
* from ext4_file_open: open gets called at every open, but release
* gets called only when /all/ the files are closed.
*/
static int ext4_release_file(struct inode *inode, struct file *filp)
{
if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
ext4_alloc_da_blocks(inode);
ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
}
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1) &&
!EXT4_I(inode)->i_reserved_data_blocks) {
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
if (is_dx(inode) && filp->private_data)
ext4_htree_free_dir_info(filp->private_data);
return 0;
}
/*
* This tests whether the IO in question is block-aligned or not.
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
* are converted to written only after the IO is complete. Until they are
* mapped, these blocks appear as holes, so dio_zero_block() will assume that
* it needs to zero out portions of the start and/or end block. If 2 AIO
* threads are at work on the same unwritten block, they must be synchronized
* or one thread will zero the other's data, causing corruption.
*/
static bool
ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
{
struct super_block *sb = inode->i_sb;
unsigned long blockmask = sb->s_blocksize - 1;
if ((pos | iov_iter_alignment(from)) & blockmask)
return true;
return false;
}
static bool
ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
{
if (offset + len > i_size_read(inode) ||
offset + len > EXT4_I(inode)->i_disksize)
return true;
return false;
}
/* Is IO overwriting allocated and initialized blocks? */
static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
{
struct ext4_map_blocks map;
unsigned int blkbits = inode->i_blkbits;
int err, blklen;
if (pos + len > i_size_read(inode))
return false;
map.m_lblk = pos >> blkbits;
map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
blklen = map.m_len;
err = ext4_map_blocks(NULL, inode, &map, 0);
/*
* 'err==len' means that all of the blocks have been preallocated,
* regardless of whether they have been initialized or not. To exclude
* unwritten extents, we need to check m_flags.
*/
return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
}
static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
ret = generic_write_checks(iocb, from);
if (ret <= 0)
return ret;
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
*/
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
return -EFBIG;
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
}
return iov_iter_count(from);
}
static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
ssize_t ret, count;
count = ext4_generic_write_checks(iocb, from);
if (count <= 0)
return count;
ret = file_modified(iocb->ki_filp);
if (ret)
return ret;
return count;
}
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
ssize_t ret;
struct inode *inode = file_inode(iocb->ki_filp);
if (iocb->ki_flags & IOCB_NOWAIT)
return -EOPNOTSUPP;
ext4_fc_start_update(inode);
inode_lock(inode);
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
current->backing_dev_info = inode_to_bdi(inode);
ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
current->backing_dev_info = NULL;
out:
inode_unlock(inode);
ext4_fc_stop_update(inode);
if (likely(ret > 0)) {
iocb->ki_pos += ret;
ret = generic_write_sync(iocb, ret);
}
return ret;
}
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
ssize_t written, size_t count)
{
handle_t *handle;
bool truncate = false;
u8 blkbits = inode->i_blkbits;
ext4_lblk_t written_blk, end_blk;
int ret;
/*
* Note that EXT4_I(inode)->i_disksize can get extended up to
* inode->i_size while the I/O was running due to writeback of delalloc
* blocks. But, the code in ext4_iomap_alloc() is careful to use
* zeroed/unwritten extents if this is possible; thus we won't leave
* uninitialized blocks in a file even if we didn't succeed in writing
* as much as we intended.
*/
WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
if (offset + count <= EXT4_I(inode)->i_disksize) {
/*
* We need to ensure that the inode is removed from the orphan
* list if it has been added prematurely, due to writeback of
* delalloc blocks.
*/
if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
ext4_orphan_del(NULL, inode);
return PTR_ERR(handle);
}
ext4_orphan_del(handle, inode);
ext4_journal_stop(handle);
}
return written;
}
if (written < 0)
goto truncate;
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
written = PTR_ERR(handle);
goto truncate;
}
if (ext4_update_inode_size(inode, offset + written)) {
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret)) {
written = ret;
ext4_journal_stop(handle);
goto truncate;
}
}
/*
* We may need to truncate allocated but not written blocks beyond EOF.
*/
written_blk = ALIGN(offset + written, 1 << blkbits);
end_blk = ALIGN(offset + count, 1 << blkbits);
if (written_blk < end_blk && ext4_can_truncate(inode))
truncate = true;
/*
* Remove the inode from the orphan list if it has been extended and
* everything went OK.
*/
if (!truncate && inode->i_nlink)
ext4_orphan_del(handle, inode);
ext4_journal_stop(handle);
if (truncate) {
truncate:
ext4_truncate_failed_write(inode);
/*
* If the truncate operation failed early, then the inode may
* still be on the orphan list. In that case, we need to try
* remove the inode from the in-memory linked list.
*/
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
}
return written;
}
static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
int error, unsigned int flags)
{
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
if (error)
return error;
if (size && flags & IOMAP_DIO_UNWRITTEN) {
error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
if (error < 0)
return error;
}
/*
* If we are extending the file, we have to update i_size here before
* page cache gets invalidated in iomap_dio_rw(). Otherwise racing
* buffered reads could zero out too much from page cache pages. Update
* of on-disk size will happen later in ext4_dio_write_iter() where
* we have enough information to also perform orphan list handling etc.
* Note that we perform all extending writes synchronously under
* i_rwsem held exclusively so i_size update is safe here in that case.
* If the write was not extending, we cannot see pos > i_size here
* because operations reducing i_size like truncate wait for all
* outstanding DIO before updating i_size.
*/
pos += size;
if (pos > i_size_read(inode))
i_size_write(inode, pos);
return 0;
}
static const struct iomap_dio_ops ext4_dio_write_ops = {
.end_io = ext4_dio_write_end_io,
};
/*
* The intention here is to start with shared lock acquired then see if any
* condition requires an exclusive inode lock. If yes, then we restart the
* whole operation by releasing the shared lock and acquiring exclusive lock.
*
* - For unaligned_io we never take shared lock as it may cause data corruption
* when two unaligned IO tries to modify the same block e.g. while zeroing.
*
* - For extending writes case we don't take the shared lock, since it requires
* updating inode i_disksize and/or orphan handling with exclusive lock.
*
* - shared locking will only be true mostly with overwrites. Otherwise we will
* switch to exclusive i_rwsem lock.
*/
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
bool *ilock_shared, bool *extend)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
loff_t offset;
size_t count;
ssize_t ret;
restart:
ret = ext4_generic_write_checks(iocb, from);
if (ret <= 0)
goto out;
offset = iocb->ki_pos;
count = ret;
if (ext4_extending_io(inode, offset, count))
*extend = true;
/*
* Determine whether the IO operation will overwrite allocated
* and initialized blocks.
* We need exclusive i_rwsem for changing security info
* in file_modified().
*/
if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
!ext4_overwrite_io(inode, offset, count))) {
if (iocb->ki_flags & IOCB_NOWAIT) {
ret = -EAGAIN;
goto out;
}
inode_unlock_shared(inode);
*ilock_shared = false;
inode_lock(inode);
goto restart;
}
ret = file_modified(file);
if (ret < 0)
goto out;
return count;
out:
if (*ilock_shared)
inode_unlock_shared(inode);
else
inode_unlock(inode);
return ret;
}
static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
ssize_t ret;
handle_t *handle;
struct inode *inode = file_inode(iocb->ki_filp);
loff_t offset = iocb->ki_pos;
size_t count = iov_iter_count(from);
const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
bool extend = false, unaligned_io = false;
bool ilock_shared = true;
/*
* We initially start with shared inode lock unless it is
* unaligned IO which needs exclusive lock anyways.
*/
if (ext4_unaligned_io(inode, from, offset)) {
unaligned_io = true;
ilock_shared = false;
}
/*
* Quick check here without any i_rwsem lock to see if it is extending
* IO. A more reliable check is done in ext4_dio_write_checks() with
* proper locking in place.
*/
if (offset + count > i_size_read(inode))
ilock_shared = false;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (ilock_shared) {
if (!inode_trylock_shared(inode))
return -EAGAIN;
} else {
if (!inode_trylock(inode))
return -EAGAIN;
}
} else {
if (ilock_shared)
inode_lock_shared(inode);
else
inode_lock(inode);
}
/* Fallback to buffered I/O if the inode does not support direct I/O. */
if (!ext4_dio_supported(iocb, from)) {
if (ilock_shared)
inode_unlock_shared(inode);
else
inode_unlock(inode);
return ext4_buffered_write_iter(iocb, from);
}
ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
if (ret <= 0)
return ret;
/* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
ret = -EAGAIN;
goto out;
}
offset = iocb->ki_pos;
count = ret;
/*
* Unaligned direct IO must be serialized among each other as zeroing
* of partial blocks of two competing unaligned IOs can result in data
* corruption.
*
* So we make sure we don't allow any unaligned IO in flight.
* For IOs where we need not wait (like unaligned non-AIO DIO),
* below inode_dio_wait() may anyway become a no-op, since we start
* with exclusive lock.
*/
if (unaligned_io)
inode_dio_wait(inode);
if (extend) {
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
}
ext4_fc_start_update(inode);
ret = ext4_orphan_add(handle, inode);
ext4_fc_stop_update(inode);
if (ret) {
ext4_journal_stop(handle);
goto out;
}
ext4_journal_stop(handle);
}
if (ilock_shared)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
is_sync_kiocb(iocb) || unaligned_io || extend);
if (ret == -ENOTBLK)
ret = 0;
if (extend)
ret = ext4_handle_inode_extension(inode, offset, ret, count);
out:
if (ilock_shared)
inode_unlock_shared(inode);
else
inode_unlock(inode);
if (ret >= 0 && iov_iter_count(from)) {
ssize_t err;
loff_t endbyte;
offset = iocb->ki_pos;
err = ext4_buffered_write_iter(iocb, from);
if (err < 0)
return err;
/*
* We need to ensure that the pages within the page cache for
* the range covered by this I/O are written to disk and
* invalidated. This is in attempt to preserve the expected
* direct I/O semantics in the case we fallback to buffered I/O
* to complete off the I/O request.
*/
ret += err;
endbyte = offset + err - 1;
err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
offset, endbyte);
if (!err)
invalidate_mapping_pages(iocb->ki_filp->f_mapping,
offset >> PAGE_SHIFT,
endbyte >> PAGE_SHIFT);
}
return ret;
}
#ifdef CONFIG_FS_DAX
static ssize_t
ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
ssize_t ret;
size_t count;
loff_t offset;
handle_t *handle;
bool extend = false;
struct inode *inode = file_inode(iocb->ki_filp);
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock(inode))
return -EAGAIN;
} else {
inode_lock(inode);
}
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
offset = iocb->ki_pos;
count = iov_iter_count(from);
if (offset + count > EXT4_I(inode)->i_disksize) {
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
}
ret = ext4_orphan_add(handle, inode);
if (ret) {
ext4_journal_stop(handle);
goto out;
}
extend = true;
ext4_journal_stop(handle);
}
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
if (extend)
ret = ext4_handle_inode_extension(inode, offset, ret, count);
out:
inode_unlock(inode);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
return ret;
}
#endif
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
if (iocb->ki_flags & IOCB_DIRECT)
return ext4_dio_write_iter(iocb, from);
else
return ext4_buffered_write_iter(iocb, from);
}
#ifdef CONFIG_FS_DAX
static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size)
{
int error = 0;
vm_fault_t result;
int retries = 0;
handle_t *handle = NULL;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct super_block *sb = inode->i_sb;
/*
* We have to distinguish real writes from writes which will result in a
* COW page; COW writes should *not* poke the journal (the file will not
* be changed). Doing so would cause unintended failures when mounted
* read-only.
*
* We check for VM_SHARED rather than vmf->cow_page since the latter is
* unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
* other sizes, dax_iomap_fault will handle splitting / fallback so that
* we eventually come back with a COW page.
*/
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
pfn_t pfn;
if (write) {
sb_start_pagefault(sb);
file_update_time(vmf->vma->vm_file);
down_read(&EXT4_I(inode)->i_mmap_sem);
retry:
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb));
if (IS_ERR(handle)) {
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
return VM_FAULT_SIGBUS;
}
} else {
down_read(&EXT4_I(inode)->i_mmap_sem);
}
result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
if (write) {
ext4_journal_stop(handle);
if ((result & VM_FAULT_ERROR) && error == -ENOSPC &&
ext4_should_retry_alloc(sb, &retries))
goto retry;
/* Handling synchronous page fault? */
if (result & VM_FAULT_NEEDDSYNC)
result = dax_finish_sync_fault(vmf, pe_size, pfn);
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
} else {
up_read(&EXT4_I(inode)->i_mmap_sem);
}
return result;
}
static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
{
return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
.huge_fault = ext4_dax_huge_fault,
.page_mkwrite = ext4_dax_fault,
.pfn_mkwrite = ext4_dax_fault,
};
#else
#define ext4_dax_vm_ops ext4_file_vm_ops
#endif
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = ext4_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
.allow_speculation = filemap_allow_speculation,
#endif
};
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_mapping->host;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct dax_device *dax_dev = sbi->s_daxdev;
if (unlikely(ext4_forced_shutdown(sbi)))
return -EIO;
/*
* We don't support synchronous mappings for non-DAX files and
* for DAX files if underneath dax_device is not synchronous.
*/
if (!daxdev_mapping_supported(vma, dax_dev))
return -EOPNOTSUPP;
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
vma->vm_flags |= VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
return 0;
}
static int ext4_sample_last_mounted(struct super_block *sb,
struct vfsmount *mnt)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct path path;
char buf[64], *cp;
handle_t *handle;
int err;
if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
return 0;
if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
return 0;
ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
/*
* Sample where the filesystem has been mounted and
* store it in the superblock for sysadmin convenience
* when trying to sort through large numbers of block
* devices or filesystem images.
*/
memset(buf, 0, sizeof(buf));
path.mnt = mnt;
path.dentry = mnt->mnt_root;
cp = d_path(&path, buf, sizeof(buf));
err = 0;
if (IS_ERR(cp))
goto out;
handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
err = PTR_ERR(handle);
if (IS_ERR(handle))
goto out;
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err)
goto out_journal;
strncpy(sbi->s_es->s_last_mounted, cp,
sizeof(sbi->s_es->s_last_mounted));
ext4_handle_dirty_super(handle, sb);
out_journal:
ext4_journal_stop(handle);
out:
sb_end_intwrite(sb);
return err;
}
static int ext4_file_open(struct inode *inode, struct file *filp)
{
int ret;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
if (ret)
return ret;
ret = fscrypt_file_open(inode, filp);
if (ret)
return ret;
ret = fsverity_file_open(inode, filp);
if (ret)
return ret;
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
*/
if (filp->f_mode & FMODE_WRITE) {
ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
return ret;
}
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
return dquot_file_open(inode, filp);
}
/*
* ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
* by calling generic_file_llseek_size() with the appropriate maxbytes
* value for each.
*/
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
loff_t maxbytes;
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
else
maxbytes = inode->i_sb->s_maxbytes;
switch (whence) {
default:
return generic_file_llseek_size(file, offset, whence,
maxbytes, i_size_read(inode));
case SEEK_HOLE:
inode_lock_shared(inode);
offset = iomap_seek_hole(inode, offset,
&ext4_iomap_report_ops);
inode_unlock_shared(inode);
break;
case SEEK_DATA:
inode_lock_shared(inode);
offset = iomap_seek_data(inode, offset,
&ext4_iomap_report_ops);
inode_unlock_shared(inode);
break;
}
if (offset < 0)
return offset;
return vfs_setpos(file, offset, maxbytes);
}
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
.mmap_supported_flags = MAP_SYNC,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
};
const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_file_getattr,
.listxattr = ext4_listxattr,
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
};