mirror of
https://github.com/torvalds/linux.git
synced 2026-06-06 21:45:45 +02:00
-----BEGIN PGP SIGNATURE-----
iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAmGWT+QACgkQONu9yGCS
aT5mYw//ZXKzugaeJjuIaFqr7tcM7x8EefbKd2H4oMr8SW3IFElJIbNPJGMJAG/C
tLZVWZvIum7QzZoxTL+JCCKpDzBERNTo4e5u7UwzAdVqiEX69YkNU0FBOzb4qXJ7
gOZMBhy4UMIKdKD12CSXXf7ZspocsNXfzdmulRQ7CQcPoPrIMKpc4vuagN1Fy/Dz
JgXYvRUAkLxtFHoQ/TeXvR4Gv9+w2ToMdb02mI48QBO+YYrFaGt+Rza2eHTv75H+
Lydz37Nv1Pk32tA1q2jWxCzz16+Kzn+AviKiCfQK0Fb9IqnJksUIWLHSiODlVIcf
kQHejanPn/p1BnBl8puPk1KFtDW45p2GwYhXG7hjGh08DGlR7QLHBS5Aa3xPYfdd
uOy4ctygSVTx5nLjPH5vr3OE0wk/TuSSf/eyk2fmcUCspwAgBOnSYSmnJOem7LTK
VqIgXFdCRplsqN415D35ddruP2BLCKqBu4KjwJ1LGIwgsx/Pmz4hlc5YcpLm8uRg
XMqGTdcieQFOGmZJjJ2q3ecaCjfb0nmTrOylP5b55/74TFwFo042YR1ua0fEtpD4
euoHLfYv3BY1dCp34TOUFGX0l+J1kAtf//vfD/JgJx/nX+ksdFBHhYwdbSi2oQG/
9CceXYJ5duEnG+JmDOWJvcZ3T49K5XaIDNfY2zGpcSu1VZKubWg=
=tQ0m
-----END PGP SIGNATURE-----
Merge 5.10.80 into android12-5.10-lts
Changes in 5.10.80
xhci: Fix USB 3.1 enumeration issues by increasing roothub power-on-good delay
usb: xhci: Enable runtime-pm by default on AMD Yellow Carp platform
binder: use euid from cred instead of using task
binder: use cred instead of task for selinux checks
binder: use cred instead of task for getsecid
Input: iforce - fix control-message timeout
Input: elantench - fix misreporting trackpoint coordinates
Input: i8042 - Add quirk for Fujitsu Lifebook T725
libata: fix read log timeout value
ocfs2: fix data corruption on truncate
scsi: core: Remove command size deduction from scsi_setup_scsi_cmnd()
scsi: qla2xxx: Fix kernel crash when accessing port_speed sysfs file
scsi: qla2xxx: Fix use after free in eh_abort path
mmc: mtk-sd: Add wait dma stop done flow
mmc: dw_mmc: Dont wait for DRTO on Write RSP error
exfat: fix incorrect loading of i_blocks for large files
parisc: Fix set_fixmap() on PA1.x CPUs
parisc: Fix ptrace check on syscall return
tpm: Check for integer overflow in tpm2_map_response_body()
firmware/psci: fix application of sizeof to pointer
crypto: s5p-sss - Add error handling in s5p_aes_probe()
media: rkvdec: Do not override sizeimage for output format
media: ite-cir: IR receiver stop working after receive overflow
media: rkvdec: Support dynamic resolution changes
media: ir-kbd-i2c: improve responsiveness of hauppauge zilog receivers
media: v4l2-ioctl: Fix check_ext_ctrls
ALSA: hda/realtek: Fix mic mute LED for the HP Spectre x360 14
ALSA: hda/realtek: Add a quirk for HP OMEN 15 mute LED
ALSA: hda/realtek: Add quirk for Clevo PC70HS
ALSA: hda/realtek: Headset fixup for Clevo NH77HJQ
ALSA: hda/realtek: Add a quirk for Acer Spin SP513-54N
ALSA: hda/realtek: Add quirk for ASUS UX550VE
ALSA: hda/realtek: Add quirk for HP EliteBook 840 G7 mute LED
ALSA: ua101: fix division by zero at probe
ALSA: 6fire: fix control and bulk message timeouts
ALSA: line6: fix control and interrupt message timeouts
ALSA: usb-audio: Line6 HX-Stomp XL USB_ID for 48k-fixed quirk
ALSA: usb-audio: Add registration quirk for JBL Quantum 400
ALSA: hda: Free card instance properly at probe errors
ALSA: synth: missing check for possible NULL after the call to kstrdup
ALSA: timer: Fix use-after-free problem
ALSA: timer: Unconditionally unlink slave instances, too
ext4: fix lazy initialization next schedule time computation in more granular unit
ext4: ensure enough credits in ext4_ext_shift_path_extents
ext4: refresh the ext4_ext_path struct after dropping i_data_sem.
fuse: fix page stealing
x86/sme: Use #define USE_EARLY_PGTABLE_L5 in mem_encrypt_identity.c
x86/cpu: Fix migration safety with X86_BUG_NULL_SEL
x86/irq: Ensure PI wakeup handler is unregistered before module unload
ASoC: soc-core: fix null-ptr-deref in snd_soc_del_component_unlocked()
ALSA: hda/realtek: Fixes HP Spectre x360 15-eb1xxx speakers
cavium: Return negative value when pci_alloc_irq_vectors() fails
scsi: qla2xxx: Return -ENOMEM if kzalloc() fails
scsi: qla2xxx: Fix unmap of already freed sgl
mISDN: Fix return values of the probe function
cavium: Fix return values of the probe function
sfc: Export fibre-specific supported link modes
sfc: Don't use netif_info before net_device setup
hyperv/vmbus: include linux/bitops.h
ARM: dts: sun7i: A20-olinuxino-lime2: Fix ethernet phy-mode
reset: socfpga: add empty driver allowing consumers to probe
mmc: winbond: don't build on M68K
drm: panel-orientation-quirks: Add quirk for Aya Neo 2021
fcnal-test: kill hanging ping/nettest binaries on cleanup
bpf: Define bpf_jit_alloc_exec_limit for arm64 JIT
bpf: Prevent increasing bpf_jit_limit above max
gpio: mlxbf2.c: Add check for bgpio_init failure
xen/netfront: stop tx queues during live migration
nvmet-tcp: fix a memory leak when releasing a queue
spi: spl022: fix Microwire full duplex mode
net: multicast: calculate csum of looped-back and forwarded packets
watchdog: Fix OMAP watchdog early handling
drm: panel-orientation-quirks: Add quirk for GPD Win3
block: schedule queue restart after BLK_STS_ZONE_RESOURCE
nvmet-tcp: fix header digest verification
r8169: Add device 10ec:8162 to driver r8169
vmxnet3: do not stop tx queues after netif_device_detach()
nfp: bpf: relax prog rejection for mtu check through max_pkt_offset
net/smc: Fix smc_link->llc_testlink_time overflow
net/smc: Correct spelling mistake to TCPF_SYN_RECV
rds: stop using dmapool
btrfs: clear MISSING device status bit in btrfs_close_one_device
btrfs: fix lost error handling when replaying directory deletes
btrfs: call btrfs_check_rw_degradable only if there is a missing device
KVM: VMX: Unregister posted interrupt wakeup handler on hardware unsetup
ia64: kprobes: Fix to pass correct trampoline address to the handler
selinux: fix race condition when computing ocontext SIDs
hwmon: (pmbus/lm25066) Add offset coefficients
regulator: s5m8767: do not use reset value as DVS voltage if GPIO DVS is disabled
regulator: dt-bindings: samsung,s5m8767: correct s5m8767,pmic-buck-default-dvs-idx property
EDAC/sb_edac: Fix top-of-high-memory value for Broadwell/Haswell
mwifiex: fix division by zero in fw download path
ath6kl: fix division by zero in send path
ath6kl: fix control-message timeout
ath10k: fix control-message timeout
ath10k: fix division by zero in send path
PCI: Mark Atheros QCA6174 to avoid bus reset
rtl8187: fix control-message timeouts
evm: mark evm_fixmode as __ro_after_init
ifb: Depend on netfilter alternatively to tc
wcn36xx: Fix HT40 capability for 2Ghz band
wcn36xx: Fix tx_status mechanism
wcn36xx: Fix (QoS) null data frame bitrate/modulation
PM: sleep: Do not let "syscore" devices runtime-suspend during system transitions
mwifiex: Read a PCI register after writing the TX ring write pointer
mwifiex: Try waking the firmware until we get an interrupt
libata: fix checking of DMA state
wcn36xx: handle connection loss indication
rsi: fix occasional initialisation failure with BT coex
rsi: fix key enabled check causing unwanted encryption for vap_id > 0
rsi: fix rate mask set leading to P2P failure
rsi: Fix module dev_oper_mode parameter description
perf/x86/intel/uncore: Support extra IMC channel on Ice Lake server
perf/x86/intel/uncore: Fix Intel ICX IIO event constraints
RDMA/qedr: Fix NULL deref for query_qp on the GSI QP
signal: Remove the bogus sigkill_pending in ptrace_stop
memory: renesas-rpc-if: Correct QSPI data transfer in Manual mode
signal/mips: Update (_save|_restore)_fp_context to fail with -EFAULT
soc: fsl: dpio: replace smp_processor_id with raw_smp_processor_id
soc: fsl: dpio: use the combined functions to protect critical zone
mtd: rawnand: socrates: Keep the driver compatible with on-die ECC engines
power: supply: max17042_battery: Prevent int underflow in set_soc_threshold
power: supply: max17042_battery: use VFSOC for capacity when no rsns
KVM: arm64: Extract ESR_ELx.EC only
KVM: nVMX: Query current VMCS when determining if MSR bitmaps are in use
can: j1939: j1939_tp_cmd_recv(): ignore abort message in the BAM transport
can: j1939: j1939_can_recv(): ignore messages with invalid source address
powerpc/85xx: Fix oops when mpc85xx_smp_guts_ids node cannot be found
ring-buffer: Protect ring_buffer_reset() from reentrancy
serial: core: Fix initializing and restoring termios speed
ifb: fix building without CONFIG_NET_CLS_ACT
ALSA: mixer: oss: Fix racy access to slots
ALSA: mixer: fix deadlock in snd_mixer_oss_set_volume
xen/balloon: add late_initcall_sync() for initial ballooning done
ovl: fix use after free in struct ovl_aio_req
PCI: pci-bridge-emul: Fix emulation of W1C bits
PCI: cadence: Add cdns_plat_pcie_probe() missing return
PCI: aardvark: Do not clear status bits of masked interrupts
PCI: aardvark: Fix checking for link up via LTSSM state
PCI: aardvark: Do not unmask unused interrupts
PCI: aardvark: Fix reporting Data Link Layer Link Active
PCI: aardvark: Fix configuring Reference clock
PCI: aardvark: Fix return value of MSI domain .alloc() method
PCI: aardvark: Read all 16-bits from PCIE_MSI_PAYLOAD_REG
PCI: aardvark: Fix support for bus mastering and PCI_COMMAND on emulated bridge
PCI: aardvark: Fix support for PCI_BRIDGE_CTL_BUS_RESET on emulated bridge
PCI: aardvark: Set PCI Bridge Class Code to PCI Bridge
PCI: aardvark: Fix support for PCI_ROM_ADDRESS1 on emulated bridge
quota: check block number when reading the block in quota file
quota: correct error number in free_dqentry()
pinctrl: core: fix possible memory leak in pinctrl_enable()
coresight: cti: Correct the parameter for pm_runtime_put
iio: dac: ad5446: Fix ad5622_write() return value
iio: ad5770r: make devicetree property reading consistent
USB: serial: keyspan: fix memleak on probe errors
serial: 8250: fix racy uartclk update
most: fix control-message timeouts
USB: iowarrior: fix control-message timeouts
USB: chipidea: fix interrupt deadlock
power: supply: max17042_battery: Clear status bits in interrupt handler
dma-buf: WARN on dmabuf release with pending attachments
drm: panel-orientation-quirks: Update the Lenovo Ideapad D330 quirk (v2)
drm: panel-orientation-quirks: Add quirk for KD Kurio Smart C15200 2-in-1
drm: panel-orientation-quirks: Add quirk for the Samsung Galaxy Book 10.6
Bluetooth: sco: Fix lock_sock() blockage by memcpy_from_msg()
Bluetooth: fix use-after-free error in lock_sock_nested()
drm/panel-orientation-quirks: add Valve Steam Deck
rcutorture: Avoid problematic critical section nesting on PREEMPT_RT
platform/x86: wmi: do not fail if disabling fails
MIPS: lantiq: dma: add small delay after reset
MIPS: lantiq: dma: reset correct number of channel
locking/lockdep: Avoid RCU-induced noinstr fail
net: sched: update default qdisc visibility after Tx queue cnt changes
rcu-tasks: Move RTGS_WAIT_CBS to beginning of rcu_tasks_kthread() loop
smackfs: Fix use-after-free in netlbl_catmap_walk()
ath11k: Align bss_chan_info structure with firmware
x86: Increase exception stack sizes
mwifiex: Run SET_BSS_MODE when changing from P2P to STATION vif-type
mwifiex: Properly initialize private structure on interface type changes
fscrypt: allow 256-bit master keys with AES-256-XTS
drm/amdgpu: Fix MMIO access page fault
ath11k: Avoid reg rules update during firmware recovery
ath11k: add handler for scan event WMI_SCAN_EVENT_DEQUEUED
ath11k: Change DMA_FROM_DEVICE to DMA_TO_DEVICE when map reinjected packets
ath10k: high latency fixes for beacon buffer
media: mt9p031: Fix corrupted frame after restarting stream
media: netup_unidvb: handle interrupt properly according to the firmware
media: atomisp: Fix error handling in probe
media: stm32: Potential NULL pointer dereference in dcmi_irq_thread()
media: uvcvideo: Set capability in s_param
media: uvcvideo: Return -EIO for control errors
media: uvcvideo: Set unique vdev name based in type
media: s5p-mfc: fix possible null-pointer dereference in s5p_mfc_probe()
media: s5p-mfc: Add checking to s5p_mfc_probe().
media: imx: set a media_device bus_info string
media: mceusb: return without resubmitting URB in case of -EPROTO error.
ia64: don't do IA64_CMPXCHG_DEBUG without CONFIG_PRINTK
rtw88: fix RX clock gate setting while fifo dump
brcmfmac: Add DMI nvram filename quirk for Cyberbook T116 tablet
media: rcar-csi2: Add checking to rcsi2_start_receiver()
ipmi: Disable some operations during a panic
fs/proc/uptime.c: Fix idle time reporting in /proc/uptime
ACPICA: Avoid evaluating methods too early during system resume
media: ipu3-imgu: imgu_fmt: Handle properly try
media: ipu3-imgu: VIDIOC_QUERYCAP: Fix bus_info
media: usb: dvd-usb: fix uninit-value bug in dibusb_read_eeprom_byte()
net-sysfs: try not to restart the syscall if it will fail eventually
tracefs: Have tracefs directories not set OTH permission bits by default
ath: dfs_pattern_detector: Fix possible null-pointer dereference in channel_detector_create()
mmc: moxart: Fix reference count leaks in moxart_probe
iov_iter: Fix iov_iter_get_pages{,_alloc} page fault return value
ACPI: battery: Accept charges over the design capacity as full
drm/amdkfd: fix resume error when iommu disabled in Picasso
net: phy: micrel: make *-skew-ps check more lenient
leaking_addresses: Always print a trailing newline
drm/msm: prevent NULL dereference in msm_gpu_crashstate_capture()
block: bump max plugged deferred size from 16 to 32
md: update superblock after changing rdev flags in state_store
memstick: r592: Fix a UAF bug when removing the driver
lib/xz: Avoid overlapping memcpy() with invalid input with in-place decompression
lib/xz: Validate the value before assigning it to an enum variable
workqueue: make sysfs of unbound kworker cpumask more clever
tracing/cfi: Fix cmp_entries_* functions signature mismatch
mt76: mt7915: fix an off-by-one bound check
mwl8k: Fix use-after-free in mwl8k_fw_state_machine()
block: remove inaccurate requeue check
media: allegro: ignore interrupt if mailbox is not initialized
nvmet: fix use-after-free when a port is removed
nvmet-rdma: fix use-after-free when a port is removed
nvmet-tcp: fix use-after-free when a port is removed
nvme: drop scan_lock and always kick requeue list when removing namespaces
PM: hibernate: Get block device exclusively in swsusp_check()
selftests: kvm: fix mismatched fclose() after popen()
selftests/bpf: Fix perf_buffer test on system with offline cpus
iwlwifi: mvm: disable RX-diversity in powersave
smackfs: use __GFP_NOFAIL for smk_cipso_doi()
ARM: clang: Do not rely on lr register for stacktrace
gre/sit: Don't generate link-local addr if addr_gen_mode is IN6_ADDR_GEN_MODE_NONE
gfs2: Cancel remote delete work asynchronously
gfs2: Fix glock_hash_walk bugs
ARM: 9136/1: ARMv7-M uses BE-8, not BE-32
vrf: run conntrack only in context of lower/physdev for locally generated packets
net: annotate data-race in neigh_output()
ACPI: AC: Quirk GK45 to skip reading _PSR
btrfs: reflink: initialize return value to 0 in btrfs_extent_same()
btrfs: do not take the uuid_mutex in btrfs_rm_device
spi: bcm-qspi: Fix missing clk_disable_unprepare() on error in bcm_qspi_probe()
wcn36xx: Correct band/freq reporting on RX
x86/hyperv: Protect set_hv_tscchange_cb() against getting preempted
drm/amd/display: dcn20_resource_construct reduce scope of FPU enabled
selftests/core: fix conflicting types compile error for close_range()
parisc: fix warning in flush_tlb_all
task_stack: Fix end_of_stack() for architectures with upwards-growing stack
erofs: don't trigger WARN() when decompression fails
parisc/unwind: fix unwinder when CONFIG_64BIT is enabled
parisc/kgdb: add kgdb_roundup() to make kgdb work with idle polling
netfilter: conntrack: set on IPS_ASSURED if flows enters internal stream state
selftests/bpf: Fix strobemeta selftest regression
Bluetooth: fix init and cleanup of sco_conn.timeout_work
rcu: Fix existing exp request check in sync_sched_exp_online_cleanup()
MIPS: lantiq: dma: fix burst length for DEU
objtool: Add xen_start_kernel() to noreturn list
x86/xen: Mark cpu_bringup_and_idle() as dead_end_function
objtool: Fix static_call list generation
drm/v3d: fix wait for TMU write combiner flush
virtio-gpu: fix possible memory allocation failure
lockdep: Let lock_is_held_type() detect recursive read as read
net: net_namespace: Fix undefined member in key_remove_domain()
cgroup: Make rebind_subsystems() disable v2 controllers all at once
wcn36xx: Fix Antenna Diversity Switching
wilc1000: fix possible memory leak in cfg_scan_result()
Bluetooth: btmtkuart: fix a memleak in mtk_hci_wmt_sync
crypto: caam - disable pkc for non-E SoCs
rxrpc: Fix _usecs_to_jiffies() by using usecs_to_jiffies()
net: dsa: rtl8366rb: Fix off-by-one bug
ath11k: fix some sleeping in atomic bugs
ath11k: Avoid race during regd updates
ath11k: fix packet drops due to incorrect 6 GHz freq value in rx status
ath11k: Fix memory leak in ath11k_qmi_driver_event_work
ath10k: Fix missing frame timestamp for beacon/probe-resp
ath10k: sdio: Add missing BH locking around napi_schdule()
drm/ttm: stop calling tt_swapin in vm_access
arm64: mm: update max_pfn after memory hotplug
drm/amdgpu: fix warning for overflow check
media: em28xx: add missing em28xx_close_extension
media: cxd2880-spi: Fix a null pointer dereference on error handling path
media: dvb-usb: fix ununit-value in az6027_rc_query
media: v4l2-ioctl: S_CTRL output the right value
media: TDA1997x: handle short reads of hdmi info frame.
media: mtk-vpu: Fix a resource leak in the error handling path of 'mtk_vpu_probe()'
media: radio-wl1273: Avoid card name truncation
media: si470x: Avoid card name truncation
media: tm6000: Avoid card name truncation
media: cx23885: Fix snd_card_free call on null card pointer
kprobes: Do not use local variable when creating debugfs file
crypto: ecc - fix CRYPTO_DEFAULT_RNG dependency
cpuidle: Fix kobject memory leaks in error paths
media: em28xx: Don't use ops->suspend if it is NULL
ath9k: Fix potential interrupt storm on queue reset
PM: EM: Fix inefficient states detection
EDAC/amd64: Handle three rank interleaving mode
rcu: Always inline rcu_dynticks_task*_{enter,exit}()
netfilter: nft_dynset: relax superfluous check on set updates
media: dvb-frontends: mn88443x: Handle errors of clk_prepare_enable()
crypto: qat - detect PFVF collision after ACK
crypto: qat - disregard spurious PFVF interrupts
hwrng: mtk - Force runtime pm ops for sleep ops
b43legacy: fix a lower bounds test
b43: fix a lower bounds test
gve: Recover from queue stall due to missed IRQ
mmc: sdhci-omap: Fix NULL pointer exception if regulator is not configured
mmc: sdhci-omap: Fix context restore
memstick: avoid out-of-range warning
memstick: jmb38x_ms: use appropriate free function in jmb38x_ms_alloc_host()
net, neigh: Fix NTF_EXT_LEARNED in combination with NTF_USE
hwmon: Fix possible memleak in __hwmon_device_register()
hwmon: (pmbus/lm25066) Let compiler determine outer dimension of lm25066_coeff
ath10k: fix max antenna gain unit
kernel/sched: Fix sched_fork() access an invalid sched_task_group
tcp: switch orphan_count to bare per-cpu counters
drm/msm: potential error pointer dereference in init()
drm/msm: uninitialized variable in msm_gem_import()
net: stream: don't purge sk_error_queue in sk_stream_kill_queues()
media: ir_toy: assignment to be16 should be of correct type
mmc: mxs-mmc: disable regulator on error and in the remove function
platform/x86: thinkpad_acpi: Fix bitwise vs. logical warning
mt76: mt7615: fix endianness warning in mt7615_mac_write_txwi
mt76: mt76x02: fix endianness warnings in mt76x02_mac.c
mt76: mt7915: fix possible infinite loop release semaphore
mt76: mt7915: fix sta_rec_wtbl tag len
mt76: mt7915: fix muar_idx in mt7915_mcu_alloc_sta_req()
rsi: stop thread firstly in rsi_91x_init() error handling
mwifiex: Send DELBA requests according to spec
net: enetc: unmap DMA in enetc_send_cmd()
phy: micrel: ksz8041nl: do not use power down mode
nvme-rdma: fix error code in nvme_rdma_setup_ctrl
PM: hibernate: fix sparse warnings
clocksource/drivers/timer-ti-dm: Select TIMER_OF
x86/sev: Fix stack type check in vc_switch_off_ist()
drm/msm: Fix potential NULL dereference in DPU SSPP
smackfs: use netlbl_cfg_cipsov4_del() for deleting cipso_v4_doi
KVM: selftests: Add operand to vmsave/vmload/vmrun in svm.c
KVM: selftests: Fix nested SVM tests when built with clang
bpftool: Avoid leaking the JSON writer prepared for program metadata
libbpf: Fix BTF data layout checks and allow empty BTF
libbpf: Allow loading empty BTFs
libbpf: Fix overflow in BTF sanity checks
libbpf: Fix BTF header parsing checks
s390/gmap: don't unconditionally call pte_unmap_unlock() in __gmap_zap()
KVM: s390: pv: avoid double free of sida page
KVM: s390: pv: avoid stalls for kvm_s390_pv_init_vm
irq: mips: avoid nested irq_enter()
tpm: fix Atmel TPM crash caused by too frequent queries
tpm_tis_spi: Add missing SPI ID
libbpf: Fix endianness detection in BPF_CORE_READ_BITFIELD_PROBED()
tcp: don't free a FIN sk_buff in tcp_remove_empty_skb()
spi: spi-rpc-if: Check return value of rpcif_sw_init()
samples/kretprobes: Fix return value if register_kretprobe() failed
KVM: s390: Fix handle_sske page fault handling
libertas_tf: Fix possible memory leak in probe and disconnect
libertas: Fix possible memory leak in probe and disconnect
wcn36xx: add proper DMA memory barriers in rx path
wcn36xx: Fix discarded frames due to wrong sequence number
drm/amdgpu/gmc6: fix DMA mask from 44 to 40 bits
selftests: bpf: Convert sk_lookup ctx access tests to PROG_TEST_RUN
selftests/bpf: Fix fd cleanup in sk_lookup test
net: amd-xgbe: Toggle PLL settings during rate change
net: phylink: avoid mvneta warning when setting pause parameters
crypto: pcrypt - Delay write to padata->info
selftests/bpf: Fix fclose/pclose mismatch in test_progs
udp6: allow SO_MARK ctrl msg to affect routing
ibmvnic: don't stop queue in xmit
ibmvnic: Process crqs after enabling interrupts
cgroup: Fix rootcg cpu.stat guest double counting
bpf: Fix propagation of bounds from 64-bit min/max into 32-bit and var_off.
bpf: Fix propagation of signed bounds from 64-bit min/max into 32-bit.
of: unittest: fix EXPECT text for gpio hog errors
iio: st_sensors: Call st_sensors_power_enable() from bus drivers
iio: st_sensors: disable regulators after device unregistration
RDMA/rxe: Fix wrong port_cap_flags
ARM: dts: BCM5301X: Fix memory nodes names
clk: mvebu: ap-cpu-clk: Fix a memory leak in error handling paths
ARM: s3c: irq-s3c24xx: Fix return value check for s3c24xx_init_intc()
arm64: dts: rockchip: Fix GPU register width for RK3328
ARM: dts: qcom: msm8974: Add xo_board reference clock to DSI0 PHY
RDMA/bnxt_re: Fix query SRQ failure
arm64: dts: ti: k3-j721e-main: Fix "max-virtual-functions" in PCIe EP nodes
arm64: dts: ti: k3-j721e-main: Fix "bus-range" upto 256 bus number for PCIe
arm64: dts: meson-g12a: Fix the pwm regulator supply properties
arm64: dts: meson-g12b: Fix the pwm regulator supply properties
bus: ti-sysc: Fix timekeeping_suspended warning on resume
ARM: dts: at91: tse850: the emac<->phy interface is rmii
scsi: dc395: Fix error case unwinding
MIPS: loongson64: make CPU_LOONGSON64 depends on MIPS_FP_SUPPORT
JFS: fix memleak in jfs_mount
arm64: dts: qcom: msm8916: Fix Secondary MI2S bit clock
arm64: dts: renesas: beacon: Fix Ethernet PHY mode
arm64: dts: qcom: pm8916: Remove wrong reg-names for rtc@6000
ALSA: hda: Reduce udelay() at SKL+ position reporting
ALSA: hda: Release controller display power during shutdown/reboot
ALSA: hda: Fix hang during shutdown due to link reset
ALSA: hda: Use position buffer for SKL+ again
soundwire: debugfs: use controller id and link_id for debugfs
scsi: pm80xx: Fix misleading log statement in pm8001_mpi_get_nvmd_resp()
driver core: Fix possible memory leak in device_link_add()
arm: dts: omap3-gta04a4: accelerometer irq fix
ASoC: SOF: topology: do not power down primary core during topology removal
soc/tegra: Fix an error handling path in tegra_powergate_power_up()
memory: fsl_ifc: fix leak of irq and nand_irq in fsl_ifc_ctrl_probe
clk: at91: check pmc node status before registering syscore ops
video: fbdev: chipsfb: use memset_io() instead of memset()
powerpc: Refactor is_kvm_guest() declaration to new header
powerpc: Rename is_kvm_guest() to check_kvm_guest()
powerpc: Reintroduce is_kvm_guest() as a fast-path check
powerpc: Fix is_kvm_guest() / kvm_para_available()
powerpc: fix unbalanced node refcount in check_kvm_guest()
serial: 8250_dw: Drop wrong use of ACPI_PTR()
usb: gadget: hid: fix error code in do_config()
power: supply: rt5033_battery: Change voltage values to µV
power: supply: max17040: fix null-ptr-deref in max17040_probe()
scsi: csiostor: Uninitialized data in csio_ln_vnp_read_cbfn()
RDMA/mlx4: Return missed an error if device doesn't support steering
usb: musb: select GENERIC_PHY instead of depending on it
staging: most: dim2: do not double-register the same device
staging: ks7010: select CRYPTO_HASH/CRYPTO_MICHAEL_MIC
pinctrl: renesas: checker: Fix off-by-one bug in drive register check
ARM: dts: stm32: Reduce DHCOR SPI NOR frequency to 50 MHz
ARM: dts: stm32: fix SAI sub nodes register range
ARM: dts: stm32: fix AV96 board SAI2 pin muxing on stm32mp15
ASoC: cs42l42: Correct some register default values
ASoC: cs42l42: Defer probe if request_threaded_irq() returns EPROBE_DEFER
soc: qcom: rpmhpd: Provide some missing struct member descriptions
soc: qcom: rpmhpd: Make power_on actually enable the domain
usb: typec: STUSB160X should select REGMAP_I2C
iio: adis: do not disabe IRQs in 'adis_init()'
scsi: ufs: Refactor ufshcd_setup_clocks() to remove skip_ref_clk
scsi: ufs: ufshcd-pltfrm: Fix memory leak due to probe defer
serial: imx: fix detach/attach of serial console
usb: dwc2: drd: fix dwc2_force_mode call in dwc2_ovr_init
usb: dwc2: drd: fix dwc2_drd_role_sw_set when clock could be disabled
usb: dwc2: drd: reset current session before setting the new one
firmware: qcom_scm: Fix error retval in __qcom_scm_is_call_available()
soc: qcom: apr: Add of_node_put() before return
pinctrl: equilibrium: Fix function addition in multiple groups
phy: qcom-qusb2: Fix a memory leak on probe
phy: ti: gmii-sel: check of_get_address() for failure
phy: qcom-snps: Correct the FSEL_MASK
serial: xilinx_uartps: Fix race condition causing stuck TX
clk: at91: sam9x60-pll: use DIV_ROUND_CLOSEST_ULL
HID: u2fzero: clarify error check and length calculations
HID: u2fzero: properly handle timeouts in usb_submit_urb
powerpc/44x/fsp2: add missing of_node_put
ASoC: cs42l42: Disable regulators if probe fails
ASoC: cs42l42: Use device_property API instead of of_property
ASoC: cs42l42: Correct configuring of switch inversion from ts-inv
virtio_ring: check desc == NULL when using indirect with packed
mips: cm: Convert to bitfield API to fix out-of-bounds access
power: supply: bq27xxx: Fix kernel crash on IRQ handler register error
apparmor: fix error check
rpmsg: Fix rpmsg_create_ept return when RPMSG config is not defined
nfsd: don't alloc under spinlock in rpc_parse_scope_id
i2c: mediatek: fixing the incorrect register offset
NFS: Fix dentry verifier races
pnfs/flexfiles: Fix misplaced barrier in nfs4_ff_layout_prepare_ds
drm/plane-helper: fix uninitialized variable reference
PCI: aardvark: Don't spam about PIO Response Status
PCI: aardvark: Fix preserving PCI_EXP_RTCTL_CRSSVE flag on emulated bridge
opp: Fix return in _opp_add_static_v2()
NFS: Fix deadlocks in nfs_scan_commit_list()
fs: orangefs: fix error return code of orangefs_revalidate_lookup()
mtd: spi-nor: hisi-sfc: Remove excessive clk_disable_unprepare()
PCI: uniphier: Serialize INTx masking/unmasking and fix the bit operation
mtd: core: don't remove debugfs directory if device is in use
remoteproc: Fix a memory leak in an error handling path in 'rproc_handle_vdev()'
rtc: rv3032: fix error handling in rv3032_clkout_set_rate()
dmaengine: at_xdmac: fix AT_XDMAC_CC_PERID() macro
NFS: Fix up commit deadlocks
NFS: Fix an Oops in pnfs_mark_request_commit()
Fix user namespace leak
auxdisplay: img-ascii-lcd: Fix lock-up when displaying empty string
auxdisplay: ht16k33: Connect backlight to fbdev
auxdisplay: ht16k33: Fix frame buffer device blanking
soc: fsl: dpaa2-console: free buffer before returning from dpaa2_console_read
netfilter: nfnetlink_queue: fix OOB when mac header was cleared
dmaengine: dmaengine_desc_callback_valid(): Check for `callback_result`
signal/sh: Use force_sig(SIGKILL) instead of do_group_exit(SIGKILL)
m68k: set a default value for MEMORY_RESERVE
watchdog: f71808e_wdt: fix inaccurate report in WDIOC_GETTIMEOUT
ar7: fix kernel builds for compiler test
scsi: qla2xxx: Changes to support FCP2 Target
scsi: qla2xxx: Relogin during fabric disturbance
scsi: qla2xxx: Fix gnl list corruption
scsi: qla2xxx: Turn off target reset during issue_lip
NFSv4: Fix a regression in nfs_set_open_stateid_locked()
i2c: xlr: Fix a resource leak in the error handling path of 'xlr_i2c_probe()'
xen-pciback: Fix return in pm_ctrl_init()
net: davinci_emac: Fix interrupt pacing disable
ethtool: fix ethtool msg len calculation for pause stats
openrisc: fix SMP tlb flush NULL pointer dereference
net: vlan: fix a UAF in vlan_dev_real_dev()
ice: Fix replacing VF hardware MAC to existing MAC filter
ice: Fix not stopping Tx queues for VFs
ACPI: PMIC: Fix intel_pmic_regs_handler() read accesses
drm/nouveau/svm: Fix refcount leak bug and missing check against null bug
net: phy: fix duplex out of sync problem while changing settings
bonding: Fix a use-after-free problem when bond_sysfs_slave_add() failed
mfd: core: Add missing of_node_put for loop iteration
can: mcp251xfd: mcp251xfd_chip_start(): fix error handling for mcp251xfd_chip_rx_int_enable()
mm/zsmalloc.c: close race window between zs_pool_dec_isolated() and zs_unregister_migration()
zram: off by one in read_block_state()
perf bpf: Add missing free to bpf_event__print_bpf_prog_info()
llc: fix out-of-bound array index in llc_sk_dev_hash()
nfc: pn533: Fix double free when pn533_fill_fragment_skbs() fails
arm64: pgtable: make __pte_to_phys/__phys_to_pte_val inline functions
bpf, sockmap: Remove unhash handler for BPF sockmap usage
bpf: sockmap, strparser, and tls are reusing qdisc_skb_cb and colliding
gve: Fix off by one in gve_tx_timeout()
seq_file: fix passing wrong private data
net/sched: sch_taprio: fix undefined behavior in ktime_mono_to_any
net: hns3: fix kernel crash when unload VF while it is being reset
net: hns3: allow configure ETS bandwidth of all TCs
net: stmmac: allow a tc-taprio base-time of zero
vsock: prevent unnecessary refcnt inc for nonblocking connect
net/smc: fix sk_refcnt underflow on linkdown and fallback
cxgb4: fix eeprom len when diagnostics not implemented
selftests/net: udpgso_bench_rx: fix port argument
ARM: 9155/1: fix early early_iounmap()
ARM: 9156/1: drop cc-option fallbacks for architecture selection
parisc: Fix backtrace to always include init funtion names
MIPS: Fix assembly error from MIPSr2 code used within MIPS_ISA_ARCH_LEVEL
x86/mce: Add errata workaround for Skylake SKX37
posix-cpu-timers: Clear task::posix_cputimers_work in copy_process()
irqchip/sifive-plic: Fixup EOI failed when masked
f2fs: should use GFP_NOFS for directory inodes
net, neigh: Enable state migration between NUD_PERMANENT and NTF_USE
9p/net: fix missing error check in p9_check_errors
memcg: prohibit unconditional exceeding the limit of dying tasks
powerpc/lib: Add helper to check if offset is within conditional branch range
powerpc/bpf: Validate branch ranges
powerpc/security: Add a helper to query stf_barrier type
powerpc/bpf: Emit stf barrier instruction sequences for BPF_NOSPEC
mm, oom: pagefault_out_of_memory: don't force global OOM for dying tasks
mm, oom: do not trigger out_of_memory from the #PF
mfd: dln2: Add cell for initializing DLN2 ADC
video: backlight: Drop maximum brightness override for brightness zero
s390/cio: check the subchannel validity for dev_busid
s390/tape: fix timer initialization in tape_std_assign()
s390/ap: Fix hanging ioctl caused by orphaned replies
s390/cio: make ccw_device_dma_* more robust
mtd: rawnand: ams-delta: Keep the driver compatible with on-die ECC engines
mtd: rawnand: xway: Keep the driver compatible with on-die ECC engines
mtd: rawnand: mpc5121: Keep the driver compatible with on-die ECC engines
mtd: rawnand: gpio: Keep the driver compatible with on-die ECC engines
mtd: rawnand: pasemi: Keep the driver compatible with on-die ECC engines
mtd: rawnand: orion: Keep the driver compatible with on-die ECC engines
mtd: rawnand: plat_nand: Keep the driver compatible with on-die ECC engines
mtd: rawnand: au1550nd: Keep the driver compatible with on-die ECC engines
powerpc/powernv/prd: Unregister OPAL_MSG_PRD2 notifier during module unload
powerpc/85xx: fix timebase sync issue when CONFIG_HOTPLUG_CPU=n
drm/sun4i: Fix macros in sun8i_csc.h
PCI: Add PCI_EXP_DEVCTL_PAYLOAD_* macros
PCI: aardvark: Fix PCIe Max Payload Size setting
SUNRPC: Partial revert of commit 6f9f17287e
ath10k: fix invalid dma_addr_t token assignment
mmc: moxart: Fix null pointer dereference on pointer host
selftests/bpf: Fix also no-alu32 strobemeta selftest
arch/cc: Introduce a function to check for confidential computing features
x86/sev: Add an x86 version of cc_platform_has()
x86/sev: Make the #VC exception stacks part of the default stacks storage
soc/tegra: pmc: Fix imbalanced clock disabling in error code path
Linux 5.10.80
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: I21c750863965fbf584251fa2de3c941ae5922d3f
1209 lines
32 KiB
C
1209 lines
32 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* linux/mm/oom_kill.c
|
|
*
|
|
* Copyright (C) 1998,2000 Rik van Riel
|
|
* Thanks go out to Claus Fischer for some serious inspiration and
|
|
* for goading me into coding this file...
|
|
* Copyright (C) 2010 Google, Inc.
|
|
* Rewritten by David Rientjes
|
|
*
|
|
* The routines in this file are used to kill a process when
|
|
* we're seriously out of memory. This gets called from __alloc_pages()
|
|
* in mm/page_alloc.c when we really run out of memory.
|
|
*
|
|
* Since we won't call these routines often (on a well-configured
|
|
* machine) this file will double as a 'coding guide' and a signpost
|
|
* for newbie kernel hackers. It features several pointers to major
|
|
* kernel subsystems and hints as to where to find out what things do.
|
|
*/
|
|
|
|
#include <linux/oom.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/err.h>
|
|
#include <linux/gfp.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/sched/mm.h>
|
|
#include <linux/sched/coredump.h>
|
|
#include <linux/sched/task.h>
|
|
#include <linux/sched/debug.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/timex.h>
|
|
#include <linux/jiffies.h>
|
|
#include <linux/cpuset.h>
|
|
#include <linux/export.h>
|
|
#include <linux/notifier.h>
|
|
#include <linux/memcontrol.h>
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/security.h>
|
|
#include <linux/ptrace.h>
|
|
#include <linux/freezer.h>
|
|
#include <linux/ftrace.h>
|
|
#include <linux/ratelimit.h>
|
|
#include <linux/kthread.h>
|
|
#include <linux/init.h>
|
|
#include <linux/mmu_notifier.h>
|
|
|
|
#include <asm/tlb.h>
|
|
#include "internal.h"
|
|
#include "slab.h"
|
|
|
|
#define CREATE_TRACE_POINTS
|
|
#include <trace/events/oom.h>
|
|
|
|
#undef CREATE_TRACE_POINTS
|
|
#include <trace/hooks/mm.h>
|
|
|
|
int sysctl_panic_on_oom;
|
|
int sysctl_oom_kill_allocating_task;
|
|
int sysctl_oom_dump_tasks = 1;
|
|
|
|
/*
|
|
* Serializes oom killer invocations (out_of_memory()) from all contexts to
|
|
* prevent from over eager oom killing (e.g. when the oom killer is invoked
|
|
* from different domains).
|
|
*
|
|
* oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
|
|
* and mark_oom_victim
|
|
*/
|
|
DEFINE_MUTEX(oom_lock);
|
|
/* Serializes oom_score_adj and oom_score_adj_min updates */
|
|
DEFINE_MUTEX(oom_adj_mutex);
|
|
|
|
static inline bool is_memcg_oom(struct oom_control *oc)
|
|
{
|
|
return oc->memcg != NULL;
|
|
}
|
|
|
|
#ifdef CONFIG_NUMA
|
|
/**
|
|
* oom_cpuset_eligible() - check task eligiblity for kill
|
|
* @start: task struct of which task to consider
|
|
* @oc: pointer to struct oom_control
|
|
*
|
|
* Task eligibility is determined by whether or not a candidate task, @tsk,
|
|
* shares the same mempolicy nodes as current if it is bound by such a policy
|
|
* and whether or not it has the same set of allowed cpuset nodes.
|
|
*
|
|
* This function is assuming oom-killer context and 'current' has triggered
|
|
* the oom-killer.
|
|
*/
|
|
static bool oom_cpuset_eligible(struct task_struct *start,
|
|
struct oom_control *oc)
|
|
{
|
|
struct task_struct *tsk;
|
|
bool ret = false;
|
|
const nodemask_t *mask = oc->nodemask;
|
|
|
|
if (is_memcg_oom(oc))
|
|
return true;
|
|
|
|
rcu_read_lock();
|
|
for_each_thread(start, tsk) {
|
|
if (mask) {
|
|
/*
|
|
* If this is a mempolicy constrained oom, tsk's
|
|
* cpuset is irrelevant. Only return true if its
|
|
* mempolicy intersects current, otherwise it may be
|
|
* needlessly killed.
|
|
*/
|
|
ret = mempolicy_nodemask_intersects(tsk, mask);
|
|
} else {
|
|
/*
|
|
* This is not a mempolicy constrained oom, so only
|
|
* check the mems of tsk's cpuset.
|
|
*/
|
|
ret = cpuset_mems_allowed_intersects(current, tsk);
|
|
}
|
|
if (ret)
|
|
break;
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
}
|
|
#else
|
|
static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
|
|
{
|
|
return true;
|
|
}
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
/*
|
|
* The process p may have detached its own ->mm while exiting or through
|
|
* kthread_use_mm(), but one or more of its subthreads may still have a valid
|
|
* pointer. Return p, or any of its subthreads with a valid ->mm, with
|
|
* task_lock() held.
|
|
*/
|
|
struct task_struct *find_lock_task_mm(struct task_struct *p)
|
|
{
|
|
struct task_struct *t;
|
|
|
|
rcu_read_lock();
|
|
|
|
for_each_thread(p, t) {
|
|
task_lock(t);
|
|
if (likely(t->mm))
|
|
goto found;
|
|
task_unlock(t);
|
|
}
|
|
t = NULL;
|
|
found:
|
|
rcu_read_unlock();
|
|
|
|
return t;
|
|
}
|
|
|
|
/*
|
|
* order == -1 means the oom kill is required by sysrq, otherwise only
|
|
* for display purposes.
|
|
*/
|
|
static inline bool is_sysrq_oom(struct oom_control *oc)
|
|
{
|
|
return oc->order == -1;
|
|
}
|
|
|
|
/* return true if the task is not adequate as candidate victim task. */
|
|
static bool oom_unkillable_task(struct task_struct *p)
|
|
{
|
|
if (is_global_init(p))
|
|
return true;
|
|
if (p->flags & PF_KTHREAD)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Print out unreclaimble slabs info when unreclaimable slabs amount is greater
|
|
* than all user memory (LRU pages)
|
|
*/
|
|
static bool is_dump_unreclaim_slabs(void)
|
|
{
|
|
unsigned long nr_lru;
|
|
|
|
nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
|
|
global_node_page_state(NR_INACTIVE_ANON) +
|
|
global_node_page_state(NR_ACTIVE_FILE) +
|
|
global_node_page_state(NR_INACTIVE_FILE) +
|
|
global_node_page_state(NR_ISOLATED_ANON) +
|
|
global_node_page_state(NR_ISOLATED_FILE) +
|
|
global_node_page_state(NR_UNEVICTABLE);
|
|
|
|
return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
|
|
}
|
|
|
|
/**
|
|
* oom_badness - heuristic function to determine which candidate task to kill
|
|
* @p: task struct of which task we should calculate
|
|
* @totalpages: total present RAM allowed for page allocation
|
|
*
|
|
* The heuristic for determining which task to kill is made to be as simple and
|
|
* predictable as possible. The goal is to return the highest value for the
|
|
* task consuming the most memory to avoid subsequent oom failures.
|
|
*/
|
|
long oom_badness(struct task_struct *p, unsigned long totalpages)
|
|
{
|
|
long points;
|
|
long adj;
|
|
|
|
if (oom_unkillable_task(p))
|
|
return LONG_MIN;
|
|
|
|
p = find_lock_task_mm(p);
|
|
if (!p)
|
|
return LONG_MIN;
|
|
|
|
/*
|
|
* Do not even consider tasks which are explicitly marked oom
|
|
* unkillable or have been already oom reaped or the are in
|
|
* the middle of vfork
|
|
*/
|
|
adj = (long)p->signal->oom_score_adj;
|
|
if (adj == OOM_SCORE_ADJ_MIN ||
|
|
test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
|
|
in_vfork(p)) {
|
|
task_unlock(p);
|
|
return LONG_MIN;
|
|
}
|
|
|
|
/*
|
|
* The baseline for the badness score is the proportion of RAM that each
|
|
* task's rss, pagetable and swap space use.
|
|
*/
|
|
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
|
|
mm_pgtables_bytes(p->mm) / PAGE_SIZE;
|
|
task_unlock(p);
|
|
|
|
/* Normalize to oom_score_adj units */
|
|
adj *= totalpages / 1000;
|
|
points += adj;
|
|
|
|
return points;
|
|
}
|
|
|
|
static const char * const oom_constraint_text[] = {
|
|
[CONSTRAINT_NONE] = "CONSTRAINT_NONE",
|
|
[CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
|
|
[CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
|
|
[CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
|
|
};
|
|
|
|
/*
|
|
* Determine the type of allocation constraint.
|
|
*/
|
|
static enum oom_constraint constrained_alloc(struct oom_control *oc)
|
|
{
|
|
struct zone *zone;
|
|
struct zoneref *z;
|
|
enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
|
|
bool cpuset_limited = false;
|
|
int nid;
|
|
|
|
if (is_memcg_oom(oc)) {
|
|
oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
|
|
return CONSTRAINT_MEMCG;
|
|
}
|
|
|
|
/* Default to all available memory */
|
|
oc->totalpages = totalram_pages() + total_swap_pages;
|
|
|
|
if (!IS_ENABLED(CONFIG_NUMA))
|
|
return CONSTRAINT_NONE;
|
|
|
|
if (!oc->zonelist)
|
|
return CONSTRAINT_NONE;
|
|
/*
|
|
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
|
|
* to kill current.We have to random task kill in this case.
|
|
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
|
|
*/
|
|
if (oc->gfp_mask & __GFP_THISNODE)
|
|
return CONSTRAINT_NONE;
|
|
|
|
/*
|
|
* This is not a __GFP_THISNODE allocation, so a truncated nodemask in
|
|
* the page allocator means a mempolicy is in effect. Cpuset policy
|
|
* is enforced in get_page_from_freelist().
|
|
*/
|
|
if (oc->nodemask &&
|
|
!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
|
|
oc->totalpages = total_swap_pages;
|
|
for_each_node_mask(nid, *oc->nodemask)
|
|
oc->totalpages += node_present_pages(nid);
|
|
return CONSTRAINT_MEMORY_POLICY;
|
|
}
|
|
|
|
/* Check this allocation failure is caused by cpuset's wall function */
|
|
for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
|
|
highest_zoneidx, oc->nodemask)
|
|
if (!cpuset_zone_allowed(zone, oc->gfp_mask))
|
|
cpuset_limited = true;
|
|
|
|
if (cpuset_limited) {
|
|
oc->totalpages = total_swap_pages;
|
|
for_each_node_mask(nid, cpuset_current_mems_allowed)
|
|
oc->totalpages += node_present_pages(nid);
|
|
return CONSTRAINT_CPUSET;
|
|
}
|
|
return CONSTRAINT_NONE;
|
|
}
|
|
|
|
static int oom_evaluate_task(struct task_struct *task, void *arg)
|
|
{
|
|
struct oom_control *oc = arg;
|
|
long points;
|
|
|
|
if (oom_unkillable_task(task))
|
|
goto next;
|
|
|
|
/* p may not have freeable memory in nodemask */
|
|
if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
|
|
goto next;
|
|
|
|
/*
|
|
* This task already has access to memory reserves and is being killed.
|
|
* Don't allow any other task to have access to the reserves unless
|
|
* the task has MMF_OOM_SKIP because chances that it would release
|
|
* any memory is quite low.
|
|
*/
|
|
if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
|
|
if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
|
|
goto next;
|
|
goto abort;
|
|
}
|
|
|
|
/*
|
|
* If task is allocating a lot of memory and has been marked to be
|
|
* killed first if it triggers an oom, then select it.
|
|
*/
|
|
if (oom_task_origin(task)) {
|
|
points = LONG_MAX;
|
|
goto select;
|
|
}
|
|
|
|
points = oom_badness(task, oc->totalpages);
|
|
|
|
if (points == LONG_MIN)
|
|
goto next;
|
|
|
|
/*
|
|
* Check to see if this is the worst task with a non-negative
|
|
* ADJ score seen so far
|
|
*/
|
|
if (task->signal->oom_score_adj >= 0 &&
|
|
points > oc->chosen_non_negative_adj_points) {
|
|
if (oc->chosen_non_negative_adj)
|
|
put_task_struct(oc->chosen_non_negative_adj);
|
|
get_task_struct(task);
|
|
oc->chosen_non_negative_adj = task;
|
|
oc->chosen_non_negative_adj_points = points;
|
|
}
|
|
|
|
if (points < oc->chosen_points)
|
|
goto next;
|
|
|
|
select:
|
|
if (oc->chosen)
|
|
put_task_struct(oc->chosen);
|
|
get_task_struct(task);
|
|
oc->chosen = task;
|
|
oc->chosen_points = points;
|
|
next:
|
|
return 0;
|
|
abort:
|
|
if (oc->chosen_non_negative_adj)
|
|
put_task_struct(oc->chosen_non_negative_adj);
|
|
if (oc->chosen)
|
|
put_task_struct(oc->chosen);
|
|
oc->chosen_non_negative_adj = NULL;
|
|
oc->chosen = (void *)-1UL;
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Simple selection loop. We choose the process with the highest number of
|
|
* 'points'. In case scan was aborted, oc->chosen is set to -1.
|
|
*/
|
|
static void select_bad_process(struct oom_control *oc)
|
|
{
|
|
oc->chosen_points = LONG_MIN;
|
|
oc->chosen_non_negative_adj_points = LONG_MIN;
|
|
oc->chosen_non_negative_adj = NULL;
|
|
|
|
if (is_memcg_oom(oc))
|
|
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
|
|
else {
|
|
struct task_struct *p;
|
|
|
|
rcu_read_lock();
|
|
for_each_process(p)
|
|
if (oom_evaluate_task(p, oc))
|
|
break;
|
|
rcu_read_unlock();
|
|
}
|
|
|
|
if (oc->chosen_non_negative_adj) {
|
|
/*
|
|
* If oc->chosen has a negative ADJ, and we found a task with
|
|
* a postive ADJ to kill, kill the task with the positive ADJ
|
|
* instead.
|
|
*/
|
|
if (oc->chosen && oc->chosen->signal->oom_score_adj < 0) {
|
|
put_task_struct(oc->chosen);
|
|
oc->chosen = oc->chosen_non_negative_adj;
|
|
oc->chosen_points = oc->chosen_non_negative_adj_points;
|
|
} else
|
|
put_task_struct(oc->chosen_non_negative_adj);
|
|
}
|
|
}
|
|
|
|
static int dump_task(struct task_struct *p, void *arg)
|
|
{
|
|
struct oom_control *oc = arg;
|
|
struct task_struct *task;
|
|
|
|
if (oom_unkillable_task(p))
|
|
return 0;
|
|
|
|
/* p may not have freeable memory in nodemask */
|
|
if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
|
|
return 0;
|
|
|
|
task = find_lock_task_mm(p);
|
|
if (!task) {
|
|
/*
|
|
* This is a kthread or all of p's threads have already
|
|
* detached their mm's. There's no need to report
|
|
* them; they can't be oom killed anyway.
|
|
*/
|
|
return 0;
|
|
}
|
|
|
|
pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
|
|
task->pid, from_kuid(&init_user_ns, task_uid(task)),
|
|
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
|
|
mm_pgtables_bytes(task->mm),
|
|
get_mm_counter(task->mm, MM_SWAPENTS),
|
|
task->signal->oom_score_adj, task->comm);
|
|
task_unlock(task);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* dump_tasks - dump current memory state of all system tasks
|
|
* @oc: pointer to struct oom_control
|
|
*
|
|
* Dumps the current memory state of all eligible tasks. Tasks not in the same
|
|
* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
|
|
* are not shown.
|
|
* State information includes task's pid, uid, tgid, vm size, rss,
|
|
* pgtables_bytes, swapents, oom_score_adj value, and name.
|
|
*/
|
|
static void dump_tasks(struct oom_control *oc)
|
|
{
|
|
pr_info("Tasks state (memory values in pages):\n");
|
|
pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
|
|
|
|
if (is_memcg_oom(oc))
|
|
mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
|
|
else {
|
|
struct task_struct *p;
|
|
|
|
rcu_read_lock();
|
|
for_each_process(p)
|
|
dump_task(p, oc);
|
|
rcu_read_unlock();
|
|
}
|
|
}
|
|
|
|
static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
|
|
{
|
|
/* one line summary of the oom killer context. */
|
|
pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
|
|
oom_constraint_text[oc->constraint],
|
|
nodemask_pr_args(oc->nodemask));
|
|
cpuset_print_current_mems_allowed();
|
|
mem_cgroup_print_oom_context(oc->memcg, victim);
|
|
pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
|
|
from_kuid(&init_user_ns, task_uid(victim)));
|
|
}
|
|
|
|
static void dump_header(struct oom_control *oc, struct task_struct *p)
|
|
{
|
|
pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
|
|
current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
|
|
current->signal->oom_score_adj);
|
|
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
|
|
pr_warn("COMPACTION is disabled!!!\n");
|
|
|
|
dump_stack();
|
|
if (is_memcg_oom(oc))
|
|
mem_cgroup_print_oom_meminfo(oc->memcg);
|
|
else {
|
|
show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
|
|
if (is_dump_unreclaim_slabs())
|
|
dump_unreclaimable_slab();
|
|
}
|
|
if (sysctl_oom_dump_tasks)
|
|
dump_tasks(oc);
|
|
if (p)
|
|
dump_oom_summary(oc, p);
|
|
}
|
|
|
|
/*
|
|
* Number of OOM victims in flight
|
|
*/
|
|
static atomic_t oom_victims = ATOMIC_INIT(0);
|
|
static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
|
|
|
|
static bool oom_killer_disabled __read_mostly;
|
|
|
|
#define K(x) ((x) << (PAGE_SHIFT-10))
|
|
|
|
/*
|
|
* task->mm can be NULL if the task is the exited group leader. So to
|
|
* determine whether the task is using a particular mm, we examine all the
|
|
* task's threads: if one of those is using this mm then this task was also
|
|
* using it.
|
|
*/
|
|
bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
|
|
{
|
|
struct task_struct *t;
|
|
|
|
for_each_thread(p, t) {
|
|
struct mm_struct *t_mm = READ_ONCE(t->mm);
|
|
if (t_mm)
|
|
return t_mm == mm;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
#ifdef CONFIG_MMU
|
|
/*
|
|
* OOM Reaper kernel thread which tries to reap the memory used by the OOM
|
|
* victim (if that is possible) to help the OOM killer to move on.
|
|
*/
|
|
static struct task_struct *oom_reaper_th;
|
|
static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
|
|
static struct task_struct *oom_reaper_list;
|
|
static DEFINE_SPINLOCK(oom_reaper_lock);
|
|
|
|
bool __oom_reap_task_mm(struct mm_struct *mm)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
bool ret = true;
|
|
|
|
/*
|
|
* Tell all users of get_user/copy_from_user etc... that the content
|
|
* is no longer stable. No barriers really needed because unmapping
|
|
* should imply barriers already and the reader would hit a page fault
|
|
* if it stumbled over a reaped memory.
|
|
*/
|
|
set_bit(MMF_UNSTABLE, &mm->flags);
|
|
|
|
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
|
|
if (!can_madv_lru_vma(vma))
|
|
continue;
|
|
|
|
/*
|
|
* Only anonymous pages have a good chance to be dropped
|
|
* without additional steps which we cannot afford as we
|
|
* are OOM already.
|
|
*
|
|
* We do not even care about fs backed pages because all
|
|
* which are reclaimable have already been reclaimed and
|
|
* we do not want to block exit_mmap by keeping mm ref
|
|
* count elevated without a good reason.
|
|
*/
|
|
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
|
|
struct mmu_notifier_range range;
|
|
struct mmu_gather tlb;
|
|
|
|
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
|
|
vma, mm, vma->vm_start,
|
|
vma->vm_end);
|
|
tlb_gather_mmu(&tlb, mm, range.start, range.end);
|
|
if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
|
|
tlb_finish_mmu(&tlb, range.start, range.end);
|
|
ret = false;
|
|
continue;
|
|
}
|
|
unmap_page_range(&tlb, vma, range.start, range.end, NULL);
|
|
mmu_notifier_invalidate_range_end(&range);
|
|
tlb_finish_mmu(&tlb, range.start, range.end);
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Reaps the address space of the give task.
|
|
*
|
|
* Returns true on success and false if none or part of the address space
|
|
* has been reclaimed and the caller should retry later.
|
|
*/
|
|
static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
|
|
{
|
|
bool ret = true;
|
|
|
|
if (!mmap_read_trylock(mm)) {
|
|
trace_skip_task_reaping(tsk->pid);
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
|
|
* work on the mm anymore. The check for MMF_OOM_SKIP must run
|
|
* under mmap_lock for reading because it serializes against the
|
|
* mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
|
|
*/
|
|
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
|
|
trace_skip_task_reaping(tsk->pid);
|
|
goto out_unlock;
|
|
}
|
|
|
|
trace_start_task_reaping(tsk->pid);
|
|
|
|
/* failed to reap part of the address space. Try again later */
|
|
ret = __oom_reap_task_mm(mm);
|
|
if (!ret)
|
|
goto out_finish;
|
|
|
|
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
|
|
task_pid_nr(tsk), tsk->comm,
|
|
K(get_mm_counter(mm, MM_ANONPAGES)),
|
|
K(get_mm_counter(mm, MM_FILEPAGES)),
|
|
K(get_mm_counter(mm, MM_SHMEMPAGES)));
|
|
out_finish:
|
|
trace_finish_task_reaping(tsk->pid);
|
|
out_unlock:
|
|
mmap_read_unlock(mm);
|
|
|
|
return ret;
|
|
}
|
|
|
|
#define MAX_OOM_REAP_RETRIES 10
|
|
static void oom_reap_task(struct task_struct *tsk)
|
|
{
|
|
int attempts = 0;
|
|
struct mm_struct *mm = tsk->signal->oom_mm;
|
|
|
|
/* Retry the mmap_read_trylock(mm) a few times */
|
|
while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
|
|
schedule_timeout_idle(HZ/10);
|
|
|
|
if (attempts <= MAX_OOM_REAP_RETRIES ||
|
|
test_bit(MMF_OOM_SKIP, &mm->flags))
|
|
goto done;
|
|
|
|
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
|
|
task_pid_nr(tsk), tsk->comm);
|
|
sched_show_task(tsk);
|
|
debug_show_all_locks();
|
|
|
|
done:
|
|
tsk->oom_reaper_list = NULL;
|
|
|
|
/*
|
|
* Hide this mm from OOM killer because it has been either reaped or
|
|
* somebody can't call mmap_write_unlock(mm).
|
|
*/
|
|
set_bit(MMF_OOM_SKIP, &mm->flags);
|
|
|
|
/* Drop a reference taken by wake_oom_reaper */
|
|
put_task_struct(tsk);
|
|
}
|
|
|
|
static int oom_reaper(void *unused)
|
|
{
|
|
while (true) {
|
|
struct task_struct *tsk = NULL;
|
|
|
|
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
|
|
spin_lock(&oom_reaper_lock);
|
|
if (oom_reaper_list != NULL) {
|
|
tsk = oom_reaper_list;
|
|
oom_reaper_list = tsk->oom_reaper_list;
|
|
}
|
|
spin_unlock(&oom_reaper_lock);
|
|
|
|
if (tsk)
|
|
oom_reap_task(tsk);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void wake_oom_reaper(struct task_struct *tsk)
|
|
{
|
|
/* mm is already queued? */
|
|
if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
|
|
return;
|
|
|
|
get_task_struct(tsk);
|
|
|
|
spin_lock(&oom_reaper_lock);
|
|
tsk->oom_reaper_list = oom_reaper_list;
|
|
oom_reaper_list = tsk;
|
|
spin_unlock(&oom_reaper_lock);
|
|
trace_wake_reaper(tsk->pid);
|
|
wake_up(&oom_reaper_wait);
|
|
}
|
|
|
|
static int __init oom_init(void)
|
|
{
|
|
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
|
|
return 0;
|
|
}
|
|
subsys_initcall(oom_init)
|
|
#else
|
|
static inline void wake_oom_reaper(struct task_struct *tsk)
|
|
{
|
|
}
|
|
#endif /* CONFIG_MMU */
|
|
|
|
/**
|
|
* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
|
|
* under task_lock or operate on the current).
|
|
*/
|
|
static void __mark_oom_victim(struct task_struct *tsk)
|
|
{
|
|
struct mm_struct *mm = tsk->mm;
|
|
|
|
if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
|
|
mmgrab(tsk->signal->oom_mm);
|
|
set_bit(MMF_OOM_VICTIM, &mm->flags);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* mark_oom_victim - mark the given task as OOM victim
|
|
* @tsk: task to mark
|
|
*
|
|
* Has to be called with oom_lock held and never after
|
|
* oom has been disabled already.
|
|
*
|
|
* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
|
|
* under task_lock or operate on the current).
|
|
*/
|
|
static void mark_oom_victim(struct task_struct *tsk)
|
|
{
|
|
WARN_ON(oom_killer_disabled);
|
|
/* OOM killer might race with memcg OOM */
|
|
if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
|
|
return;
|
|
|
|
/* oom_mm is bound to the signal struct life time. */
|
|
__mark_oom_victim(tsk);
|
|
|
|
/*
|
|
* Make sure that the task is woken up from uninterruptible sleep
|
|
* if it is frozen because OOM killer wouldn't be able to free
|
|
* any memory and livelock. freezing_slow_path will tell the freezer
|
|
* that TIF_MEMDIE tasks should be ignored.
|
|
*/
|
|
__thaw_task(tsk);
|
|
atomic_inc(&oom_victims);
|
|
trace_mark_victim(tsk->pid);
|
|
}
|
|
|
|
/**
|
|
* exit_oom_victim - note the exit of an OOM victim
|
|
*/
|
|
void exit_oom_victim(void)
|
|
{
|
|
clear_thread_flag(TIF_MEMDIE);
|
|
|
|
if (!atomic_dec_return(&oom_victims))
|
|
wake_up_all(&oom_victims_wait);
|
|
}
|
|
|
|
/**
|
|
* oom_killer_enable - enable OOM killer
|
|
*/
|
|
void oom_killer_enable(void)
|
|
{
|
|
oom_killer_disabled = false;
|
|
pr_info("OOM killer enabled.\n");
|
|
}
|
|
|
|
/**
|
|
* oom_killer_disable - disable OOM killer
|
|
* @timeout: maximum timeout to wait for oom victims in jiffies
|
|
*
|
|
* Forces all page allocations to fail rather than trigger OOM killer.
|
|
* Will block and wait until all OOM victims are killed or the given
|
|
* timeout expires.
|
|
*
|
|
* The function cannot be called when there are runnable user tasks because
|
|
* the userspace would see unexpected allocation failures as a result. Any
|
|
* new usage of this function should be consulted with MM people.
|
|
*
|
|
* Returns true if successful and false if the OOM killer cannot be
|
|
* disabled.
|
|
*/
|
|
bool oom_killer_disable(signed long timeout)
|
|
{
|
|
signed long ret;
|
|
|
|
/*
|
|
* Make sure to not race with an ongoing OOM killer. Check that the
|
|
* current is not killed (possibly due to sharing the victim's memory).
|
|
*/
|
|
if (mutex_lock_killable(&oom_lock))
|
|
return false;
|
|
oom_killer_disabled = true;
|
|
mutex_unlock(&oom_lock);
|
|
|
|
ret = wait_event_interruptible_timeout(oom_victims_wait,
|
|
!atomic_read(&oom_victims), timeout);
|
|
if (ret <= 0) {
|
|
oom_killer_enable();
|
|
return false;
|
|
}
|
|
pr_info("OOM killer disabled.\n");
|
|
|
|
return true;
|
|
}
|
|
|
|
static inline bool __task_will_free_mem(struct task_struct *task)
|
|
{
|
|
struct signal_struct *sig = task->signal;
|
|
|
|
/*
|
|
* A coredumping process may sleep for an extended period in exit_mm(),
|
|
* so the oom killer cannot assume that the process will promptly exit
|
|
* and release memory.
|
|
*/
|
|
if (sig->flags & SIGNAL_GROUP_COREDUMP)
|
|
return false;
|
|
|
|
if (sig->flags & SIGNAL_GROUP_EXIT)
|
|
return true;
|
|
|
|
if (thread_group_empty(task) && (task->flags & PF_EXITING))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Checks whether the given task is dying or exiting and likely to
|
|
* release its address space. This means that all threads and processes
|
|
* sharing the same mm have to be killed or exiting.
|
|
* Caller has to make sure that task->mm is stable (hold task_lock or
|
|
* it operates on the current).
|
|
*/
|
|
static bool task_will_free_mem(struct task_struct *task)
|
|
{
|
|
struct mm_struct *mm = task->mm;
|
|
struct task_struct *p;
|
|
bool ret = true;
|
|
|
|
/*
|
|
* Skip tasks without mm because it might have passed its exit_mm and
|
|
* exit_oom_victim. oom_reaper could have rescued that but do not rely
|
|
* on that for now. We can consider find_lock_task_mm in future.
|
|
*/
|
|
if (!mm)
|
|
return false;
|
|
|
|
if (!__task_will_free_mem(task))
|
|
return false;
|
|
|
|
/*
|
|
* This task has already been drained by the oom reaper so there are
|
|
* only small chances it will free some more
|
|
*/
|
|
if (test_bit(MMF_OOM_SKIP, &mm->flags))
|
|
return false;
|
|
|
|
if (atomic_read(&mm->mm_users) <= 1)
|
|
return true;
|
|
|
|
/*
|
|
* Make sure that all tasks which share the mm with the given tasks
|
|
* are dying as well to make sure that a) nobody pins its mm and
|
|
* b) the task is also reapable by the oom reaper.
|
|
*/
|
|
rcu_read_lock();
|
|
for_each_process(p) {
|
|
if (!process_shares_mm(p, mm))
|
|
continue;
|
|
if (same_thread_group(task, p))
|
|
continue;
|
|
ret = __task_will_free_mem(p);
|
|
if (!ret)
|
|
break;
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void __oom_kill_process(struct task_struct *victim, const char *message)
|
|
{
|
|
struct task_struct *p;
|
|
struct mm_struct *mm;
|
|
bool can_oom_reap = true;
|
|
|
|
p = find_lock_task_mm(victim);
|
|
if (!p) {
|
|
pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
|
|
message, task_pid_nr(victim), victim->comm);
|
|
put_task_struct(victim);
|
|
return;
|
|
} else if (victim != p) {
|
|
get_task_struct(p);
|
|
put_task_struct(victim);
|
|
victim = p;
|
|
}
|
|
|
|
/* Get a reference to safely compare mm after task_unlock(victim) */
|
|
mm = victim->mm;
|
|
mmgrab(mm);
|
|
|
|
/* Raise event before sending signal: task reaper must see this */
|
|
count_vm_event(OOM_KILL);
|
|
memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
|
|
|
|
/*
|
|
* We should send SIGKILL before granting access to memory reserves
|
|
* in order to prevent the OOM victim from depleting the memory
|
|
* reserves from the user space under its control.
|
|
*/
|
|
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
|
|
mark_oom_victim(victim);
|
|
pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
|
|
message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
|
|
K(get_mm_counter(mm, MM_ANONPAGES)),
|
|
K(get_mm_counter(mm, MM_FILEPAGES)),
|
|
K(get_mm_counter(mm, MM_SHMEMPAGES)),
|
|
from_kuid(&init_user_ns, task_uid(victim)),
|
|
mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
|
|
task_unlock(victim);
|
|
|
|
/*
|
|
* Kill all user processes sharing victim->mm in other thread groups, if
|
|
* any. They don't get access to memory reserves, though, to avoid
|
|
* depletion of all memory. This prevents mm->mmap_lock livelock when an
|
|
* oom killed thread cannot exit because it requires the semaphore and
|
|
* its contended by another thread trying to allocate memory itself.
|
|
* That thread will now get access to memory reserves since it has a
|
|
* pending fatal signal.
|
|
*/
|
|
rcu_read_lock();
|
|
for_each_process(p) {
|
|
if (!process_shares_mm(p, mm))
|
|
continue;
|
|
if (same_thread_group(p, victim))
|
|
continue;
|
|
if (is_global_init(p)) {
|
|
can_oom_reap = false;
|
|
set_bit(MMF_OOM_SKIP, &mm->flags);
|
|
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
|
|
task_pid_nr(victim), victim->comm,
|
|
task_pid_nr(p), p->comm);
|
|
continue;
|
|
}
|
|
/*
|
|
* No kthead_use_mm() user needs to read from the userspace so
|
|
* we are ok to reap it.
|
|
*/
|
|
if (unlikely(p->flags & PF_KTHREAD))
|
|
continue;
|
|
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
if (can_oom_reap)
|
|
wake_oom_reaper(victim);
|
|
|
|
mmdrop(mm);
|
|
put_task_struct(victim);
|
|
}
|
|
#undef K
|
|
|
|
/*
|
|
* Kill provided task unless it's secured by setting
|
|
* oom_score_adj to OOM_SCORE_ADJ_MIN.
|
|
*/
|
|
static int oom_kill_memcg_member(struct task_struct *task, void *message)
|
|
{
|
|
if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
|
|
!is_global_init(task)) {
|
|
get_task_struct(task);
|
|
__oom_kill_process(task, message);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void oom_kill_process(struct oom_control *oc, const char *message)
|
|
{
|
|
struct task_struct *victim = oc->chosen;
|
|
struct mem_cgroup *oom_group;
|
|
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
|
|
DEFAULT_RATELIMIT_BURST);
|
|
|
|
/*
|
|
* If the task is already exiting, don't alarm the sysadmin or kill
|
|
* its children or threads, just give it access to memory reserves
|
|
* so it can die quickly
|
|
*/
|
|
task_lock(victim);
|
|
if (task_will_free_mem(victim)) {
|
|
mark_oom_victim(victim);
|
|
wake_oom_reaper(victim);
|
|
task_unlock(victim);
|
|
put_task_struct(victim);
|
|
return;
|
|
}
|
|
task_unlock(victim);
|
|
|
|
if (__ratelimit(&oom_rs))
|
|
dump_header(oc, victim);
|
|
|
|
/*
|
|
* Do we need to kill the entire memory cgroup?
|
|
* Or even one of the ancestor memory cgroups?
|
|
* Check this out before killing the victim task.
|
|
*/
|
|
oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
|
|
|
|
__oom_kill_process(victim, message);
|
|
|
|
/*
|
|
* If necessary, kill all tasks in the selected memory cgroup.
|
|
*/
|
|
if (oom_group) {
|
|
mem_cgroup_print_oom_group(oom_group);
|
|
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
|
|
(void*)message);
|
|
mem_cgroup_put(oom_group);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
|
|
*/
|
|
static void check_panic_on_oom(struct oom_control *oc)
|
|
{
|
|
if (likely(!sysctl_panic_on_oom))
|
|
return;
|
|
if (sysctl_panic_on_oom != 2) {
|
|
/*
|
|
* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
|
|
* does not panic for cpuset, mempolicy, or memcg allocation
|
|
* failures.
|
|
*/
|
|
if (oc->constraint != CONSTRAINT_NONE)
|
|
return;
|
|
}
|
|
/* Do not panic for oom kills triggered by sysrq */
|
|
if (is_sysrq_oom(oc))
|
|
return;
|
|
dump_header(oc, NULL);
|
|
panic("Out of memory: %s panic_on_oom is enabled\n",
|
|
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
|
|
}
|
|
|
|
static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
|
|
|
|
int register_oom_notifier(struct notifier_block *nb)
|
|
{
|
|
return blocking_notifier_chain_register(&oom_notify_list, nb);
|
|
}
|
|
EXPORT_SYMBOL_GPL(register_oom_notifier);
|
|
|
|
int unregister_oom_notifier(struct notifier_block *nb)
|
|
{
|
|
return blocking_notifier_chain_unregister(&oom_notify_list, nb);
|
|
}
|
|
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
|
|
|
|
/**
|
|
* out_of_memory - kill the "best" process when we run out of memory
|
|
* @oc: pointer to struct oom_control
|
|
*
|
|
* If we run out of memory, we have the choice between either
|
|
* killing a random task (bad), letting the system crash (worse)
|
|
* OR try to be smart about which process to kill. Note that we
|
|
* don't have to be perfect here, we just have to be good.
|
|
*/
|
|
bool out_of_memory(struct oom_control *oc)
|
|
{
|
|
unsigned long freed = 0;
|
|
|
|
if (oom_killer_disabled)
|
|
return false;
|
|
|
|
if (!is_memcg_oom(oc)) {
|
|
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
|
|
if (freed > 0)
|
|
/* Got some memory back in the last second. */
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* If current has a pending SIGKILL or is exiting, then automatically
|
|
* select it. The goal is to allow it to allocate so that it may
|
|
* quickly exit and free its memory.
|
|
*/
|
|
if (task_will_free_mem(current)) {
|
|
mark_oom_victim(current);
|
|
wake_oom_reaper(current);
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* The OOM killer does not compensate for IO-less reclaim.
|
|
* pagefault_out_of_memory lost its gfp context so we have to
|
|
* make sure exclude 0 mask - all other users should have at least
|
|
* ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
|
|
* invoke the OOM killer even if it is a GFP_NOFS allocation.
|
|
*/
|
|
if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
|
|
return true;
|
|
|
|
/*
|
|
* Check if there were limitations on the allocation (only relevant for
|
|
* NUMA and memcg) that may require different handling.
|
|
*/
|
|
oc->constraint = constrained_alloc(oc);
|
|
if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
|
|
oc->nodemask = NULL;
|
|
check_panic_on_oom(oc);
|
|
|
|
if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
|
|
current->mm && !oom_unkillable_task(current) &&
|
|
oom_cpuset_eligible(current, oc) &&
|
|
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
|
|
get_task_struct(current);
|
|
oc->chosen = current;
|
|
oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
|
|
return true;
|
|
}
|
|
|
|
select_bad_process(oc);
|
|
/* Found nothing?!?! */
|
|
if (!oc->chosen) {
|
|
int ret = false;
|
|
|
|
trace_android_vh_oom_check_panic(oc, &ret);
|
|
if (ret)
|
|
return true;
|
|
|
|
dump_header(oc, NULL);
|
|
pr_warn("Out of memory and no killable processes...\n");
|
|
/*
|
|
* If we got here due to an actual allocation at the
|
|
* system level, we cannot survive this and will enter
|
|
* an endless loop in the allocator. Bail out now.
|
|
*/
|
|
if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
|
|
panic("System is deadlocked on memory\n");
|
|
}
|
|
if (oc->chosen && oc->chosen != (void *)-1UL)
|
|
oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
|
|
"Memory cgroup out of memory");
|
|
return !!oc->chosen;
|
|
}
|
|
|
|
/*
|
|
* The pagefault handler calls here because some allocation has failed. We have
|
|
* to take care of the memcg OOM here because this is the only safe context without
|
|
* any locks held but let the oom killer triggered from the allocation context care
|
|
* about the global OOM.
|
|
*/
|
|
void pagefault_out_of_memory(void)
|
|
{
|
|
static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
|
|
DEFAULT_RATELIMIT_BURST);
|
|
|
|
if (mem_cgroup_oom_synchronize(true))
|
|
return;
|
|
|
|
if (fatal_signal_pending(current))
|
|
return;
|
|
|
|
if (__ratelimit(&pfoom_rs))
|
|
pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
|
|
}
|
|
|
|
void add_to_oom_reaper(struct task_struct *p)
|
|
{
|
|
p = find_lock_task_mm(p);
|
|
if (!p)
|
|
return;
|
|
|
|
get_task_struct(p);
|
|
if (task_will_free_mem(p)) {
|
|
__mark_oom_victim(p);
|
|
wake_oom_reaper(p);
|
|
}
|
|
task_unlock(p);
|
|
put_task_struct(p);
|
|
}
|