linux/mm/oom_kill.c
Greg Kroah-Hartman c553d9a246 This is the 5.10.80 stable release
-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAmGWT+QACgkQONu9yGCS
 aT5mYw//ZXKzugaeJjuIaFqr7tcM7x8EefbKd2H4oMr8SW3IFElJIbNPJGMJAG/C
 tLZVWZvIum7QzZoxTL+JCCKpDzBERNTo4e5u7UwzAdVqiEX69YkNU0FBOzb4qXJ7
 gOZMBhy4UMIKdKD12CSXXf7ZspocsNXfzdmulRQ7CQcPoPrIMKpc4vuagN1Fy/Dz
 JgXYvRUAkLxtFHoQ/TeXvR4Gv9+w2ToMdb02mI48QBO+YYrFaGt+Rza2eHTv75H+
 Lydz37Nv1Pk32tA1q2jWxCzz16+Kzn+AviKiCfQK0Fb9IqnJksUIWLHSiODlVIcf
 kQHejanPn/p1BnBl8puPk1KFtDW45p2GwYhXG7hjGh08DGlR7QLHBS5Aa3xPYfdd
 uOy4ctygSVTx5nLjPH5vr3OE0wk/TuSSf/eyk2fmcUCspwAgBOnSYSmnJOem7LTK
 VqIgXFdCRplsqN415D35ddruP2BLCKqBu4KjwJ1LGIwgsx/Pmz4hlc5YcpLm8uRg
 XMqGTdcieQFOGmZJjJ2q3ecaCjfb0nmTrOylP5b55/74TFwFo042YR1ua0fEtpD4
 euoHLfYv3BY1dCp34TOUFGX0l+J1kAtf//vfD/JgJx/nX+ksdFBHhYwdbSi2oQG/
 9CceXYJ5duEnG+JmDOWJvcZ3T49K5XaIDNfY2zGpcSu1VZKubWg=
 =tQ0m
 -----END PGP SIGNATURE-----

Merge 5.10.80 into android12-5.10-lts

Changes in 5.10.80
	xhci: Fix USB 3.1 enumeration issues by increasing roothub power-on-good delay
	usb: xhci: Enable runtime-pm by default on AMD Yellow Carp platform
	binder: use euid from cred instead of using task
	binder: use cred instead of task for selinux checks
	binder: use cred instead of task for getsecid
	Input: iforce - fix control-message timeout
	Input: elantench - fix misreporting trackpoint coordinates
	Input: i8042 - Add quirk for Fujitsu Lifebook T725
	libata: fix read log timeout value
	ocfs2: fix data corruption on truncate
	scsi: core: Remove command size deduction from scsi_setup_scsi_cmnd()
	scsi: qla2xxx: Fix kernel crash when accessing port_speed sysfs file
	scsi: qla2xxx: Fix use after free in eh_abort path
	mmc: mtk-sd: Add wait dma stop done flow
	mmc: dw_mmc: Dont wait for DRTO on Write RSP error
	exfat: fix incorrect loading of i_blocks for large files
	parisc: Fix set_fixmap() on PA1.x CPUs
	parisc: Fix ptrace check on syscall return
	tpm: Check for integer overflow in tpm2_map_response_body()
	firmware/psci: fix application of sizeof to pointer
	crypto: s5p-sss - Add error handling in s5p_aes_probe()
	media: rkvdec: Do not override sizeimage for output format
	media: ite-cir: IR receiver stop working after receive overflow
	media: rkvdec: Support dynamic resolution changes
	media: ir-kbd-i2c: improve responsiveness of hauppauge zilog receivers
	media: v4l2-ioctl: Fix check_ext_ctrls
	ALSA: hda/realtek: Fix mic mute LED for the HP Spectre x360 14
	ALSA: hda/realtek: Add a quirk for HP OMEN 15 mute LED
	ALSA: hda/realtek: Add quirk for Clevo PC70HS
	ALSA: hda/realtek: Headset fixup for Clevo NH77HJQ
	ALSA: hda/realtek: Add a quirk for Acer Spin SP513-54N
	ALSA: hda/realtek: Add quirk for ASUS UX550VE
	ALSA: hda/realtek: Add quirk for HP EliteBook 840 G7 mute LED
	ALSA: ua101: fix division by zero at probe
	ALSA: 6fire: fix control and bulk message timeouts
	ALSA: line6: fix control and interrupt message timeouts
	ALSA: usb-audio: Line6 HX-Stomp XL USB_ID for 48k-fixed quirk
	ALSA: usb-audio: Add registration quirk for JBL Quantum 400
	ALSA: hda: Free card instance properly at probe errors
	ALSA: synth: missing check for possible NULL after the call to kstrdup
	ALSA: timer: Fix use-after-free problem
	ALSA: timer: Unconditionally unlink slave instances, too
	ext4: fix lazy initialization next schedule time computation in more granular unit
	ext4: ensure enough credits in ext4_ext_shift_path_extents
	ext4: refresh the ext4_ext_path struct after dropping i_data_sem.
	fuse: fix page stealing
	x86/sme: Use #define USE_EARLY_PGTABLE_L5 in mem_encrypt_identity.c
	x86/cpu: Fix migration safety with X86_BUG_NULL_SEL
	x86/irq: Ensure PI wakeup handler is unregistered before module unload
	ASoC: soc-core: fix null-ptr-deref in snd_soc_del_component_unlocked()
	ALSA: hda/realtek: Fixes HP Spectre x360 15-eb1xxx speakers
	cavium: Return negative value when pci_alloc_irq_vectors() fails
	scsi: qla2xxx: Return -ENOMEM if kzalloc() fails
	scsi: qla2xxx: Fix unmap of already freed sgl
	mISDN: Fix return values of the probe function
	cavium: Fix return values of the probe function
	sfc: Export fibre-specific supported link modes
	sfc: Don't use netif_info before net_device setup
	hyperv/vmbus: include linux/bitops.h
	ARM: dts: sun7i: A20-olinuxino-lime2: Fix ethernet phy-mode
	reset: socfpga: add empty driver allowing consumers to probe
	mmc: winbond: don't build on M68K
	drm: panel-orientation-quirks: Add quirk for Aya Neo 2021
	fcnal-test: kill hanging ping/nettest binaries on cleanup
	bpf: Define bpf_jit_alloc_exec_limit for arm64 JIT
	bpf: Prevent increasing bpf_jit_limit above max
	gpio: mlxbf2.c: Add check for bgpio_init failure
	xen/netfront: stop tx queues during live migration
	nvmet-tcp: fix a memory leak when releasing a queue
	spi: spl022: fix Microwire full duplex mode
	net: multicast: calculate csum of looped-back and forwarded packets
	watchdog: Fix OMAP watchdog early handling
	drm: panel-orientation-quirks: Add quirk for GPD Win3
	block: schedule queue restart after BLK_STS_ZONE_RESOURCE
	nvmet-tcp: fix header digest verification
	r8169: Add device 10ec:8162 to driver r8169
	vmxnet3: do not stop tx queues after netif_device_detach()
	nfp: bpf: relax prog rejection for mtu check through max_pkt_offset
	net/smc: Fix smc_link->llc_testlink_time overflow
	net/smc: Correct spelling mistake to TCPF_SYN_RECV
	rds: stop using dmapool
	btrfs: clear MISSING device status bit in btrfs_close_one_device
	btrfs: fix lost error handling when replaying directory deletes
	btrfs: call btrfs_check_rw_degradable only if there is a missing device
	KVM: VMX: Unregister posted interrupt wakeup handler on hardware unsetup
	ia64: kprobes: Fix to pass correct trampoline address to the handler
	selinux: fix race condition when computing ocontext SIDs
	hwmon: (pmbus/lm25066) Add offset coefficients
	regulator: s5m8767: do not use reset value as DVS voltage if GPIO DVS is disabled
	regulator: dt-bindings: samsung,s5m8767: correct s5m8767,pmic-buck-default-dvs-idx property
	EDAC/sb_edac: Fix top-of-high-memory value for Broadwell/Haswell
	mwifiex: fix division by zero in fw download path
	ath6kl: fix division by zero in send path
	ath6kl: fix control-message timeout
	ath10k: fix control-message timeout
	ath10k: fix division by zero in send path
	PCI: Mark Atheros QCA6174 to avoid bus reset
	rtl8187: fix control-message timeouts
	evm: mark evm_fixmode as __ro_after_init
	ifb: Depend on netfilter alternatively to tc
	wcn36xx: Fix HT40 capability for 2Ghz band
	wcn36xx: Fix tx_status mechanism
	wcn36xx: Fix (QoS) null data frame bitrate/modulation
	PM: sleep: Do not let "syscore" devices runtime-suspend during system transitions
	mwifiex: Read a PCI register after writing the TX ring write pointer
	mwifiex: Try waking the firmware until we get an interrupt
	libata: fix checking of DMA state
	wcn36xx: handle connection loss indication
	rsi: fix occasional initialisation failure with BT coex
	rsi: fix key enabled check causing unwanted encryption for vap_id > 0
	rsi: fix rate mask set leading to P2P failure
	rsi: Fix module dev_oper_mode parameter description
	perf/x86/intel/uncore: Support extra IMC channel on Ice Lake server
	perf/x86/intel/uncore: Fix Intel ICX IIO event constraints
	RDMA/qedr: Fix NULL deref for query_qp on the GSI QP
	signal: Remove the bogus sigkill_pending in ptrace_stop
	memory: renesas-rpc-if: Correct QSPI data transfer in Manual mode
	signal/mips: Update (_save|_restore)_fp_context to fail with -EFAULT
	soc: fsl: dpio: replace smp_processor_id with raw_smp_processor_id
	soc: fsl: dpio: use the combined functions to protect critical zone
	mtd: rawnand: socrates: Keep the driver compatible with on-die ECC engines
	power: supply: max17042_battery: Prevent int underflow in set_soc_threshold
	power: supply: max17042_battery: use VFSOC for capacity when no rsns
	KVM: arm64: Extract ESR_ELx.EC only
	KVM: nVMX: Query current VMCS when determining if MSR bitmaps are in use
	can: j1939: j1939_tp_cmd_recv(): ignore abort message in the BAM transport
	can: j1939: j1939_can_recv(): ignore messages with invalid source address
	powerpc/85xx: Fix oops when mpc85xx_smp_guts_ids node cannot be found
	ring-buffer: Protect ring_buffer_reset() from reentrancy
	serial: core: Fix initializing and restoring termios speed
	ifb: fix building without CONFIG_NET_CLS_ACT
	ALSA: mixer: oss: Fix racy access to slots
	ALSA: mixer: fix deadlock in snd_mixer_oss_set_volume
	xen/balloon: add late_initcall_sync() for initial ballooning done
	ovl: fix use after free in struct ovl_aio_req
	PCI: pci-bridge-emul: Fix emulation of W1C bits
	PCI: cadence: Add cdns_plat_pcie_probe() missing return
	PCI: aardvark: Do not clear status bits of masked interrupts
	PCI: aardvark: Fix checking for link up via LTSSM state
	PCI: aardvark: Do not unmask unused interrupts
	PCI: aardvark: Fix reporting Data Link Layer Link Active
	PCI: aardvark: Fix configuring Reference clock
	PCI: aardvark: Fix return value of MSI domain .alloc() method
	PCI: aardvark: Read all 16-bits from PCIE_MSI_PAYLOAD_REG
	PCI: aardvark: Fix support for bus mastering and PCI_COMMAND on emulated bridge
	PCI: aardvark: Fix support for PCI_BRIDGE_CTL_BUS_RESET on emulated bridge
	PCI: aardvark: Set PCI Bridge Class Code to PCI Bridge
	PCI: aardvark: Fix support for PCI_ROM_ADDRESS1 on emulated bridge
	quota: check block number when reading the block in quota file
	quota: correct error number in free_dqentry()
	pinctrl: core: fix possible memory leak in pinctrl_enable()
	coresight: cti: Correct the parameter for pm_runtime_put
	iio: dac: ad5446: Fix ad5622_write() return value
	iio: ad5770r: make devicetree property reading consistent
	USB: serial: keyspan: fix memleak on probe errors
	serial: 8250: fix racy uartclk update
	most: fix control-message timeouts
	USB: iowarrior: fix control-message timeouts
	USB: chipidea: fix interrupt deadlock
	power: supply: max17042_battery: Clear status bits in interrupt handler
	dma-buf: WARN on dmabuf release with pending attachments
	drm: panel-orientation-quirks: Update the Lenovo Ideapad D330 quirk (v2)
	drm: panel-orientation-quirks: Add quirk for KD Kurio Smart C15200 2-in-1
	drm: panel-orientation-quirks: Add quirk for the Samsung Galaxy Book 10.6
	Bluetooth: sco: Fix lock_sock() blockage by memcpy_from_msg()
	Bluetooth: fix use-after-free error in lock_sock_nested()
	drm/panel-orientation-quirks: add Valve Steam Deck
	rcutorture: Avoid problematic critical section nesting on PREEMPT_RT
	platform/x86: wmi: do not fail if disabling fails
	MIPS: lantiq: dma: add small delay after reset
	MIPS: lantiq: dma: reset correct number of channel
	locking/lockdep: Avoid RCU-induced noinstr fail
	net: sched: update default qdisc visibility after Tx queue cnt changes
	rcu-tasks: Move RTGS_WAIT_CBS to beginning of rcu_tasks_kthread() loop
	smackfs: Fix use-after-free in netlbl_catmap_walk()
	ath11k: Align bss_chan_info structure with firmware
	x86: Increase exception stack sizes
	mwifiex: Run SET_BSS_MODE when changing from P2P to STATION vif-type
	mwifiex: Properly initialize private structure on interface type changes
	fscrypt: allow 256-bit master keys with AES-256-XTS
	drm/amdgpu: Fix MMIO access page fault
	ath11k: Avoid reg rules update during firmware recovery
	ath11k: add handler for scan event WMI_SCAN_EVENT_DEQUEUED
	ath11k: Change DMA_FROM_DEVICE to DMA_TO_DEVICE when map reinjected packets
	ath10k: high latency fixes for beacon buffer
	media: mt9p031: Fix corrupted frame after restarting stream
	media: netup_unidvb: handle interrupt properly according to the firmware
	media: atomisp: Fix error handling in probe
	media: stm32: Potential NULL pointer dereference in dcmi_irq_thread()
	media: uvcvideo: Set capability in s_param
	media: uvcvideo: Return -EIO for control errors
	media: uvcvideo: Set unique vdev name based in type
	media: s5p-mfc: fix possible null-pointer dereference in s5p_mfc_probe()
	media: s5p-mfc: Add checking to s5p_mfc_probe().
	media: imx: set a media_device bus_info string
	media: mceusb: return without resubmitting URB in case of -EPROTO error.
	ia64: don't do IA64_CMPXCHG_DEBUG without CONFIG_PRINTK
	rtw88: fix RX clock gate setting while fifo dump
	brcmfmac: Add DMI nvram filename quirk for Cyberbook T116 tablet
	media: rcar-csi2: Add checking to rcsi2_start_receiver()
	ipmi: Disable some operations during a panic
	fs/proc/uptime.c: Fix idle time reporting in /proc/uptime
	ACPICA: Avoid evaluating methods too early during system resume
	media: ipu3-imgu: imgu_fmt: Handle properly try
	media: ipu3-imgu: VIDIOC_QUERYCAP: Fix bus_info
	media: usb: dvd-usb: fix uninit-value bug in dibusb_read_eeprom_byte()
	net-sysfs: try not to restart the syscall if it will fail eventually
	tracefs: Have tracefs directories not set OTH permission bits by default
	ath: dfs_pattern_detector: Fix possible null-pointer dereference in channel_detector_create()
	mmc: moxart: Fix reference count leaks in moxart_probe
	iov_iter: Fix iov_iter_get_pages{,_alloc} page fault return value
	ACPI: battery: Accept charges over the design capacity as full
	drm/amdkfd: fix resume error when iommu disabled in Picasso
	net: phy: micrel: make *-skew-ps check more lenient
	leaking_addresses: Always print a trailing newline
	drm/msm: prevent NULL dereference in msm_gpu_crashstate_capture()
	block: bump max plugged deferred size from 16 to 32
	md: update superblock after changing rdev flags in state_store
	memstick: r592: Fix a UAF bug when removing the driver
	lib/xz: Avoid overlapping memcpy() with invalid input with in-place decompression
	lib/xz: Validate the value before assigning it to an enum variable
	workqueue: make sysfs of unbound kworker cpumask more clever
	tracing/cfi: Fix cmp_entries_* functions signature mismatch
	mt76: mt7915: fix an off-by-one bound check
	mwl8k: Fix use-after-free in mwl8k_fw_state_machine()
	block: remove inaccurate requeue check
	media: allegro: ignore interrupt if mailbox is not initialized
	nvmet: fix use-after-free when a port is removed
	nvmet-rdma: fix use-after-free when a port is removed
	nvmet-tcp: fix use-after-free when a port is removed
	nvme: drop scan_lock and always kick requeue list when removing namespaces
	PM: hibernate: Get block device exclusively in swsusp_check()
	selftests: kvm: fix mismatched fclose() after popen()
	selftests/bpf: Fix perf_buffer test on system with offline cpus
	iwlwifi: mvm: disable RX-diversity in powersave
	smackfs: use __GFP_NOFAIL for smk_cipso_doi()
	ARM: clang: Do not rely on lr register for stacktrace
	gre/sit: Don't generate link-local addr if addr_gen_mode is IN6_ADDR_GEN_MODE_NONE
	gfs2: Cancel remote delete work asynchronously
	gfs2: Fix glock_hash_walk bugs
	ARM: 9136/1: ARMv7-M uses BE-8, not BE-32
	vrf: run conntrack only in context of lower/physdev for locally generated packets
	net: annotate data-race in neigh_output()
	ACPI: AC: Quirk GK45 to skip reading _PSR
	btrfs: reflink: initialize return value to 0 in btrfs_extent_same()
	btrfs: do not take the uuid_mutex in btrfs_rm_device
	spi: bcm-qspi: Fix missing clk_disable_unprepare() on error in bcm_qspi_probe()
	wcn36xx: Correct band/freq reporting on RX
	x86/hyperv: Protect set_hv_tscchange_cb() against getting preempted
	drm/amd/display: dcn20_resource_construct reduce scope of FPU enabled
	selftests/core: fix conflicting types compile error for close_range()
	parisc: fix warning in flush_tlb_all
	task_stack: Fix end_of_stack() for architectures with upwards-growing stack
	erofs: don't trigger WARN() when decompression fails
	parisc/unwind: fix unwinder when CONFIG_64BIT is enabled
	parisc/kgdb: add kgdb_roundup() to make kgdb work with idle polling
	netfilter: conntrack: set on IPS_ASSURED if flows enters internal stream state
	selftests/bpf: Fix strobemeta selftest regression
	Bluetooth: fix init and cleanup of sco_conn.timeout_work
	rcu: Fix existing exp request check in sync_sched_exp_online_cleanup()
	MIPS: lantiq: dma: fix burst length for DEU
	objtool: Add xen_start_kernel() to noreturn list
	x86/xen: Mark cpu_bringup_and_idle() as dead_end_function
	objtool: Fix static_call list generation
	drm/v3d: fix wait for TMU write combiner flush
	virtio-gpu: fix possible memory allocation failure
	lockdep: Let lock_is_held_type() detect recursive read as read
	net: net_namespace: Fix undefined member in key_remove_domain()
	cgroup: Make rebind_subsystems() disable v2 controllers all at once
	wcn36xx: Fix Antenna Diversity Switching
	wilc1000: fix possible memory leak in cfg_scan_result()
	Bluetooth: btmtkuart: fix a memleak in mtk_hci_wmt_sync
	crypto: caam - disable pkc for non-E SoCs
	rxrpc: Fix _usecs_to_jiffies() by using usecs_to_jiffies()
	net: dsa: rtl8366rb: Fix off-by-one bug
	ath11k: fix some sleeping in atomic bugs
	ath11k: Avoid race during regd updates
	ath11k: fix packet drops due to incorrect 6 GHz freq value in rx status
	ath11k: Fix memory leak in ath11k_qmi_driver_event_work
	ath10k: Fix missing frame timestamp for beacon/probe-resp
	ath10k: sdio: Add missing BH locking around napi_schdule()
	drm/ttm: stop calling tt_swapin in vm_access
	arm64: mm: update max_pfn after memory hotplug
	drm/amdgpu: fix warning for overflow check
	media: em28xx: add missing em28xx_close_extension
	media: cxd2880-spi: Fix a null pointer dereference on error handling path
	media: dvb-usb: fix ununit-value in az6027_rc_query
	media: v4l2-ioctl: S_CTRL output the right value
	media: TDA1997x: handle short reads of hdmi info frame.
	media: mtk-vpu: Fix a resource leak in the error handling path of 'mtk_vpu_probe()'
	media: radio-wl1273: Avoid card name truncation
	media: si470x: Avoid card name truncation
	media: tm6000: Avoid card name truncation
	media: cx23885: Fix snd_card_free call on null card pointer
	kprobes: Do not use local variable when creating debugfs file
	crypto: ecc - fix CRYPTO_DEFAULT_RNG dependency
	cpuidle: Fix kobject memory leaks in error paths
	media: em28xx: Don't use ops->suspend if it is NULL
	ath9k: Fix potential interrupt storm on queue reset
	PM: EM: Fix inefficient states detection
	EDAC/amd64: Handle three rank interleaving mode
	rcu: Always inline rcu_dynticks_task*_{enter,exit}()
	netfilter: nft_dynset: relax superfluous check on set updates
	media: dvb-frontends: mn88443x: Handle errors of clk_prepare_enable()
	crypto: qat - detect PFVF collision after ACK
	crypto: qat - disregard spurious PFVF interrupts
	hwrng: mtk - Force runtime pm ops for sleep ops
	b43legacy: fix a lower bounds test
	b43: fix a lower bounds test
	gve: Recover from queue stall due to missed IRQ
	mmc: sdhci-omap: Fix NULL pointer exception if regulator is not configured
	mmc: sdhci-omap: Fix context restore
	memstick: avoid out-of-range warning
	memstick: jmb38x_ms: use appropriate free function in jmb38x_ms_alloc_host()
	net, neigh: Fix NTF_EXT_LEARNED in combination with NTF_USE
	hwmon: Fix possible memleak in __hwmon_device_register()
	hwmon: (pmbus/lm25066) Let compiler determine outer dimension of lm25066_coeff
	ath10k: fix max antenna gain unit
	kernel/sched: Fix sched_fork() access an invalid sched_task_group
	tcp: switch orphan_count to bare per-cpu counters
	drm/msm: potential error pointer dereference in init()
	drm/msm: uninitialized variable in msm_gem_import()
	net: stream: don't purge sk_error_queue in sk_stream_kill_queues()
	media: ir_toy: assignment to be16 should be of correct type
	mmc: mxs-mmc: disable regulator on error and in the remove function
	platform/x86: thinkpad_acpi: Fix bitwise vs. logical warning
	mt76: mt7615: fix endianness warning in mt7615_mac_write_txwi
	mt76: mt76x02: fix endianness warnings in mt76x02_mac.c
	mt76: mt7915: fix possible infinite loop release semaphore
	mt76: mt7915: fix sta_rec_wtbl tag len
	mt76: mt7915: fix muar_idx in mt7915_mcu_alloc_sta_req()
	rsi: stop thread firstly in rsi_91x_init() error handling
	mwifiex: Send DELBA requests according to spec
	net: enetc: unmap DMA in enetc_send_cmd()
	phy: micrel: ksz8041nl: do not use power down mode
	nvme-rdma: fix error code in nvme_rdma_setup_ctrl
	PM: hibernate: fix sparse warnings
	clocksource/drivers/timer-ti-dm: Select TIMER_OF
	x86/sev: Fix stack type check in vc_switch_off_ist()
	drm/msm: Fix potential NULL dereference in DPU SSPP
	smackfs: use netlbl_cfg_cipsov4_del() for deleting cipso_v4_doi
	KVM: selftests: Add operand to vmsave/vmload/vmrun in svm.c
	KVM: selftests: Fix nested SVM tests when built with clang
	bpftool: Avoid leaking the JSON writer prepared for program metadata
	libbpf: Fix BTF data layout checks and allow empty BTF
	libbpf: Allow loading empty BTFs
	libbpf: Fix overflow in BTF sanity checks
	libbpf: Fix BTF header parsing checks
	s390/gmap: don't unconditionally call pte_unmap_unlock() in __gmap_zap()
	KVM: s390: pv: avoid double free of sida page
	KVM: s390: pv: avoid stalls for kvm_s390_pv_init_vm
	irq: mips: avoid nested irq_enter()
	tpm: fix Atmel TPM crash caused by too frequent queries
	tpm_tis_spi: Add missing SPI ID
	libbpf: Fix endianness detection in BPF_CORE_READ_BITFIELD_PROBED()
	tcp: don't free a FIN sk_buff in tcp_remove_empty_skb()
	spi: spi-rpc-if: Check return value of rpcif_sw_init()
	samples/kretprobes: Fix return value if register_kretprobe() failed
	KVM: s390: Fix handle_sske page fault handling
	libertas_tf: Fix possible memory leak in probe and disconnect
	libertas: Fix possible memory leak in probe and disconnect
	wcn36xx: add proper DMA memory barriers in rx path
	wcn36xx: Fix discarded frames due to wrong sequence number
	drm/amdgpu/gmc6: fix DMA mask from 44 to 40 bits
	selftests: bpf: Convert sk_lookup ctx access tests to PROG_TEST_RUN
	selftests/bpf: Fix fd cleanup in sk_lookup test
	net: amd-xgbe: Toggle PLL settings during rate change
	net: phylink: avoid mvneta warning when setting pause parameters
	crypto: pcrypt - Delay write to padata->info
	selftests/bpf: Fix fclose/pclose mismatch in test_progs
	udp6: allow SO_MARK ctrl msg to affect routing
	ibmvnic: don't stop queue in xmit
	ibmvnic: Process crqs after enabling interrupts
	cgroup: Fix rootcg cpu.stat guest double counting
	bpf: Fix propagation of bounds from 64-bit min/max into 32-bit and var_off.
	bpf: Fix propagation of signed bounds from 64-bit min/max into 32-bit.
	of: unittest: fix EXPECT text for gpio hog errors
	iio: st_sensors: Call st_sensors_power_enable() from bus drivers
	iio: st_sensors: disable regulators after device unregistration
	RDMA/rxe: Fix wrong port_cap_flags
	ARM: dts: BCM5301X: Fix memory nodes names
	clk: mvebu: ap-cpu-clk: Fix a memory leak in error handling paths
	ARM: s3c: irq-s3c24xx: Fix return value check for s3c24xx_init_intc()
	arm64: dts: rockchip: Fix GPU register width for RK3328
	ARM: dts: qcom: msm8974: Add xo_board reference clock to DSI0 PHY
	RDMA/bnxt_re: Fix query SRQ failure
	arm64: dts: ti: k3-j721e-main: Fix "max-virtual-functions" in PCIe EP nodes
	arm64: dts: ti: k3-j721e-main: Fix "bus-range" upto 256 bus number for PCIe
	arm64: dts: meson-g12a: Fix the pwm regulator supply properties
	arm64: dts: meson-g12b: Fix the pwm regulator supply properties
	bus: ti-sysc: Fix timekeeping_suspended warning on resume
	ARM: dts: at91: tse850: the emac<->phy interface is rmii
	scsi: dc395: Fix error case unwinding
	MIPS: loongson64: make CPU_LOONGSON64 depends on MIPS_FP_SUPPORT
	JFS: fix memleak in jfs_mount
	arm64: dts: qcom: msm8916: Fix Secondary MI2S bit clock
	arm64: dts: renesas: beacon: Fix Ethernet PHY mode
	arm64: dts: qcom: pm8916: Remove wrong reg-names for rtc@6000
	ALSA: hda: Reduce udelay() at SKL+ position reporting
	ALSA: hda: Release controller display power during shutdown/reboot
	ALSA: hda: Fix hang during shutdown due to link reset
	ALSA: hda: Use position buffer for SKL+ again
	soundwire: debugfs: use controller id and link_id for debugfs
	scsi: pm80xx: Fix misleading log statement in pm8001_mpi_get_nvmd_resp()
	driver core: Fix possible memory leak in device_link_add()
	arm: dts: omap3-gta04a4: accelerometer irq fix
	ASoC: SOF: topology: do not power down primary core during topology removal
	soc/tegra: Fix an error handling path in tegra_powergate_power_up()
	memory: fsl_ifc: fix leak of irq and nand_irq in fsl_ifc_ctrl_probe
	clk: at91: check pmc node status before registering syscore ops
	video: fbdev: chipsfb: use memset_io() instead of memset()
	powerpc: Refactor is_kvm_guest() declaration to new header
	powerpc: Rename is_kvm_guest() to check_kvm_guest()
	powerpc: Reintroduce is_kvm_guest() as a fast-path check
	powerpc: Fix is_kvm_guest() / kvm_para_available()
	powerpc: fix unbalanced node refcount in check_kvm_guest()
	serial: 8250_dw: Drop wrong use of ACPI_PTR()
	usb: gadget: hid: fix error code in do_config()
	power: supply: rt5033_battery: Change voltage values to µV
	power: supply: max17040: fix null-ptr-deref in max17040_probe()
	scsi: csiostor: Uninitialized data in csio_ln_vnp_read_cbfn()
	RDMA/mlx4: Return missed an error if device doesn't support steering
	usb: musb: select GENERIC_PHY instead of depending on it
	staging: most: dim2: do not double-register the same device
	staging: ks7010: select CRYPTO_HASH/CRYPTO_MICHAEL_MIC
	pinctrl: renesas: checker: Fix off-by-one bug in drive register check
	ARM: dts: stm32: Reduce DHCOR SPI NOR frequency to 50 MHz
	ARM: dts: stm32: fix SAI sub nodes register range
	ARM: dts: stm32: fix AV96 board SAI2 pin muxing on stm32mp15
	ASoC: cs42l42: Correct some register default values
	ASoC: cs42l42: Defer probe if request_threaded_irq() returns EPROBE_DEFER
	soc: qcom: rpmhpd: Provide some missing struct member descriptions
	soc: qcom: rpmhpd: Make power_on actually enable the domain
	usb: typec: STUSB160X should select REGMAP_I2C
	iio: adis: do not disabe IRQs in 'adis_init()'
	scsi: ufs: Refactor ufshcd_setup_clocks() to remove skip_ref_clk
	scsi: ufs: ufshcd-pltfrm: Fix memory leak due to probe defer
	serial: imx: fix detach/attach of serial console
	usb: dwc2: drd: fix dwc2_force_mode call in dwc2_ovr_init
	usb: dwc2: drd: fix dwc2_drd_role_sw_set when clock could be disabled
	usb: dwc2: drd: reset current session before setting the new one
	firmware: qcom_scm: Fix error retval in __qcom_scm_is_call_available()
	soc: qcom: apr: Add of_node_put() before return
	pinctrl: equilibrium: Fix function addition in multiple groups
	phy: qcom-qusb2: Fix a memory leak on probe
	phy: ti: gmii-sel: check of_get_address() for failure
	phy: qcom-snps: Correct the FSEL_MASK
	serial: xilinx_uartps: Fix race condition causing stuck TX
	clk: at91: sam9x60-pll: use DIV_ROUND_CLOSEST_ULL
	HID: u2fzero: clarify error check and length calculations
	HID: u2fzero: properly handle timeouts in usb_submit_urb
	powerpc/44x/fsp2: add missing of_node_put
	ASoC: cs42l42: Disable regulators if probe fails
	ASoC: cs42l42: Use device_property API instead of of_property
	ASoC: cs42l42: Correct configuring of switch inversion from ts-inv
	virtio_ring: check desc == NULL when using indirect with packed
	mips: cm: Convert to bitfield API to fix out-of-bounds access
	power: supply: bq27xxx: Fix kernel crash on IRQ handler register error
	apparmor: fix error check
	rpmsg: Fix rpmsg_create_ept return when RPMSG config is not defined
	nfsd: don't alloc under spinlock in rpc_parse_scope_id
	i2c: mediatek: fixing the incorrect register offset
	NFS: Fix dentry verifier races
	pnfs/flexfiles: Fix misplaced barrier in nfs4_ff_layout_prepare_ds
	drm/plane-helper: fix uninitialized variable reference
	PCI: aardvark: Don't spam about PIO Response Status
	PCI: aardvark: Fix preserving PCI_EXP_RTCTL_CRSSVE flag on emulated bridge
	opp: Fix return in _opp_add_static_v2()
	NFS: Fix deadlocks in nfs_scan_commit_list()
	fs: orangefs: fix error return code of orangefs_revalidate_lookup()
	mtd: spi-nor: hisi-sfc: Remove excessive clk_disable_unprepare()
	PCI: uniphier: Serialize INTx masking/unmasking and fix the bit operation
	mtd: core: don't remove debugfs directory if device is in use
	remoteproc: Fix a memory leak in an error handling path in 'rproc_handle_vdev()'
	rtc: rv3032: fix error handling in rv3032_clkout_set_rate()
	dmaengine: at_xdmac: fix AT_XDMAC_CC_PERID() macro
	NFS: Fix up commit deadlocks
	NFS: Fix an Oops in pnfs_mark_request_commit()
	Fix user namespace leak
	auxdisplay: img-ascii-lcd: Fix lock-up when displaying empty string
	auxdisplay: ht16k33: Connect backlight to fbdev
	auxdisplay: ht16k33: Fix frame buffer device blanking
	soc: fsl: dpaa2-console: free buffer before returning from dpaa2_console_read
	netfilter: nfnetlink_queue: fix OOB when mac header was cleared
	dmaengine: dmaengine_desc_callback_valid(): Check for `callback_result`
	signal/sh: Use force_sig(SIGKILL) instead of do_group_exit(SIGKILL)
	m68k: set a default value for MEMORY_RESERVE
	watchdog: f71808e_wdt: fix inaccurate report in WDIOC_GETTIMEOUT
	ar7: fix kernel builds for compiler test
	scsi: qla2xxx: Changes to support FCP2 Target
	scsi: qla2xxx: Relogin during fabric disturbance
	scsi: qla2xxx: Fix gnl list corruption
	scsi: qla2xxx: Turn off target reset during issue_lip
	NFSv4: Fix a regression in nfs_set_open_stateid_locked()
	i2c: xlr: Fix a resource leak in the error handling path of 'xlr_i2c_probe()'
	xen-pciback: Fix return in pm_ctrl_init()
	net: davinci_emac: Fix interrupt pacing disable
	ethtool: fix ethtool msg len calculation for pause stats
	openrisc: fix SMP tlb flush NULL pointer dereference
	net: vlan: fix a UAF in vlan_dev_real_dev()
	ice: Fix replacing VF hardware MAC to existing MAC filter
	ice: Fix not stopping Tx queues for VFs
	ACPI: PMIC: Fix intel_pmic_regs_handler() read accesses
	drm/nouveau/svm: Fix refcount leak bug and missing check against null bug
	net: phy: fix duplex out of sync problem while changing settings
	bonding: Fix a use-after-free problem when bond_sysfs_slave_add() failed
	mfd: core: Add missing of_node_put for loop iteration
	can: mcp251xfd: mcp251xfd_chip_start(): fix error handling for mcp251xfd_chip_rx_int_enable()
	mm/zsmalloc.c: close race window between zs_pool_dec_isolated() and zs_unregister_migration()
	zram: off by one in read_block_state()
	perf bpf: Add missing free to bpf_event__print_bpf_prog_info()
	llc: fix out-of-bound array index in llc_sk_dev_hash()
	nfc: pn533: Fix double free when pn533_fill_fragment_skbs() fails
	arm64: pgtable: make __pte_to_phys/__phys_to_pte_val inline functions
	bpf, sockmap: Remove unhash handler for BPF sockmap usage
	bpf: sockmap, strparser, and tls are reusing qdisc_skb_cb and colliding
	gve: Fix off by one in gve_tx_timeout()
	seq_file: fix passing wrong private data
	net/sched: sch_taprio: fix undefined behavior in ktime_mono_to_any
	net: hns3: fix kernel crash when unload VF while it is being reset
	net: hns3: allow configure ETS bandwidth of all TCs
	net: stmmac: allow a tc-taprio base-time of zero
	vsock: prevent unnecessary refcnt inc for nonblocking connect
	net/smc: fix sk_refcnt underflow on linkdown and fallback
	cxgb4: fix eeprom len when diagnostics not implemented
	selftests/net: udpgso_bench_rx: fix port argument
	ARM: 9155/1: fix early early_iounmap()
	ARM: 9156/1: drop cc-option fallbacks for architecture selection
	parisc: Fix backtrace to always include init funtion names
	MIPS: Fix assembly error from MIPSr2 code used within MIPS_ISA_ARCH_LEVEL
	x86/mce: Add errata workaround for Skylake SKX37
	posix-cpu-timers: Clear task::posix_cputimers_work in copy_process()
	irqchip/sifive-plic: Fixup EOI failed when masked
	f2fs: should use GFP_NOFS for directory inodes
	net, neigh: Enable state migration between NUD_PERMANENT and NTF_USE
	9p/net: fix missing error check in p9_check_errors
	memcg: prohibit unconditional exceeding the limit of dying tasks
	powerpc/lib: Add helper to check if offset is within conditional branch range
	powerpc/bpf: Validate branch ranges
	powerpc/security: Add a helper to query stf_barrier type
	powerpc/bpf: Emit stf barrier instruction sequences for BPF_NOSPEC
	mm, oom: pagefault_out_of_memory: don't force global OOM for dying tasks
	mm, oom: do not trigger out_of_memory from the #PF
	mfd: dln2: Add cell for initializing DLN2 ADC
	video: backlight: Drop maximum brightness override for brightness zero
	s390/cio: check the subchannel validity for dev_busid
	s390/tape: fix timer initialization in tape_std_assign()
	s390/ap: Fix hanging ioctl caused by orphaned replies
	s390/cio: make ccw_device_dma_* more robust
	mtd: rawnand: ams-delta: Keep the driver compatible with on-die ECC engines
	mtd: rawnand: xway: Keep the driver compatible with on-die ECC engines
	mtd: rawnand: mpc5121: Keep the driver compatible with on-die ECC engines
	mtd: rawnand: gpio: Keep the driver compatible with on-die ECC engines
	mtd: rawnand: pasemi: Keep the driver compatible with on-die ECC engines
	mtd: rawnand: orion: Keep the driver compatible with on-die ECC engines
	mtd: rawnand: plat_nand: Keep the driver compatible with on-die ECC engines
	mtd: rawnand: au1550nd: Keep the driver compatible with on-die ECC engines
	powerpc/powernv/prd: Unregister OPAL_MSG_PRD2 notifier during module unload
	powerpc/85xx: fix timebase sync issue when CONFIG_HOTPLUG_CPU=n
	drm/sun4i: Fix macros in sun8i_csc.h
	PCI: Add PCI_EXP_DEVCTL_PAYLOAD_* macros
	PCI: aardvark: Fix PCIe Max Payload Size setting
	SUNRPC: Partial revert of commit 6f9f17287e
	ath10k: fix invalid dma_addr_t token assignment
	mmc: moxart: Fix null pointer dereference on pointer host
	selftests/bpf: Fix also no-alu32 strobemeta selftest
	arch/cc: Introduce a function to check for confidential computing features
	x86/sev: Add an x86 version of cc_platform_has()
	x86/sev: Make the #VC exception stacks part of the default stacks storage
	soc/tegra: pmc: Fix imbalanced clock disabling in error code path
	Linux 5.10.80

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: I21c750863965fbf584251fa2de3c941ae5922d3f
2021-11-19 11:50:41 +01:00

1209 lines
32 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/oom_kill.c
*
* Copyright (C) 1998,2000 Rik van Riel
* Thanks go out to Claus Fischer for some serious inspiration and
* for goading me into coding this file...
* Copyright (C) 2010 Google, Inc.
* Rewritten by David Rientjes
*
* The routines in this file are used to kill a process when
* we're seriously out of memory. This gets called from __alloc_pages()
* in mm/page_alloc.c when we really run out of memory.
*
* Since we won't call these routines often (on a well-configured
* machine) this file will double as a 'coding guide' and a signpost
* for newbie kernel hackers. It features several pointers to major
* kernel subsystems and hints as to where to find out what things do.
*/
#include <linux/oom.h>
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
#include <linux/sched/debug.h>
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/cpuset.h>
#include <linux/export.h>
#include <linux/notifier.h>
#include <linux/memcontrol.h>
#include <linux/mempolicy.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/freezer.h>
#include <linux/ftrace.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/init.h>
#include <linux/mmu_notifier.h>
#include <asm/tlb.h>
#include "internal.h"
#include "slab.h"
#define CREATE_TRACE_POINTS
#include <trace/events/oom.h>
#undef CREATE_TRACE_POINTS
#include <trace/hooks/mm.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
/*
* Serializes oom killer invocations (out_of_memory()) from all contexts to
* prevent from over eager oom killing (e.g. when the oom killer is invoked
* from different domains).
*
* oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
* and mark_oom_victim
*/
DEFINE_MUTEX(oom_lock);
/* Serializes oom_score_adj and oom_score_adj_min updates */
DEFINE_MUTEX(oom_adj_mutex);
static inline bool is_memcg_oom(struct oom_control *oc)
{
return oc->memcg != NULL;
}
#ifdef CONFIG_NUMA
/**
* oom_cpuset_eligible() - check task eligiblity for kill
* @start: task struct of which task to consider
* @oc: pointer to struct oom_control
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
* and whether or not it has the same set of allowed cpuset nodes.
*
* This function is assuming oom-killer context and 'current' has triggered
* the oom-killer.
*/
static bool oom_cpuset_eligible(struct task_struct *start,
struct oom_control *oc)
{
struct task_struct *tsk;
bool ret = false;
const nodemask_t *mask = oc->nodemask;
if (is_memcg_oom(oc))
return true;
rcu_read_lock();
for_each_thread(start, tsk) {
if (mask) {
/*
* If this is a mempolicy constrained oom, tsk's
* cpuset is irrelevant. Only return true if its
* mempolicy intersects current, otherwise it may be
* needlessly killed.
*/
ret = mempolicy_nodemask_intersects(tsk, mask);
} else {
/*
* This is not a mempolicy constrained oom, so only
* check the mems of tsk's cpuset.
*/
ret = cpuset_mems_allowed_intersects(current, tsk);
}
if (ret)
break;
}
rcu_read_unlock();
return ret;
}
#else
static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
{
return true;
}
#endif /* CONFIG_NUMA */
/*
* The process p may have detached its own ->mm while exiting or through
* kthread_use_mm(), but one or more of its subthreads may still have a valid
* pointer. Return p, or any of its subthreads with a valid ->mm, with
* task_lock() held.
*/
struct task_struct *find_lock_task_mm(struct task_struct *p)
{
struct task_struct *t;
rcu_read_lock();
for_each_thread(p, t) {
task_lock(t);
if (likely(t->mm))
goto found;
task_unlock(t);
}
t = NULL;
found:
rcu_read_unlock();
return t;
}
/*
* order == -1 means the oom kill is required by sysrq, otherwise only
* for display purposes.
*/
static inline bool is_sysrq_oom(struct oom_control *oc)
{
return oc->order == -1;
}
/* return true if the task is not adequate as candidate victim task. */
static bool oom_unkillable_task(struct task_struct *p)
{
if (is_global_init(p))
return true;
if (p->flags & PF_KTHREAD)
return true;
return false;
}
/*
* Print out unreclaimble slabs info when unreclaimable slabs amount is greater
* than all user memory (LRU pages)
*/
static bool is_dump_unreclaim_slabs(void)
{
unsigned long nr_lru;
nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
global_node_page_state(NR_INACTIVE_ANON) +
global_node_page_state(NR_ACTIVE_FILE) +
global_node_page_state(NR_INACTIVE_FILE) +
global_node_page_state(NR_ISOLATED_ANON) +
global_node_page_state(NR_ISOLATED_FILE) +
global_node_page_state(NR_UNEVICTABLE);
return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
}
/**
* oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
* @totalpages: total present RAM allowed for page allocation
*
* The heuristic for determining which task to kill is made to be as simple and
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
long oom_badness(struct task_struct *p, unsigned long totalpages)
{
long points;
long adj;
if (oom_unkillable_task(p))
return LONG_MIN;
p = find_lock_task_mm(p);
if (!p)
return LONG_MIN;
/*
* Do not even consider tasks which are explicitly marked oom
* unkillable or have been already oom reaped or the are in
* the middle of vfork
*/
adj = (long)p->signal->oom_score_adj;
if (adj == OOM_SCORE_ADJ_MIN ||
test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
in_vfork(p)) {
task_unlock(p);
return LONG_MIN;
}
/*
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
mm_pgtables_bytes(p->mm) / PAGE_SIZE;
task_unlock(p);
/* Normalize to oom_score_adj units */
adj *= totalpages / 1000;
points += adj;
return points;
}
static const char * const oom_constraint_text[] = {
[CONSTRAINT_NONE] = "CONSTRAINT_NONE",
[CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
[CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
[CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
};
/*
* Determine the type of allocation constraint.
*/
static enum oom_constraint constrained_alloc(struct oom_control *oc)
{
struct zone *zone;
struct zoneref *z;
enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
bool cpuset_limited = false;
int nid;
if (is_memcg_oom(oc)) {
oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
return CONSTRAINT_MEMCG;
}
/* Default to all available memory */
oc->totalpages = totalram_pages() + total_swap_pages;
if (!IS_ENABLED(CONFIG_NUMA))
return CONSTRAINT_NONE;
if (!oc->zonelist)
return CONSTRAINT_NONE;
/*
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
* to kill current.We have to random task kill in this case.
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
*/
if (oc->gfp_mask & __GFP_THISNODE)
return CONSTRAINT_NONE;
/*
* This is not a __GFP_THISNODE allocation, so a truncated nodemask in
* the page allocator means a mempolicy is in effect. Cpuset policy
* is enforced in get_page_from_freelist().
*/
if (oc->nodemask &&
!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
oc->totalpages = total_swap_pages;
for_each_node_mask(nid, *oc->nodemask)
oc->totalpages += node_present_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
}
/* Check this allocation failure is caused by cpuset's wall function */
for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
highest_zoneidx, oc->nodemask)
if (!cpuset_zone_allowed(zone, oc->gfp_mask))
cpuset_limited = true;
if (cpuset_limited) {
oc->totalpages = total_swap_pages;
for_each_node_mask(nid, cpuset_current_mems_allowed)
oc->totalpages += node_present_pages(nid);
return CONSTRAINT_CPUSET;
}
return CONSTRAINT_NONE;
}
static int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
long points;
if (oom_unkillable_task(task))
goto next;
/* p may not have freeable memory in nodemask */
if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
goto next;
/*
* This task already has access to memory reserves and is being killed.
* Don't allow any other task to have access to the reserves unless
* the task has MMF_OOM_SKIP because chances that it would release
* any memory is quite low.
*/
if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
goto next;
goto abort;
}
/*
* If task is allocating a lot of memory and has been marked to be
* killed first if it triggers an oom, then select it.
*/
if (oom_task_origin(task)) {
points = LONG_MAX;
goto select;
}
points = oom_badness(task, oc->totalpages);
if (points == LONG_MIN)
goto next;
/*
* Check to see if this is the worst task with a non-negative
* ADJ score seen so far
*/
if (task->signal->oom_score_adj >= 0 &&
points > oc->chosen_non_negative_adj_points) {
if (oc->chosen_non_negative_adj)
put_task_struct(oc->chosen_non_negative_adj);
get_task_struct(task);
oc->chosen_non_negative_adj = task;
oc->chosen_non_negative_adj_points = points;
}
if (points < oc->chosen_points)
goto next;
select:
if (oc->chosen)
put_task_struct(oc->chosen);
get_task_struct(task);
oc->chosen = task;
oc->chosen_points = points;
next:
return 0;
abort:
if (oc->chosen_non_negative_adj)
put_task_struct(oc->chosen_non_negative_adj);
if (oc->chosen)
put_task_struct(oc->chosen);
oc->chosen_non_negative_adj = NULL;
oc->chosen = (void *)-1UL;
return 1;
}
/*
* Simple selection loop. We choose the process with the highest number of
* 'points'. In case scan was aborted, oc->chosen is set to -1.
*/
static void select_bad_process(struct oom_control *oc)
{
oc->chosen_points = LONG_MIN;
oc->chosen_non_negative_adj_points = LONG_MIN;
oc->chosen_non_negative_adj = NULL;
if (is_memcg_oom(oc))
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
else {
struct task_struct *p;
rcu_read_lock();
for_each_process(p)
if (oom_evaluate_task(p, oc))
break;
rcu_read_unlock();
}
if (oc->chosen_non_negative_adj) {
/*
* If oc->chosen has a negative ADJ, and we found a task with
* a postive ADJ to kill, kill the task with the positive ADJ
* instead.
*/
if (oc->chosen && oc->chosen->signal->oom_score_adj < 0) {
put_task_struct(oc->chosen);
oc->chosen = oc->chosen_non_negative_adj;
oc->chosen_points = oc->chosen_non_negative_adj_points;
} else
put_task_struct(oc->chosen_non_negative_adj);
}
}
static int dump_task(struct task_struct *p, void *arg)
{
struct oom_control *oc = arg;
struct task_struct *task;
if (oom_unkillable_task(p))
return 0;
/* p may not have freeable memory in nodemask */
if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
return 0;
task = find_lock_task_mm(p);
if (!task) {
/*
* This is a kthread or all of p's threads have already
* detached their mm's. There's no need to report
* them; they can't be oom killed anyway.
*/
return 0;
}
pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
mm_pgtables_bytes(task->mm),
get_mm_counter(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm);
task_unlock(task);
return 0;
}
/**
* dump_tasks - dump current memory state of all system tasks
* @oc: pointer to struct oom_control
*
* Dumps the current memory state of all eligible tasks. Tasks not in the same
* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
* are not shown.
* State information includes task's pid, uid, tgid, vm size, rss,
* pgtables_bytes, swapents, oom_score_adj value, and name.
*/
static void dump_tasks(struct oom_control *oc)
{
pr_info("Tasks state (memory values in pages):\n");
pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
if (is_memcg_oom(oc))
mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
else {
struct task_struct *p;
rcu_read_lock();
for_each_process(p)
dump_task(p, oc);
rcu_read_unlock();
}
}
static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
{
/* one line summary of the oom killer context. */
pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
oom_constraint_text[oc->constraint],
nodemask_pr_args(oc->nodemask));
cpuset_print_current_mems_allowed();
mem_cgroup_print_oom_context(oc->memcg, victim);
pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
from_kuid(&init_user_ns, task_uid(victim)));
}
static void dump_header(struct oom_control *oc, struct task_struct *p)
{
pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
pr_warn("COMPACTION is disabled!!!\n");
dump_stack();
if (is_memcg_oom(oc))
mem_cgroup_print_oom_meminfo(oc->memcg);
else {
show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
if (is_dump_unreclaim_slabs())
dump_unreclaimable_slab();
}
if (sysctl_oom_dump_tasks)
dump_tasks(oc);
if (p)
dump_oom_summary(oc, p);
}
/*
* Number of OOM victims in flight
*/
static atomic_t oom_victims = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
static bool oom_killer_disabled __read_mostly;
#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* task->mm can be NULL if the task is the exited group leader. So to
* determine whether the task is using a particular mm, we examine all the
* task's threads: if one of those is using this mm then this task was also
* using it.
*/
bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
{
struct task_struct *t;
for_each_thread(p, t) {
struct mm_struct *t_mm = READ_ONCE(t->mm);
if (t_mm)
return t_mm == mm;
}
return false;
}
#ifdef CONFIG_MMU
/*
* OOM Reaper kernel thread which tries to reap the memory used by the OOM
* victim (if that is possible) to help the OOM killer to move on.
*/
static struct task_struct *oom_reaper_th;
static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
bool __oom_reap_task_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
bool ret = true;
/*
* Tell all users of get_user/copy_from_user etc... that the content
* is no longer stable. No barriers really needed because unmapping
* should imply barriers already and the reader would hit a page fault
* if it stumbled over a reaped memory.
*/
set_bit(MMF_UNSTABLE, &mm->flags);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
if (!can_madv_lru_vma(vma))
continue;
/*
* Only anonymous pages have a good chance to be dropped
* without additional steps which we cannot afford as we
* are OOM already.
*
* We do not even care about fs backed pages because all
* which are reclaimable have already been reclaimed and
* we do not want to block exit_mmap by keeping mm ref
* count elevated without a good reason.
*/
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
struct mmu_notifier_range range;
struct mmu_gather tlb;
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
vma, mm, vma->vm_start,
vma->vm_end);
tlb_gather_mmu(&tlb, mm, range.start, range.end);
if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
tlb_finish_mmu(&tlb, range.start, range.end);
ret = false;
continue;
}
unmap_page_range(&tlb, vma, range.start, range.end, NULL);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb, range.start, range.end);
}
}
return ret;
}
/*
* Reaps the address space of the give task.
*
* Returns true on success and false if none or part of the address space
* has been reclaimed and the caller should retry later.
*/
static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
bool ret = true;
if (!mmap_read_trylock(mm)) {
trace_skip_task_reaping(tsk->pid);
return false;
}
/*
* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
* work on the mm anymore. The check for MMF_OOM_SKIP must run
* under mmap_lock for reading because it serializes against the
* mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
*/
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
trace_skip_task_reaping(tsk->pid);
goto out_unlock;
}
trace_start_task_reaping(tsk->pid);
/* failed to reap part of the address space. Try again later */
ret = __oom_reap_task_mm(mm);
if (!ret)
goto out_finish;
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm,
K(get_mm_counter(mm, MM_ANONPAGES)),
K(get_mm_counter(mm, MM_FILEPAGES)),
K(get_mm_counter(mm, MM_SHMEMPAGES)));
out_finish:
trace_finish_task_reaping(tsk->pid);
out_unlock:
mmap_read_unlock(mm);
return ret;
}
#define MAX_OOM_REAP_RETRIES 10
static void oom_reap_task(struct task_struct *tsk)
{
int attempts = 0;
struct mm_struct *mm = tsk->signal->oom_mm;
/* Retry the mmap_read_trylock(mm) a few times */
while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
schedule_timeout_idle(HZ/10);
if (attempts <= MAX_OOM_REAP_RETRIES ||
test_bit(MMF_OOM_SKIP, &mm->flags))
goto done;
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
task_pid_nr(tsk), tsk->comm);
sched_show_task(tsk);
debug_show_all_locks();
done:
tsk->oom_reaper_list = NULL;
/*
* Hide this mm from OOM killer because it has been either reaped or
* somebody can't call mmap_write_unlock(mm).
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
/* Drop a reference taken by wake_oom_reaper */
put_task_struct(tsk);
}
static int oom_reaper(void *unused)
{
while (true) {
struct task_struct *tsk = NULL;
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
spin_lock(&oom_reaper_lock);
if (oom_reaper_list != NULL) {
tsk = oom_reaper_list;
oom_reaper_list = tsk->oom_reaper_list;
}
spin_unlock(&oom_reaper_lock);
if (tsk)
oom_reap_task(tsk);
}
return 0;
}
static void wake_oom_reaper(struct task_struct *tsk)
{
/* mm is already queued? */
if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
return;
get_task_struct(tsk);
spin_lock(&oom_reaper_lock);
tsk->oom_reaper_list = oom_reaper_list;
oom_reaper_list = tsk;
spin_unlock(&oom_reaper_lock);
trace_wake_reaper(tsk->pid);
wake_up(&oom_reaper_wait);
}
static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
return 0;
}
subsys_initcall(oom_init)
#else
static inline void wake_oom_reaper(struct task_struct *tsk)
{
}
#endif /* CONFIG_MMU */
/**
* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
* under task_lock or operate on the current).
*/
static void __mark_oom_victim(struct task_struct *tsk)
{
struct mm_struct *mm = tsk->mm;
if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
mmgrab(tsk->signal->oom_mm);
set_bit(MMF_OOM_VICTIM, &mm->flags);
}
}
/**
* mark_oom_victim - mark the given task as OOM victim
* @tsk: task to mark
*
* Has to be called with oom_lock held and never after
* oom has been disabled already.
*
* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
* under task_lock or operate on the current).
*/
static void mark_oom_victim(struct task_struct *tsk)
{
WARN_ON(oom_killer_disabled);
/* OOM killer might race with memcg OOM */
if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
return;
/* oom_mm is bound to the signal struct life time. */
__mark_oom_victim(tsk);
/*
* Make sure that the task is woken up from uninterruptible sleep
* if it is frozen because OOM killer wouldn't be able to free
* any memory and livelock. freezing_slow_path will tell the freezer
* that TIF_MEMDIE tasks should be ignored.
*/
__thaw_task(tsk);
atomic_inc(&oom_victims);
trace_mark_victim(tsk->pid);
}
/**
* exit_oom_victim - note the exit of an OOM victim
*/
void exit_oom_victim(void)
{
clear_thread_flag(TIF_MEMDIE);
if (!atomic_dec_return(&oom_victims))
wake_up_all(&oom_victims_wait);
}
/**
* oom_killer_enable - enable OOM killer
*/
void oom_killer_enable(void)
{
oom_killer_disabled = false;
pr_info("OOM killer enabled.\n");
}
/**
* oom_killer_disable - disable OOM killer
* @timeout: maximum timeout to wait for oom victims in jiffies
*
* Forces all page allocations to fail rather than trigger OOM killer.
* Will block and wait until all OOM victims are killed or the given
* timeout expires.
*
* The function cannot be called when there are runnable user tasks because
* the userspace would see unexpected allocation failures as a result. Any
* new usage of this function should be consulted with MM people.
*
* Returns true if successful and false if the OOM killer cannot be
* disabled.
*/
bool oom_killer_disable(signed long timeout)
{
signed long ret;
/*
* Make sure to not race with an ongoing OOM killer. Check that the
* current is not killed (possibly due to sharing the victim's memory).
*/
if (mutex_lock_killable(&oom_lock))
return false;
oom_killer_disabled = true;
mutex_unlock(&oom_lock);
ret = wait_event_interruptible_timeout(oom_victims_wait,
!atomic_read(&oom_victims), timeout);
if (ret <= 0) {
oom_killer_enable();
return false;
}
pr_info("OOM killer disabled.\n");
return true;
}
static inline bool __task_will_free_mem(struct task_struct *task)
{
struct signal_struct *sig = task->signal;
/*
* A coredumping process may sleep for an extended period in exit_mm(),
* so the oom killer cannot assume that the process will promptly exit
* and release memory.
*/
if (sig->flags & SIGNAL_GROUP_COREDUMP)
return false;
if (sig->flags & SIGNAL_GROUP_EXIT)
return true;
if (thread_group_empty(task) && (task->flags & PF_EXITING))
return true;
return false;
}
/*
* Checks whether the given task is dying or exiting and likely to
* release its address space. This means that all threads and processes
* sharing the same mm have to be killed or exiting.
* Caller has to make sure that task->mm is stable (hold task_lock or
* it operates on the current).
*/
static bool task_will_free_mem(struct task_struct *task)
{
struct mm_struct *mm = task->mm;
struct task_struct *p;
bool ret = true;
/*
* Skip tasks without mm because it might have passed its exit_mm and
* exit_oom_victim. oom_reaper could have rescued that but do not rely
* on that for now. We can consider find_lock_task_mm in future.
*/
if (!mm)
return false;
if (!__task_will_free_mem(task))
return false;
/*
* This task has already been drained by the oom reaper so there are
* only small chances it will free some more
*/
if (test_bit(MMF_OOM_SKIP, &mm->flags))
return false;
if (atomic_read(&mm->mm_users) <= 1)
return true;
/*
* Make sure that all tasks which share the mm with the given tasks
* are dying as well to make sure that a) nobody pins its mm and
* b) the task is also reapable by the oom reaper.
*/
rcu_read_lock();
for_each_process(p) {
if (!process_shares_mm(p, mm))
continue;
if (same_thread_group(task, p))
continue;
ret = __task_will_free_mem(p);
if (!ret)
break;
}
rcu_read_unlock();
return ret;
}
static void __oom_kill_process(struct task_struct *victim, const char *message)
{
struct task_struct *p;
struct mm_struct *mm;
bool can_oom_reap = true;
p = find_lock_task_mm(victim);
if (!p) {
pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
message, task_pid_nr(victim), victim->comm);
put_task_struct(victim);
return;
} else if (victim != p) {
get_task_struct(p);
put_task_struct(victim);
victim = p;
}
/* Get a reference to safely compare mm after task_unlock(victim) */
mm = victim->mm;
mmgrab(mm);
/* Raise event before sending signal: task reaper must see this */
count_vm_event(OOM_KILL);
memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
/*
* We should send SIGKILL before granting access to memory reserves
* in order to prevent the OOM victim from depleting the memory
* reserves from the user space under its control.
*/
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
mark_oom_victim(victim);
pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
K(get_mm_counter(mm, MM_ANONPAGES)),
K(get_mm_counter(mm, MM_FILEPAGES)),
K(get_mm_counter(mm, MM_SHMEMPAGES)),
from_kuid(&init_user_ns, task_uid(victim)),
mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
task_unlock(victim);
/*
* Kill all user processes sharing victim->mm in other thread groups, if
* any. They don't get access to memory reserves, though, to avoid
* depletion of all memory. This prevents mm->mmap_lock livelock when an
* oom killed thread cannot exit because it requires the semaphore and
* its contended by another thread trying to allocate memory itself.
* That thread will now get access to memory reserves since it has a
* pending fatal signal.
*/
rcu_read_lock();
for_each_process(p) {
if (!process_shares_mm(p, mm))
continue;
if (same_thread_group(p, victim))
continue;
if (is_global_init(p)) {
can_oom_reap = false;
set_bit(MMF_OOM_SKIP, &mm->flags);
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
task_pid_nr(victim), victim->comm,
task_pid_nr(p), p->comm);
continue;
}
/*
* No kthead_use_mm() user needs to read from the userspace so
* we are ok to reap it.
*/
if (unlikely(p->flags & PF_KTHREAD))
continue;
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
}
rcu_read_unlock();
if (can_oom_reap)
wake_oom_reaper(victim);
mmdrop(mm);
put_task_struct(victim);
}
#undef K
/*
* Kill provided task unless it's secured by setting
* oom_score_adj to OOM_SCORE_ADJ_MIN.
*/
static int oom_kill_memcg_member(struct task_struct *task, void *message)
{
if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
!is_global_init(task)) {
get_task_struct(task);
__oom_kill_process(task, message);
}
return 0;
}
static void oom_kill_process(struct oom_control *oc, const char *message)
{
struct task_struct *victim = oc->chosen;
struct mem_cgroup *oom_group;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
/*
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just give it access to memory reserves
* so it can die quickly
*/
task_lock(victim);
if (task_will_free_mem(victim)) {
mark_oom_victim(victim);
wake_oom_reaper(victim);
task_unlock(victim);
put_task_struct(victim);
return;
}
task_unlock(victim);
if (__ratelimit(&oom_rs))
dump_header(oc, victim);
/*
* Do we need to kill the entire memory cgroup?
* Or even one of the ancestor memory cgroups?
* Check this out before killing the victim task.
*/
oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
__oom_kill_process(victim, message);
/*
* If necessary, kill all tasks in the selected memory cgroup.
*/
if (oom_group) {
mem_cgroup_print_oom_group(oom_group);
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
(void*)message);
mem_cgroup_put(oom_group);
}
}
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
static void check_panic_on_oom(struct oom_control *oc)
{
if (likely(!sysctl_panic_on_oom))
return;
if (sysctl_panic_on_oom != 2) {
/*
* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
* does not panic for cpuset, mempolicy, or memcg allocation
* failures.
*/
if (oc->constraint != CONSTRAINT_NONE)
return;
}
/* Do not panic for oom kills triggered by sysrq */
if (is_sysrq_oom(oc))
return;
dump_header(oc, NULL);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
int register_oom_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&oom_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_oom_notifier);
int unregister_oom_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&oom_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
/**
* out_of_memory - kill the "best" process when we run out of memory
* @oc: pointer to struct oom_control
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
bool out_of_memory(struct oom_control *oc)
{
unsigned long freed = 0;
if (oom_killer_disabled)
return false;
if (!is_memcg_oom(oc)) {
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
return true;
}
/*
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
if (task_will_free_mem(current)) {
mark_oom_victim(current);
wake_oom_reaper(current);
return true;
}
/*
* The OOM killer does not compensate for IO-less reclaim.
* pagefault_out_of_memory lost its gfp context so we have to
* make sure exclude 0 mask - all other users should have at least
* ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
* invoke the OOM killer even if it is a GFP_NOFS allocation.
*/
if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
return true;
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA and memcg) that may require different handling.
*/
oc->constraint = constrained_alloc(oc);
if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
oc->nodemask = NULL;
check_panic_on_oom(oc);
if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
current->mm && !oom_unkillable_task(current) &&
oom_cpuset_eligible(current, oc) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
oc->chosen = current;
oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
return true;
}
select_bad_process(oc);
/* Found nothing?!?! */
if (!oc->chosen) {
int ret = false;
trace_android_vh_oom_check_panic(oc, &ret);
if (ret)
return true;
dump_header(oc, NULL);
pr_warn("Out of memory and no killable processes...\n");
/*
* If we got here due to an actual allocation at the
* system level, we cannot survive this and will enter
* an endless loop in the allocator. Bail out now.
*/
if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
panic("System is deadlocked on memory\n");
}
if (oc->chosen && oc->chosen != (void *)-1UL)
oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
"Memory cgroup out of memory");
return !!oc->chosen;
}
/*
* The pagefault handler calls here because some allocation has failed. We have
* to take care of the memcg OOM here because this is the only safe context without
* any locks held but let the oom killer triggered from the allocation context care
* about the global OOM.
*/
void pagefault_out_of_memory(void)
{
static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
if (mem_cgroup_oom_synchronize(true))
return;
if (fatal_signal_pending(current))
return;
if (__ratelimit(&pfoom_rs))
pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
}
void add_to_oom_reaper(struct task_struct *p)
{
p = find_lock_task_mm(p);
if (!p)
return;
get_task_struct(p);
if (task_will_free_mem(p)) {
__mark_oom_victim(p);
wake_oom_reaper(p);
}
task_unlock(p);
put_task_struct(p);
}