Greg Kroah-Hartman d8c7f0a3cd Merge 5.10.20 into android12-5.10
Changes in 5.10.20
	vmlinux.lds.h: add DWARF v5 sections
	vdpa/mlx5: fix param validation in mlx5_vdpa_get_config()
	debugfs: be more robust at handling improper input in debugfs_lookup()
	debugfs: do not attempt to create a new file before the filesystem is initalized
	scsi: libsas: docs: Remove notify_ha_event()
	scsi: qla2xxx: Fix mailbox Ch erroneous error
	kdb: Make memory allocations more robust
	w1: w1_therm: Fix conversion result for negative temperatures
	PCI: qcom: Use PHY_REFCLK_USE_PAD only for ipq8064
	PCI: Decline to resize resources if boot config must be preserved
	virt: vbox: Do not use wait_event_interruptible when called from kernel context
	bfq: Avoid false bfq queue merging
	ALSA: usb-audio: Fix PCM buffer allocation in non-vmalloc mode
	MIPS: vmlinux.lds.S: add missing PAGE_ALIGNED_DATA() section
	vmlinux.lds.h: Define SANTIZER_DISCARDS with CONFIG_GCOV_KERNEL=y
	random: fix the RNDRESEEDCRNG ioctl
	ALSA: pcm: Call sync_stop at disconnection
	ALSA: pcm: Assure sync with the pending stop operation at suspend
	ALSA: pcm: Don't call sync_stop if it hasn't been stopped
	drm/i915/gt: One more flush for Baytrail clear residuals
	ath10k: Fix error handling in case of CE pipe init failure
	Bluetooth: btqcomsmd: Fix a resource leak in error handling paths in the probe function
	Bluetooth: hci_uart: Fix a race for write_work scheduling
	Bluetooth: Fix initializing response id after clearing struct
	arm64: dts: renesas: beacon kit: Fix choppy Bluetooth Audio
	arm64: dts: renesas: beacon: Fix audio-1.8V pin enable
	ARM: dts: exynos: correct PMIC interrupt trigger level on Artik 5
	ARM: dts: exynos: correct PMIC interrupt trigger level on Monk
	ARM: dts: exynos: correct PMIC interrupt trigger level on Rinato
	ARM: dts: exynos: correct PMIC interrupt trigger level on Spring
	ARM: dts: exynos: correct PMIC interrupt trigger level on Arndale Octa
	ARM: dts: exynos: correct PMIC interrupt trigger level on Odroid XU3 family
	arm64: dts: exynos: correct PMIC interrupt trigger level on TM2
	arm64: dts: exynos: correct PMIC interrupt trigger level on Espresso
	memory: mtk-smi: Fix PM usage counter unbalance in mtk_smi ops
	Bluetooth: hci_qca: Fix memleak in qca_controller_memdump
	staging: vchiq: Fix bulk userdata handling
	staging: vchiq: Fix bulk transfers on 64-bit builds
	arm64: dts: qcom: msm8916-samsung-a5u: Fix iris compatible
	net: stmmac: dwmac-meson8b: fix enabling the timing-adjustment clock
	bpf: Add bpf_patch_call_args prototype to include/linux/bpf.h
	bpf: Avoid warning when re-casting __bpf_call_base into __bpf_call_base_args
	firmware: arm_scmi: Fix call site of scmi_notification_exit
	arm64: dts: allwinner: A64: properly connect USB PHY to port 0
	arm64: dts: allwinner: H6: properly connect USB PHY to port 0
	arm64: dts: allwinner: Drop non-removable from SoPine/LTS SD card
	arm64: dts: allwinner: H6: Allow up to 150 MHz MMC bus frequency
	arm64: dts: allwinner: A64: Limit MMC2 bus frequency to 150 MHz
	arm64: dts: qcom: msm8916-samsung-a2015: Fix sensors
	cpufreq: brcmstb-avs-cpufreq: Free resources in error path
	cpufreq: brcmstb-avs-cpufreq: Fix resource leaks in ->remove()
	arm64: dts: rockchip: rk3328: Add clock_in_out property to gmac2phy node
	ACPICA: Fix exception code class checks
	usb: gadget: u_audio: Free requests only after callback
	arm64: dts: qcom: sdm845-db845c: Fix reset-pin of ov8856 node
	soc: qcom: socinfo: Fix an off by one in qcom_show_pmic_model()
	soc: ti: pm33xx: Fix some resource leak in the error handling paths of the probe function
	staging: media: atomisp: Fix size_t format specifier in hmm_alloc() debug statemenet
	Bluetooth: drop HCI device reference before return
	Bluetooth: Put HCI device if inquiry procedure interrupts
	memory: ti-aemif: Drop child node when jumping out loop
	ARM: dts: Configure missing thermal interrupt for 4430
	usb: dwc2: Do not update data length if it is 0 on inbound transfers
	usb: dwc2: Abort transaction after errors with unknown reason
	usb: dwc2: Make "trimming xfer length" a debug message
	staging: rtl8723bs: wifi_regd.c: Fix incorrect number of regulatory rules
	x86/MSR: Filter MSR writes through X86_IOC_WRMSR_REGS ioctl too
	arm64: dts: renesas: beacon: Fix EEPROM compatible value
	can: mcp251xfd: mcp251xfd_probe(): fix errata reference
	ARM: dts: armada388-helios4: assign pinctrl to LEDs
	ARM: dts: armada388-helios4: assign pinctrl to each fan
	arm64: dts: armada-3720-turris-mox: rename u-boot mtd partition to a53-firmware
	opp: Correct debug message in _opp_add_static_v2()
	Bluetooth: btusb: Fix memory leak in btusb_mtk_wmt_recv
	soc: qcom: ocmem: don't return NULL in of_get_ocmem
	arm64: dts: msm8916: Fix reserved and rfsa nodes unit address
	arm64: dts: meson: fix broken wifi node for Khadas VIM3L
	iwlwifi: mvm: set enabled in the PPAG command properly
	ARM: s3c: fix fiq for clang IAS
	optee: simplify i2c access
	staging: wfx: fix possible panic with re-queued frames
	ARM: at91: use proper asm syntax in pm_suspend
	ath10k: Fix suspicious RCU usage warning in ath10k_wmi_tlv_parse_peer_stats_info()
	ath10k: Fix lockdep assertion warning in ath10k_sta_statistics
	ath11k: fix a locking bug in ath11k_mac_op_start()
	soc: aspeed: snoop: Add clock control logic
	iwlwifi: mvm: fix the type we use in the PPAG table validity checks
	iwlwifi: mvm: store PPAG enabled/disabled flag properly
	iwlwifi: mvm: send stored PPAG command instead of local
	iwlwifi: mvm: assign SAR table revision to the command later
	iwlwifi: mvm: don't check if CSA event is running before removing
	bpf_lru_list: Read double-checked variable once without lock
	iwlwifi: pnvm: set the PNVM again if it was already loaded
	iwlwifi: pnvm: increment the pointer before checking the TLV
	ath9k: fix data bus crash when setting nf_override via debugfs
	selftests/bpf: Convert test_xdp_redirect.sh to bash
	ibmvnic: Set to CLOSED state even on error
	bnxt_en: reverse order of TX disable and carrier off
	bnxt_en: Fix devlink info's stored fw.psid version format.
	xen/netback: fix spurious event detection for common event case
	dpaa2-eth: fix memory leak in XDP_REDIRECT
	net: phy: consider that suspend2ram may cut off PHY power
	net/mlx5e: Don't change interrupt moderation params when DIM is enabled
	net/mlx5e: Change interrupt moderation channel params also when channels are closed
	net/mlx5: Fix health error state handling
	net/mlx5e: Replace synchronize_rcu with synchronize_net
	net/mlx5e: kTLS, Use refcounts to free kTLS RX priv context
	net/mlx5: Disable devlink reload for multi port slave device
	net/mlx5: Disallow RoCE on multi port slave device
	net/mlx5: Disallow RoCE on lag device
	net/mlx5: Disable devlink reload for lag devices
	net/mlx5e: CT: manage the lifetime of the ct entry object
	net/mlx5e: Check tunnel offload is required before setting SWP
	mac80211: fix potential overflow when multiplying to u32 integers
	libbpf: Ignore non function pointer member in struct_ops
	bpf: Fix an unitialized value in bpf_iter
	bpf, devmap: Use GFP_KERNEL for xdp bulk queue allocation
	bpf: Fix bpf_fib_lookup helper MTU check for SKB ctx
	selftests: mptcp: fix ACKRX debug message
	tcp: fix SO_RCVLOWAT related hangs under mem pressure
	net: axienet: Handle deferred probe on clock properly
	cxgb4/chtls/cxgbit: Keeping the max ofld immediate data size same in cxgb4 and ulds
	b43: N-PHY: Fix the update of coef for the PHY revision >= 3case
	bpf: Clear subreg_def for global function return values
	ibmvnic: add memory barrier to protect long term buffer
	ibmvnic: skip send_request_unmap for timeout reset
	net: dsa: felix: perform teardown in reverse order of setup
	net: dsa: felix: don't deinitialize unused ports
	net: phy: mscc: adding LCPLL reset to VSC8514
	net: amd-xgbe: Reset the PHY rx data path when mailbox command timeout
	net: amd-xgbe: Fix NETDEV WATCHDOG transmit queue timeout warning
	net: amd-xgbe: Reset link when the link never comes back
	net: amd-xgbe: Fix network fluctuations when using 1G BELFUSE SFP
	net: mvneta: Remove per-cpu queue mapping for Armada 3700
	net: enetc: fix destroyed phylink dereference during unbind
	tty: convert tty_ldisc_ops 'read()' function to take a kernel pointer
	tty: implement read_iter
	fbdev: aty: SPARC64 requires FB_ATY_CT
	drm/gma500: Fix error return code in psb_driver_load()
	gma500: clean up error handling in init
	drm/fb-helper: Add missed unlocks in setcmap_legacy()
	drm/panel: mantix: Tweak init sequence
	drm/vc4: hdmi: Take into account the clock doubling flag in atomic_check
	crypto: sun4i-ss - linearize buffers content must be kept
	crypto: sun4i-ss - fix kmap usage
	crypto: arm64/aes-ce - really hide slower algos when faster ones are enabled
	hwrng: ingenic - Fix a resource leak in an error handling path
	media: allegro: Fix use after free on error
	kcsan: Rewrite kcsan_prandom_u32_max() without prandom_u32_state()
	drm: rcar-du: Fix PM reference leak in rcar_cmm_enable()
	drm: rcar-du: Fix crash when using LVDS1 clock for CRTC
	drm: rcar-du: Fix the return check of of_parse_phandle and of_find_device_by_node
	drm/amdgpu: Fix macro name _AMDGPU_TRACE_H_ in preprocessor if condition
	MIPS: c-r4k: Fix section mismatch for loongson2_sc_init
	MIPS: lantiq: Explicitly compare LTQ_EBU_PCC_ISTAT against 0
	drm/virtio: make sure context is created in gem open
	drm/fourcc: fix Amlogic format modifier masks
	media: ipu3-cio2: Build only for x86
	media: i2c: ov5670: Fix PIXEL_RATE minimum value
	media: imx: Unregister csc/scaler only if registered
	media: imx: Fix csc/scaler unregister
	media: mtk-vcodec: fix error return code in vdec_vp9_decode()
	media: camss: missing error code in msm_video_register()
	media: vsp1: Fix an error handling path in the probe function
	media: em28xx: Fix use-after-free in em28xx_alloc_urbs
	media: media/pci: Fix memleak in empress_init
	media: tm6000: Fix memleak in tm6000_start_stream
	media: aspeed: fix error return code in aspeed_video_setup_video()
	ASoC: cs42l56: fix up error handling in probe
	ASoC: qcom: qdsp6: Move frontend AIFs to q6asm-dai
	evm: Fix memleak in init_desc
	crypto: bcm - Rename struct device_private to bcm_device_private
	sched/fair: Avoid stale CPU util_est value for schedutil in task dequeue
	drm/sun4i: tcon: fix inverted DCLK polarity
	media: imx7: csi: Fix regression for parallel cameras on i.MX6UL
	media: imx7: csi: Fix pad link validation
	media: ti-vpe: cal: fix write to unallocated memory
	MIPS: properly stop .eh_frame generation
	MIPS: Compare __SYNC_loongson3_war against 0
	drm/tegra: Fix reference leak when pm_runtime_get_sync() fails
	drm/amdgpu: toggle on DF Cstate after finishing xgmi injection
	bsg: free the request before return error code
	macintosh/adb-iop: Use big-endian autopoll mask
	drm/amd/display: Fix 10/12 bpc setup in DCE output bit depth reduction.
	drm/amd/display: Fix HDMI deep color output for DCE 6-11.
	media: software_node: Fix refcounts in software_node_get_next_child()
	media: lmedm04: Fix misuse of comma
	media: vidtv: psi: fix missing crc for PMT
	media: atomisp: Fix a buffer overflow in debug code
	media: qm1d1c0042: fix error return code in qm1d1c0042_init()
	media: cx25821: Fix a bug when reallocating some dma memory
	media: mtk-vcodec: fix argument used when DEBUG is defined
	media: pxa_camera: declare variable when DEBUG is defined
	media: uvcvideo: Accept invalid bFormatIndex and bFrameIndex values
	sched/eas: Don't update misfit status if the task is pinned
	f2fs: compress: fix potential deadlock
	ASoC: qcom: lpass-cpu: Remove bit clock state check
	ASoC: SOF: Intel: hda: cancel D0i3 work during runtime suspend
	perf/arm-cmn: Fix PMU instance naming
	perf/arm-cmn: Move IRQs when migrating context
	mtd: parser: imagetag: fix error codes in bcm963xx_parse_imagetag_partitions()
	crypto: talitos - Work around SEC6 ERRATA (AES-CTR mode data size error)
	crypto: talitos - Fix ctr(aes) on SEC1
	drm/nouveau: bail out of nouveau_channel_new if channel init fails
	mm: proc: Invalidate TLB after clearing soft-dirty page state
	ata: ahci_brcm: Add back regulators management
	ASoC: cpcap: fix microphone timeslot mask
	ASoC: codecs: add missing max_register in regmap config
	mtd: parsers: afs: Fix freeing the part name memory in failure
	f2fs: fix to avoid inconsistent quota data
	drm/amdgpu: Prevent shift wrapping in amdgpu_read_mask()
	f2fs: fix a wrong condition in __submit_bio
	ASoC: qcom: Fix typo error in HDMI regmap config callbacks
	KVM: nSVM: Don't strip host's C-bit from guest's CR3 when reading PDPTRs
	drm/mediatek: Check if fb is null
	Drivers: hv: vmbus: Avoid use-after-free in vmbus_onoffer_rescind()
	ASoC: Intel: sof_sdw: add missing TGL_HDMI quirk for Dell SKU 0A5E
	ASoC: Intel: sof_sdw: add missing TGL_HDMI quirk for Dell SKU 0A3E
	locking/lockdep: Avoid unmatched unlock
	ASoC: qcom: lpass: Fix i2s ctl register bit map
	ASoC: rt5682: Fix panic in rt5682_jack_detect_handler happening during system shutdown
	ASoC: SOF: debug: Fix a potential issue on string buffer termination
	btrfs: clarify error returns values in __load_free_space_cache
	btrfs: fix double accounting of ordered extent for subpage case in btrfs_invalidapge
	KVM: x86: Restore all 64 bits of DR6 and DR7 during RSM on x86-64
	s390/zcrypt: return EIO when msg retry limit reached
	drm/vc4: hdmi: Move hdmi reset to bind
	drm/vc4: hdmi: Fix register offset with longer CEC messages
	drm/vc4: hdmi: Fix up CEC registers
	drm/vc4: hdmi: Restore cec physical address on reconnect
	drm/vc4: hdmi: Compute the CEC clock divider from the clock rate
	drm/vc4: hdmi: Update the CEC clock divider on HSM rate change
	drm/lima: fix reference leak in lima_pm_busy
	drm/dp_mst: Don't cache EDIDs for physical ports
	hwrng: timeriomem - Fix cooldown period calculation
	crypto: ecdh_helper - Ensure 'len >= secret.len' in decode_key()
	io_uring: fix possible deadlock in io_uring_poll
	nvmet-tcp: fix receive data digest calculation for multiple h2cdata PDUs
	nvmet-tcp: fix potential race of tcp socket closing accept_work
	nvme-multipath: set nr_zones for zoned namespaces
	nvmet: remove extra variable in identify ns
	nvmet: set status to 0 in case for invalid nsid
	ASoC: SOF: sof-pci-dev: add missing Up-Extreme quirk
	ima: Free IMA measurement buffer on error
	ima: Free IMA measurement buffer after kexec syscall
	ASoC: simple-card-utils: Fix device module clock
	fs/jfs: fix potential integer overflow on shift of a int
	jffs2: fix use after free in jffs2_sum_write_data()
	ubifs: Fix memleak in ubifs_init_authentication
	ubifs: replay: Fix high stack usage, again
	ubifs: Fix error return code in alloc_wbufs()
	irqchip/imx: IMX_INTMUX should not default to y, unconditionally
	smp: Process pending softirqs in flush_smp_call_function_from_idle()
	drm/amdgpu/display: remove hdcp_srm sysfs on device removal
	capabilities: Don't allow writing ambiguous v3 file capabilities
	HSI: Fix PM usage counter unbalance in ssi_hw_init
	power: supply: cpcap: Add missing IRQF_ONESHOT to fix regression
	clk: meson: clk-pll: fix initializing the old rate (fallback) for a PLL
	clk: meson: clk-pll: make "ret" a signed integer
	clk: meson: clk-pll: propagate the error from meson_clk_pll_set_rate()
	selftests/powerpc: Make the test check in eeh-basic.sh posix compliant
	regulator: qcom-rpmh-regulator: add pm8009-1 chip revision
	arm64: dts: qcom: qrb5165-rb5: fix pm8009 regulators
	quota: Fix memory leak when handling corrupted quota file
	i2c: iproc: handle only slave interrupts which are enabled
	i2c: iproc: update slave isr mask (ISR_MASK_SLAVE)
	i2c: iproc: handle master read request
	spi: cadence-quadspi: Abort read if dummy cycles required are too many
	clk: sunxi-ng: h6: Fix CEC clock
	clk: renesas: r8a779a0: Remove non-existent S2 clock
	clk: renesas: r8a779a0: Fix parent of CBFUSA clock
	HID: core: detect and skip invalid inputs to snto32()
	RDMA/siw: Fix handling of zero-sized Read and Receive Queues.
	dmaengine: fsldma: Fix a resource leak in the remove function
	dmaengine: fsldma: Fix a resource leak in an error handling path of the probe function
	dmaengine: owl-dma: Fix a resource leak in the remove function
	dmaengine: hsu: disable spurious interrupt
	mfd: bd9571mwv: Use devm_mfd_add_devices()
	power: supply: cpcap-charger: Fix missing power_supply_put()
	power: supply: cpcap-battery: Fix missing power_supply_put()
	power: supply: cpcap-charger: Fix power_supply_put on null battery pointer
	fdt: Properly handle "no-map" field in the memory region
	of/fdt: Make sure no-map does not remove already reserved regions
	RDMA/rtrs: Extend ibtrs_cq_qp_create
	RDMA/rtrs-srv: Release lock before call into close_sess
	RDMA/rtrs-srv: Use sysfs_remove_file_self for disconnect
	RDMA/rtrs-clt: Set mininum limit when create QP
	RDMA/rtrs: Call kobject_put in the failure path
	RDMA/rtrs-srv: Fix missing wr_cqe
	RDMA/rtrs-clt: Refactor the failure cases in alloc_clt
	RDMA/rtrs-srv: Init wr_cnt as 1
	power: reset: at91-sama5d2_shdwc: fix wkupdbc mask
	rtc: s5m: select REGMAP_I2C
	dmaengine: idxd: set DMA channel to be private
	power: supply: fix sbs-charger build, needs REGMAP_I2C
	clocksource/drivers/ixp4xx: Select TIMER_OF when needed
	clocksource/drivers/mxs_timer: Add missing semicolon when DEBUG is defined
	spi: imx: Don't print error on -EPROBEDEFER
	RDMA/mlx5: Use the correct obj_id upon DEVX TIR creation
	IB/mlx5: Add mutex destroy call to cap_mask_mutex mutex
	clk: sunxi-ng: h6: Fix clock divider range on some clocks
	platform/chrome: cros_ec_proto: Use EC_HOST_EVENT_MASK not BIT
	platform/chrome: cros_ec_proto: Add LID and BATTERY to default mask
	regulator: axp20x: Fix reference cout leak
	watch_queue: Drop references to /dev/watch_queue
	certs: Fix blacklist flag type confusion
	regulator: s5m8767: Fix reference count leak
	spi: atmel: Put allocated master before return
	regulator: s5m8767: Drop regulators OF node reference
	power: supply: axp20x_usb_power: Init work before enabling IRQs
	power: supply: smb347-charger: Fix interrupt usage if interrupt is unavailable
	regulator: core: Avoid debugfs: Directory ... already present! error
	isofs: release buffer head before return
	watchdog: intel-mid_wdt: Postpone IRQ handler registration till SCU is ready
	auxdisplay: ht16k33: Fix refresh rate handling
	objtool: Fix error handling for STD/CLD warnings
	objtool: Fix retpoline detection in asm code
	objtool: Fix ".cold" section suffix check for newer versions of GCC
	scsi: lpfc: Fix ancient double free
	iommu: Switch gather->end to the inclusive end
	IB/umad: Return EIO in case of when device disassociated
	IB/umad: Return EPOLLERR in case of when device disassociated
	KVM: PPC: Make the VMX instruction emulation routines static
	powerpc/47x: Disable 256k page size
	powerpc/time: Enable sched clock for irqtime
	mmc: owl-mmc: Fix a resource leak in an error handling path and in the remove function
	mmc: sdhci-sprd: Fix some resource leaks in the remove function
	mmc: usdhi6rol0: Fix a resource leak in the error handling path of the probe
	mmc: renesas_sdhi_internal_dmac: Fix DMA buffer alignment from 8 to 128-bytes
	ARM: 9046/1: decompressor: Do not clear SCTLR.nTLSMD for ARMv7+ cores
	i2c: qcom-geni: Store DMA mapping data in geni_i2c_dev struct
	amba: Fix resource leak for drivers without .remove
	iommu: Move iotlb_sync_map out from __iommu_map
	iommu: Properly pass gfp_t in _iommu_map() to avoid atomic sleeping
	IB/mlx5: Return appropriate error code instead of ENOMEM
	IB/cm: Avoid a loop when device has 255 ports
	tracepoint: Do not fail unregistering a probe due to memory failure
	rtc: zynqmp: depend on HAS_IOMEM
	perf tools: Fix DSO filtering when not finding a map for a sampled address
	perf vendor events arm64: Fix Ampere eMag event typo
	RDMA/rxe: Fix coding error in rxe_recv.c
	RDMA/rxe: Fix coding error in rxe_rcv_mcast_pkt
	RDMA/rxe: Correct skb on loopback path
	spi: stm32: properly handle 0 byte transfer
	mfd: altera-sysmgr: Fix physical address storing more
	mfd: wm831x-auxadc: Prevent use after free in wm831x_auxadc_read_irq()
	powerpc/pseries/dlpar: handle ibm, configure-connector delay status
	powerpc/8xx: Fix software emulation interrupt
	clk: qcom: gcc-msm8998: Fix Alpha PLL type for all GPLLs
	kunit: tool: fix unit test cleanup handling
	kselftests: dmabuf-heaps: Fix Makefile's inclusion of the kernel's usr/include dir
	RDMA/hns: Fixed wrong judgments in the goto branch
	RDMA/siw: Fix calculation of tx_valid_cpus size
	RDMA/hns: Fix type of sq_signal_bits
	RDMA/hns: Disable RQ inline by default
	clk: divider: fix initialization with parent_hw
	spi: pxa2xx: Fix the controller numbering for Wildcat Point
	powerpc/uaccess: Avoid might_fault() when user access is enabled
	powerpc/kuap: Restore AMR after replaying soft interrupts
	regulator: qcom-rpmh: fix pm8009 ldo7
	clk: aspeed: Fix APLL calculate formula from ast2600-A2
	selftests/ftrace: Update synthetic event syntax errors
	perf symbols: Use (long) for iterator for bfd symbols
	regulator: bd718x7, bd71828, Fix dvs voltage levels
	spi: dw: Avoid stack content exposure
	spi: Skip zero-length transfers in spi_transfer_one_message()
	printk: avoid prb_first_valid_seq() where possible
	perf symbols: Fix return value when loading PE DSO
	nfsd: register pernet ops last, unregister first
	svcrdma: Hold private mutex while invoking rdma_accept()
	ceph: fix flush_snap logic after putting caps
	RDMA/hns: Fixes missing error code of CMDQ
	RDMA/ucma: Fix use-after-free bug in ucma_create_uevent
	RDMA/rtrs-srv: Fix stack-out-of-bounds
	RDMA/rtrs: Only allow addition of path to an already established session
	RDMA/rtrs-srv: fix memory leak by missing kobject free
	RDMA/rtrs-srv-sysfs: fix missing put_device
	RDMA/rtrs-srv: Do not pass a valid pointer to PTR_ERR()
	Input: sur40 - fix an error code in sur40_probe()
	perf record: Fix continue profiling after draining the buffer
	perf intel-pt: Fix missing CYC processing in PSB
	perf intel-pt: Fix premature IPC
	perf intel-pt: Fix IPC with CYC threshold
	perf test: Fix unaligned access in sample parsing test
	Input: elo - fix an error code in elo_connect()
	sparc64: only select COMPAT_BINFMT_ELF if BINFMT_ELF is set
	sparc: fix led.c driver when PROC_FS is not enabled
	Input: zinitix - fix return type of zinitix_init_touch()
	ARM: 9065/1: OABI compat: fix build when EPOLL is not enabled
	misc: eeprom_93xx46: Fix module alias to enable module autoprobe
	phy: rockchip-emmc: emmc_phy_init() always return 0
	phy: cadence-torrent: Fix error code in cdns_torrent_phy_probe()
	misc: eeprom_93xx46: Add module alias to avoid breaking support for non device tree users
	PCI: rcar: Always allocate MSI addresses in 32bit space
	soundwire: cadence: fix ACK/NAK handling
	pwm: rockchip: Enable APB clock during register access while probing
	pwm: rockchip: rockchip_pwm_probe(): Remove superfluous clk_unprepare()
	pwm: rockchip: Eliminate potential race condition when probing
	PCI: xilinx-cpm: Fix reference count leak on error path
	VMCI: Use set_page_dirty_lock() when unregistering guest memory
	PCI: Align checking of syscall user config accessors
	mei: hbm: call mei_set_devstate() on hbm stop response
	drm/msm: Fix MSM_INFO_GET_IOVA with carveout
	drm/msm/dsi: Correct io_start for MSM8994 (20nm PHY)
	drm/msm/mdp5: Fix wait-for-commit for cmd panels
	drm/msm: Fix race of GPU init vs timestamp power management.
	drm/msm: Fix races managing the OOB state for timestamp vs timestamps.
	drm/msm/dp: trigger unplug event in msm_dp_display_disable
	vfio/iommu_type1: Populate full dirty when detach non-pinned group
	vfio/iommu_type1: Fix some sanity checks in detach group
	vfio-pci/zdev: fix possible segmentation fault issue
	ext4: fix potential htree index checksum corruption
	phy: USB_LGM_PHY should depend on X86
	coresight: etm4x: Skip accessing TRCPDCR in save/restore
	nvmem: core: Fix a resource leak on error in nvmem_add_cells_from_of()
	nvmem: core: skip child nodes not matching binding
	soundwire: bus: use sdw_update_no_pm when initializing a device
	soundwire: bus: use sdw_write_no_pm when setting the bus scale registers
	soundwire: export sdw_write/read_no_pm functions
	soundwire: bus: fix confusion on device used by pm_runtime
	misc: fastrpc: fix incorrect usage of dma_map_sgtable
	remoteproc/mediatek: acknowledge watchdog IRQ after handled
	regmap: sdw: use _no_pm functions in regmap_read/write
	ext: EXT4_KUNIT_TESTS should depend on EXT4_FS instead of selecting it
	mailbox: sprd: correct definition of SPRD_OUTBOX_FIFO_FULL
	device-dax: Fix default return code of range_parse()
	PCI: pci-bridge-emul: Fix array overruns, improve safety
	PCI: cadence: Fix DMA range mapping early return error
	i40e: Fix flow for IPv6 next header (extension header)
	i40e: Add zero-initialization of AQ command structures
	i40e: Fix overwriting flow control settings during driver loading
	i40e: Fix addition of RX filters after enabling FW LLDP agent
	i40e: Fix VFs not created
	Take mmap lock in cacheflush syscall
	nios2: fixed broken sys_clone syscall
	i40e: Fix add TC filter for IPv6
	octeontx2-af: Fix an off by one in rvu_dbg_qsize_write()
	pwm: iqs620a: Fix overflow and optimize calculations
	vfio/type1: Use follow_pte()
	ice: report correct max number of TCs
	ice: Account for port VLAN in VF max packet size calculation
	ice: Fix state bits on LLDP mode switch
	ice: update the number of available RSS queues
	net: stmmac: fix CBS idleslope and sendslope calculation
	net/mlx4_core: Add missed mlx4_free_cmd_mailbox()
	PCI: rockchip: Make 'ep-gpios' DT property optional
	vxlan: move debug check after netdev unregister
	wireguard: device: do not generate ICMP for non-IP packets
	wireguard: kconfig: use arm chacha even with no neon
	ocfs2: fix a use after free on error
	mm: memcontrol: fix NR_ANON_THPS accounting in charge moving
	mm: memcontrol: fix slub memory accounting
	mm/memory.c: fix potential pte_unmap_unlock pte error
	mm/hugetlb: fix potential double free in hugetlb_register_node() error path
	mm/hugetlb: suppress wrong warning info when alloc gigantic page
	mm/compaction: fix misbehaviors of fast_find_migrateblock()
	r8169: fix jumbo packet handling on RTL8168e
	NFSv4: Fixes for nfs4_bitmask_adjust()
	KVM: SVM: Intercept INVPCID when it's disabled to inject #UD
	KVM: x86/mmu: Expand collapsible SPTE zap for TDP MMU to ZONE_DEVICE and HugeTLB pages
	arm64: Add missing ISB after invalidating TLB in __primary_switch
	i2c: brcmstb: Fix brcmstd_send_i2c_cmd condition
	i2c: exynos5: Preserve high speed master code
	mm,thp,shmem: make khugepaged obey tmpfs mount flags
	mm: fix memory_failure() handling of dax-namespace metadata
	mm/rmap: fix potential pte_unmap on an not mapped pte
	proc: use kvzalloc for our kernel buffer
	csky: Fix a size determination in gpr_get()
	scsi: bnx2fc: Fix Kconfig warning & CNIC build errors
	scsi: sd: sd_zbc: Don't pass GFP_NOIO to kvcalloc
	block: reopen the device in blkdev_reread_part
	ide/falconide: Fix module unload
	scsi: sd: Fix Opal support
	blk-settings: align max_sectors on "logical_block_size" boundary
	soundwire: intel: fix possible crash when no device is detected
	ACPI: property: Fix fwnode string properties matching
	ACPI: configfs: add missing check after configfs_register_default_group()
	cpufreq: ACPI: Set cpuinfo.max_freq directly if max boost is known
	HID: logitech-dj: add support for keyboard events in eQUAD step 4 Gaming
	HID: wacom: Ignore attempts to overwrite the touch_max value from HID
	Input: raydium_ts_i2c - do not send zero length
	Input: xpad - add support for PowerA Enhanced Wired Controller for Xbox Series X|S
	Input: joydev - prevent potential read overflow in ioctl
	Input: i8042 - add ASUS Zenbook Flip to noselftest list
	media: mceusb: Fix potential out-of-bounds shift
	USB: serial: option: update interface mapping for ZTE P685M
	usb: musb: Fix runtime PM race in musb_queue_resume_work
	usb: dwc3: gadget: Fix setting of DEPCFG.bInterval_m1
	usb: dwc3: gadget: Fix dep->interval for fullspeed interrupt
	USB: serial: ftdi_sio: fix FTX sub-integer prescaler
	USB: serial: pl2303: fix line-speed handling on newer chips
	USB: serial: mos7840: fix error code in mos7840_write()
	USB: serial: mos7720: fix error code in mos7720_write()
	phy: lantiq: rcu-usb2: wait after clock enable
	ALSA: fireface: fix to parse sync status register of latter protocol
	ALSA: hda: Add another CometLake-H PCI ID
	ALSA: hda/hdmi: Drop bogus check at closing a stream
	ALSA: hda/realtek: modify EAPD in the ALC886
	ALSA: hda/realtek: Quirk for HP Spectre x360 14 amp setup
	MIPS: Ingenic: Disable HPTLB for D0 XBurst CPUs too
	MIPS: Support binutils configured with --enable-mips-fix-loongson3-llsc=yes
	MIPS: VDSO: Use CLANG_FLAGS instead of filtering out '--target='
	Revert "MIPS: Octeon: Remove special handling of CONFIG_MIPS_ELF_APPENDED_DTB=y"
	Revert "bcache: Kill btree_io_wq"
	bcache: Give btree_io_wq correct semantics again
	bcache: Move journal work to new flush wq
	Revert "drm/amd/display: Update NV1x SR latency values"
	drm/amd/display: Add FPU wrappers to dcn21_validate_bandwidth()
	drm/amd/display: Remove Assert from dcn10_get_dig_frontend
	drm/amd/display: Add vupdate_no_lock interrupts for DCN2.1
	drm/amdkfd: Fix recursive lock warnings
	drm/amdgpu: Set reference clock to 100Mhz on Renoir (v2)
	drm/nouveau/kms: handle mDP connectors
	drm/modes: Switch to 64bit maths to avoid integer overflow
	drm/sched: Cancel and flush all outstanding jobs before finish.
	drm/panel: kd35t133: allow using non-continuous dsi clock
	drm/rockchip: Require the YTR modifier for AFBC
	ASoC: siu: Fix build error by a wrong const prefix
	selinux: fix inconsistency between inode_getxattr and inode_listsecurity
	erofs: initialized fields can only be observed after bit is set
	tpm_tis: Fix check_locality for correct locality acquisition
	tpm_tis: Clean up locality release
	KEYS: trusted: Fix incorrect handling of tpm_get_random()
	KEYS: trusted: Fix migratable=1 failing
	KEYS: trusted: Reserve TPM for seal and unseal operations
	btrfs: do not cleanup upper nodes in btrfs_backref_cleanup_node
	btrfs: do not warn if we can't find the reloc root when looking up backref
	btrfs: add asserts for deleting backref cache nodes
	btrfs: abort the transaction if we fail to inc ref in btrfs_copy_root
	btrfs: fix reloc root leak with 0 ref reloc roots on recovery
	btrfs: splice remaining dirty_bg's onto the transaction dirty bg list
	btrfs: handle space_info::total_bytes_pinned inside the delayed ref itself
	btrfs: account for new extents being deleted in total_bytes_pinned
	btrfs: fix extent buffer leak on failure to copy root
	drm/i915/gt: Flush before changing register state
	drm/i915/gt: Correct surface base address for renderclear
	crypto: arm64/sha - add missing module aliases
	crypto: aesni - prevent misaligned buffers on the stack
	crypto: michael_mic - fix broken misalignment handling
	crypto: sun4i-ss - checking sg length is not sufficient
	crypto: sun4i-ss - IV register does not work on A10 and A13
	crypto: sun4i-ss - handle BigEndian for cipher
	crypto: sun4i-ss - initialize need_fallback
	soc: samsung: exynos-asv: don't defer early on not-supported SoCs
	soc: samsung: exynos-asv: handle reading revision register error
	seccomp: Add missing return in non-void function
	arm64: ptrace: Fix seccomp of traced syscall -1 (NO_SYSCALL)
	misc: rtsx: init of rts522a add OCP power off when no card is present
	drivers/misc/vmw_vmci: restrict too big queue size in qp_host_alloc_queue
	pstore: Fix typo in compression option name
	dts64: mt7622: fix slow sd card access
	arm64: dts: agilex: fix phy interface bit shift for gmac1 and gmac2
	staging/mt7621-dma: mtk-hsdma.c->hsdma-mt7621.c
	staging: gdm724x: Fix DMA from stack
	staging: rtl8188eu: Add Edimax EW-7811UN V2 to device table
	floppy: reintroduce O_NDELAY fix
	media: i2c: max9286: fix access to unallocated memory
	media: ir_toy: add another IR Droid device
	media: ipu3-cio2: Fix mbus_code processing in cio2_subdev_set_fmt()
	media: marvell-ccic: power up the device on mclk enable
	media: smipcie: fix interrupt handling and IR timeout
	x86/virt: Eat faults on VMXOFF in reboot flows
	x86/reboot: Force all cpus to exit VMX root if VMX is supported
	x86/fault: Fix AMD erratum #91 errata fixup for user code
	x86/entry: Fix instrumentation annotation
	powerpc/prom: Fix "ibm,arch-vec-5-platform-support" scan
	rcu: Pull deferred rcuog wake up to rcu_eqs_enter() callers
	rcu/nocb: Perform deferred wake up before last idle's need_resched() check
	kprobes: Fix to delay the kprobes jump optimization
	arm64: Extend workaround for erratum 1024718 to all versions of Cortex-A55
	iommu/arm-smmu-qcom: Fix mask extraction for bootloader programmed SMRs
	arm64: kexec_file: fix memory leakage in create_dtb() when fdt_open_into() fails
	arm64: uprobe: Return EOPNOTSUPP for AARCH32 instruction probing
	arm64 module: set plt* section addresses to 0x0
	arm64: spectre: Prevent lockdep splat on v4 mitigation enable path
	riscv: Disable KSAN_SANITIZE for vDSO
	watchdog: qcom: Remove incorrect usage of QCOM_WDT_ENABLE_IRQ
	watchdog: mei_wdt: request stop on unregister
	coresight: etm4x: Handle accesses to TRCSTALLCTLR
	mtd: spi-nor: sfdp: Fix last erase region marking
	mtd: spi-nor: sfdp: Fix wrong erase type bitmask for overlaid region
	mtd: spi-nor: core: Fix erase type discovery for overlaid region
	mtd: spi-nor: core: Add erase size check for erase command initialization
	mtd: spi-nor: hisi-sfc: Put child node np on error path
	fs/affs: release old buffer head on error path
	seq_file: document how per-entry resources are managed.
	x86: fix seq_file iteration for pat/memtype.c
	mm: memcontrol: fix swap undercounting in cgroup2
	mm: memcontrol: fix get_active_memcg return value
	hugetlb: fix update_and_free_page contig page struct assumption
	hugetlb: fix copy_huge_page_from_user contig page struct assumption
	mm/vmscan: restore zone_reclaim_mode ABI
	mm, compaction: make fast_isolate_freepages() stay within zone
	KVM: nSVM: fix running nested guests when npt=0
	nvmem: qcom-spmi-sdam: Fix uninitialized pdev pointer
	module: Ignore _GLOBAL_OFFSET_TABLE_ when warning for undefined symbols
	mmc: sdhci-esdhc-imx: fix kernel panic when remove module
	mmc: sdhci-pci-o2micro: Bug fix for SDR104 HW tuning failure
	powerpc/32: Preserve cr1 in exception prolog stack check to fix build error
	powerpc/kexec_file: fix FDT size estimation for kdump kernel
	powerpc/32s: Add missing call to kuep_lock on syscall entry
	spmi: spmi-pmic-arb: Fix hw_irq overflow
	mei: fix transfer over dma with extended header
	mei: me: emmitsburg workstation DID
	mei: me: add adler lake point S DID
	mei: me: add adler lake point LP DID
	gpio: pcf857x: Fix missing first interrupt
	mfd: gateworks-gsc: Fix interrupt type
	printk: fix deadlock when kernel panic
	exfat: fix shift-out-of-bounds in exfat_fill_super()
	zonefs: Fix file size of zones in full condition
	kcmp: Support selection of SYS_kcmp without CHECKPOINT_RESTORE
	thermal: cpufreq_cooling: freq_qos_update_request() returns < 0 on error
	cpufreq: qcom-hw: drop devm_xxx() calls from init/exit hooks
	cpufreq: intel_pstate: Change intel_pstate_get_hwp_max() argument
	cpufreq: intel_pstate: Get per-CPU max freq via MSR_HWP_CAPABILITIES if available
	proc: don't allow async path resolution of /proc/thread-self components
	s390/vtime: fix inline assembly clobber list
	virtio/s390: implement virtio-ccw revision 2 correctly
	um: mm: check more comprehensively for stub changes
	um: defer killing userspace on page table update failures
	irqchip/loongson-pch-msi: Use bitmap_zalloc() to allocate bitmap
	f2fs: fix out-of-repair __setattr_copy()
	f2fs: enforce the immutable flag on open files
	f2fs: flush data when enabling checkpoint back
	sparc32: fix a user-triggerable oops in clear_user()
	spi: fsl: invert spisel_boot signal on MPC8309
	spi: spi-synquacer: fix set_cs handling
	gfs2: fix glock confusion in function signal_our_withdraw
	gfs2: Don't skip dlm unlock if glock has an lvb
	gfs2: Lock imbalance on error path in gfs2_recover_one
	gfs2: Recursive gfs2_quota_hold in gfs2_iomap_end
	dm: fix deadlock when swapping to encrypted device
	dm table: fix iterate_devices based device capability checks
	dm table: fix DAX iterate_devices based device capability checks
	dm table: fix zoned iterate_devices based device capability checks
	dm writecache: fix performance degradation in ssd mode
	dm writecache: return the exact table values that were set
	dm writecache: fix writing beyond end of underlying device when shrinking
	dm era: Recover committed writeset after crash
	dm era: Update in-core bitset after committing the metadata
	dm era: Verify the data block size hasn't changed
	dm era: Fix bitset memory leaks
	dm era: Use correct value size in equality function of writeset tree
	dm era: Reinitialize bitset cache before digesting a new writeset
	dm era: only resize metadata in preresume
	drm/i915: Reject 446-480MHz HDMI clock on GLK
	kgdb: fix to kill breakpoints on initmem after boot
	ipv6: silence compilation warning for non-IPV6 builds
	net: icmp: pass zeroed opts from icmp{,v6}_ndo_send before sending
	wireguard: selftests: test multiple parallel streams
	wireguard: queueing: get rid of per-peer ring buffers
	net: sched: fix police ext initialization
	net: qrtr: Fix memory leak in qrtr_tun_open
	net_sched: fix RTNL deadlock again caused by request_module()
	ARM: dts: aspeed: Add LCLK to lpc-snoop
	Linux 5.10.20

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: I3fbcecd9413ce212dac68d5cc800c9457feba56a
2021-03-07 12:33:33 +01:00

2358 lines
59 KiB
C

// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/mm_inline.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
enum scan_result {
SCAN_FAIL,
SCAN_SUCCEED,
SCAN_PMD_NULL,
SCAN_EXCEED_NONE_PTE,
SCAN_EXCEED_SWAP_PTE,
SCAN_EXCEED_SHARED_PTE,
SCAN_PTE_NON_PRESENT,
SCAN_PTE_UFFD_WP,
SCAN_PAGE_RO,
SCAN_LACK_REFERENCED_PAGE,
SCAN_PAGE_NULL,
SCAN_SCAN_ABORT,
SCAN_PAGE_COUNT,
SCAN_PAGE_LRU,
SCAN_PAGE_LOCK,
SCAN_PAGE_ANON,
SCAN_PAGE_COMPOUND,
SCAN_ANY_PROCESS,
SCAN_VMA_NULL,
SCAN_VMA_CHECK,
SCAN_ADDRESS_RANGE,
SCAN_SWAP_CACHE_PAGE,
SCAN_DEL_PAGE_LRU,
SCAN_ALLOC_HUGE_PAGE_FAIL,
SCAN_CGROUP_CHARGE_FAIL,
SCAN_TRUNCATED,
SCAN_PAGE_HAS_PRIVATE,
};
#define CREATE_TRACE_POINTS
#include <trace/events/huge_memory.h>
static struct task_struct *khugepaged_thread __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);
/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
static unsigned long khugepaged_sleep_expire;
static DEFINE_SPINLOCK(khugepaged_mm_lock);
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
/*
* default collapse hugepages if there is at least one pte mapped like
* it would have happened if the vma was large enough during page
* fault.
*/
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;
#define MM_SLOTS_HASH_BITS 10
static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
static struct kmem_cache *mm_slot_cache __read_mostly;
#define MAX_PTE_MAPPED_THP 8
/**
* struct mm_slot - hash lookup from mm to mm_slot
* @hash: hash collision list
* @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
* @mm: the mm that this information is valid for
*/
struct mm_slot {
struct hlist_node hash;
struct list_head mm_node;
struct mm_struct *mm;
/* pte-mapped THP in this mm */
int nr_pte_mapped_thp;
unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
};
/**
* struct khugepaged_scan - cursor for scanning
* @mm_head: the head of the mm list to scan
* @mm_slot: the current mm_slot we are scanning
* @address: the next address inside that to be scanned
*
* There is only the one khugepaged_scan instance of this cursor structure.
*/
struct khugepaged_scan {
struct list_head mm_head;
struct mm_slot *mm_slot;
unsigned long address;
};
static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
#ifdef CONFIG_SYSFS
static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
}
static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
unsigned long msecs;
int err;
err = kstrtoul(buf, 10, &msecs);
if (err || msecs > UINT_MAX)
return -EINVAL;
khugepaged_scan_sleep_millisecs = msecs;
khugepaged_sleep_expire = 0;
wake_up_interruptible(&khugepaged_wait);
return count;
}
static struct kobj_attribute scan_sleep_millisecs_attr =
__ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
scan_sleep_millisecs_store);
static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
}
static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
unsigned long msecs;
int err;
err = kstrtoul(buf, 10, &msecs);
if (err || msecs > UINT_MAX)
return -EINVAL;
khugepaged_alloc_sleep_millisecs = msecs;
khugepaged_sleep_expire = 0;
wake_up_interruptible(&khugepaged_wait);
return count;
}
static struct kobj_attribute alloc_sleep_millisecs_attr =
__ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
alloc_sleep_millisecs_store);
static ssize_t pages_to_scan_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
}
static ssize_t pages_to_scan_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int err;
unsigned long pages;
err = kstrtoul(buf, 10, &pages);
if (err || !pages || pages > UINT_MAX)
return -EINVAL;
khugepaged_pages_to_scan = pages;
return count;
}
static struct kobj_attribute pages_to_scan_attr =
__ATTR(pages_to_scan, 0644, pages_to_scan_show,
pages_to_scan_store);
static ssize_t pages_collapsed_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
}
static struct kobj_attribute pages_collapsed_attr =
__ATTR_RO(pages_collapsed);
static ssize_t full_scans_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%u\n", khugepaged_full_scans);
}
static struct kobj_attribute full_scans_attr =
__ATTR_RO(full_scans);
static ssize_t khugepaged_defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return single_hugepage_flag_show(kobj, attr, buf,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static ssize_t khugepaged_defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
return single_hugepage_flag_store(kobj, attr, buf, count,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static struct kobj_attribute khugepaged_defrag_attr =
__ATTR(defrag, 0644, khugepaged_defrag_show,
khugepaged_defrag_store);
/*
* max_ptes_none controls if khugepaged should collapse hugepages over
* any unmapped ptes in turn potentially increasing the memory
* footprint of the vmas. When max_ptes_none is 0 khugepaged will not
* reduce the available free memory in the system as it
* runs. Increasing max_ptes_none will instead potentially reduce the
* free memory in the system during the khugepaged scan.
*/
static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
}
static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int err;
unsigned long max_ptes_none;
err = kstrtoul(buf, 10, &max_ptes_none);
if (err || max_ptes_none > HPAGE_PMD_NR-1)
return -EINVAL;
khugepaged_max_ptes_none = max_ptes_none;
return count;
}
static struct kobj_attribute khugepaged_max_ptes_none_attr =
__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
khugepaged_max_ptes_none_store);
static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
}
static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int err;
unsigned long max_ptes_swap;
err = kstrtoul(buf, 10, &max_ptes_swap);
if (err || max_ptes_swap > HPAGE_PMD_NR-1)
return -EINVAL;
khugepaged_max_ptes_swap = max_ptes_swap;
return count;
}
static struct kobj_attribute khugepaged_max_ptes_swap_attr =
__ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
khugepaged_max_ptes_swap_store);
static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
}
static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int err;
unsigned long max_ptes_shared;
err = kstrtoul(buf, 10, &max_ptes_shared);
if (err || max_ptes_shared > HPAGE_PMD_NR-1)
return -EINVAL;
khugepaged_max_ptes_shared = max_ptes_shared;
return count;
}
static struct kobj_attribute khugepaged_max_ptes_shared_attr =
__ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
khugepaged_max_ptes_shared_store);
static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr,
&khugepaged_max_ptes_none_attr.attr,
&khugepaged_max_ptes_swap_attr.attr,
&khugepaged_max_ptes_shared_attr.attr,
&pages_to_scan_attr.attr,
&pages_collapsed_attr.attr,
&full_scans_attr.attr,
&scan_sleep_millisecs_attr.attr,
&alloc_sleep_millisecs_attr.attr,
NULL,
};
struct attribute_group khugepaged_attr_group = {
.attrs = khugepaged_attr,
.name = "khugepaged",
};
#endif /* CONFIG_SYSFS */
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
switch (advice) {
case MADV_HUGEPAGE:
#ifdef CONFIG_S390
/*
* qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
* can't handle this properly after s390_enable_sie, so we simply
* ignore the madvise to prevent qemu from causing a SIGSEGV.
*/
if (mm_has_pgste(vma->vm_mm))
return 0;
#endif
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
/*
* If the vma become good for khugepaged to scan,
* register it here without waiting a page fault that
* may not happen any time soon.
*/
if (!(*vm_flags & VM_NO_KHUGEPAGED) &&
khugepaged_enter_vma_merge(vma, *vm_flags))
return -ENOMEM;
break;
case MADV_NOHUGEPAGE:
*vm_flags &= ~VM_HUGEPAGE;
*vm_flags |= VM_NOHUGEPAGE;
/*
* Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
* this vma even if we leave the mm registered in khugepaged if
* it got registered before VM_NOHUGEPAGE was set.
*/
break;
}
return 0;
}
int __init khugepaged_init(void)
{
mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
sizeof(struct mm_slot),
__alignof__(struct mm_slot), 0, NULL);
if (!mm_slot_cache)
return -ENOMEM;
khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
return 0;
}
void __init khugepaged_destroy(void)
{
kmem_cache_destroy(mm_slot_cache);
}
static inline struct mm_slot *alloc_mm_slot(void)
{
if (!mm_slot_cache) /* initialization failed */
return NULL;
return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
}
static inline void free_mm_slot(struct mm_slot *mm_slot)
{
kmem_cache_free(mm_slot_cache, mm_slot);
}
static struct mm_slot *get_mm_slot(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
if (mm == mm_slot->mm)
return mm_slot;
return NULL;
}
static void insert_to_mm_slots_hash(struct mm_struct *mm,
struct mm_slot *mm_slot)
{
mm_slot->mm = mm;
hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
}
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
return atomic_read(&mm->mm_users) == 0;
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
unsigned long vm_flags)
{
/* Explicitly disabled through madvise. */
if ((vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
return false;
/* Enabled via shmem mount options or sysfs settings. */
if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
HPAGE_PMD_NR);
}
/* THP settings require madvise. */
if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
return false;
/* Read-only file mappings need to be aligned for THP to work. */
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
(vm_flags & VM_DENYWRITE)) {
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
HPAGE_PMD_NR);
}
if (!vma->anon_vma || vma->vm_ops)
return false;
if (vma_is_temporary_stack(vma))
return false;
return !(vm_flags & VM_NO_KHUGEPAGED);
}
int __khugepaged_enter(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
int wakeup;
mm_slot = alloc_mm_slot();
if (!mm_slot)
return -ENOMEM;
/* __khugepaged_exit() must not run from under us */
VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
return 0;
}
spin_lock(&khugepaged_mm_lock);
insert_to_mm_slots_hash(mm, mm_slot);
/*
* Insert just behind the scanning cursor, to let the area settle
* down a little.
*/
wakeup = list_empty(&khugepaged_scan.mm_head);
list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
spin_unlock(&khugepaged_mm_lock);
mmgrab(mm);
if (wakeup)
wake_up_interruptible(&khugepaged_wait);
return 0;
}
int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
unsigned long vm_flags)
{
unsigned long hstart, hend;
/*
* khugepaged only supports read-only files for non-shmem files.
* khugepaged does not yet work on special mappings. And
* file-private shmem THP is not supported.
*/
if (!hugepage_vma_check(vma, vm_flags))
return 0;
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
return khugepaged_enter(vma, vm_flags);
return 0;
}
void __khugepaged_exit(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
int free = 0;
spin_lock(&khugepaged_mm_lock);
mm_slot = get_mm_slot(mm);
if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
hash_del(&mm_slot->hash);
list_del(&mm_slot->mm_node);
free = 1;
}
spin_unlock(&khugepaged_mm_lock);
if (free) {
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
free_mm_slot(mm_slot);
mmdrop(mm);
} else if (mm_slot) {
/*
* This is required to serialize against
* khugepaged_test_exit() (which is guaranteed to run
* under mmap sem read mode). Stop here (after we
* return all pagetables will be destroyed) until
* khugepaged has finished working on the pagetables
* under the mmap_lock.
*/
mmap_write_lock(mm);
mmap_write_unlock(mm);
}
}
static void release_pte_page(struct page *page)
{
mod_node_page_state(page_pgdat(page),
NR_ISOLATED_ANON + page_is_file_lru(page),
-compound_nr(page));
unlock_page(page);
putback_lru_page(page);
}
static void release_pte_pages(pte_t *pte, pte_t *_pte,
struct list_head *compound_pagelist)
{
struct page *page, *tmp;
while (--_pte >= pte) {
pte_t pteval = *_pte;
page = pte_page(pteval);
if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
!PageCompound(page))
release_pte_page(page);
}
list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
list_del(&page->lru);
release_pte_page(page);
}
}
static bool is_refcount_suitable(struct page *page)
{
int expected_refcount;
expected_refcount = total_mapcount(page);
if (PageSwapCache(page))
expected_refcount += compound_nr(page);
return page_count(page) == expected_refcount;
}
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
struct list_head *compound_pagelist)
{
struct page *page = NULL;
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
bool writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
if (!userfaultfd_armed(vma) &&
++none_or_zero <= khugepaged_max_ptes_none) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
goto out;
}
}
if (!pte_present(pteval)) {
result = SCAN_PTE_NON_PRESENT;
goto out;
}
page = vm_normal_page(vma, address, pteval);
if (unlikely(!page)) {
result = SCAN_PAGE_NULL;
goto out;
}
VM_BUG_ON_PAGE(!PageAnon(page), page);
if (page_mapcount(page) > 1 &&
++shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
goto out;
}
if (PageCompound(page)) {
struct page *p;
page = compound_head(page);
/*
* Check if we have dealt with the compound page
* already
*/
list_for_each_entry(p, compound_pagelist, lru) {
if (page == p)
goto next;
}
}
/*
* We can do it before isolate_lru_page because the
* page can't be freed from under us. NOTE: PG_lock
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
if (!trylock_page(page)) {
result = SCAN_PAGE_LOCK;
goto out;
}
/*
* Check if the page has any GUP (or other external) pins.
*
* The page table that maps the page has been already unlinked
* from the page table tree and this process cannot get
* an additinal pin on the page.
*
* New pins can come later if the page is shared across fork,
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
*/
if (!is_refcount_suitable(page)) {
unlock_page(page);
result = SCAN_PAGE_COUNT;
goto out;
}
if (!pte_write(pteval) && PageSwapCache(page) &&
!reuse_swap_page(page, NULL)) {
/*
* Page is in the swap cache and cannot be re-used.
* It cannot be collapsed into a THP.
*/
unlock_page(page);
result = SCAN_SWAP_CACHE_PAGE;
goto out;
}
/*
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
*/
if (isolate_lru_page(page)) {
unlock_page(page);
result = SCAN_DEL_PAGE_LRU;
goto out;
}
mod_node_page_state(page_pgdat(page),
NR_ISOLATED_ANON + page_is_file_lru(page),
compound_nr(page));
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
if (PageCompound(page))
list_add_tail(&page->lru, compound_pagelist);
next:
/* There should be enough young pte to collapse the page */
if (pte_young(pteval) ||
page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced++;
if (pte_write(pteval))
writable = true;
}
if (likely(writable)) {
if (likely(referenced)) {
result = SCAN_SUCCEED;
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
return 1;
}
} else {
result = SCAN_PAGE_RO;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
return 0;
}
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
struct vm_area_struct *vma,
unsigned long address,
spinlock_t *ptl,
struct list_head *compound_pagelist)
{
struct page *src_page, *tmp;
pte_t *_pte;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, page++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
clear_user_highpage(page, address);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
if (is_zero_pfn(pte_pfn(pteval))) {
/*
* ptl mostly unnecessary.
*/
spin_lock(ptl);
/*
* paravirt calls inside pte_clear here are
* superfluous.
*/
pte_clear(vma->vm_mm, address, _pte);
spin_unlock(ptl);
}
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
if (!PageCompound(src_page))
release_pte_page(src_page);
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
* inside page_remove_rmap().
*/
spin_lock(ptl);
/*
* paravirt calls inside pte_clear here are
* superfluous.
*/
pte_clear(vma->vm_mm, address, _pte);
page_remove_rmap(src_page, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
}
list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
list_del(&src_page->lru);
release_pte_page(src_page);
}
}
static void khugepaged_alloc_sleep(void)
{
DEFINE_WAIT(wait);
add_wait_queue(&khugepaged_wait, &wait);
freezable_schedule_timeout_interruptible(
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
remove_wait_queue(&khugepaged_wait, &wait);
}
static int khugepaged_node_load[MAX_NUMNODES];
static bool khugepaged_scan_abort(int nid)
{
int i;
/*
* If node_reclaim_mode is disabled, then no extra effort is made to
* allocate memory locally.
*/
if (!node_reclaim_mode)
return false;
/* If there is a count for this node already, it must be acceptable */
if (khugepaged_node_load[nid])
return false;
for (i = 0; i < MAX_NUMNODES; i++) {
if (!khugepaged_node_load[i])
continue;
if (node_distance(nid, i) > node_reclaim_distance)
return true;
}
return false;
}
/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
}
#ifdef CONFIG_NUMA
static int khugepaged_find_target_node(void)
{
static int last_khugepaged_target_node = NUMA_NO_NODE;
int nid, target_node = 0, max_value = 0;
/* find first node with max normal pages hit */
for (nid = 0; nid < MAX_NUMNODES; nid++)
if (khugepaged_node_load[nid] > max_value) {
max_value = khugepaged_node_load[nid];
target_node = nid;
}
/* do some balance if several nodes have the same hit record */
if (target_node <= last_khugepaged_target_node)
for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
nid++)
if (max_value == khugepaged_node_load[nid]) {
target_node = nid;
break;
}
last_khugepaged_target_node = target_node;
return target_node;
}
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
if (IS_ERR(*hpage)) {
if (!*wait)
return false;
*wait = false;
*hpage = NULL;
khugepaged_alloc_sleep();
} else if (*hpage) {
put_page(*hpage);
*hpage = NULL;
}
return true;
}
static struct page *
khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
{
VM_BUG_ON_PAGE(*hpage, *hpage);
*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
return NULL;
}
prep_transhuge_page(*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
return *hpage;
}
#else
static int khugepaged_find_target_node(void)
{
return 0;
}
static inline struct page *alloc_khugepaged_hugepage(void)
{
struct page *page;
page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
HPAGE_PMD_ORDER);
if (page)
prep_transhuge_page(page);
return page;
}
static struct page *khugepaged_alloc_hugepage(bool *wait)
{
struct page *hpage;
do {
hpage = alloc_khugepaged_hugepage();
if (!hpage) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
if (!*wait)
return NULL;
*wait = false;
khugepaged_alloc_sleep();
} else
count_vm_event(THP_COLLAPSE_ALLOC);
} while (unlikely(!hpage) && likely(khugepaged_enabled()));
return hpage;
}
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
/*
* If the hpage allocated earlier was briefly exposed in page cache
* before collapse_file() failed, it is possible that racing lookups
* have not yet completed, and would then be unpleasantly surprised by
* finding the hpage reused for the same mapping at a different offset.
* Just release the previous allocation if there is any danger of that.
*/
if (*hpage && page_count(*hpage) > 1) {
put_page(*hpage);
*hpage = NULL;
}
if (!*hpage)
*hpage = khugepaged_alloc_hugepage(wait);
if (unlikely(!*hpage))
return false;
return true;
}
static struct page *
khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
{
VM_BUG_ON(!*hpage);
return *hpage;
}
#endif
/*
* If mmap_lock temporarily dropped, revalidate vma
* before taking mmap_lock.
* Return 0 if succeeds, otherwise return none-zero
* value (scan code).
*/
static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
struct vm_area_struct **vmap)
{
struct vm_area_struct *vma;
unsigned long hstart, hend;
if (unlikely(khugepaged_test_exit(mm)))
return SCAN_ANY_PROCESS;
*vmap = vma = find_vma(mm, address);
if (!vma)
return SCAN_VMA_NULL;
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
return SCAN_ADDRESS_RANGE;
if (!hugepage_vma_check(vma, vma->vm_flags))
return SCAN_VMA_CHECK;
/* Anon VMA expected */
if (!vma->anon_vma || vma->vm_ops)
return SCAN_VMA_CHECK;
return 0;
}
/*
* Bring missing pages in from swap, to complete THP collapse.
* Only done if khugepaged_scan_pmd believes it is worthwhile.
*
* Called and returns without pte mapped or spinlocks held,
* but with mmap_lock held to protect against vma changes.
*/
static bool __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd,
int referenced)
{
int swapped_in = 0;
vm_fault_t ret = 0;
unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
for (address = haddr; address < end; address += PAGE_SIZE) {
struct vm_fault vmf = {
.vma = vma,
.address = address,
.pgoff = linear_page_index(vma, haddr),
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
.vma_flags = vma->vm_flags,
.vma_page_prot = vma->vm_page_prot,
};
vmf.pte = pte_offset_map(pmd, address);
vmf.orig_pte = *vmf.pte;
if (!is_swap_pte(vmf.orig_pte)) {
pte_unmap(vmf.pte);
continue;
}
swapped_in++;
ret = do_swap_page(&vmf);
/* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
if (ret & VM_FAULT_RETRY) {
mmap_read_lock(mm);
if (hugepage_vma_revalidate(mm, haddr, &vma)) {
/* vma is no longer available, don't continue to swapin */
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
/* check if the pmd is still valid */
if (mm_find_pmd(mm, haddr) != pmd) {
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
}
if (ret & VM_FAULT_ERROR) {
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
}
/* Drain LRU add pagevec to remove extra pin on the swapped in pages */
if (swapped_in)
lru_add_drain();
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
return true;
}
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
int node, int referenced, int unmapped)
{
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
pte_t *pte;
pgtable_t pgtable;
struct page *new_page;
spinlock_t *pmd_ptl, *pte_ptl;
int isolated = 0, result = 0;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
gfp_t gfp;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
/* Only allocate from the target node */
gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
/*
* Before allocating the hugepage, release the mmap_lock read lock.
* The allocation can take potentially a long time if it involves
* sync compaction, and we do not need to hold the mmap_lock during
* that. We will recheck the vma after taking it again in write mode.
*/
mmap_read_unlock(mm);
new_page = khugepaged_alloc_page(hpage, gfp, node);
if (!new_page) {
result = SCAN_ALLOC_HUGE_PAGE_FAIL;
goto out_nolock;
}
if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out_nolock;
}
count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
mmap_read_lock(mm);
result = hugepage_vma_revalidate(mm, address, &vma);
if (result) {
mmap_read_unlock(mm);
goto out_nolock;
}
pmd = mm_find_pmd(mm, address);
if (!pmd) {
result = SCAN_PMD_NULL;
mmap_read_unlock(mm);
goto out_nolock;
}
/*
* __collapse_huge_page_swapin always returns with mmap_lock locked.
* If it fails, we release mmap_lock and jump out_nolock.
* Continuing to collapse causes inconsistency.
*/
if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
pmd, referenced)) {
mmap_read_unlock(mm);
goto out_nolock;
}
mmap_read_unlock(mm);
/*
* Prevent all access to pagetables with the exception of
* gup_fast later handled by the ptep_clear_flush and the VM
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
/* check if the pmd is still valid */
if (mm_find_pmd(mm, address) != pmd)
goto out;
vm_write_begin(vma);
anon_vma_lock_write(vma->anon_vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
address, address + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
pte = pte_offset_map(pmd, address);
pte_ptl = pte_lockptr(mm, pmd);
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
* any huge TLB entry from the CPU so we won't allow
* huge and small TLB entries for the same virtual address
* to avoid the risk of CPU bugs in that area.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
spin_lock(pte_ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte,
&compound_pagelist);
spin_unlock(pte_ptl);
if (unlikely(!isolated)) {
pte_unmap(pte);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
/*
* We can only use set_pmd_at when establishing
* hugepmds and never for establishing regular pmds that
* points to regular pagetables. Use pmd_populate for that
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
vm_write_end(vma);
result = SCAN_FAIL;
goto out;
}
/*
* All pages are isolated and locked so anon_vma rmap
* can't run anymore.
*/
anon_vma_unlock_write(vma->anon_vma);
__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
&compound_pagelist);
pte_unmap(pte);
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
/*
* spin_lock() below is not the equivalent of smp_wmb(), so
* this is needed to avoid the copy_huge_page writes to become
* visible after the set_pmd_at() write.
*/
smp_wmb();
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
lru_cache_add_inactive_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
spin_unlock(pmd_ptl);
vm_write_end(vma);
*hpage = NULL;
khugepaged_pages_collapsed++;
result = SCAN_SUCCEED;
out_up_write:
mmap_write_unlock(mm);
out_nolock:
if (!IS_ERR_OR_NULL(*hpage))
mem_cgroup_uncharge(*hpage);
trace_mm_collapse_huge_page(mm, isolated, result);
return;
out:
goto out_up_write;
}
static int khugepaged_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
struct page **hpage)
{
pmd_t *pmd;
pte_t *pte, *_pte;
int ret = 0, result = 0, referenced = 0;
int none_or_zero = 0, shared = 0;
struct page *page = NULL;
unsigned long _address;
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
bool writable = false;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
pmd = mm_find_pmd(mm, address);
if (!pmd) {
result = SCAN_PMD_NULL;
goto out;
}
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (is_swap_pte(pteval)) {
if (++unmapped <= khugepaged_max_ptes_swap) {
/*
* Always be strict with uffd-wp
* enabled swap entries. Please see
* comment below for pte_uffd_wp().
*/
if (pte_swp_uffd_wp(pteval)) {
result = SCAN_PTE_UFFD_WP;
goto out_unmap;
}
continue;
} else {
result = SCAN_EXCEED_SWAP_PTE;
goto out_unmap;
}
}
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
++none_or_zero <= khugepaged_max_ptes_none) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
goto out_unmap;
}
}
if (!pte_present(pteval)) {
result = SCAN_PTE_NON_PRESENT;
goto out_unmap;
}
if (pte_uffd_wp(pteval)) {
/*
* Don't collapse the page if any of the small
* PTEs are armed with uffd write protection.
* Here we can also mark the new huge pmd as
* write protected if any of the small ones is
* marked but that could bring uknown
* userfault messages that falls outside of
* the registered range. So, just be simple.
*/
result = SCAN_PTE_UFFD_WP;
goto out_unmap;
}
if (pte_write(pteval))
writable = true;
page = vm_normal_page(vma, _address, pteval);
if (unlikely(!page)) {
result = SCAN_PAGE_NULL;
goto out_unmap;
}
if (page_mapcount(page) > 1 &&
++shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
goto out_unmap;
}
page = compound_head(page);
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
* Khupaged will allocate hugepage from the node has the max
* hit record.
*/
node = page_to_nid(page);
if (khugepaged_scan_abort(node)) {
result = SCAN_SCAN_ABORT;
goto out_unmap;
}
khugepaged_node_load[node]++;
if (!PageLRU(page)) {
result = SCAN_PAGE_LRU;
goto out_unmap;
}
if (PageLocked(page)) {
result = SCAN_PAGE_LOCK;
goto out_unmap;
}
if (!PageAnon(page)) {
result = SCAN_PAGE_ANON;
goto out_unmap;
}
/*
* Check if the page has any GUP (or other external) pins.
*
* Here the check is racy it may see totmal_mapcount > refcount
* in some cases.
* For example, one process with one forked child process.
* The parent has the PMD split due to MADV_DONTNEED, then
* the child is trying unmap the whole PMD, but khugepaged
* may be scanning the parent between the child has
* PageDoubleMap flag cleared and dec the mapcount. So
* khugepaged may see total_mapcount > refcount.
*
* But such case is ephemeral we could always retry collapse
* later. However it may report false positive if the page
* has excessive GUP pins (i.e. 512). Anyway the same check
* will be done again later the risk seems low.
*/
if (!is_refcount_suitable(page)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
if (pte_young(pteval) ||
page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced++;
}
if (!writable) {
result = SCAN_PAGE_RO;
} else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
ret = 1;
}
out_unmap:
pte_unmap_unlock(pte, ptl);
if (ret) {
node = khugepaged_find_target_node();
/* collapse_huge_page will return with the mmap_lock released */
collapse_huge_page(mm, address, hpage, node,
referenced, unmapped);
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
none_or_zero, result, unmapped);
return ret;
}
static void collect_mm_slot(struct mm_slot *mm_slot)
{
struct mm_struct *mm = mm_slot->mm;
lockdep_assert_held(&khugepaged_mm_lock);
if (khugepaged_test_exit(mm)) {
/* free mm_slot */
hash_del(&mm_slot->hash);
list_del(&mm_slot->mm_node);
/*
* Not strictly needed because the mm exited already.
*
* clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
*/
/* khugepaged_mm_lock actually not necessary for the below */
free_mm_slot(mm_slot);
mmdrop(mm);
}
}
#ifdef CONFIG_SHMEM
/*
* Notify khugepaged that given addr of the mm is pte-mapped THP. Then
* khugepaged should try to collapse the page table.
*/
static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
unsigned long addr)
{
struct mm_slot *mm_slot;
VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
spin_lock(&khugepaged_mm_lock);
mm_slot = get_mm_slot(mm);
if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
spin_unlock(&khugepaged_mm_lock);
return 0;
}
/**
* Try to collapse a pte-mapped THP for mm at address haddr.
*
* This function checks whether all the PTEs in the PMD are pointing to the
* right THP. If so, retract the page table so the THP can refault in with
* as pmd-mapped.
*/
void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
{
unsigned long haddr = addr & HPAGE_PMD_MASK;
struct vm_area_struct *vma = find_vma(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
pmd_t *pmd, _pmd;
spinlock_t *ptl;
int count = 0;
int i;
if (!vma || !vma->vm_file ||
vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
return;
/*
* This vm_flags may not have VM_HUGEPAGE if the page was not
* collapsed by this mm. But we can still collapse if the page is
* the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
* will not fail the vma for missing VM_HUGEPAGE
*/
if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
return;
hpage = find_lock_page(vma->vm_file->f_mapping,
linear_page_index(vma, haddr));
if (!hpage)
return;
if (!PageHead(hpage))
goto drop_hpage;
pmd = mm_find_pmd(mm, haddr);
if (!pmd)
goto drop_hpage;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
struct page *page;
/* empty pte, skip */
if (pte_none(*pte))
continue;
/* page swapped out, abort */
if (!pte_present(*pte))
goto abort;
page = vm_normal_page(vma, addr, *pte);
/*
* Note that uprobe, debugger, or MAP_PRIVATE may change the
* page table, but the new page will not be a subpage of hpage.
*/
if (hpage + i != page)
goto abort;
count++;
}
/* step 2: adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
struct page *page;
if (pte_none(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
page_remove_rmap(page, false);
}
pte_unmap_unlock(start_pte, ptl);
/* step 3: set proper refcount and mm_counters. */
if (count) {
page_ref_sub(hpage, count);
add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
}
/* step 4: collapse pmd */
ptl = pmd_lock(vma->vm_mm, pmd);
_pmd = pmdp_collapse_flush(vma, haddr, pmd);
spin_unlock(ptl);
mm_dec_nr_ptes(mm);
pte_free(mm, pmd_pgtable(_pmd));
drop_hpage:
unlock_page(hpage);
put_page(hpage);
return;
abort:
pte_unmap_unlock(start_pte, ptl);
goto drop_hpage;
}
static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
{
struct mm_struct *mm = mm_slot->mm;
int i;
if (likely(mm_slot->nr_pte_mapped_thp == 0))
return 0;
if (!mmap_write_trylock(mm))
return -EBUSY;
if (unlikely(khugepaged_test_exit(mm)))
goto out;
for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
out:
mm_slot->nr_pte_mapped_thp = 0;
mmap_write_unlock(mm);
return 0;
}
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long addr;
pmd_t *pmd, _pmd;
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
* got written to. These VMAs are likely not worth investing
* mmap_write_lock(mm) as PMD-mapping is likely to be split
* later.
*
* Not that vma->anon_vma check is racy: it can be set up after
* the check but before we took mmap_lock by the fault path.
* But page lock would prevent establishing any new ptes of the
* page, so we are safe.
*
* An alternative would be drop the check, but check that page
* table is clear before calling pmdp_collapse_flush() under
* ptl. It has higher chance to recover THP for the VMA, but
* has higher cost too.
*/
if (vma->anon_vma)
continue;
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (addr & ~HPAGE_PMD_MASK)
continue;
if (vma->vm_end < addr + HPAGE_PMD_SIZE)
continue;
mm = vma->vm_mm;
pmd = mm_find_pmd(mm, addr);
if (!pmd)
continue;
/*
* We need exclusive mmap_lock to retract page table.
*
* We use trylock due to lock inversion: we need to acquire
* mmap_lock while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
*/
if (mmap_write_trylock(mm)) {
if (!khugepaged_test_exit(mm)) {
spinlock_t *ptl = pmd_lock(mm, pmd);
/* assume page table is clear */
_pmd = pmdp_collapse_flush(vma, addr, pmd);
spin_unlock(ptl);
mm_dec_nr_ptes(mm);
pte_free(mm, pmd_pgtable(_pmd));
}
mmap_write_unlock(mm);
} else {
/* Try again later */
khugepaged_add_pte_mapped_thp(mm, addr);
}
}
i_mmap_unlock_write(mapping);
}
/**
* collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
*
* Basic scheme is simple, details are more complex:
* - allocate and lock a new huge page;
* - scan page cache replacing old pages with the new one
* + swap/gup in pages if necessary;
* + fill in gaps;
* + keep old pages around in case rollback is required;
* - if replacing succeeds:
* + copy data over;
* + free old pages;
* + unlock huge page;
* - if replacing failed;
* + put all pages back and unfreeze them;
* + restore gaps in the page cache;
* + unlock and free huge page;
*/
static void collapse_file(struct mm_struct *mm,
struct file *file, pgoff_t start,
struct page **hpage, int node)
{
struct address_space *mapping = file->f_mapping;
gfp_t gfp;
struct page *new_page;
pgoff_t index, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
bool is_shmem = shmem_file(file);
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
/* Only allocate from the target node */
gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
new_page = khugepaged_alloc_page(hpage, gfp, node);
if (!new_page) {
result = SCAN_ALLOC_HUGE_PAGE_FAIL;
goto out;
}
if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out;
}
count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
/* This will be less messy when we use multi-index entries */
do {
xas_lock_irq(&xas);
xas_create_range(&xas);
if (!xas_error(&xas))
break;
xas_unlock_irq(&xas);
if (!xas_nomem(&xas, GFP_KERNEL)) {
result = SCAN_FAIL;
goto out;
}
} while (1);
__SetPageLocked(new_page);
if (is_shmem)
__SetPageSwapBacked(new_page);
new_page->index = start;
new_page->mapping = mapping;
/*
* At this point the new_page is locked and not up-to-date.
* It's safe to insert it into the page cache, because nobody would
* be able to map it or use it in another way until we unlock it.
*/
xas_set(&xas, start);
for (index = start; index < end; index++) {
struct page *page = xas_next(&xas);
VM_BUG_ON(index != xas.xa_index);
if (is_shmem) {
if (!page) {
/*
* Stop if extent has been truncated or
* hole-punched, and is now completely
* empty.
*/
if (index == start) {
if (!xas_next_entry(&xas, end - 1)) {
result = SCAN_TRUNCATED;
goto xa_locked;
}
xas_set(&xas, index);
}
if (!shmem_charge(mapping->host, 1)) {
result = SCAN_FAIL;
goto xa_locked;
}
xas_store(&xas, new_page);
nr_none++;
continue;
}
if (xa_is_value(page) || !PageUptodate(page)) {
xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
if (shmem_getpage(mapping->host, index, &page,
SGP_NOHUGE)) {
result = SCAN_FAIL;
goto xa_unlocked;
}
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
} else {
result = SCAN_PAGE_LOCK;
goto xa_locked;
}
} else { /* !is_shmem */
if (!page || xa_is_value(page)) {
xas_unlock_irq(&xas);
page_cache_sync_readahead(mapping, &file->f_ra,
file, index,
end - index);
/* drain pagevecs to help isolate_lru_page() */
lru_add_drain();
page = find_lock_page(mapping, index);
if (unlikely(page == NULL)) {
result = SCAN_FAIL;
goto xa_unlocked;
}
} else if (PageDirty(page)) {
/*
* khugepaged only works on read-only fd,
* so this page is dirty because it hasn't
* been flushed since first write. There
* won't be new dirty pages.
*
* Trigger async flush here and hope the
* writeback is done when khugepaged
* revisits this page.
*
* This is a one-off situation. We are not
* forcing writeback in loop.
*/
xas_unlock_irq(&xas);
filemap_flush(mapping);
result = SCAN_FAIL;
goto xa_unlocked;
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
} else {
result = SCAN_PAGE_LOCK;
goto xa_locked;
}
}
/*
* The page must be locked, so we can drop the i_pages lock
* without racing with truncate.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
/* make sure the page is up to date */
if (unlikely(!PageUptodate(page))) {
result = SCAN_FAIL;
goto out_unlock;
}
/*
* If file was truncated then extended, or hole-punched, before
* we locked the first page, then a THP might be there already.
*/
if (PageTransCompound(page)) {
result = SCAN_PAGE_COMPOUND;
goto out_unlock;
}
if (page_mapping(page) != mapping) {
result = SCAN_TRUNCATED;
goto out_unlock;
}
if (!is_shmem && PageDirty(page)) {
/*
* khugepaged only works on read-only fd, so this
* page is dirty because it hasn't been flushed
* since first write.
*/
result = SCAN_FAIL;
goto out_unlock;
}
if (isolate_lru_page(page)) {
result = SCAN_DEL_PAGE_LRU;
goto out_unlock;
}
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL)) {
result = SCAN_PAGE_HAS_PRIVATE;
putback_lru_page(page);
goto out_unlock;
}
if (page_mapped(page))
unmap_mapping_pages(mapping, index, 1, false);
xas_lock_irq(&xas);
xas_set(&xas, index);
VM_BUG_ON_PAGE(page != xas_load(&xas), page);
VM_BUG_ON_PAGE(page_mapped(page), page);
/*
* The page is expected to have page_count() == 3:
* - we hold a pin on it;
* - one reference from page cache;
* - one from isolate_lru_page;
*/
if (!page_ref_freeze(page, 3)) {
result = SCAN_PAGE_COUNT;
xas_unlock_irq(&xas);
putback_lru_page(page);
goto out_unlock;
}
/*
* Add the page to the list to be able to undo the collapse if
* something go wrong.
*/
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
xas_store(&xas, new_page);
continue;
out_unlock:
unlock_page(page);
put_page(page);
goto xa_unlocked;
}
if (is_shmem)
__inc_node_page_state(new_page, NR_SHMEM_THPS);
else {
__inc_node_page_state(new_page, NR_FILE_THPS);
filemap_nr_thps_inc(mapping);
}
if (nr_none) {
__mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
if (is_shmem)
__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
}
xa_locked:
xas_unlock_irq(&xas);
xa_unlocked:
if (result == SCAN_SUCCEED) {
struct page *page, *tmp;
/*
* Replacing old pages with new one has succeeded, now we
* need to copy the content and free the old pages.
*/
index = start;
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
while (index < page->index) {
clear_highpage(new_page + (index % HPAGE_PMD_NR));
index++;
}
copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
page);
list_del(&page->lru);
page->mapping = NULL;
page_ref_unfreeze(page, 1);
ClearPageActive(page);
ClearPageUnevictable(page);
unlock_page(page);
put_page(page);
index++;
}
while (index < end) {
clear_highpage(new_page + (index % HPAGE_PMD_NR));
index++;
}
SetPageUptodate(new_page);
page_ref_add(new_page, HPAGE_PMD_NR - 1);
if (is_shmem)
set_page_dirty(new_page);
lru_cache_add(new_page);
/*
* Remove pte page tables, so we can re-fault the page as huge.
*/
retract_page_tables(mapping, start);
*hpage = NULL;
khugepaged_pages_collapsed++;
} else {
struct page *page;
/* Something went wrong: roll back page cache changes */
xas_lock_irq(&xas);
mapping->nrpages -= nr_none;
if (is_shmem)
shmem_uncharge(mapping->host, nr_none);
xas_set(&xas, start);
xas_for_each(&xas, page, end - 1) {
page = list_first_entry_or_null(&pagelist,
struct page, lru);
if (!page || xas.xa_index < page->index) {
if (!nr_none)
break;
nr_none--;
/* Put holes back where they were */
xas_store(&xas, NULL);
continue;
}
VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
/* Unfreeze the page. */
list_del(&page->lru);
page_ref_unfreeze(page, 2);
xas_store(&xas, page);
xas_pause(&xas);
xas_unlock_irq(&xas);
unlock_page(page);
putback_lru_page(page);
xas_lock_irq(&xas);
}
VM_BUG_ON(nr_none);
xas_unlock_irq(&xas);
new_page->mapping = NULL;
}
unlock_page(new_page);
out:
VM_BUG_ON(!list_empty(&pagelist));
if (!IS_ERR_OR_NULL(*hpage))
mem_cgroup_uncharge(*hpage);
/* TODO: tracepoints */
}
static void khugepaged_scan_file(struct mm_struct *mm,
struct file *file, pgoff_t start, struct page **hpage)
{
struct page *page = NULL;
struct address_space *mapping = file->f_mapping;
XA_STATE(xas, &mapping->i_pages, start);
int present, swap;
int node = NUMA_NO_NODE;
int result = SCAN_SUCCEED;
present = 0;
swap = 0;
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
rcu_read_lock();
xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
if (xas_retry(&xas, page))
continue;
if (xa_is_value(page)) {
if (++swap > khugepaged_max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
break;
}
continue;
}
if (PageTransCompound(page)) {
result = SCAN_PAGE_COMPOUND;
break;
}
node = page_to_nid(page);
if (khugepaged_scan_abort(node)) {
result = SCAN_SCAN_ABORT;
break;
}
khugepaged_node_load[node]++;
if (!PageLRU(page)) {
result = SCAN_PAGE_LRU;
break;
}
if (page_count(page) !=
1 + page_mapcount(page) + page_has_private(page)) {
result = SCAN_PAGE_COUNT;
break;
}
/*
* We probably should check if the page is referenced here, but
* nobody would transfer pte_young() to PageReferenced() for us.
* And rmap walk here is just too costly...
*/
present++;
if (need_resched()) {
xas_pause(&xas);
cond_resched_rcu();
}
}
rcu_read_unlock();
if (result == SCAN_SUCCEED) {
if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
} else {
node = khugepaged_find_target_node();
collapse_file(mm, file, start, hpage, node);
}
}
/* TODO: tracepoints */
}
#else
static void khugepaged_scan_file(struct mm_struct *mm,
struct file *file, pgoff_t start, struct page **hpage)
{
BUILD_BUG();
}
static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
{
return 0;
}
#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
struct page **hpage)
__releases(&khugepaged_mm_lock)
__acquires(&khugepaged_mm_lock)
{
struct mm_slot *mm_slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
int progress = 0;
VM_BUG_ON(!pages);
lockdep_assert_held(&khugepaged_mm_lock);
if (khugepaged_scan.mm_slot)
mm_slot = khugepaged_scan.mm_slot;
else {
mm_slot = list_entry(khugepaged_scan.mm_head.next,
struct mm_slot, mm_node);
khugepaged_scan.address = 0;
khugepaged_scan.mm_slot = mm_slot;
}
spin_unlock(&khugepaged_mm_lock);
khugepaged_collapse_pte_mapped_thps(mm_slot);
mm = mm_slot->mm;
/*
* Don't wait for semaphore (to avoid long wait times). Just move to
* the next mm on the list.
*/
vma = NULL;
if (unlikely(!mmap_read_trylock(mm)))
goto breakouterloop_mmap_lock;
if (likely(!khugepaged_test_exit(mm)))
vma = find_vma(mm, khugepaged_scan.address);
progress++;
for (; vma; vma = vma->vm_next) {
unsigned long hstart, hend;
cond_resched();
if (unlikely(khugepaged_test_exit(mm))) {
progress++;
break;
}
if (!hugepage_vma_check(vma, vma->vm_flags)) {
skip:
progress++;
continue;
}
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart >= hend)
goto skip;
if (khugepaged_scan.address > hend)
goto skip;
if (khugepaged_scan.address < hstart)
khugepaged_scan.address = hstart;
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma))
goto skip;
while (khugepaged_scan.address < hend) {
int ret;
cond_resched();
if (unlikely(khugepaged_test_exit(mm)))
goto breakouterloop;
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
mmap_read_unlock(mm);
ret = 1;
khugepaged_scan_file(mm, file, pgoff, hpage);
fput(file);
} else {
ret = khugepaged_scan_pmd(mm, vma,
khugepaged_scan.address,
hpage);
}
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
if (ret)
/* we released mmap_lock so break loop */
goto breakouterloop_mmap_lock;
if (progress >= pages)
goto breakouterloop;
}
}
breakouterloop:
mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
breakouterloop_mmap_lock:
spin_lock(&khugepaged_mm_lock);
VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
/*
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
*/
if (khugepaged_test_exit(mm) || !vma) {
/*
* Make sure that if mm_users is reaching zero while
* khugepaged runs here, khugepaged_exit will find
* mm_slot not pointing to the exiting mm.
*/
if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
khugepaged_scan.mm_slot = list_entry(
mm_slot->mm_node.next,
struct mm_slot, mm_node);
khugepaged_scan.address = 0;
} else {
khugepaged_scan.mm_slot = NULL;
khugepaged_full_scans++;
}
collect_mm_slot(mm_slot);
}
return progress;
}
static int khugepaged_has_work(void)
{
return !list_empty(&khugepaged_scan.mm_head) &&
khugepaged_enabled();
}
static int khugepaged_wait_event(void)
{
return !list_empty(&khugepaged_scan.mm_head) ||
kthread_should_stop();
}
static void khugepaged_do_scan(void)
{
struct page *hpage = NULL;
unsigned int progress = 0, pass_through_head = 0;
unsigned int pages = khugepaged_pages_to_scan;
bool wait = true;
barrier(); /* write khugepaged_pages_to_scan to local stack */
lru_add_drain_all();
while (progress < pages) {
if (!khugepaged_prealloc_page(&hpage, &wait))
break;
cond_resched();
if (unlikely(kthread_should_stop() || try_to_freeze()))
break;
spin_lock(&khugepaged_mm_lock);
if (!khugepaged_scan.mm_slot)
pass_through_head++;
if (khugepaged_has_work() &&
pass_through_head < 2)
progress += khugepaged_scan_mm_slot(pages - progress,
&hpage);
else
progress = pages;
spin_unlock(&khugepaged_mm_lock);
}
if (!IS_ERR_OR_NULL(hpage))
put_page(hpage);
}
static bool khugepaged_should_wakeup(void)
{
return kthread_should_stop() ||
time_after_eq(jiffies, khugepaged_sleep_expire);
}
static void khugepaged_wait_work(void)
{
if (khugepaged_has_work()) {
const unsigned long scan_sleep_jiffies =
msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
if (!scan_sleep_jiffies)
return;
khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
wait_event_freezable_timeout(khugepaged_wait,
khugepaged_should_wakeup(),
scan_sleep_jiffies);
return;
}
if (khugepaged_enabled())
wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
static int khugepaged(void *none)
{
struct mm_slot *mm_slot;
set_freezable();
set_user_nice(current, MAX_NICE);
while (!kthread_should_stop()) {
khugepaged_do_scan();
khugepaged_wait_work();
}
spin_lock(&khugepaged_mm_lock);
mm_slot = khugepaged_scan.mm_slot;
khugepaged_scan.mm_slot = NULL;
if (mm_slot)
collect_mm_slot(mm_slot);
spin_unlock(&khugepaged_mm_lock);
return 0;
}
static void set_recommended_min_free_kbytes(void)
{
struct zone *zone;
int nr_zones = 0;
unsigned long recommended_min;
for_each_populated_zone(zone) {
/*
* We don't need to worry about fragmentation of
* ZONE_MOVABLE since it only has movable pages.
*/
if (zone_idx(zone) > gfp_zone(GFP_USER))
continue;
nr_zones++;
}
/* Ensure 2 pageblocks are free to assist fragmentation avoidance */
recommended_min = pageblock_nr_pages * nr_zones * 2;
/*
* Make sure that on average at least two pageblocks are almost free
* of another type, one for a migratetype to fall back to and a
* second to avoid subsequent fallbacks of other types There are 3
* MIGRATE_TYPES we care about.
*/
recommended_min += pageblock_nr_pages * nr_zones *
MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
/* don't ever allow to reserve more than 5% of the lowmem */
recommended_min = min(recommended_min,
(unsigned long) nr_free_buffer_pages() / 20);
recommended_min <<= (PAGE_SHIFT-10);
if (recommended_min > min_free_kbytes) {
if (user_min_free_kbytes >= 0)
pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
min_free_kbytes, recommended_min);
min_free_kbytes = recommended_min;
}
setup_per_zone_wmarks();
}
int start_stop_khugepaged(void)
{
int err = 0;
mutex_lock(&khugepaged_mutex);
if (khugepaged_enabled()) {
if (!khugepaged_thread)
khugepaged_thread = kthread_run(khugepaged, NULL,
"khugepaged");
if (IS_ERR(khugepaged_thread)) {
pr_err("khugepaged: kthread_run(khugepaged) failed\n");
err = PTR_ERR(khugepaged_thread);
khugepaged_thread = NULL;
goto fail;
}
if (!list_empty(&khugepaged_scan.mm_head))
wake_up_interruptible(&khugepaged_wait);
set_recommended_min_free_kbytes();
} else if (khugepaged_thread) {
kthread_stop(khugepaged_thread);
khugepaged_thread = NULL;
}
fail:
mutex_unlock(&khugepaged_mutex);
return err;
}
void khugepaged_min_free_kbytes_update(void)
{
mutex_lock(&khugepaged_mutex);
if (khugepaged_enabled() && khugepaged_thread)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}