android_kernel_xiaomi_sm8450/mm/madvise.c
Greg Kroah-Hartman 3ccfc59f82 This is the 5.10.24 stable release
-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAmBSKS0ACgkQONu9yGCS
 aT7Ngg//c4C1WnWC0sNWzP3xT2paCkLnUUyjQTmrkbPvLtr2DvehW5Bvp/32pGiS
 8mDMoTLq3QxNrfrU6SY3KavZRC9Pa+migAsVmuujygQwNphqv95/XxnFemFEAYTl
 b8b5OJPyomzMMEwHzx1Tr+7/d58czrqXo97QI0lmaDlHl+9JKTg2SMX9AkHkU8pK
 zYjbtzdhd9UZCTdVYY1ZFkQ1ik1iAWo3Xv0G2aMeQQpuGcZIh/Y66xBuyH+8g+Yz
 3mInhPQvhkb+c+m4ZJ9NhOUVEW4Fl0fq0mVrrYkfHqXe0D36Vj/yYvO/yTSBqb4+
 XQ5PLXX3KFVDFl1id94unXGgP3c0zBe30JZPqKdpSET+PzOlGiZTxMCfjPeTgu/Z
 7xc2qSX1zn273HMTRrT1daO4/NXQ85kE04mZMzq7cqDpum7ltfKrEMum/Gma+dJz
 Knn47oZHbSW4Er/WcAwHSeZpxvD7AahG/GlsQRy+IVPu/jMXJHmo2/Nv1fLJWp+G
 7VVWRXug69hywGr7hFiT3USG2C5g5cV3/dEO8NFFjGKRa5CbLbQD6B3+Dz3dXyBH
 jE3MGIoqoNk+SvJOAf2ogu7SS6wLynZWOchmAVvIQ4QEzcP2jroeFHKD49MYxDUE
 dKcq0dtfMc4nUaUZ/XRfWtS9fSm+T4XonmvEY4yXnAyfZ0aeEM8=
 =FdFm
 -----END PGP SIGNATURE-----

Merge 5.10.24 into android12-5.10-lts

Changes in 5.10.24
	uapi: nfnetlink_cthelper.h: fix userspace compilation error
	powerpc/perf: Fix handling of privilege level checks in perf interrupt context
	powerpc/pseries: Don't enforce MSI affinity with kdump
	ethernet: alx: fix order of calls on resume
	crypto: mips/poly1305 - enable for all MIPS processors
	ath9k: fix transmitting to stations in dynamic SMPS mode
	net: Fix gro aggregation for udp encaps with zero csum
	net: check if protocol extracted by virtio_net_hdr_set_proto is correct
	net: avoid infinite loop in mpls_gso_segment when mpls_hlen == 0
	net: l2tp: reduce log level of messages in receive path, add counter instead
	can: skb: can_skb_set_owner(): fix ref counting if socket was closed before setting skb ownership
	can: flexcan: assert FRZ bit in flexcan_chip_freeze()
	can: flexcan: enable RX FIFO after FRZ/HALT valid
	can: flexcan: invoke flexcan_chip_freeze() to enter freeze mode
	can: tcan4x5x: tcan4x5x_init(): fix initialization - clear MRAM before entering Normal Mode
	tcp: Fix sign comparison bug in getsockopt(TCP_ZEROCOPY_RECEIVE)
	tcp: add sanity tests to TCP_QUEUE_SEQ
	netfilter: nf_nat: undo erroneous tcp edemux lookup
	netfilter: x_tables: gpf inside xt_find_revision()
	net: always use icmp{,v6}_ndo_send from ndo_start_xmit
	net: phy: fix save wrong speed and duplex problem if autoneg is on
	selftests/bpf: Use the last page in test_snprintf_btf on s390
	selftests/bpf: No need to drop the packet when there is no geneve opt
	selftests/bpf: Mask bpf_csum_diff() return value to 16 bits in test_verifier
	samples, bpf: Add missing munmap in xdpsock
	libbpf: Clear map_info before each bpf_obj_get_info_by_fd
	ibmvnic: Fix possibly uninitialized old_num_tx_queues variable warning.
	ibmvnic: always store valid MAC address
	mt76: dma: do not report truncated frames to mac80211
	powerpc/603: Fix protection of user pages mapped with PROT_NONE
	mount: fix mounting of detached mounts onto targets that reside on shared mounts
	cifs: return proper error code in statfs(2)
	Revert "mm, slub: consider rest of partial list if acquire_slab() fails"
	docs: networking: drop special stable handling
	net: dsa: tag_rtl4_a: fix egress tags
	sh_eth: fix TRSCER mask for SH771x
	net: enetc: don't overwrite the RSS indirection table when initializing
	net: enetc: take the MDIO lock only once per NAPI poll cycle
	net: enetc: fix incorrect TPID when receiving 802.1ad tagged packets
	net: enetc: don't disable VLAN filtering in IFF_PROMISC mode
	net: enetc: force the RGMII speed and duplex instead of operating in inband mode
	net: enetc: remove bogus write to SIRXIDR from enetc_setup_rxbdr
	net: enetc: keep RX ring consumer index in sync with hardware
	net: ethernet: mtk-star-emac: fix wrong unmap in RX handling
	net/mlx4_en: update moderation when config reset
	net: stmmac: fix incorrect DMA channel intr enable setting of EQoS v4.10
	nexthop: Do not flush blackhole nexthops when loopback goes down
	net: sched: avoid duplicates in classes dump
	net: mscc: ocelot: properly reject destination IP keys in VCAP IS1
	net: dsa: sja1105: fix SGMII PCS being forced to SPEED_UNKNOWN instead of SPEED_10
	net: usb: qmi_wwan: allow qmimux add/del with master up
	netdevsim: init u64 stats for 32bit hardware
	cipso,calipso: resolve a number of problems with the DOI refcounts
	net: stmmac: Fix VLAN filter delete timeout issue in Intel mGBE SGMII
	stmmac: intel: Fixes clock registration error seen for multiple interfaces
	net: lapbether: Remove netif_start_queue / netif_stop_queue
	net: davicom: Fix regulator not turned off on failed probe
	net: davicom: Fix regulator not turned off on driver removal
	net: enetc: allow hardware timestamping on TX queues with tc-etf enabled
	net: qrtr: fix error return code of qrtr_sendmsg()
	s390/qeth: fix memory leak after failed TX Buffer allocation
	r8169: fix r8168fp_adjust_ocp_cmd function
	ixgbe: fail to create xfrm offload of IPsec tunnel mode SA
	tools/resolve_btfids: Fix build error with older host toolchains
	perf build: Fix ccache usage in $(CC) when generating arch errno table
	net: stmmac: stop each tx channel independently
	net: stmmac: fix watchdog timeout during suspend/resume stress test
	net: stmmac: fix wrongly set buffer2 valid when sph unsupport
	ethtool: fix the check logic of at least one channel for RX/TX
	net: phy: make mdio_bus_phy_suspend/resume as __maybe_unused
	selftests: forwarding: Fix race condition in mirror installation
	mlxsw: spectrum_ethtool: Add an external speed to PTYS register
	perf traceevent: Ensure read cmdlines are null terminated.
	perf report: Fix -F for branch & mem modes
	net: hns3: fix query vlan mask value error for flow director
	net: hns3: fix bug when calculating the TCAM table info
	s390/cio: return -EFAULT if copy_to_user() fails again
	bnxt_en: reliably allocate IRQ table on reset to avoid crash
	gpiolib: acpi: Add ACPI_GPIO_QUIRK_ABSOLUTE_NUMBER quirk
	gpiolib: acpi: Allow to find GpioInt() resource by name and index
	gpio: pca953x: Set IRQ type when handle Intel Galileo Gen 2
	gpio: fix gpio-device list corruption
	drm/compat: Clear bounce structures
	drm/amd/display: Add a backlight module option
	drm/amdgpu/display: use GFP_ATOMIC in dcn21_validate_bandwidth_fp()
	drm/amd/display: Fix nested FPU context in dcn21_validate_bandwidth()
	drm/amd/pm: bug fix for pcie dpm
	drm/amdgpu/display: simplify backlight setting
	drm/amdgpu/display: don't assert in set backlight function
	drm/amdgpu/display: handle aux backlight in backlight_get_brightness
	drm/shmem-helper: Check for purged buffers in fault handler
	drm/shmem-helper: Don't remove the offset in vm_area_struct pgoff
	drm: Use USB controller's DMA mask when importing dmabufs
	drm: meson_drv add shutdown function
	drm/shmem-helpers: vunmap: Don't put pages for dma-buf
	drm/i915: Wedge the GPU if command parser setup fails
	s390/cio: return -EFAULT if copy_to_user() fails
	s390/crypto: return -EFAULT if copy_to_user() fails
	qxl: Fix uninitialised struct field head.surface_id
	sh_eth: fix TRSCER mask for R7S9210
	media: usbtv: Fix deadlock on suspend
	media: rkisp1: params: fix wrong bits settings
	media: v4l: vsp1: Fix uif null pointer access
	media: v4l: vsp1: Fix bru null pointer access
	media: rc: compile rc-cec.c into rc-core
	cifs: fix credit accounting for extra channel
	net: hns3: fix error mask definition of flow director
	s390/qeth: don't replace a fully completed async TX buffer
	s390/qeth: remove QETH_QDIO_BUF_HANDLED_DELAYED state
	s390/qeth: improve completion of pending TX buffers
	s390/qeth: fix notification for pending buffers during teardown
	net: dsa: implement a central TX reallocation procedure
	net: dsa: tag_ksz: don't allocate additional memory for padding/tagging
	net: dsa: trailer: don't allocate additional memory for padding/tagging
	net: dsa: tag_qca: let DSA core deal with TX reallocation
	net: dsa: tag_ocelot: let DSA core deal with TX reallocation
	net: dsa: tag_mtk: let DSA core deal with TX reallocation
	net: dsa: tag_lan9303: let DSA core deal with TX reallocation
	net: dsa: tag_edsa: let DSA core deal with TX reallocation
	net: dsa: tag_brcm: let DSA core deal with TX reallocation
	net: dsa: tag_dsa: let DSA core deal with TX reallocation
	net: dsa: tag_gswip: let DSA core deal with TX reallocation
	net: dsa: tag_ar9331: let DSA core deal with TX reallocation
	net: dsa: tag_mtk: fix 802.1ad VLAN egress
	enetc: Fix unused var build warning for CONFIG_OF
	net: enetc: initialize RFS/RSS memories for unused ports too
	ath11k: peer delete synchronization with firmware
	ath11k: start vdev if a bss peer is already created
	ath11k: fix AP mode for QCA6390
	i2c: rcar: faster irq code to minimize HW race condition
	i2c: rcar: optimize cacheline to minimize HW race condition
	scsi: ufs: WB is only available on LUN #0 to #7
	udf: fix silent AED tagLocation corruption
	iommu/vt-d: Clear PRQ overflow only when PRQ is empty
	mmc: mxs-mmc: Fix a resource leak in an error handling path in 'mxs_mmc_probe()'
	mmc: mediatek: fix race condition between msdc_request_timeout and irq
	mmc: sdhci-iproc: Add ACPI bindings for the RPi
	Platform: OLPC: Fix probe error handling
	powerpc/pci: Add ppc_md.discover_phbs()
	spi: stm32: make spurious and overrun interrupts visible
	powerpc: improve handling of unrecoverable system reset
	powerpc/perf: Record counter overflow always if SAMPLE_IP is unset
	HID: logitech-dj: add support for the new lightspeed connection iteration
	powerpc/64: Fix stack trace not displaying final frame
	iommu/amd: Fix performance counter initialization
	clk: qcom: gdsc: Implement NO_RET_PERIPH flag
	sparc32: Limit memblock allocation to low memory
	sparc64: Use arch_validate_flags() to validate ADI flag
	Input: applespi - don't wait for responses to commands indefinitely.
	PCI: xgene-msi: Fix race in installing chained irq handler
	PCI: mediatek: Add missing of_node_put() to fix reference leak
	drivers/base: build kunit tests without structleak plugin
	PCI/LINK: Remove bandwidth notification
	ext4: don't try to processed freed blocks until mballoc is initialized
	kbuild: clamp SUBLEVEL to 255
	PCI: Fix pci_register_io_range() memory leak
	i40e: Fix memory leak in i40e_probe
	kasan: fix memory corruption in kasan_bitops_tags test
	s390/smp: __smp_rescan_cpus() - move cpumask away from stack
	drivers/base/memory: don't store phys_device in memory blocks
	sysctl.c: fix underflow value setting risk in vm_table
	scsi: libiscsi: Fix iscsi_prep_scsi_cmd_pdu() error handling
	scsi: target: core: Add cmd length set before cmd complete
	scsi: target: core: Prevent underflow for service actions
	clk: qcom: gpucc-msm8998: Add resets, cxc, fix flags on gpu_gx_gdsc
	mmc: sdhci: Update firmware interface API
	ARM: 9029/1: Make iwmmxt.S support Clang's integrated assembler
	ARM: assembler: introduce adr_l, ldr_l and str_l macros
	ARM: efistub: replace adrl pseudo-op with adr_l macro invocation
	ALSA: usb: Add Plantronics C320-M USB ctrl msg delay quirk
	ALSA: hda/hdmi: Cancel pending works before suspend
	ALSA: hda/conexant: Add quirk for mute LED control on HP ZBook G5
	ALSA: hda/ca0132: Add Sound BlasterX AE-5 Plus support
	ALSA: hda: Drop the BATCH workaround for AMD controllers
	ALSA: hda: Flush pending unsolicited events before suspend
	ALSA: hda: Avoid spurious unsol event handling during S3/S4
	ALSA: usb-audio: Fix "cannot get freq eq" errors on Dell AE515 sound bar
	ALSA: usb-audio: Apply the control quirk to Plantronics headsets
	ALSA: usb-audio: Disable USB autosuspend properly in setup_disable_autosuspend()
	ALSA: usb-audio: fix NULL ptr dereference in usb_audio_probe
	ALSA: usb-audio: fix use after free in usb_audio_disconnect
	Revert 95ebabde382c ("capabilities: Don't allow writing ambiguous v3 file capabilities")
	block: Discard page cache of zone reset target range
	block: Try to handle busy underlying device on discard
	arm64: kasan: fix page_alloc tagging with DEBUG_VIRTUAL
	arm64: mte: Map hotplugged memory as Normal Tagged
	arm64: perf: Fix 64-bit event counter read truncation
	s390/dasd: fix hanging DASD driver unbind
	s390/dasd: fix hanging IO request during DASD driver unbind
	software node: Fix node registration
	xen/events: reset affinity of 2-level event when tearing it down
	mmc: mmci: Add MMC_CAP_NEED_RSP_BUSY for the stm32 variants
	mmc: core: Fix partition switch time for eMMC
	mmc: cqhci: Fix random crash when remove mmc module/card
	cifs: do not send close in compound create+close requests
	Goodix Fingerprint device is not a modem
	USB: gadget: udc: s3c2410_udc: fix return value check in s3c2410_udc_probe()
	USB: gadget: u_ether: Fix a configfs return code
	usb: gadget: f_uac2: always increase endpoint max_packet_size by one audio slot
	usb: gadget: f_uac1: stop playback on function disable
	usb: dwc3: qcom: Add missing DWC3 OF node refcount decrement
	usb: dwc3: qcom: add URS Host support for sdm845 ACPI boot
	usb: dwc3: qcom: add ACPI device id for sc8180x
	usb: dwc3: qcom: Honor wakeup enabled/disabled state
	USB: usblp: fix a hang in poll() if disconnected
	usb: renesas_usbhs: Clear PIPECFG for re-enabling pipe with other EPNUM
	usb: xhci: do not perform Soft Retry for some xHCI hosts
	xhci: Improve detection of device initiated wake signal.
	usb: xhci: Fix ASMedia ASM1042A and ASM3242 DMA addressing
	xhci: Fix repeated xhci wake after suspend due to uncleared internal wake state
	USB: serial: io_edgeport: fix memory leak in edge_startup
	USB: serial: ch341: add new Product ID
	USB: serial: cp210x: add ID for Acuity Brands nLight Air Adapter
	USB: serial: cp210x: add some more GE USB IDs
	usbip: fix stub_dev to check for stream socket
	usbip: fix vhci_hcd to check for stream socket
	usbip: fix vudc to check for stream socket
	usbip: fix stub_dev usbip_sockfd_store() races leading to gpf
	usbip: fix vhci_hcd attach_store() races leading to gpf
	usbip: fix vudc usbip_sockfd_store races leading to gpf
	Revert "serial: max310x: rework RX interrupt handling"
	misc/pvpanic: Export module FDT device table
	misc: fastrpc: restrict user apps from sending kernel RPC messages
	staging: rtl8192u: fix ->ssid overflow in r8192_wx_set_scan()
	staging: rtl8188eu: prevent ->ssid overflow in rtw_wx_set_scan()
	staging: rtl8712: unterminated string leads to read overflow
	staging: rtl8188eu: fix potential memory corruption in rtw_check_beacon_data()
	staging: ks7010: prevent buffer overflow in ks_wlan_set_scan()
	staging: rtl8712: Fix possible buffer overflow in r8712_sitesurvey_cmd
	staging: rtl8192e: Fix possible buffer overflow in _rtl92e_wx_set_scan
	staging: comedi: addi_apci_1032: Fix endian problem for COS sample
	staging: comedi: addi_apci_1500: Fix endian problem for command sample
	staging: comedi: adv_pci1710: Fix endian problem for AI command data
	staging: comedi: das6402: Fix endian problem for AI command data
	staging: comedi: das800: Fix endian problem for AI command data
	staging: comedi: dmm32at: Fix endian problem for AI command data
	staging: comedi: me4000: Fix endian problem for AI command data
	staging: comedi: pcl711: Fix endian problem for AI command data
	staging: comedi: pcl818: Fix endian problem for AI command data
	sh_eth: fix TRSCER mask for R7S72100
	cpufreq: qcom-hw: fix dereferencing freed memory 'data'
	cpufreq: qcom-hw: Fix return value check in qcom_cpufreq_hw_cpu_init()
	arm64/mm: Fix pfn_valid() for ZONE_DEVICE based memory
	SUNRPC: Set memalloc_nofs_save() for sync tasks
	NFS: Don't revalidate the directory permissions on a lookup failure
	NFS: Don't gratuitously clear the inode cache when lookup failed
	NFSv4.2: fix return value of _nfs4_get_security_label()
	block: rsxx: fix error return code of rsxx_pci_probe()
	nvme-fc: fix racing controller reset and create association
	configfs: fix a use-after-free in __configfs_open_file
	arm64: mm: use a 48-bit ID map when possible on 52-bit VA builds
	perf/core: Flush PMU internal buffers for per-CPU events
	perf/x86/intel: Set PERF_ATTACH_SCHED_CB for large PEBS and LBR
	hrtimer: Update softirq_expires_next correctly after __hrtimer_get_next_event()
	powerpc/64s/exception: Clean up a missed SRR specifier
	seqlock,lockdep: Fix seqcount_latch_init()
	stop_machine: mark helpers __always_inline
	include/linux/sched/mm.h: use rcu_dereference in in_vfork()
	zram: fix return value on writeback_store
	linux/compiler-clang.h: define HAVE_BUILTIN_BSWAP*
	sched/membarrier: fix missing local execution of ipi_sync_rq_state()
	efi: stub: omit SetVirtualAddressMap() if marked unsupported in RT_PROP table
	powerpc/64s: Fix instruction encoding for lis in ppc_function_entry()
	powerpc: Fix inverted SET_FULL_REGS bitop
	powerpc: Fix missing declaration of [en/dis]able_kernel_vsx()
	binfmt_misc: fix possible deadlock in bm_register_write
	x86/unwind/orc: Disable KASAN checking in the ORC unwinder, part 2
	x86/sev-es: Introduce ip_within_syscall_gap() helper
	x86/sev-es: Check regs->sp is trusted before adjusting #VC IST stack
	x86/entry: Move nmi entry/exit into common code
	x86/sev-es: Correctly track IRQ states in runtime #VC handler
	x86/sev-es: Use __copy_from_user_inatomic()
	x86/entry: Fix entry/exit mismatch on failed fast 32-bit syscalls
	KVM: x86: Ensure deadline timer has truly expired before posting its IRQ
	KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged
	KVM: arm64: Fix range alignment when walking page tables
	KVM: arm64: Avoid corrupting vCPU context register in guest exit
	KVM: arm64: nvhe: Save the SPE context early
	KVM: arm64: Reject VM creation when the default IPA size is unsupported
	KVM: arm64: Fix exclusive limit for IPA size
	mm/userfaultfd: fix memory corruption due to writeprotect
	mm/madvise: replace ptrace attach requirement for process_madvise
	KVM: arm64: Ensure I-cache isolation between vcpus of a same VM
	mm/page_alloc.c: refactor initialization of struct page for holes in memory layout
	xen/events: don't unmask an event channel when an eoi is pending
	xen/events: avoid handling the same event on two cpus at the same time
	KVM: arm64: Fix nVHE hyp panic host context restore
	RDMA/umem: Use ib_dma_max_seg_size instead of dma_get_max_seg_size
	Linux 5.10.24

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: Ie53a3c1963066a18d41357b6be41cff00690bd40
2021-03-19 09:42:56 +01:00

1254 lines
31 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* linux/mm/madvise.c
*
* Copyright (C) 1999 Linus Torvalds
* Copyright (C) 2002 Christoph Hellwig
*/
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/page-isolation.h>
#include <linux/page_idle.h>
#include <linux/userfaultfd_k.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
#include <linux/fadvise.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/pagewalk.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
#include <asm/tlb.h>
#include "internal.h"
struct madvise_walk_private {
struct mmu_gather *tlb;
bool pageout;
};
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_lock for writing. Others, which simply traverse vmas, need
* to only take it for reading.
*/
static int madvise_need_mmap_write(int behavior)
{
switch (behavior) {
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
case MADV_COLD:
case MADV_PAGEOUT:
case MADV_FREE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
return 1;
}
}
/*
* We can potentially split a vm area into separate
* areas, each area with its own behavior.
*/
static long madvise_behavior(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
struct mm_struct *mm = vma->vm_mm;
int error = 0;
pgoff_t pgoff;
unsigned long new_flags = vma->vm_flags;
switch (behavior) {
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
case MADV_SEQUENTIAL:
new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
break;
case MADV_RANDOM:
new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
break;
case MADV_DONTFORK:
new_flags |= VM_DONTCOPY;
break;
case MADV_DOFORK:
if (vma->vm_flags & VM_IO) {
error = -EINVAL;
goto out;
}
new_flags &= ~VM_DONTCOPY;
break;
case MADV_WIPEONFORK:
/* MADV_WIPEONFORK is only supported on anonymous memory. */
if (vma->vm_file || vma->vm_flags & VM_SHARED) {
error = -EINVAL;
goto out;
}
new_flags |= VM_WIPEONFORK;
break;
case MADV_KEEPONFORK:
new_flags &= ~VM_WIPEONFORK;
break;
case MADV_DONTDUMP:
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
error = -EINVAL;
goto out;
}
new_flags &= ~VM_DONTDUMP;
break;
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
error = ksm_madvise(vma, start, end, behavior, &new_flags);
if (error)
goto out_convert_errno;
break;
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
error = hugepage_madvise(vma, &new_flags, behavior);
if (error)
goto out_convert_errno;
break;
}
if (new_flags == vma->vm_flags) {
*prev = vma;
goto out;
}
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
if (*prev) {
vma = *prev;
goto success;
}
*prev = vma;
if (start != vma->vm_start) {
if (unlikely(mm->map_count >= sysctl_max_map_count)) {
error = -ENOMEM;
goto out;
}
error = __split_vma(mm, vma, start, 1);
if (error)
goto out_convert_errno;
}
if (end != vma->vm_end) {
if (unlikely(mm->map_count >= sysctl_max_map_count)) {
error = -ENOMEM;
goto out;
}
error = __split_vma(mm, vma, end, 0);
if (error)
goto out_convert_errno;
}
success:
/*
* vm_flags is protected by the mmap_lock held in write mode.
*/
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, new_flags);
vm_write_end(vma);
out_convert_errno:
/*
* madvise() returns EAGAIN if kernel resources, such as
* slab, are temporarily unavailable.
*/
if (error == -ENOMEM)
error = -EAGAIN;
out:
return error;
}
#ifdef CONFIG_SWAP
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)
{
pte_t *orig_pte;
struct vm_area_struct *vma = walk->private;
unsigned long index;
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
return 0;
for (index = start; index != end; index += PAGE_SIZE) {
pte_t pte;
swp_entry_t entry;
struct page *page;
spinlock_t *ptl;
orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
pte = *(orig_pte + ((index - start) / PAGE_SIZE));
pte_unmap_unlock(orig_pte, ptl);
if (pte_present(pte) || pte_none(pte))
continue;
entry = pte_to_swp_entry(pte);
if (unlikely(non_swap_entry(entry)))
continue;
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
vma, index, false);
if (page)
put_page(page);
}
return 0;
}
static const struct mm_walk_ops swapin_walk_ops = {
.pmd_entry = swapin_walk_pmd_entry,
};
static void force_shm_swapin_readahead(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct address_space *mapping)
{
XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
struct page *page;
rcu_read_lock();
xas_for_each(&xas, page, end_index) {
swp_entry_t swap;
if (!xa_is_value(page))
continue;
xas_pause(&xas);
rcu_read_unlock();
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
NULL, 0, false);
if (page)
put_page(page);
rcu_read_lock();
}
rcu_read_unlock();
lru_add_drain(); /* Push any new pages onto the LRU now */
}
#endif /* CONFIG_SWAP */
/*
* Schedule all required I/O operations. Do not wait for completion.
*/
static long madvise_willneed(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
loff_t offset;
*prev = vma;
#ifdef CONFIG_SWAP
if (!file) {
walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
lru_add_drain(); /* Push any new pages onto the LRU now */
return 0;
}
if (shmem_mapping(file->f_mapping)) {
force_shm_swapin_readahead(vma, start, end,
file->f_mapping);
return 0;
}
#else
if (!file)
return -EBADF;
#endif
if (IS_DAX(file_inode(file))) {
/* no bad return value, but ignore advice */
return 0;
}
/*
* Filesystem's fadvise may need to take various locks. We need to
* explicitly grab a reference because the vma (and hence the
* vma's reference to the file) can go away as soon as we drop
* mmap_lock.
*/
*prev = NULL; /* tell sys_madvise we drop mmap_lock */
get_file(file);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
mmap_read_unlock(mm);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
mmap_read_lock(mm);
return 0;
}
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct madvise_walk_private *private = walk->private;
struct mmu_gather *tlb = private->tlb;
bool pageout = private->pageout;
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
pte_t *orig_pte, *pte, ptent;
spinlock_t *ptl;
struct page *page = NULL;
LIST_HEAD(page_list);
if (fatal_signal_pending(current))
return -EINTR;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (pmd_trans_huge(*pmd)) {
pmd_t orig_pmd;
unsigned long next = pmd_addr_end(addr, end);
tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
ptl = pmd_trans_huge_lock(pmd, vma);
if (!ptl)
return 0;
orig_pmd = *pmd;
if (is_huge_zero_pmd(orig_pmd))
goto huge_unlock;
if (unlikely(!pmd_present(orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
!is_pmd_migration_entry(orig_pmd));
goto huge_unlock;
}
page = pmd_page(orig_pmd);
/* Do not interfere with other mappings of this page */
if (page_mapcount(page) != 1)
goto huge_unlock;
if (next - addr != HPAGE_PMD_SIZE) {
int err;
get_page(page);
spin_unlock(ptl);
lock_page(page);
err = split_huge_page(page);
unlock_page(page);
put_page(page);
if (!err)
goto regular_page;
return 0;
}
if (pmd_young(orig_pmd)) {
pmdp_invalidate(vma, addr, pmd);
orig_pmd = pmd_mkold(orig_pmd);
set_pmd_at(mm, addr, pmd, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
ClearPageReferenced(page);
test_and_clear_page_young(page);
if (pageout) {
if (!isolate_lru_page(page)) {
if (PageUnevictable(page))
putback_lru_page(page);
else
list_add(&page->lru, &page_list);
}
} else
deactivate_page(page);
huge_unlock:
spin_unlock(ptl);
if (pageout)
reclaim_pages(&page_list);
return 0;
}
regular_page:
if (pmd_trans_unstable(pmd))
return 0;
#endif
tlb_change_page_size(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr < end; pte++, addr += PAGE_SIZE) {
ptent = *pte;
if (pte_none(ptent))
continue;
if (!pte_present(ptent))
continue;
page = vm_normal_page(vma, addr, ptent);
if (!page)
continue;
/*
* Creating a THP page is expensive so split it only if we
* are sure it's worth. Split it if we are only owner.
*/
if (PageTransCompound(page)) {
if (page_mapcount(page) != 1)
break;
get_page(page);
if (!trylock_page(page)) {
put_page(page);
break;
}
pte_unmap_unlock(orig_pte, ptl);
if (split_huge_page(page)) {
unlock_page(page);
put_page(page);
pte_offset_map_lock(mm, pmd, addr, &ptl);
break;
}
unlock_page(page);
put_page(page);
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte--;
addr -= PAGE_SIZE;
continue;
}
/* Do not interfere with other mappings of this page */
if (page_mapcount(page) != 1)
continue;
VM_BUG_ON_PAGE(PageTransCompound(page), page);
if (pte_young(ptent)) {
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
ptent = pte_mkold(ptent);
set_pte_at(mm, addr, pte, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
}
/*
* We are deactivating a page for accelerating reclaiming.
* VM couldn't reclaim the page unless we clear PG_young.
* As a side effect, it makes confuse idle-page tracking
* because they will miss recent referenced history.
*/
ClearPageReferenced(page);
test_and_clear_page_young(page);
if (pageout) {
if (!isolate_lru_page(page)) {
if (PageUnevictable(page))
putback_lru_page(page);
else
list_add(&page->lru, &page_list);
}
} else
deactivate_page(page);
}
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_pte, ptl);
if (pageout)
reclaim_pages(&page_list);
cond_resched();
return 0;
}
static const struct mm_walk_ops cold_walk_ops = {
.pmd_entry = madvise_cold_or_pageout_pte_range,
};
static void madvise_cold_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
struct madvise_walk_private walk_private = {
.pageout = false,
.tlb = tlb,
};
vm_write_begin(vma);
tlb_start_vma(tlb, vma);
walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
tlb_end_vma(tlb, vma);
vm_write_end(vma);
}
static long madvise_cold(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start_addr, unsigned long end_addr)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
*prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
lru_add_drain();
tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
tlb_finish_mmu(&tlb, start_addr, end_addr);
return 0;
}
static void madvise_pageout_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
struct madvise_walk_private walk_private = {
.pageout = true,
.tlb = tlb,
};
vm_write_begin(vma);
tlb_start_vma(tlb, vma);
walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
tlb_end_vma(tlb, vma);
vm_write_end(vma);
}
static inline bool can_do_pageout(struct vm_area_struct *vma)
{
if (vma_is_anonymous(vma))
return true;
if (!vma->vm_file)
return false;
/*
* paging out pagecache only for non-anonymous mappings that correspond
* to the files the calling process could (if tried) open for writing;
* otherwise we'd be including shared non-exclusive mappings, which
* opens a side channel.
*/
return inode_owner_or_capable(file_inode(vma->vm_file)) ||
inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
}
static long madvise_pageout(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start_addr, unsigned long end_addr)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
*prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
if (!can_do_pageout(vma))
return 0;
lru_add_drain();
tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
tlb_finish_mmu(&tlb, start_addr, end_addr);
return 0;
}
static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct mmu_gather *tlb = walk->private;
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
pte_t *orig_pte, *pte, ptent;
struct page *page;
int nr_swap = 0;
unsigned long next;
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd))
if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
goto next;
if (pmd_trans_unstable(pmd))
return 0;
tlb_change_page_size(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr != end; pte++, addr += PAGE_SIZE) {
ptent = *pte;
if (pte_none(ptent))
continue;
/*
* If the pte has swp_entry, just clear page table to
* prevent swap-in which is more expensive rather than
* (page allocation + zeroing).
*/
if (!pte_present(ptent)) {
swp_entry_t entry;
entry = pte_to_swp_entry(ptent);
if (non_swap_entry(entry))
continue;
nr_swap--;
free_swap_and_cache(entry);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
continue;
}
page = vm_normal_page(vma, addr, ptent);
if (!page)
continue;
/*
* If pmd isn't transhuge but the page is THP and
* is owned by only this process, split it and
* deactivate all pages.
*/
if (PageTransCompound(page)) {
if (page_mapcount(page) != 1)
goto out;
get_page(page);
if (!trylock_page(page)) {
put_page(page);
goto out;
}
pte_unmap_unlock(orig_pte, ptl);
if (split_huge_page(page)) {
unlock_page(page);
put_page(page);
pte_offset_map_lock(mm, pmd, addr, &ptl);
goto out;
}
unlock_page(page);
put_page(page);
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte--;
addr -= PAGE_SIZE;
continue;
}
VM_BUG_ON_PAGE(PageTransCompound(page), page);
if (PageSwapCache(page) || PageDirty(page)) {
if (!trylock_page(page))
continue;
/*
* If page is shared with others, we couldn't clear
* PG_dirty of the page.
*/
if (page_mapcount(page) != 1) {
unlock_page(page);
continue;
}
if (PageSwapCache(page) && !try_to_free_swap(page)) {
unlock_page(page);
continue;
}
ClearPageDirty(page);
unlock_page(page);
}
if (pte_young(ptent) || pte_dirty(ptent)) {
/*
* Some of architecture(ex, PPC) don't update TLB
* with set_pte_at and tlb_remove_tlb_entry so for
* the portability, remap the pte with old|clean
* after pte clearing.
*/
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
ptent = pte_mkold(ptent);
ptent = pte_mkclean(ptent);
set_pte_at(mm, addr, pte, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
}
mark_page_lazyfree(page);
}
out:
if (nr_swap) {
if (current->mm == mm)
sync_mm_rss(mm);
add_mm_counter(mm, MM_SWAPENTS, nr_swap);
}
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_pte, ptl);
cond_resched();
next:
return 0;
}
static const struct mm_walk_ops madvise_free_walk_ops = {
.pmd_entry = madvise_free_pte_range,
};
static int madvise_free_single_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
struct mmu_gather tlb;
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
return -EINVAL;
range.start = max(vma->vm_start, start_addr);
if (range.start >= vma->vm_end)
return -EINVAL;
range.end = min(vma->vm_end, end_addr);
if (range.end <= vma->vm_start)
return -EINVAL;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
range.start, range.end);
lru_add_drain();
tlb_gather_mmu(&tlb, mm, range.start, range.end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
vm_write_begin(vma);
tlb_start_vma(&tlb, vma);
walk_page_range(vma->vm_mm, range.start, range.end,
&madvise_free_walk_ops, &tlb);
tlb_end_vma(&tlb, vma);
vm_write_end(vma);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb, range.start, range.end);
return 0;
}
/*
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
* data it wants to keep. Be sure to free swap resources too. The
* zap_page_range call sets things up for shrink_active_list to actually free
* these pages later if no one else has touched them in the meantime,
* although we could add these pages to a global reuse list for
* shrink_active_list to pick up before reclaiming other pages.
*
* NB: This interface discards data rather than pushes it out to swap,
* as some implementations do. This has performance implications for
* applications like large transactional databases which want to discard
* pages in anonymous maps after committing to backing store the data
* that was kept in them. There is no reason to write this data out to
* the swap area if the application is discarding it.
*
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
zap_page_range(vma, start, end - start);
return 0;
}
static long madvise_dontneed_free(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
int behavior)
{
struct mm_struct *mm = vma->vm_mm;
*prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
if (!userfaultfd_remove(vma, start, end)) {
*prev = NULL; /* mmap_lock has been dropped, prev is stale */
mmap_read_lock(mm);
vma = find_vma(mm, start);
if (!vma)
return -ENOMEM;
if (start < vma->vm_start) {
/*
* This "vma" under revalidation is the one
* with the lowest vma->vm_start where start
* is also < vma->vm_end. If start <
* vma->vm_start it means an hole materialized
* in the user address space within the
* virtual range passed to MADV_DONTNEED
* or MADV_FREE.
*/
return -ENOMEM;
}
if (!can_madv_lru_vma(vma))
return -EINVAL;
if (end > vma->vm_end) {
/*
* Don't fail if end > vma->vm_end. If the old
* vma was splitted while the mmap_lock was
* released the effect of the concurrent
* operation may not cause madvise() to
* have an undefined result. There may be an
* adjacent next vma that we'll walk
* next. userfaultfd_remove() will generate an
* UFFD_EVENT_REMOVE repetition on the
* end-vma->vm_end range, but the manager can
* handle a repetition fine.
*/
end = vma->vm_end;
}
VM_WARN_ON(start >= end);
}
if (behavior == MADV_DONTNEED)
return madvise_dontneed_single_vma(vma, start, end);
else if (behavior == MADV_FREE)
return madvise_free_single_vma(vma, start, end);
else
return -EINVAL;
}
/*
* Application wants to free up the pages and associated backing store.
* This is effectively punching a hole into the middle of a file.
*/
static long madvise_remove(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
loff_t offset;
int error;
struct file *f;
struct mm_struct *mm = vma->vm_mm;
*prev = NULL; /* tell sys_madvise we drop mmap_lock */
if (vma->vm_flags & VM_LOCKED)
return -EINVAL;
f = vma->vm_file;
if (!f || !f->f_mapping || !f->f_mapping->host) {
return -EINVAL;
}
if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
return -EACCES;
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
/*
* Filesystem's fallocate may need to take i_mutex. We need to
* explicitly grab a reference because the vma (and hence the
* vma's reference to the file) can go away as soon as we drop
* mmap_lock.
*/
get_file(f);
if (userfaultfd_remove(vma, start, end)) {
/* mmap_lock was not released by userfaultfd_remove() */
mmap_read_unlock(mm);
}
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
fput(f);
mmap_read_lock(mm);
return error;
}
#ifdef CONFIG_MEMORY_FAILURE
/*
* Error injection support for memory error handling.
*/
static int madvise_inject_error(int behavior,
unsigned long start, unsigned long end)
{
struct zone *zone;
unsigned long size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
for (; start < end; start += size) {
unsigned long pfn;
struct page *page;
int ret;
ret = get_user_pages_fast(start, 1, 0, &page);
if (ret != 1)
return ret;
pfn = page_to_pfn(page);
/*
* When soft offlining hugepages, after migrating the page
* we dissolve it, therefore in the second loop "page" will
* no longer be a compound page.
*/
size = page_size(compound_head(page));
if (behavior == MADV_SOFT_OFFLINE) {
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
pfn, start);
ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
} else {
pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
pfn, start);
ret = memory_failure(pfn, MF_COUNT_INCREASED);
}
if (ret)
return ret;
}
/* Ensure that all poisoned pages are removed from per-cpu lists */
for_each_populated_zone(zone)
drain_all_pages(zone);
return 0;
}
#endif
static long
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
switch (behavior) {
case MADV_REMOVE:
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
case MADV_COLD:
return madvise_cold(vma, prev, start, end);
case MADV_PAGEOUT:
return madvise_pageout(vma, prev, start, end);
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
default:
return madvise_behavior(vma, prev, start, end, behavior);
}
}
static bool
madvise_behavior_valid(int behavior)
{
switch (behavior) {
case MADV_DOFORK:
case MADV_DONTFORK:
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
case MADV_FREE:
case MADV_COLD:
case MADV_PAGEOUT:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
case MADV_WIPEONFORK:
case MADV_KEEPONFORK:
#ifdef CONFIG_MEMORY_FAILURE
case MADV_SOFT_OFFLINE:
case MADV_HWPOISON:
#endif
return true;
default:
return false;
}
}
static bool
process_madvise_behavior_valid(int behavior)
{
switch (behavior) {
case MADV_COLD:
case MADV_PAGEOUT:
return true;
default:
return false;
}
}
/*
* The madvise(2) system call.
*
* Applications can use madvise() to advise the kernel how it should
* handle paging I/O in this VM area. The idea is to help the kernel
* use appropriate read-ahead and caching techniques. The information
* provided is advisory only, and can be safely disregarded by the
* kernel without affecting the correct operation of the application.
*
* behavior values:
* MADV_NORMAL - the default behavior is to read clusters. This
* results in some read-ahead and read-behind.
* MADV_RANDOM - the system should read the minimum amount of data
* on any access, since it is unlikely that the appli-
* cation will need more than what it asks for.
* MADV_SEQUENTIAL - pages in the given range will probably be accessed
* once, so they can be aggressively read ahead, and
* can be freed soon after they are accessed.
* MADV_WILLNEED - the application is notifying the system to read
* some pages ahead.
* MADV_DONTNEED - the application is finished with the given range,
* so the kernel can free resources associated with it.
* MADV_FREE - the application marks pages in the given range as lazy free,
* where actual purges are postponed until memory pressure happens.
* MADV_REMOVE - the application wants to free up the given range of
* pages and associated backing store.
* MADV_DONTFORK - omit this area from child's address space when forking:
* typically, to avoid COWing pages pinned by get_user_pages().
* MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
* MADV_WIPEONFORK - present the child process with zero-filled memory in this
* range after a fork.
* MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
* MADV_HWPOISON - trigger memory error handler as if the given memory range
* were corrupted by unrecoverable hardware memory failure.
* MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
* MADV_MERGEABLE - the application recommends that KSM try to merge pages in
* this area with pages of identical content from other such areas.
* MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
* MADV_HUGEPAGE - the application wants to back the given range by transparent
* huge pages in the future. Existing pages might be coalesced and
* new pages might be allocated as THP.
* MADV_NOHUGEPAGE - mark the given range as not worth being backed by
* transparent huge pages so the existing pages will not be
* coalesced into THP and new pages will not be allocated as THP.
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
* MADV_COLD - the application is not expected to use this memory soon,
* deactivate pages in this range so that they can be reclaimed
* easily if memory pressure hanppens.
* MADV_PAGEOUT - the application is not expected to use this memory soon,
* page out the pages in this range immediately.
*
* return values:
* zero - success
* -EINVAL - start + len < 0, start is not page-aligned,
* "behavior" is not a valid value, or application
* is attempting to release locked or shared pages,
* or the specified address range includes file, Huge TLB,
* MAP_SHARED or VMPFNMAP range.
* -ENOMEM - addresses in the specified range are not currently
* mapped, or are outside the AS of the process.
* -EIO - an I/O error occurred while paging in data.
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
int unmapped_error = 0;
int error = -EINVAL;
int write;
size_t len;
struct blk_plug plug;
start = untagged_addr(start);
if (!madvise_behavior_valid(behavior))
return error;
if (!PAGE_ALIGNED(start))
return error;
len = PAGE_ALIGN(len_in);
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
return error;
end = start + len;
if (end < start)
return error;
error = 0;
if (end == start)
return error;
#ifdef CONFIG_MEMORY_FAILURE
if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
return madvise_inject_error(behavior, start, start + len_in);
#endif
write = madvise_need_mmap_write(behavior);
if (write) {
if (mmap_write_lock_killable(mm))
return -EINTR;
} else {
mmap_read_lock(mm);
}
/*
* If the interval [start,end) covers some unmapped address
* ranges, just ignore them, but return -ENOMEM at the end.
* - different from the way of handling in mlock etc.
*/
vma = find_vma_prev(mm, start, &prev);
if (vma && start > vma->vm_start)
prev = vma;
blk_start_plug(&plug);
for (;;) {
/* Still start < end. */
error = -ENOMEM;
if (!vma)
goto out;
/* Here start < (end|vma->vm_end). */
if (start < vma->vm_start) {
unmapped_error = -ENOMEM;
start = vma->vm_start;
if (start >= end)
goto out;
}
/* Here vma->vm_start <= start < (end|vma->vm_end) */
tmp = vma->vm_end;
if (end < tmp)
tmp = end;
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
error = madvise_vma(vma, &prev, start, tmp, behavior);
if (error)
goto out;
start = tmp;
if (prev && start < prev->vm_end)
start = prev->vm_end;
error = unmapped_error;
if (start >= end)
goto out;
if (prev)
vma = prev->vm_next;
else /* madvise_remove dropped mmap_lock */
vma = find_vma(mm, start);
}
out:
blk_finish_plug(&plug);
if (write)
mmap_write_unlock(mm);
else
mmap_read_unlock(mm);
return error;
}
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
return do_madvise(current->mm, start, len_in, behavior);
}
SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
size_t, vlen, int, behavior, unsigned int, flags)
{
ssize_t ret;
struct iovec iovstack[UIO_FASTIOV], iovec;
struct iovec *iov = iovstack;
struct iov_iter iter;
struct pid *pid;
struct task_struct *task;
struct mm_struct *mm;
size_t total_len;
unsigned int f_flags;
if (flags != 0) {
ret = -EINVAL;
goto out;
}
ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
if (ret < 0)
goto out;
pid = pidfd_get_pid(pidfd, &f_flags);
if (IS_ERR(pid)) {
ret = PTR_ERR(pid);
goto free_iov;
}
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
ret = -ESRCH;
goto put_pid;
}
if (!process_madvise_behavior_valid(behavior)) {
ret = -EINVAL;
goto release_task;
}
/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
if (IS_ERR_OR_NULL(mm)) {
ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
goto release_task;
}
/*
* Require CAP_SYS_NICE for influencing process performance. Note that
* only non-destructive hints are currently supported.
*/
if (!capable(CAP_SYS_NICE)) {
ret = -EPERM;
goto release_mm;
}
total_len = iov_iter_count(&iter);
while (iov_iter_count(&iter)) {
iovec = iov_iter_iovec(&iter);
ret = do_madvise(mm, (unsigned long)iovec.iov_base,
iovec.iov_len, behavior);
if (ret < 0)
break;
iov_iter_advance(&iter, iovec.iov_len);
}
if (ret == 0)
ret = total_len - iov_iter_count(&iter);
release_mm:
mmput(mm);
release_task:
put_task_struct(task);
put_pid:
put_pid(pid);
free_iov:
kfree(iov);
out:
return ret;
}