android_kernel_samsung_sm8650/kernel/futex/core.c
Greg Kroah-Hartman 2b3ea8bdef This is the 6.1.63 stable release
-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAmVbOmsACgkQONu9yGCS
 aT5m1RAAx7hgbFDnLHCGh4YVBbNy8JngItsUBaJcI/67Mk5toNi0x8pqcS8mq7ED
 GTwRnRcKaIR2bTyco5Ed2OZn4jMCyHC4oiyBZnHWg6AMuQjSCYzIgm7DzlTCVYZ7
 2r8uRbt/uXADTILJ2kwR2mtVpGcwrXa+lsHrMqvt+MvNwRoSVHBHVVYCrAc+JXwR
 GXCopzV/RFGS6w4SBsX0K+8pV7GO+bhpxJ1lPz1T/xeLYfT4C3EwSTWDbUXPbez7
 IpJ+5yKJXXT9Xn9m/pekwZ/aOirLqtEbDxneEctsjvw140lCoQiEZn6ZRscgNEns
 3H+J3Asgc2zXqPzfZFH02TebPj31B8HZ43Upu0okr0hr4A4/4JL9pjXEhm1bON/Z
 x3jlTF4dyay4vOGGIEYOAuJSUbn6AqpZ318uBWCd3BSPocihEDMJz2aoazVHcb6k
 83MVxfFfEL6s9utcoSXB8VjHa4FQmpMYsozegloUSJJCsizgdzmih0buJYhBB9sI
 HbEohW+YAh3cACSn6arXUJIMH5F5xsfD89od2Pj+6UrapdlPz5gCaggA1RZplCho
 bjGc1k61Rp2qSdfMEcx+h4ypgoOdhgqZI0YhYDCgBSRcWOXnGrDjFvnnumatcT+H
 6vqyX6zlNt6U1NpE56Jtf7gt1Ds6PeoadD0L6B8vjXrkdeXOlUU=
 =AZ9s
 -----END PGP SIGNATURE-----

Merge 6.1.63 into android14-6.1-lts

Changes in 6.1.63
	hwmon: (nct6775) Fix incorrect variable reuse in fan_div calculation
	sched/fair: Fix cfs_rq_is_decayed() on !SMP
	iov_iter, x86: Be consistent about the __user tag on copy_mc_to_user()
	sched/uclamp: Set max_spare_cap_cpu even if max_spare_cap is 0
	sched/uclamp: Ignore (util == 0) optimization in feec() when p_util_max = 0
	objtool: Propagate early errors
	sched: Fix stop_one_cpu_nowait() vs hotplug
	vfs: fix readahead(2) on block devices
	writeback, cgroup: switch inodes with dirty timestamps to release dying cgwbs
	x86/srso: Fix SBPB enablement for (possible) future fixed HW
	futex: Don't include process MM in futex key on no-MMU
	x86/numa: Introduce numa_fill_memblks()
	ACPI/NUMA: Apply SRAT proximity domain to entire CFMWS window
	x86/sev-es: Allow copy_from_kernel_nofault() in earlier boot
	x86/boot: Fix incorrect startup_gdt_descr.size
	drivers/clocksource/timer-ti-dm: Don't call clk_get_rate() in stop function
	pstore/platform: Add check for kstrdup
	string: Adjust strtomem() logic to allow for smaller sources
	genirq/matrix: Exclude managed interrupts in irq_matrix_allocated()
	wifi: cfg80211: add flush functions for wiphy work
	wifi: mac80211: move radar detect work to wiphy work
	wifi: mac80211: move scan work to wiphy work
	wifi: mac80211: move offchannel works to wiphy work
	wifi: mac80211: move sched-scan stop work to wiphy work
	wifi: mac80211: fix # of MSDU in A-MSDU calculation
	wifi: iwlwifi: honor the enable_ini value
	i40e: fix potential memory leaks in i40e_remove()
	iavf: Fix promiscuous mode configuration flow messages
	selftests/bpf: Correct map_fd to data_fd in tailcalls
	udp: add missing WRITE_ONCE() around up->encap_rcv
	tcp: call tcp_try_undo_recovery when an RTOd TFO SYNACK is ACKed
	gve: Use size_add() in call to struct_size()
	mlxsw: Use size_mul() in call to struct_size()
	tls: Only use data field in crypto completion function
	tls: Use size_add() in call to struct_size()
	tipc: Use size_add() in calls to struct_size()
	net: spider_net: Use size_add() in call to struct_size()
	net: ethernet: mtk_wed: fix EXT_INT_STATUS_RX_FBUF definitions for MT7986 SoC
	wifi: rtw88: debug: Fix the NULL vs IS_ERR() bug for debugfs_create_file()
	wifi: ath11k: fix boot failure with one MSI vector
	wifi: mt76: mt7603: rework/fix rx pse hang check
	wifi: mt76: mt7603: improve watchdog reset reliablity
	wifi: mt76: mt7603: improve stuck beacon handling
	wifi: mt76: mt7915: fix beamforming availability check
	wifi: ath: dfs_pattern_detector: Fix a memory initialization issue
	tcp_metrics: add missing barriers on delete
	tcp_metrics: properly set tp->snd_ssthresh in tcp_init_metrics()
	tcp_metrics: do not create an entry from tcp_init_metrics()
	wifi: rtlwifi: fix EDCA limit set by BT coexistence
	ACPI: property: Allow _DSD buffer data only for byte accessors
	ACPI: video: Add acpi_backlight=vendor quirk for Toshiba Portégé R100
	wifi: ath11k: fix Tx power value during active CAC
	can: dev: can_restart(): don't crash kernel if carrier is OK
	can: dev: can_restart(): fix race condition between controller restart and netif_carrier_on()
	can: dev: can_put_echo_skb(): don't crash kernel if can_priv::echo_skb is accessed out of bounds
	PM / devfreq: rockchip-dfi: Make pmu regmap mandatory
	wifi: wfx: fix case where rates are out of order
	netfilter: nf_tables: Drop pointless memset when dumping rules
	thermal: core: prevent potential string overflow
	r8169: use tp_to_dev instead of open code
	r8169: fix rare issue with broken rx after link-down on RTL8125
	selftests: netfilter: test for sctp collision processing in nf_conntrack
	net: skb_find_text: Ignore patterns extending past 'to'
	chtls: fix tp->rcv_tstamp initialization
	tcp: fix cookie_init_timestamp() overflows
	wifi: iwlwifi: call napi_synchronize() before freeing rx/tx queues
	wifi: iwlwifi: pcie: synchronize IRQs before NAPI
	wifi: iwlwifi: empty overflow queue during flush
	Bluetooth: hci_sync: Fix Opcode prints in bt_dev_dbg/err
	bpf: Fix unnecessary -EBUSY from htab_lock_bucket
	ACPI: sysfs: Fix create_pnp_modalias() and create_of_modalias()
	ipv6: avoid atomic fragment on GSO packets
	net: add DEV_STATS_READ() helper
	ipvlan: properly track tx_errors
	regmap: debugfs: Fix a erroneous check after snprintf()
	spi: tegra: Fix missing IRQ check in tegra_slink_probe()
	clk: qcom: gcc-msm8996: Remove RPM bus clocks
	clk: qcom: clk-rcg2: Fix clock rate overflow for high parent frequencies
	clk: qcom: mmcc-msm8998: Don't check halt bit on some branch clks
	clk: qcom: mmcc-msm8998: Fix the SMMU GDSC
	clk: qcom: gcc-sm8150: Fix gcc_sdcc2_apps_clk_src
	regulator: mt6358: Fail probe on unknown chip ID
	clk: imx: Select MXC_CLK for CLK_IMX8QXP
	clk: imx: imx8mq: correct error handling path
	clk: imx: imx8qxp: Fix elcdif_pll clock
	clk: renesas: rcar-gen3: Extend SDnH divider table
	clk: renesas: rzg2l: Wait for status bit of SD mux before continuing
	clk: renesas: rzg2l: Lock around writes to mux register
	clk: renesas: rzg2l: Trust value returned by hardware
	clk: renesas: rzg2l: Use FIELD_GET() for PLL register fields
	clk: renesas: rzg2l: Fix computation formula
	clk: linux/clk-provider.h: fix kernel-doc warnings and typos
	spi: nxp-fspi: use the correct ioremap function
	clk: keystone: pll: fix a couple NULL vs IS_ERR() checks
	clk: ti: change ti_clk_register[_omap_hw]() API
	clk: ti: fix double free in of_ti_divider_clk_setup()
	clk: npcm7xx: Fix incorrect kfree
	clk: mediatek: clk-mt6765: Add check for mtk_alloc_clk_data
	clk: mediatek: clk-mt6779: Add check for mtk_alloc_clk_data
	clk: mediatek: clk-mt6797: Add check for mtk_alloc_clk_data
	clk: mediatek: clk-mt7629-eth: Add check for mtk_alloc_clk_data
	clk: mediatek: clk-mt7629: Add check for mtk_alloc_clk_data
	clk: mediatek: clk-mt2701: Add check for mtk_alloc_clk_data
	clk: qcom: config IPQ_APSS_6018 should depend on QCOM_SMEM
	platform/x86: wmi: Fix probe failure when failing to register WMI devices
	platform/x86: wmi: Fix opening of char device
	hwmon: (axi-fan-control) Fix possible NULL pointer dereference
	hwmon: (coretemp) Fix potentially truncated sysfs attribute name
	Revert "hwmon: (sch56xx-common) Add DMI override table"
	Revert "hwmon: (sch56xx-common) Add automatic module loading on supported devices"
	hwmon: (sch5627) Use bit macros when accessing the control register
	hwmon: (sch5627) Disallow write access if virtual registers are locked
	hte: tegra: Fix missing error code in tegra_hte_test_probe()
	drm/rockchip: vop: Fix reset of state in duplicate state crtc funcs
	drm/rockchip: vop: Fix call to crtc reset helper
	drm/rockchip: vop2: Don't crash for invalid duplicate_state
	drm/rockchip: vop2: Add missing call to crtc reset helper
	drm/radeon: possible buffer overflow
	drm: bridge: it66121: Fix invalid connector dereference
	drm/bridge: lt8912b: Add hot plug detection
	drm/bridge: lt8912b: Fix bridge_detach
	drm/bridge: lt8912b: Fix crash on bridge detach
	drm/bridge: lt8912b: Manually disable HPD only if it was enabled
	drm/bridge: lt8912b: Add missing drm_bridge_attach call
	drm/bridge: tc358768: Fix use of uninitialized variable
	drm/bridge: tc358768: Fix bit updates
	drm/bridge: tc358768: remove unused variable
	drm/bridge: tc358768: Use struct videomode
	drm/bridge: tc358768: Print logical values, not raw register values
	drm/bridge: tc358768: Use dev for dbg prints, not priv->dev
	drm/bridge: tc358768: Rename dsibclk to hsbyteclk
	drm/bridge: tc358768: Clean up clock period code
	drm/bridge: tc358768: Fix tc358768_ns_to_cnt()
	drm/amdkfd: fix some race conditions in vram buffer alloc/free of svm code
	drm/amd/display: Check all enabled planes in dm_check_crtc_cursor
	drm/amd/display: Refactor dm_get_plane_scale helper
	drm/amd/display: Bail from dm_check_crtc_cursor if no relevant change
	io_uring/kbuf: Fix check of BID wrapping in provided buffers
	io_uring/kbuf: Allow the full buffer id space for provided buffers
	drm/mediatek: Fix iommu fault by swapping FBs after updating plane state
	drm/mediatek: Fix iommu fault during crtc enabling
	drm/rockchip: cdn-dp: Fix some error handling paths in cdn_dp_probe()
	gpu: host1x: Correct allocated size for contexts
	drm/bridge: lt9611uxc: fix the race in the error path
	arm64/arm: xen: enlighten: Fix KPTI checks
	drm/rockchip: Fix type promotion bug in rockchip_gem_iommu_map()
	xenbus: fix error exit in xenbus_init()
	xen-pciback: Consider INTx disabled when MSI/MSI-X is enabled
	drm/msm/dsi: use msm_gem_kernel_put to free TX buffer
	drm/msm/dsi: free TX buffer in unbind
	clocksource/drivers/arm_arch_timer: limit XGene-1 workaround
	drm: mediatek: mtk_dsi: Fix NO_EOT_PACKET settings/handling
	drivers/perf: hisi: use cpuhp_state_remove_instance_nocalls() for hisi_hns3_pmu uninit process
	perf/arm-cmn: Revamp model detection
	perf/arm-cmn: Fix DTC domain detection
	drivers/perf: hisi_pcie: Check the type first in pmu::event_init()
	perf: hisi: Fix use-after-free when register pmu fails
	ARM: dts: renesas: blanche: Fix typo in GP_11_2 pin name
	arm64: dts: qcom: sdm845: cheza doesn't support LMh node
	arm64: dts: qcom: sc7280: link usb3_phy_wrapper_gcc_usb30_pipe_clk
	arm64: dts: qcom: msm8916: Fix iommu local address range
	arm64: dts: qcom: msm8992-libra: drop duplicated reserved memory
	arm64: dts: qcom: sc7280: Add missing LMH interrupts
	arm64: dts: qcom: sm8150: add ref clock to PCIe PHYs
	arm64: dts: qcom: sm8350: fix pinctrl for UART18
	arm64: dts: qcom: sdm845-mtp: fix WiFi configuration
	ARM64: dts: marvell: cn9310: Use appropriate label for spi1 pins
	arm64: dts: qcom: apq8016-sbc: Add missing ADV7533 regulators
	ARM: dts: qcom: mdm9615: populate vsdcc fixed regulator
	soc: qcom: llcc: Handle a second device without data corruption
	kunit: Fix missed memory release in kunit_free_suite_set()
	firmware: ti_sci: Mark driver as non removable
	arm64: dts: ti: k3-am62a7-sk: Drop i2c-1 to 100Khz
	firmware: arm_ffa: Assign the missing IDR allocation ID to the FFA device
	firmware: arm_ffa: Allow the FF-A drivers to use 32bit mode of messaging
	ARM: dts: am3517-evm: Fix LED3/4 pinmux
	clk: scmi: Free scmi_clk allocated when the clocks with invalid info are skipped
	arm64: dts: imx8qm-ss-img: Fix jpegenc compatible entry
	arm64: dts: imx8mm: Add sound-dai-cells to micfil node
	arm64: dts: imx8mn: Add sound-dai-cells to micfil node
	arm64: tegra: Use correct interrupts for Tegra234 TKE
	selftests/pidfd: Fix ksft print formats
	selftests/resctrl: Ensure the benchmark commands fits to its array
	module/decompress: use vmalloc() for gzip decompression workspace
	ASoC: cs35l41: Verify PM runtime resume errors in IRQ handler
	ASoC: cs35l41: Undo runtime PM changes at driver exit time
	ALSA: hda: cs35l41: Fix unbalanced pm_runtime_get()
	ALSA: hda: cs35l41: Undo runtime PM changes at driver exit time
	KEYS: Include linux/errno.h in linux/verification.h
	crypto: hisilicon/hpre - Fix a erroneous check after snprintf()
	hwrng: bcm2835 - Fix hwrng throughput regression
	hwrng: geode - fix accessing registers
	RDMA/core: Use size_{add,sub,mul}() in calls to struct_size()
	crypto: qat - ignore subsequent state up commands
	crypto: qat - relocate bufferlist logic
	crypto: qat - rename bufferlist functions
	crypto: qat - change bufferlist logic interface
	crypto: qat - generalize crypto request buffers
	crypto: qat - extend buffer list interface
	crypto: qat - fix unregistration of crypto algorithms
	scsi: ibmvfc: Fix erroneous use of rtas_busy_delay with hcall return code
	libnvdimm/of_pmem: Use devm_kstrdup instead of kstrdup and check its return value
	nd_btt: Make BTT lanes preemptible
	crypto: caam/qi2 - fix Chacha20 + Poly1305 self test failure
	crypto: caam/jr - fix Chacha20 + Poly1305 self test failure
	crypto: qat - increase size of buffers
	PCI: vmd: Correct PCI Header Type Register's multi-function check
	hid: cp2112: Fix duplicate workqueue initialization
	crypto: hisilicon/qm - delete redundant null assignment operations
	crypto: hisilicon/qm - modify the process of regs dfx
	crypto: hisilicon/qm - split a debugfs.c from qm
	crypto: hisilicon/qm - fix PF queue parameter issue
	ARM: 9321/1: memset: cast the constant byte to unsigned char
	ext4: move 'ix' sanity check to corrent position
	ASoC: fsl: mpc5200_dma.c: Fix warning of Function parameter or member not described
	IB/mlx5: Fix rdma counter binding for RAW QP
	RDMA/hns: Fix printing level of asynchronous events
	RDMA/hns: Fix uninitialized ucmd in hns_roce_create_qp_common()
	RDMA/hns: Fix signed-unsigned mixed comparisons
	RDMA/hns: Add check for SL
	RDMA/hns: The UD mode can only be configured with DCQCN
	ASoC: SOF: core: Ensure sof_ops_free() is still called when probe never ran.
	ASoC: fsl: Fix PM disable depth imbalance in fsl_easrc_probe
	scsi: ufs: core: Leave space for '\0' in utf8 desc string
	RDMA/hfi1: Workaround truncation compilation error
	HID: cp2112: Make irq_chip immutable
	hid: cp2112: Fix IRQ shutdown stopping polling for all IRQs on chip
	sh: bios: Revive earlyprintk support
	Revert "HID: logitech-hidpp: add a module parameter to keep firmware gestures"
	HID: logitech-hidpp: Remove HIDPP_QUIRK_NO_HIDINPUT quirk
	HID: logitech-hidpp: Don't restart IO, instead defer hid_connect() only
	HID: logitech-hidpp: Revert "Don't restart communication if not necessary"
	HID: logitech-hidpp: Move get_wireless_feature_index() check to hidpp_connect_event()
	ASoC: Intel: Skylake: Fix mem leak when parsing UUIDs fails
	padata: Fix refcnt handling in padata_free_shell()
	crypto: qat - fix deadlock in backlog processing
	ASoC: ams-delta.c: use component after check
	IB/mlx5: Fix init stage error handling to avoid double free of same QP and UAF
	mfd: core: Un-constify mfd_cell.of_reg
	mfd: core: Ensure disabled devices are skipped without aborting
	mfd: dln2: Fix double put in dln2_probe
	dt-bindings: mfd: mt6397: Add binding for MT6357
	dt-bindings: mfd: mt6397: Split out compatible for MediaTek MT6366 PMIC
	mfd: arizona-spi: Set pdata.hpdet_channel for ACPI enumerated devs
	leds: turris-omnia: Drop unnecessary mutex locking
	leds: turris-omnia: Do not use SMBUS calls
	leds: pwm: Don't disable the PWM when the LED should be off
	leds: trigger: ledtrig-cpu:: Fix 'output may be truncated' issue for 'cpu'
	kunit: add macro to allow conditionally exposing static symbols to tests
	apparmor: test: make static symbols visible during kunit testing
	apparmor: fix invalid reference on profile->disconnected
	perf stat: Fix aggr mode initialization
	iio: frequency: adf4350: Use device managed functions and fix power down issue.
	perf kwork: Fix incorrect and missing free atom in work_push_atom()
	perf kwork: Add the supported subcommands to the document
	perf kwork: Set ordered_events to true in 'struct perf_tool'
	filemap: add filemap_get_folios_tag()
	f2fs: convert f2fs_write_cache_pages() to use filemap_get_folios_tag()
	f2fs: compress: fix deadloop in f2fs_write_cache_pages()
	f2fs: compress: fix to avoid use-after-free on dic
	f2fs: compress: fix to avoid redundant compress extension
	tty: tty_jobctrl: fix pid memleak in disassociate_ctty()
	livepatch: Fix missing newline character in klp_resolve_symbols()
	pinctrl: renesas: rzg2l: Make reverse order of enable() for disable()
	perf record: Fix BTF type checks in the off-cpu profiling
	dmaengine: idxd: Register dsa_bus_type before registering idxd sub-drivers
	usb: dwc2: fix possible NULL pointer dereference caused by driver concurrency
	usb: chipidea: Fix DMA overwrite for Tegra
	usb: chipidea: Simplify Tegra DMA alignment code
	dmaengine: ti: edma: handle irq_of_parse_and_map() errors
	misc: st_core: Do not call kfree_skb() under spin_lock_irqsave()
	tools: iio: iio_generic_buffer ensure alignment
	USB: usbip: fix stub_dev hub disconnect
	dmaengine: pxa_dma: Remove an erroneous BUG_ON() in pxad_free_desc()
	f2fs: fix to initialize map.m_pblk in f2fs_precache_extents()
	interconnect: qcom: sc7180: Retire DEFINE_QBCM
	interconnect: qcom: sc7180: Set ACV enable_mask
	interconnect: qcom: sc7280: Set ACV enable_mask
	interconnect: qcom: sc8180x: Set ACV enable_mask
	interconnect: qcom: sc8280xp: Set ACV enable_mask
	interconnect: qcom: sdm845: Retire DEFINE_QBCM
	interconnect: qcom: sdm845: Set ACV enable_mask
	interconnect: qcom: sm6350: Retire DEFINE_QBCM
	interconnect: qcom: sm6350: Set ACV enable_mask
	interconnect: move ignore_list out of of_count_icc_providers()
	interconnect: qcom: sm8150: Drop IP0 interconnects
	interconnect: qcom: sm8150: Retire DEFINE_QBCM
	interconnect: qcom: sm8150: Set ACV enable_mask
	interconnect: qcom: sm8350: Retire DEFINE_QBCM
	interconnect: qcom: sm8350: Set ACV enable_mask
	powerpc: Only define __parse_fpscr() when required
	modpost: fix tee MODULE_DEVICE_TABLE built on big-endian host
	modpost: fix ishtp MODULE_DEVICE_TABLE built on big-endian host
	powerpc/40x: Remove stale PTE_ATOMIC_UPDATES macro
	powerpc/xive: Fix endian conversion size
	powerpc/vas: Limit open window failure messages in log bufffer
	powerpc/imc-pmu: Use the correct spinlock initializer.
	powerpc/pseries: fix potential memory leak in init_cpu_associativity()
	xhci: Loosen RPM as default policy to cover for AMD xHC 1.1
	usb: host: xhci-plat: fix possible kernel oops while resuming
	perf machine: Avoid out of bounds LBR memory read
	perf hist: Add missing puts to hist__account_cycles
	9p/net: fix possible memory leak in p9_check_errors()
	i3c: Fix potential refcount leak in i3c_master_register_new_i3c_devs
	cxl/mem: Fix shutdown order
	crypto: ccp - Name -1 return value as SEV_RET_NO_FW_CALL
	x86/sev: Change snp_guest_issue_request()'s fw_err argument
	virt: sevguest: Fix passing a stack buffer as a scatterlist target
	rtc: pcf85363: fix wrong mask/val parameters in regmap_update_bits call
	pcmcia: cs: fix possible hung task and memory leak pccardd()
	pcmcia: ds: fix refcount leak in pcmcia_device_add()
	pcmcia: ds: fix possible name leak in error path in pcmcia_device_add()
	media: hantro: Check whether reset op is defined before use
	media: verisilicon: Do not enable G2 postproc downscale if source is narrower than destination
	media: ov5640: Drop dead code using frame_interval
	media: ov5640: fix vblank unchange issue when work at dvp mode
	media: i2c: max9286: Fix some redundant of_node_put() calls
	media: ov5640: Fix a memory leak when ov5640_probe fails
	media: bttv: fix use after free error due to btv->timeout timer
	media: amphion: handle firmware debug message
	media: mtk-jpegenc: Fix bug in JPEG encode quality selection
	media: s3c-camif: Avoid inappropriate kfree()
	media: vidtv: psi: Add check for kstrdup
	media: vidtv: mux: Add check and kfree for kstrdup
	media: cedrus: Fix clock/reset sequence
	media: cadence: csi2rx: Unregister v4l2 async notifier
	media: dvb-usb-v2: af9035: fix missing unlock
	media: cec: meson: always include meson sub-directory in Makefile
	regmap: prevent noinc writes from clobbering cache
	pwm: sti: Reduce number of allocations and drop usage of chip_data
	pwm: brcmstb: Utilize appropriate clock APIs in suspend/resume
	Input: synaptics-rmi4 - fix use after free in rmi_unregister_function()
	watchdog: ixp4xx: Make sure restart always works
	llc: verify mac len before reading mac header
	hsr: Prevent use after free in prp_create_tagged_frame()
	tipc: Change nla_policy for bearer-related names to NLA_NUL_STRING
	bpf: Check map->usercnt after timer->timer is assigned
	inet: shrink struct flowi_common
	octeontx2-pf: Fix error codes
	octeontx2-pf: Fix holes in error code
	net: page_pool: add missing free_percpu when page_pool_init fail
	dccp: Call security_inet_conn_request() after setting IPv4 addresses.
	dccp/tcp: Call security_inet_conn_request() after setting IPv6 addresses.
	net: r8169: Disable multicast filter for RTL8168H and RTL8107E
	Fix termination state for idr_for_each_entry_ul()
	net: stmmac: xgmac: Enable support for multiple Flexible PPS outputs
	selftests: pmtu.sh: fix result checking
	octeontx2-pf: Rename tot_tx_queues to non_qos_queues
	octeontx2-pf: qos send queues management
	octeontx2-pf: Free pending and dropped SQEs
	net/smc: fix dangling sock under state SMC_APPFINCLOSEWAIT
	net/smc: allow cdc msg send rather than drop it with NULL sndbuf_desc
	net/smc: put sk reference if close work was canceled
	nvme: fix error-handling for io_uring nvme-passthrough
	tg3: power down device only on SYSTEM_POWER_OFF
	nbd: fix uaf in nbd_open
	blk-core: use pr_warn_ratelimited() in bio_check_ro()
	virtio/vsock: replace virtio_vsock_pkt with sk_buff
	vsock/virtio: remove socket from connected/bound list on shutdown
	r8169: respect userspace disabling IFF_MULTICAST
	i2c: iproc: handle invalid slave state
	netfilter: xt_recent: fix (increase) ipv6 literal buffer length
	netfilter: nft_redir: use `struct nf_nat_range2` throughout and deduplicate eval call-backs
	netfilter: nat: fix ipv6 nat redirect with mapped and scoped addresses
	RISC-V: Don't fail in riscv_of_parent_hartid() for disabled HARTs
	drm/syncobj: fix DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE
	ASoC: mediatek: mt8186_mt6366_rt1019_rt5682s: trivial: fix error messages
	ASoC: hdmi-codec: register hpd callback on component probe
	ASoC: dapm: fix clock get name
	spi: spi-zynq-qspi: add spi-mem to driver kconfig dependencies
	fbdev: imsttfb: Fix error path of imsttfb_probe()
	fbdev: imsttfb: fix a resource leak in probe
	fbdev: fsl-diu-fb: mark wr_reg_wa() static
	tracing/kprobes: Fix the order of argument descriptions
	io_uring/net: ensure socket is marked connected on connect retry
	x86/amd_nb: Use Family 19h Models 60h-7Fh Function 4 IDs
	Revert "mmc: core: Capture correct oemid-bits for eMMC cards"
	btrfs: use u64 for buffer sizes in the tree search ioctls
	wifi: cfg80211: fix kernel-doc for wiphy_delayed_work_flush()
	virtio/vsock: don't use skbuff state to account credit
	virtio/vsock: remove redundant 'skb_pull()' call
	virtio/vsock: don't drop skbuff on copy failure
	vsock/loopback: use only sk_buff_head.lock to protect the packet queue
	virtio/vsock: fix leaks due to missing skb owner
	virtio/vsock: Fix uninit-value in virtio_transport_recv_pkt()
	virtio/vsock: fix header length on skb merging
	Linux 6.1.63

Change-Id: I87b7a539b11c90cfaf16edb07d613f74d54458a4
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
2023-11-27 16:59:46 +00:00

1164 lines
32 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Fast Userspace Mutexes (which I call "Futexes!").
* (C) Rusty Russell, IBM 2002
*
* Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
* (C) Copyright 2003 Red Hat Inc, All Rights Reserved
*
* Removed page pinning, fix privately mapped COW pages and other cleanups
* (C) Copyright 2003, 2004 Jamie Lokier
*
* Robust futex support started by Ingo Molnar
* (C) Copyright 2006 Red Hat Inc, All Rights Reserved
* Thanks to Thomas Gleixner for suggestions, analysis and fixes.
*
* PI-futex support started by Ingo Molnar and Thomas Gleixner
* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
* Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
*
* PRIVATE futexes by Eric Dumazet
* Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
*
* Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
* Copyright (C) IBM Corporation, 2009
* Thanks to Thomas Gleixner for conceptual design and careful reviews.
*
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
*
* "The futexes are also cursed."
* "But they come in a choice of three flavours!"
*/
#include <linux/compat.h>
#include <linux/jhash.h>
#include <linux/pagemap.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
#include <linux/slab.h>
#include "futex.h"
#include "../locking/rtmutex_common.h"
#include <trace/hooks/futex.h>
/*
* The base of the bucket array and its size are always used together
* (after initialization only in futex_hash()), so ensure that they
* reside in the same cacheline.
*/
static struct {
struct futex_hash_bucket *queues;
unsigned long hashsize;
} __futex_data __read_mostly __aligned(2*sizeof(long));
#define futex_queues (__futex_data.queues)
#define futex_hashsize (__futex_data.hashsize)
/*
* Fault injections for futexes.
*/
#ifdef CONFIG_FAIL_FUTEX
static struct {
struct fault_attr attr;
bool ignore_private;
} fail_futex = {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_private = false,
};
static int __init setup_fail_futex(char *str)
{
return setup_fault_attr(&fail_futex.attr, str);
}
__setup("fail_futex=", setup_fail_futex);
bool should_fail_futex(bool fshared)
{
if (fail_futex.ignore_private && !fshared)
return false;
return should_fail(&fail_futex.attr, 1);
}
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
static int __init fail_futex_debugfs(void)
{
umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
struct dentry *dir;
dir = fault_create_debugfs_attr("fail_futex", NULL,
&fail_futex.attr);
if (IS_ERR(dir))
return PTR_ERR(dir);
debugfs_create_bool("ignore-private", mode, dir,
&fail_futex.ignore_private);
return 0;
}
late_initcall(fail_futex_debugfs);
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
#endif /* CONFIG_FAIL_FUTEX */
/**
* futex_hash - Return the hash bucket in the global hash
* @key: Pointer to the futex key for which the hash is calculated
*
* We hash on the keys returned from get_futex_key (see below) and return the
* corresponding hash bucket in the global hash.
*/
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
key->both.offset);
return &futex_queues[hash & (futex_hashsize - 1)];
}
/**
* futex_setup_timer - set up the sleeping hrtimer.
* @time: ptr to the given timeout value
* @timeout: the hrtimer_sleeper structure to be set up
* @flags: futex flags
* @range_ns: optional range in ns
*
* Return: Initialized hrtimer_sleeper structure or NULL if no timeout
* value given
*/
struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
int flags, u64 range_ns)
{
if (!time)
return NULL;
hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
CLOCK_REALTIME : CLOCK_MONOTONIC,
HRTIMER_MODE_ABS);
/*
* If range_ns is 0, calling hrtimer_set_expires_range_ns() is
* effectively the same as calling hrtimer_set_expires().
*/
hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
return timeout;
}
/*
* Generate a machine wide unique identifier for this inode.
*
* This relies on u64 not wrapping in the life-time of the machine; which with
* 1ns resolution means almost 585 years.
*
* This further relies on the fact that a well formed program will not unmap
* the file while it has a (shared) futex waiting on it. This mapping will have
* a file reference which pins the mount and inode.
*
* If for some reason an inode gets evicted and read back in again, it will get
* a new sequence number and will _NOT_ match, even though it is the exact same
* file.
*
* It is important that futex_match() will never have a false-positive, esp.
* for PI futexes that can mess up the state. The above argues that false-negatives
* are only possible for malformed programs.
*/
static u64 get_inode_sequence_number(struct inode *inode)
{
static atomic64_t i_seq;
u64 old;
/* Does the inode already have a sequence number? */
old = atomic64_read(&inode->i_sequence);
if (likely(old))
return old;
for (;;) {
u64 new = atomic64_add_return(1, &i_seq);
if (WARN_ON_ONCE(!new))
continue;
old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
if (old)
return old;
return new;
}
}
/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
* @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
* @key: address where result is stored.
* @rw: mapping needs to be read/write (values: FUTEX_READ,
* FUTEX_WRITE)
*
* Return: a negative error code or 0
*
* The key words are stored in @key on success.
*
* For shared mappings (when @fshared), the key is:
*
* ( inode->i_sequence, page->index, offset_within_page )
*
* [ also see get_inode_sequence_number() ]
*
* For private mappings (or when !@fshared), the key is:
*
* ( current->mm, address, 0 )
*
* This allows (cross process, where applicable) identification of the futex
* without keeping the page pinned for the duration of the FUTEX_WAIT.
*
* lock_page() might sleep, the caller should not hold a spinlock.
*/
int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
enum futex_access rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
struct page *page, *tail;
struct address_space *mapping;
int err, ro = 0;
/*
* The futex address must be "naturally" aligned.
*/
key->both.offset = address % PAGE_SIZE;
if (unlikely((address % sizeof(u32)) != 0))
return -EINVAL;
address -= key->both.offset;
if (unlikely(!access_ok(uaddr, sizeof(u32))))
return -EFAULT;
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
* virtual address, we dont even have to find the underlying vma.
* Note : We do have to check 'uaddr' is a valid user address,
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
/*
* On no-MMU, shared futexes are treated as private, therefore
* we must not include the current process in the key. Since
* there is only one address space, the address is a unique key
* on its own.
*/
if (IS_ENABLED(CONFIG_MMU))
key->private.mm = mm;
else
key->private.mm = NULL;
key->private.address = address;
return 0;
}
again:
/* Ignore any VERIFY_READ mapping (futex common case) */
if (unlikely(should_fail_futex(true)))
return -EFAULT;
err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
/*
* If write access is not required (eg. FUTEX_WAIT), try
* and get read-only access.
*/
if (err == -EFAULT && rw == FUTEX_READ) {
err = get_user_pages_fast(address, 1, 0, &page);
ro = 1;
}
if (err < 0)
return err;
else
err = 0;
/*
* The treatment of mapping from this point on is critical. The page
* lock protects many things but in this context the page lock
* stabilizes mapping, prevents inode freeing in the shared
* file-backed region case and guards against movement to swap cache.
*
* Strictly speaking the page lock is not needed in all cases being
* considered here and page lock forces unnecessarily serialization
* From this point on, mapping will be re-verified if necessary and
* page lock will be acquired only if it is unavoidable
*
* Mapping checks require the head page for any compound page so the
* head page and mapping is looked up now. For anonymous pages, it
* does not matter if the page splits in the future as the key is
* based on the address. For filesystem-backed pages, the tail is
* required as the index of the page determines the key. For
* base pages, there is no tail page and tail == page.
*/
tail = page;
page = compound_head(page);
mapping = READ_ONCE(page->mapping);
/*
* If page->mapping is NULL, then it cannot be a PageAnon
* page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast
* found it, but truncated or holepunched or subjected to
* invalidate_complete_page2 before we got the page lock (also
* cases which we are happy to fail). And we hold a reference,
* so refcount care in invalidate_inode_page's remove_mapping
* prevents drop_caches from setting mapping to NULL beneath us.
*
* The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us:
* an unlikely race, but we do need to retry for page->mapping.
*/
if (unlikely(!mapping)) {
int shmem_swizzled;
/*
* Page lock is required to identify which special case above
* applies. If this is really a shmem page then the page lock
* will prevent unexpected transitions.
*/
lock_page(page);
shmem_swizzled = PageSwapCache(page) || page->mapping;
unlock_page(page);
put_page(page);
if (shmem_swizzled)
goto again;
return -EFAULT;
}
/*
* Private mappings are handled in a simple way.
*
* If the futex key is stored on an anonymous page, then the associated
* object is the mm which is implicitly pinned by the calling process.
*
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
if (PageAnon(page)) {
/*
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
*/
if (unlikely(should_fail_futex(true)) || ro) {
err = -EFAULT;
goto out;
}
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
} else {
struct inode *inode;
/*
* The associated futex object in this case is the inode and
* the page->mapping must be traversed. Ordinarily this should
* be stabilised under page lock but it's not strictly
* necessary in this case as we just want to pin the inode, not
* update the radix tree or anything like that.
*
* The RCU read lock is taken as the inode is finally freed
* under RCU. If the mapping still matches expectations then the
* mapping->host can be safely accessed as being a valid inode.
*/
rcu_read_lock();
if (READ_ONCE(page->mapping) != mapping) {
rcu_read_unlock();
put_page(page);
goto again;
}
inode = READ_ONCE(mapping->host);
if (!inode) {
rcu_read_unlock();
put_page(page);
goto again;
}
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.i_seq = get_inode_sequence_number(inode);
key->shared.pgoff = page_to_pgoff(tail);
rcu_read_unlock();
}
out:
put_page(page);
return err;
}
/**
* fault_in_user_writeable() - Fault in user address and verify RW access
* @uaddr: pointer to faulting user space address
*
* Slow path to fixup the fault we just took in the atomic write
* access to @uaddr.
*
* We have no generic implementation of a non-destructive write to the
* user address. We know that we faulted in the atomic pagefault
* disabled section so we can as well avoid the #PF overhead by
* calling get_user_pages() right away.
*/
int fault_in_user_writeable(u32 __user *uaddr)
{
struct mm_struct *mm = current->mm;
int ret;
mmap_read_lock(mm);
ret = fixup_user_fault(mm, (unsigned long)uaddr,
FAULT_FLAG_WRITE, NULL);
mmap_read_unlock(mm);
return ret < 0 ? ret : 0;
}
/**
* futex_top_waiter() - Return the highest priority waiter on a futex
* @hb: the hash bucket the futex_q's reside in
* @key: the futex key (to distinguish it from other futex futex_q's)
*
* Must be called with the hb lock held.
*/
struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
{
struct futex_q *this;
plist_for_each_entry(this, &hb->chain, list) {
if (futex_match(&this->key, key))
return this;
}
return NULL;
}
int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
{
int ret;
pagefault_disable();
ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
pagefault_enable();
return ret;
}
int futex_get_value_locked(u32 *dest, u32 __user *from)
{
int ret;
pagefault_disable();
ret = __get_user(*dest, from);
pagefault_enable();
return ret ? -EFAULT : 0;
}
/**
* wait_for_owner_exiting - Block until the owner has exited
* @ret: owner's current futex lock status
* @exiting: Pointer to the exiting task
*
* Caller must hold a refcount on @exiting.
*/
void wait_for_owner_exiting(int ret, struct task_struct *exiting)
{
if (ret != -EBUSY) {
WARN_ON_ONCE(exiting);
return;
}
if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
return;
mutex_lock(&exiting->futex_exit_mutex);
/*
* No point in doing state checking here. If the waiter got here
* while the task was in exec()->exec_futex_release() then it can
* have any FUTEX_STATE_* value when the waiter has acquired the
* mutex. OK, if running, EXITING or DEAD if it reached exit()
* already. Highly unlikely and not a problem. Just one more round
* through the futex maze.
*/
mutex_unlock(&exiting->futex_exit_mutex);
put_task_struct(exiting);
}
/**
* __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
* @q: The futex_q to unqueue
*
* The q->lock_ptr must not be NULL and must be held by the caller.
*/
void __futex_unqueue(struct futex_q *q)
{
struct futex_hash_bucket *hb;
if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
return;
lockdep_assert_held(q->lock_ptr);
hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
plist_del(&q->list, &hb->chain);
futex_hb_waiters_dec(hb);
}
/* The key must be already stored in q->key. */
struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
__acquires(&hb->lock)
{
struct futex_hash_bucket *hb;
hb = futex_hash(&q->key);
/*
* Increment the counter before taking the lock so that
* a potential waker won't miss a to-be-slept task that is
* waiting for the spinlock. This is safe as all futex_q_lock()
* users end up calling futex_queue(). Similarly, for housekeeping,
* decrement the counter at futex_q_unlock() when some error has
* occurred and we don't end up adding the task to the list.
*/
futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */
q->lock_ptr = &hb->lock;
spin_lock(&hb->lock);
return hb;
}
void futex_q_unlock(struct futex_hash_bucket *hb)
__releases(&hb->lock)
{
spin_unlock(&hb->lock);
futex_hb_waiters_dec(hb);
}
void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
{
int prio;
bool already_on_hb = false;
/*
* The priority used to register this element is
* - either the real thread-priority for the real-time threads
* (i.e. threads with a priority lower than MAX_RT_PRIO)
* - or MAX_RT_PRIO for non-RT threads.
* Thus, all RT-threads are woken first in priority order, and
* the others are woken last, in FIFO order.
*/
prio = min(current->normal_prio, MAX_RT_PRIO);
plist_node_init(&q->list, prio);
trace_android_vh_alter_futex_plist_add(&q->list, &hb->chain, &already_on_hb);
if (!already_on_hb)
plist_add(&q->list, &hb->chain);
q->task = current;
}
/**
* futex_unqueue() - Remove the futex_q from its futex_hash_bucket
* @q: The futex_q to unqueue
*
* The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
* be paired with exactly one earlier call to futex_queue().
*
* Return:
* - 1 - if the futex_q was still queued (and we removed unqueued it);
* - 0 - if the futex_q was already removed by the waking thread
*/
int futex_unqueue(struct futex_q *q)
{
spinlock_t *lock_ptr;
int ret = 0;
/* In the common case we don't take the spinlock, which is nice. */
retry:
/*
* q->lock_ptr can change between this read and the following spin_lock.
* Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
* optimizing lock_ptr out of the logic below.
*/
lock_ptr = READ_ONCE(q->lock_ptr);
if (lock_ptr != NULL) {
spin_lock(lock_ptr);
/*
* q->lock_ptr can change between reading it and
* spin_lock(), causing us to take the wrong lock. This
* corrects the race condition.
*
* Reasoning goes like this: if we have the wrong lock,
* q->lock_ptr must have changed (maybe several times)
* between reading it and the spin_lock(). It can
* change again after the spin_lock() but only if it was
* already changed before the spin_lock(). It cannot,
* however, change back to the original value. Therefore
* we can detect whether we acquired the correct lock.
*/
if (unlikely(lock_ptr != q->lock_ptr)) {
spin_unlock(lock_ptr);
goto retry;
}
__futex_unqueue(q);
BUG_ON(q->pi_state);
spin_unlock(lock_ptr);
ret = 1;
}
return ret;
}
/*
* PI futexes can not be requeued and must remove themselves from the
* hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
*/
void futex_unqueue_pi(struct futex_q *q)
{
__futex_unqueue(q);
BUG_ON(!q->pi_state);
put_pi_state(q->pi_state);
q->pi_state = NULL;
}
/* Constants for the pending_op argument of handle_futex_death */
#define HANDLE_DEATH_PENDING true
#define HANDLE_DEATH_LIST false
/*
* Process a futex-list entry, check whether it's owned by the
* dying task, and do notification if so:
*/
static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
bool pi, bool pending_op)
{
u32 uval, nval, mval;
pid_t owner;
int err;
/* Futex address must be 32bit aligned */
if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
return -1;
retry:
if (get_user(uval, uaddr))
return -1;
/*
* Special case for regular (non PI) futexes. The unlock path in
* user space has two race scenarios:
*
* 1. The unlock path releases the user space futex value and
* before it can execute the futex() syscall to wake up
* waiters it is killed.
*
* 2. A woken up waiter is killed before it can acquire the
* futex in user space.
*
* In the second case, the wake up notification could be generated
* by the unlock path in user space after setting the futex value
* to zero or by the kernel after setting the OWNER_DIED bit below.
*
* In both cases the TID validation below prevents a wakeup of
* potential waiters which can cause these waiters to block
* forever.
*
* In both cases the following conditions are met:
*
* 1) task->robust_list->list_op_pending != NULL
* @pending_op == true
* 2) The owner part of user space futex value == 0
* 3) Regular futex: @pi == false
*
* If these conditions are met, it is safe to attempt waking up a
* potential waiter without touching the user space futex value and
* trying to set the OWNER_DIED bit. If the futex value is zero,
* the rest of the user space mutex state is consistent, so a woken
* waiter will just take over the uncontended futex. Setting the
* OWNER_DIED bit would create inconsistent state and malfunction
* of the user space owner died handling. Otherwise, the OWNER_DIED
* bit is already set, and the woken waiter is expected to deal with
* this.
*/
owner = uval & FUTEX_TID_MASK;
if (pending_op && !pi && !owner) {
futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
return 0;
}
if (owner != task_pid_vnr(curr))
return 0;
/*
* Ok, this dying thread is truly holding a futex
* of interest. Set the OWNER_DIED bit atomically
* via cmpxchg, and if the value had FUTEX_WAITERS
* set, wake up a waiter (if any). (We have to do a
* futex_wake() even if OWNER_DIED is already set -
* to handle the rare but possible case of recursive
* thread-death.) The rest of the cleanup is done in
* userspace.
*/
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
/*
* We are not holding a lock here, but we want to have
* the pagefault_disable/enable() protection because
* we want to handle the fault gracefully. If the
* access fails we try to fault in the futex with R/W
* verification via get_user_pages. get_user() above
* does not guarantee R/W access. If that fails we
* give up and leave the futex locked.
*/
if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
switch (err) {
case -EFAULT:
if (fault_in_user_writeable(uaddr))
return -1;
goto retry;
case -EAGAIN:
cond_resched();
goto retry;
default:
WARN_ON_ONCE(1);
return err;
}
}
if (nval != uval)
goto retry;
/*
* Wake robust non-PI futexes here. The wakeup of
* PI futexes happens in exit_pi_state():
*/
if (!pi && (uval & FUTEX_WAITERS))
futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
return 0;
}
/*
* Fetch a robust-list pointer. Bit 0 signals PI futexes:
*/
static inline int fetch_robust_entry(struct robust_list __user **entry,
struct robust_list __user * __user *head,
unsigned int *pi)
{
unsigned long uentry;
if (get_user(uentry, (unsigned long __user *)head))
return -EFAULT;
*entry = (void __user *)(uentry & ~1UL);
*pi = uentry & 1;
return 0;
}
/*
* Walk curr->robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
*
* We silently return on any sign of list-walking problem.
*/
static void exit_robust_list(struct task_struct *curr)
{
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
unsigned int next_pi;
unsigned long futex_offset;
int rc;
/*
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
*/
if (fetch_robust_entry(&entry, &head->list.next, &pi))
return;
/*
* Fetch the relative futex offset:
*/
if (get_user(futex_offset, &head->futex_offset))
return;
/*
* Fetch any possibly pending lock-add first, and handle it
* if it exists:
*/
if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
return;
next_entry = NULL; /* avoid warning with gcc */
while (entry != &head->list) {
/*
* Fetch the next entry in the list before calling
* handle_futex_death:
*/
rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
/*
* A pending lock might already be on the list, so
* don't process it twice:
*/
if (entry != pending) {
if (handle_futex_death((void __user *)entry + futex_offset,
curr, pi, HANDLE_DEATH_LIST))
return;
}
if (rc)
return;
entry = next_entry;
pi = next_pi;
/*
* Avoid excessively long or circular lists:
*/
if (!--limit)
break;
cond_resched();
}
if (pending) {
handle_futex_death((void __user *)pending + futex_offset,
curr, pip, HANDLE_DEATH_PENDING);
}
}
#ifdef CONFIG_COMPAT
static void __user *futex_uaddr(struct robust_list __user *entry,
compat_long_t futex_offset)
{
compat_uptr_t base = ptr_to_compat(entry);
void __user *uaddr = compat_ptr(base + futex_offset);
return uaddr;
}
/*
* Fetch a robust-list pointer. Bit 0 signals PI futexes:
*/
static inline int
compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
compat_uptr_t __user *head, unsigned int *pi)
{
if (get_user(*uentry, head))
return -EFAULT;
*entry = compat_ptr((*uentry) & ~1);
*pi = (unsigned int)(*uentry) & 1;
return 0;
}
/*
* Walk curr->robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
*
* We silently return on any sign of list-walking problem.
*/
static void compat_exit_robust_list(struct task_struct *curr)
{
struct compat_robust_list_head __user *head = curr->compat_robust_list;
struct robust_list __user *entry, *next_entry, *pending;
unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
unsigned int next_pi;
compat_uptr_t uentry, next_uentry, upending;
compat_long_t futex_offset;
int rc;
/*
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
*/
if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
return;
/*
* Fetch the relative futex offset:
*/
if (get_user(futex_offset, &head->futex_offset))
return;
/*
* Fetch any possibly pending lock-add first, and handle it
* if it exists:
*/
if (compat_fetch_robust_entry(&upending, &pending,
&head->list_op_pending, &pip))
return;
next_entry = NULL; /* avoid warning with gcc */
while (entry != (struct robust_list __user *) &head->list) {
/*
* Fetch the next entry in the list before calling
* handle_futex_death:
*/
rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
(compat_uptr_t __user *)&entry->next, &next_pi);
/*
* A pending lock might already be on the list, so
* dont process it twice:
*/
if (entry != pending) {
void __user *uaddr = futex_uaddr(entry, futex_offset);
if (handle_futex_death(uaddr, curr, pi,
HANDLE_DEATH_LIST))
return;
}
if (rc)
return;
uentry = next_uentry;
entry = next_entry;
pi = next_pi;
/*
* Avoid excessively long or circular lists:
*/
if (!--limit)
break;
cond_resched();
}
if (pending) {
void __user *uaddr = futex_uaddr(pending, futex_offset);
handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
}
}
#endif
#ifdef CONFIG_FUTEX_PI
/*
* This task is holding PI mutexes at exit time => bad.
* Kernel cleans up PI-state, but userspace is likely hosed.
* (Robust-futex cleanup is separate and might save the day for userspace.)
*/
static void exit_pi_state_list(struct task_struct *curr)
{
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
struct futex_hash_bucket *hb;
union futex_key key = FUTEX_KEY_INIT;
/*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
* versus waiters unqueueing themselves:
*/
raw_spin_lock_irq(&curr->pi_lock);
while (!list_empty(head)) {
next = head->next;
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
hb = futex_hash(&key);
/*
* We can race against put_pi_state() removing itself from the
* list (a waiter going away). put_pi_state() will first
* decrement the reference count and then modify the list, so
* its possible to see the list entry but fail this reference
* acquire.
*
* In that case; drop the locks to let put_pi_state() make
* progress and retry the loop.
*/
if (!refcount_inc_not_zero(&pi_state->refcount)) {
raw_spin_unlock_irq(&curr->pi_lock);
cpu_relax();
raw_spin_lock_irq(&curr->pi_lock);
continue;
}
raw_spin_unlock_irq(&curr->pi_lock);
spin_lock(&hb->lock);
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
raw_spin_lock(&curr->pi_lock);
/*
* We dropped the pi-lock, so re-check whether this
* task still owns the PI-state:
*/
if (head->next != next) {
/* retain curr->pi_lock for the loop invariant */
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
put_pi_state(pi_state);
continue;
}
WARN_ON(pi_state->owner != curr);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
pi_state->owner = NULL;
raw_spin_unlock(&curr->pi_lock);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
rt_mutex_futex_unlock(&pi_state->pi_mutex);
put_pi_state(pi_state);
raw_spin_lock_irq(&curr->pi_lock);
}
raw_spin_unlock_irq(&curr->pi_lock);
}
#else
static inline void exit_pi_state_list(struct task_struct *curr) { }
#endif
static void futex_cleanup(struct task_struct *tsk)
{
if (unlikely(tsk->robust_list)) {
exit_robust_list(tsk);
tsk->robust_list = NULL;
}
#ifdef CONFIG_COMPAT
if (unlikely(tsk->compat_robust_list)) {
compat_exit_robust_list(tsk);
tsk->compat_robust_list = NULL;
}
#endif
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
}
/**
* futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
* @tsk: task to set the state on
*
* Set the futex exit state of the task lockless. The futex waiter code
* observes that state when a task is exiting and loops until the task has
* actually finished the futex cleanup. The worst case for this is that the
* waiter runs through the wait loop until the state becomes visible.
*
* This is called from the recursive fault handling path in make_task_dead().
*
* This is best effort. Either the futex exit code has run already or
* not. If the OWNER_DIED bit has been set on the futex then the waiter can
* take it over. If not, the problem is pushed back to user space. If the
* futex exit code did not run yet, then an already queued waiter might
* block forever, but there is nothing which can be done about that.
*/
void futex_exit_recursive(struct task_struct *tsk)
{
/* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
if (tsk->futex_state == FUTEX_STATE_EXITING)
mutex_unlock(&tsk->futex_exit_mutex);
tsk->futex_state = FUTEX_STATE_DEAD;
}
static void futex_cleanup_begin(struct task_struct *tsk)
{
/*
* Prevent various race issues against a concurrent incoming waiter
* including live locks by forcing the waiter to block on
* tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
* attach_to_pi_owner().
*/
mutex_lock(&tsk->futex_exit_mutex);
/*
* Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
*
* This ensures that all subsequent checks of tsk->futex_state in
* attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
* tsk->pi_lock held.
*
* It guarantees also that a pi_state which was queued right before
* the state change under tsk->pi_lock by a concurrent waiter must
* be observed in exit_pi_state_list().
*/
raw_spin_lock_irq(&tsk->pi_lock);
tsk->futex_state = FUTEX_STATE_EXITING;
raw_spin_unlock_irq(&tsk->pi_lock);
}
static void futex_cleanup_end(struct task_struct *tsk, int state)
{
/*
* Lockless store. The only side effect is that an observer might
* take another loop until it becomes visible.
*/
tsk->futex_state = state;
/*
* Drop the exit protection. This unblocks waiters which observed
* FUTEX_STATE_EXITING to reevaluate the state.
*/
mutex_unlock(&tsk->futex_exit_mutex);
}
void futex_exec_release(struct task_struct *tsk)
{
/*
* The state handling is done for consistency, but in the case of
* exec() there is no way to prevent further damage as the PID stays
* the same. But for the unlikely and arguably buggy case that a
* futex is held on exec(), this provides at least as much state
* consistency protection which is possible.
*/
futex_cleanup_begin(tsk);
futex_cleanup(tsk);
/*
* Reset the state to FUTEX_STATE_OK. The task is alive and about
* exec a new binary.
*/
futex_cleanup_end(tsk, FUTEX_STATE_OK);
}
void futex_exit_release(struct task_struct *tsk)
{
futex_cleanup_begin(tsk);
futex_cleanup(tsk);
futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}
static int __init futex_init(void)
{
unsigned int futex_shift;
unsigned long i;
#if CONFIG_BASE_SMALL
futex_hashsize = 16;
#else
futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
#endif
futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
futex_hashsize, 0,
futex_hashsize < 256 ? HASH_SMALL : 0,
&futex_shift, NULL,
futex_hashsize, futex_hashsize);
futex_hashsize = 1UL << futex_shift;
for (i = 0; i < futex_hashsize; i++) {
atomic_set(&futex_queues[i].waiters, 0);
plist_head_init(&futex_queues[i].chain);
spin_lock_init(&futex_queues[i].lock);
}
return 0;
}
core_initcall(futex_init);