https://source.android.com/docs/security/bulletin/2023-02-01 CVE-2022-39189 CVE-2022-39842 CVE-2022-41222 CVE-2023-20937 CVE-2023-20938 CVE-2022-0850 * tag 'ASB-2023-02-05_11-5.4' of https://android.googlesource.com/kernel/common: ANDROID: ABI: Cuttlefish Symbol update UPSTREAM: media: dvb-core: Fix UAF due to refcount races at releasing ANDROID: abi_gki_aarch64_qcom: Add hrtimer_sleeper_start_expires UPSTREAM: ALSA: pcm: Move rwsem lock inside snd_ctl_elem_read to prevent UAF ANDROID: Revert "tracing/ring-buffer: Have polling block on watermark" UPSTREAM: usb: gadget: f_hid: fix f_hidg lifetime vs cdev UPSTREAM: usb: gadget: f_hid: optional SETUP/SET_REPORT mode ANDROID: add TEST_MAPPING for net/, include/net UPSTREAM: nfp: fix use-after-free in area_cache_get() UPSTREAM: proc: avoid integer type confusion in get_proc_long UPSTREAM: proc: proc_skip_spaces() shouldn't think it is working on C strings ANDROID: usb: f_accessory: Check buffer size when initialised via composite BACKPORT: mm: don't be stuck to rmap lock on reclaim path ANDROID: Add more hvc devices for virtio-console. Revert "mmc: sdhci: Fix voltage switch delay" ANDROID: gki_defconfig: add CONFIG_FUNCTION_ERROR_INJECTION Linux 5.4.226 ipc/sem: Fix dangling sem_array access in semtimedop race v4l2: don't fall back to follow_pfn() if pin_user_pages_fast() fails proc: proc_skip_spaces() shouldn't think it is working on C strings proc: avoid integer type confusion in get_proc_long mmc: sdhci: Fix voltage switch delay mmc: sdhci: use FIELD_GET for preset value bit masks char: tpm: Protect tpm_pm_suspend with locks Revert "clocksource/drivers/riscv: Events are stopped during CPU suspend" x86/ioremap: Fix page aligned size calculation in __ioremap_caller() Bluetooth: L2CAP: Fix accepting connection request for invalid SPSM x86/pm: Add enumeration check before spec MSRs save/restore setup x86/tsx: Add a feature bit for TSX control MSR support nvme: ensure subsystem reset is single threaded nvme: restrict management ioctls to admin epoll: check for events when removing a timed out thread from the wait queue epoll: call final ep_events_available() check under the lock tracing/ring-buffer: Have polling block on watermark ipv4: Fix route deletion when nexthop info is not specified ipv4: Handle attempt to delete multipath route when fib_info contains an nh reference selftests: net: fix nexthop warning cleanup double ip typo selftests: net: add delete nexthop route warning test Kconfig.debug: provide a little extra FRAME_WARN leeway when KASAN is enabled parisc: Increase FRAME_WARN to 2048 bytes on parisc xtensa: increase size of gcc stack frame check parisc: Increase size of gcc stack frame check iommu/vt-d: Fix PCI device refcount leak in dmar_dev_scope_init() pinctrl: single: Fix potential division by zero ASoC: ops: Fix bounds check for _sx controls mm: Fix '.data.once' orphan section warning arm64: errata: Fix KVM Spectre-v2 mitigation selection for Cortex-A57/A72 arm64: Fix panic() when Spectre-v2 causes Spectre-BHB to re-allocate KVM vectors tracing: Free buffers when a used dynamic event is removed mmc: sdhci-sprd: Fix no reset data and command after voltage switch mmc: sdhci-esdhc-imx: correct CQHCI exit halt state check mmc: core: Fix ambiguous TRIM and DISCARD arg mmc: mmc_test: Fix removal of debugfs file pinctrl: intel: Save and restore pins in "direct IRQ" mode x86/bugs: Make sure MSR_SPEC_CTRL is updated properly upon resume from S3 nilfs2: fix NULL pointer dereference in nilfs_palloc_commit_free_entry() tools/vm/slabinfo-gnuplot: use "grep -E" instead of "egrep" error-injection: Add prompt for function error injection net/mlx5: DR, Fix uninitialized var warning hwmon: (coretemp) fix pci device refcount leak in nv1a_ram_new() hwmon: (coretemp) Check for null before removing sysfs attrs net: ethernet: renesas: ravb: Fix promiscuous mode after system resumed sctp: fix memory leak in sctp_stream_outq_migrate() packet: do not set TP_STATUS_CSUM_VALID on CHECKSUM_COMPLETE net: tun: Fix use-after-free in tun_detach() afs: Fix fileserver probe RTT handling net: hsr: Fix potential use-after-free dsa: lan9303: Correct stat name net: ethernet: nixge: fix NULL dereference net/9p: Fix a potential socket leak in p9_socket_open net: net_netdev: Fix error handling in ntb_netdev_init_module() net: phy: fix null-ptr-deref while probe() failed wifi: cfg80211: fix buffer overflow in elem comparison qlcnic: fix sleep-in-atomic-context bugs caused by msleep can: cc770: cc770_isa_probe(): add missing free_cc770dev() can: sja1000_isa: sja1000_isa_probe(): add missing free_sja1000dev() net/mlx5e: Fix use-after-free when reverting termination table net/mlx5: Fix uninitialized variable bug in outlen_write() of: property: decrement node refcount in of_fwnode_get_reference_args() hwmon: (ibmpex) Fix possible UAF when ibmpex_register_bmc() fails hwmon: (i5500_temp) fix missing pci_disable_device() scripts/faddr2line: Fix regression in name resolution on ppc64le iio: light: rpr0521: add missing Kconfig dependencies iio: health:afe4404
: Fix oob read in afe4404_[read|write]_raw iio: health: afe4403: Fix oob read in afe4403_read_raw btrfs: qgroup: fix sleep from invalid context bug in btrfs_qgroup_inherit() drm/amdgpu: Partially revert "drm/amdgpu: update drm_display_info correctly when the edid is read" drm/amdgpu: update drm_display_info correctly when the edid is read btrfs: move QUOTA_ENABLED check to rescan_should_stop from btrfs_qgroup_rescan_worker spi: spi-imx: Fix spi_bus_clk if requested clock is higher than input clock btrfs: free btrfs_path before copying inodes to userspace fuse: lock inode unconditionally in fuse_fallocate() drm/i915: fix TLB invalidation for Gen12 video and compute engines drm/amdgpu: always register an MMU notifier for userptr drm/amd/dc/dce120: Fix audio register mapping, stop triggering KASAN btrfs: sysfs: normalize the error handling branch in btrfs_init_sysfs() btrfs: free btrfs_path before copying subvol info to userspace btrfs: free btrfs_path before copying fspath to userspace btrfs: free btrfs_path before copying root refs to userspace binder: Gracefully handle BINDER_TYPE_FDA objects with num_fds=0 binder: Address corner cases in deferred copy and fixup binder: fix pointer cast warning binder: defer copies of pre-patched txn data binder: read pre-translated fds from sender buffer binder: avoid potential data leakage when copying txn dm integrity: flush the journal on suspend net: usb: qmi_wwan: add Telit 0x103a composition tcp: configurable source port perturb table size platform/x86: hp-wmi: Ignore Smart Experience App event platform/x86: acer-wmi: Enable SW_TABLET_MODE on Switch V 10 (SW5-017) platform/x86: asus-wmi: add missing pci_dev_put() in asus_wmi_set_xusb2pr() xen/platform-pci: add missing free_irq() in error path serial: 8250: 8250_omap: Avoid RS485 RTS glitch on ->set_termios() ASoC: Intel: bytcht_es8316: Add quirk for the Nanote UMPC-01 Input: synaptics - switch touchpad on HP Laptop 15-da3001TU to RMI mode gcov: clang: fix the buffer overflow issue nilfs2: fix nilfs_sufile_mark_dirty() not set segment usage as dirty firmware: coreboot: Register bus in module init firmware: google: Release devices before unregistering the bus ceph: avoid putting the realm twice when decoding snaps fails ceph: do not update snapshot context when there is no new snapshot iio: pressure: ms5611: fixed value compensation bug iio: ms5611: Simplify IO callback parameters nios2: add FORCE for vmlinuz.gz init/Kconfig: fix CC_HAS_ASM_GOTO_TIED_OUTPUT test with dash iio: core: Fix entry not deleted when iio_register_sw_trigger_type() fails iio: light: apds9960: fix wrong register for gesture gain arm64: dts: rockchip: lower rk3399-puma-haikou SD controller clock frequency usb: dwc3: exynos: Fix remove() function lib/vdso: use "grep -E" instead of "egrep" s390/crashdump: fix TOD programmable field size net: thunderx: Fix the ACPI memory leak nfc: st-nci: fix memory leaks in EVT_TRANSACTION nfc: st-nci: fix incorrect validating logic in EVT_TRANSACTION s390/dasd: fix no record found for raw_track_access dccp/tcp: Reset saddr on failure after inet6?_hash_connect(). bnx2x: fix pci device refcount leak in bnx2x_vf_is_pcie_pending() regulator: twl6030: re-add TWL6032_SUBCLASS NFC: nci: fix memory leak in nci_rx_data_packet() xfrm: Fix ignored return value in xfrm6_init() tipc: check skb_linearize() return value in tipc_disc_rcv() tipc: add an extra conn_get in tipc_conn_alloc tipc: set con sock in tipc_conn_alloc net/mlx5: Fix FW tracer timestamp calculation Drivers: hv: vmbus: fix possible memory leak in vmbus_device_register() Drivers: hv: vmbus: fix double free in the error path of vmbus_add_channel_work() nfp: add port from netdev validation for EEPROM access net: pch_gbe: fix pci device refcount leak while module exiting net/qla3xxx: fix potential memleak in ql3xxx_send() net/mlx4: Check retval of mlx4_bitmap_init ARM: mxs: fix memory leak in mxs_machine_init() 9p/fd: fix issue of list_del corruption in p9_fd_cancel() net: pch_gbe: fix potential memleak in pch_gbe_tx_queue() nfc/nci: fix race with opening and closing net: liquidio: simplify if expression ARM: dts: at91: sam9g20ek: enable udc vbus gpio pinctrl tee: optee: fix possible memory leak in optee_register_device() bus: sunxi-rsb: Support atomic transfers regulator: core: fix UAF in destroy_regulator() regulator: core: fix kobject release warning and memory leak in regulator_register() ASoC: sgtl5000: Reset the CHIP_CLK_CTRL reg on remove ARM: dts: am335x-pcm-953: Define fixed regulators in root node af_key: Fix send_acquire race with pfkey_register MIPS: pic32: treat port as signed integer RISC-V: vdso: Do not add missing symbols to version section in linker script arm64/syscall: Include asm/ptrace.h in syscall_wrapper header. block, bfq: fix null pointer dereference in bfq_bio_bfqg() drm: panel-orientation-quirks: Add quirk for Acer Switch V 10 (SW5-017) spi: stm32: fix stm32_spi_prepare_mbr() that halves spi clk for every run wifi: mac80211: Fix ack frame idr leak when mesh has no route audit: fix undefined behavior in bit shift for AUDIT_BIT wifi: mac80211_hwsim: fix debugfs attribute ps with rc table support wifi: mac80211: fix memory free error when registering wiphy fail Revert "can: af_can: fix NULL pointer dereference in can_rx_register()" Linux 5.4.225 ntfs: check overflow when iterating ATTR_RECORDs ntfs: fix out-of-bounds read in ntfs_attr_find() ntfs: fix use-after-free in ntfs_attr_find() mm: fs: initialize fsdata passed to write_begin/write_end interface 9p/trans_fd: always use O_NONBLOCK read/write gfs2: Switch from strlcpy to strscpy gfs2: Check sb_bsize_shift after reading superblock 9p: trans_fd/p9_conn_cancel: drop client lock earlier kcm: close race conditions on sk_receive_queue bpf, test_run: Fix alignment problem in bpf_prog_test_run_skb() kcm: avoid potential race in kcm_tx_work tcp: cdg: allow tcp_cdg_release() to be called multiple times macvlan: enforce a consistent minimal mtu Input: i8042 - fix leaking of platform device on module removal kprobes: Skip clearing aggrprobe's post_handler in kprobe-on-ftrace case scsi: target: tcm_loop: Fix possible name leak in tcm_loop_setup_hba_bus() ring-buffer: Include dropped pages in counting dirty patches serial: 8250: Flush DMA Rx on RLSI misc/vmw_vmci: fix an infoleak in vmci_host_do_receive_datagram() docs: update mediator contact information in CoC doc mmc: sdhci-pci: Fix possible memory leak caused by missing pci_dev_put() mmc: sdhci-pci-o2micro: fix card detect fail issue caused by CD# debounce timeout mmc: core: properly select voltage range without power cycle scsi: zfcp: Fix double free of FSF request when qdio send fails Input: iforce - invert valid length check when fetching device IDs serial: 8250_lpss: Configure DMA also w/o DMA filter serial: 8250: Fall back to non-DMA Rx if IIR_RDI occurs dm ioctl: fix misbehavior if list_versions races with module loading iio: pressure: ms5611: changed hardcoded SPI speed to value limited iio: trigger: sysfs: fix possible memory leak in iio_sysfs_trig_init() iio: adc: at91_adc: fix possible memory leak in at91_adc_allocate_trigger() usb: chipidea: fix deadlock in ci_otg_del_timer usb: add NO_LPM quirk for Realforce 87U Keyboard USB: serial: option: add Fibocom FM160 0x0111 composition USB: serial: option: add u-blox LARA-L6 modem USB: serial: option: add u-blox LARA-R6 00B modem USB: serial: option: remove old LARA-R6 PID USB: serial: option: add Sierra Wireless EM9191 speakup: fix a segfault caused by switching consoles slimbus: stream: correct presence rate frequencies Revert "usb: dwc3: disable USB core PHY management" ALSA: usb-audio: Drop snd_BUG_ON() from snd_usbmidi_output_open() ring_buffer: Do not deactivate non-existant pages ftrace: Fix null pointer dereference in ftrace_add_mod() ftrace: Optimize the allocation for mcount entries ftrace: Fix the possible incorrect kernel message cifs: add check for returning value of SMB2_set_info_init net: thunderbolt: Fix error handling in tbnet_init() cifs: Fix wrong return value checking when GETFLAGS net/x25: Fix skb leak in x25_lapb_receive_frame() platform/x86/intel: pmc: Don't unconditionally attach Intel PMC when virtualized drbd: use after free in drbd_create_device() xen/pcpu: fix possible memory leak in register_pcpu() bnxt_en: Remove debugfs when pci_register_driver failed net: caif: fix double disconnect client in chnl_net_open() net: macvlan: Use built-in RCU list checking mISDN: fix misuse of put_device() in mISDN_register_device() net: liquidio: release resources when liquidio driver open failed mISDN: fix possible memory leak in mISDN_dsp_element_register() net: bgmac: Drop free_netdev() from bgmac_enet_remove() ata: libata-transport: fix double ata_host_put() in ata_tport_add() arm64: dts: imx8mn: Fix NAND controller size-cells arm64: dts: imx8mm: Fix NAND controller size-cells pinctrl: devicetree: fix null pointer dereferencing in pinctrl_dt_to_map parport_pc: Avoid FIFO port location truncation siox: fix possible memory leak in siox_device_add() block: sed-opal: kmalloc the cmd/resp buffers ASoC: soc-utils: Remove __exit for snd_soc_util_exit() tty: n_gsm: fix sleep-in-atomic-context bug in gsm_control_send serial: imx: Add missing .thaw_noirq hook serial: 8250: omap: Flush PM QOS work on remove serial: 8250: omap: Fix unpaired pm_runtime_put_sync() in omap8250_remove() serial: 8250_omap: remove wait loop from Errata i202 workaround ASoC: core: Fix use-after-free in snd_soc_exit() spi: stm32: Print summary 'callbacks suppressed' message ASoC: codecs: jz4725b: Fix spelling mistake "Sourc" -> "Source", "Routee" -> "Route" Bluetooth: L2CAP: Fix l2cap_global_chan_by_psm btrfs: remove pointless and double ulist frees in error paths of qgroup tests drm/imx: imx-tve: Fix return type of imx_tve_connector_mode_valid i2c: i801: add lis3lv02d's I2C address for Vostro 5568 NFSv4: Retry LOCK on OLD_STATEID during delegation return selftests/intel_pstate: fix build for ARCH=x86_64 selftests/futex: fix build for clang ASoC: codecs: jz4725b: fix capture selector naming ASoC: codecs: jz4725b: use right control for Capture Volume ASoC: codecs: jz4725b: fix reported volume for Master ctl ASoC: codecs: jz4725b: add missed Line In power control bit spi: intel: Fix the offset to get the 64K erase opcode ASoC: wm8962: Add an event handler for TEMP_HP and TEMP_SPK ASoC: wm8997: Revert "ASoC: wm8997: Fix PM disable depth imbalance in wm8997_probe" ASoC: wm5110: Revert "ASoC: wm5110: Fix PM disable depth imbalance in wm5110_probe" ASoC: wm5102: Revert "ASoC: wm5102: Fix PM disable depth imbalance in wm5102_probe" x86/cpu: Restore AMD's DE_CFG MSR after resume net: tun: call napi_schedule_prep() to ensure we own a napi dmaengine: at_hdmac: Check return code of dma_async_device_register dmaengine: at_hdmac: Fix impossible condition dmaengine: at_hdmac: Don't allow CPU to reorder channel enable dmaengine: at_hdmac: Fix completion of unissued descriptor in case of errors dmaengine: at_hdmac: Don't start transactions at tx_submit level dmaengine: at_hdmac: Fix at_lli struct definition cert host tools: Stop complaining about deprecated OpenSSL functions can: j1939: j1939_send_one(): fix missing CAN header initialization udf: Fix a slab-out-of-bounds write bug in udf_find_entry() btrfs: selftests: fix wrong error check in btrfs_free_dummy_root() platform/x86: hp_wmi: Fix rfkill causing soft blocked wifi drm/i915/dmabuf: fix sg_table handling in map_dma_buf nilfs2: fix use-after-free bug of ns_writer on remount nilfs2: fix deadlock in nilfs_count_free_blocks() vmlinux.lds.h: Fix placement of '.data..decrypted' section ALSA: usb-audio: Add DSD support for Accuphase DAC-60 ALSA: usb-audio: Add quirk entry for M-Audio Micro ALSA: hda: fix potential memleak in 'add_widget_node' ALSA: hda/ca0132: add quirk for EVGA Z390 DARK mmc: sdhci-tegra: Fix SDHCI_RESET_ALL for CQHCI mmc: sdhci-of-arasan: Fix SDHCI_RESET_ALL for CQHCI mmc: cqhci: Provide helper for resetting both SDHCI and CQHCI MIPS: jump_label: Fix compat branch range check arm64: efi: Fix handling of misaligned runtime regions and drop warning riscv: process: fix kernel info leakage net: macvlan: fix memory leaks of macvlan_common_newlink ethernet: tundra: free irq when alloc ring failed in tsi108_open() net: mv643xx_eth: disable napi when init rxq or txq failed in mv643xx_eth_open() ethernet: s2io: disable napi when start nic failed in s2io_card_up() cxgb4vf: shut down the adapter when t4vf_update_port_info() failed in cxgb4vf_open() net: cxgb3_main: disable napi when bind qsets failed in cxgb_up() net: cpsw: disable napi in cpsw_ndo_open() net/mlx5: Allow async trigger completion execution on single CPU systems net: nixge: disable napi when enable interrupts failed in nixge_open() perf stat: Fix printing os->prefix in CSV metrics output drivers: net: xgene: disable napi when register irq failed in xgene_enet_open() dmaengine: mv_xor_v2: Fix a resource leak in mv_xor_v2_remove() dmaengine: pxa_dma: use platform_get_irq_optional tipc: fix the msg->req tlv len check in tipc_nl_compat_name_table_dump_header can: af_can: fix NULL pointer dereference in can_rx_register() ipv6: addrlabel: fix infoleak when sending struct ifaddrlblmsg to network drm/vc4: Fix missing platform_unregister_drivers() call in vc4_drm_register() hamradio: fix issue of dev reference count leakage in bpq_device_event() net: lapbether: fix issue of dev reference count leakage in lapbeth_device_event() capabilities: fix undefined behavior in bit shift for CAP_TO_MASK net: fman: Unregister ethernet device on removal bnxt_en: fix potentially incorrect return value for ndo_rx_flow_steer bnxt_en: Fix possible crash in bnxt_hwrm_set_coal() net: tun: Fix memory leaks of napi_get_frags net: gso: fix panic on frag_list with mixed head alloc types HID: hyperv: fix possible memory leak in mousevsc_probe() bpf, sockmap: Fix the sk->sk_forward_alloc warning of sk_stream_kill_queues wifi: cfg80211: fix memory leak in query_regdb_file() wifi: cfg80211: silence a sparse RCU warning phy: stm32: fix an error code in probe xfs: drain the buf delwri queue before xfsaild idles xfs: preserve inode versioning across remounts xfs: use MMAPLOCK around filemap_map_pages() xfs: redesign the reflink remap loop to fix blkres depletion crash xfs: rename xfs_bmap_is_real_extent to is_written_extent xfs: preserve rmapbt swapext block reservation from freed blocks ANDROID: properly copy the scm_io_uring field in struct sk_buff Linux 5.4.224 ipc: remove memcg accounting for sops objects in do_semtimedop() wifi: brcmfmac: Fix potential buffer overflow in brcmf_fweh_event_worker() drm/i915/sdvo: Setup DDC fully before output init drm/i915/sdvo: Filter out invalid outputs more sensibly drm/rockchip: dsi: Force synchronous probe mtd: rawnand: gpmi: Set WAIT_FOR_READY timeout based on program/erase times KVM: x86: emulator: update the emulation mode after CR0 write KVM: x86: emulator: introduce emulator_recalc_and_set_mode KVM: x86: emulator: em_sysexit should update ctxt->mode KVM: x86: Mask off reserved bits in CPUID.80000008H KVM: x86: Mask off reserved bits in CPUID.8000001AH ext4: fix BUG_ON() when directory entry has invalid rec_len ext4: fix warning in 'ext4_da_release_space' parisc: Avoid printing the hardware path twice parisc: Export iosapic_serial_irq() symbol for serial port driver parisc: Make 8250_gsc driver dependend on CONFIG_PARISC ALSA: usb-audio: Add quirks for MacroSilicon MS2100/MS2106 devices perf/x86/intel: Add Cooper Lake stepping to isolation_ucodes[] perf/x86/intel: Fix pebs event constraints for ICL efi: random: reduce seed size to 32 bytes fuse: add file_modified() to fallocate capabilities: fix potential memleak on error path from vfs_getxattr_alloc() tracing/histogram: Update document for KEYS_MAX size tools/nolibc/string: Fix memcmp() implementation kprobe: reverse kp->flags when arm_kprobe failed tcp/udp: Make early_demux back namespacified. btrfs: fix type of parameter generation in btrfs_get_dentry binder: fix UAF of alloc->vma in race with munmap() memcg: enable accounting of ipc resources tcp/udp: Fix memory leak in ipv6_renew_options(). block, bfq: protect 'bfqd->queued' by 'bfqd->lock' Bluetooth: L2CAP: Fix attempting to access uninitialized memory xfs: Add the missed xfs_perag_put() for xfs_ifree_cluster() xfs: don't fail unwritten extent conversion on writeback due to edquot xfs: group quota should return EDQUOT when prj quota enabled xfs: gut error handling in xfs_trans_unreserve_and_mod_sb() xfs: use ordered buffers to initialize dquot buffers during quotacheck xfs: don't fail verifier on empty attr3 leaf block i2c: xiic: Add platform module alias HID: saitek: add madcatz variant of MMO7 mouse device ID scsi: core: Restrict legal sdev_state transitions via sysfs media: meson: vdec: fix possible refcount leak in vdec_probe() media: dvb-frontends/drxk: initialize err to 0 media: cros-ec-cec: limit msg.len to CEC_MAX_MSG_SIZE media: s5p_cec: limit msg.len to CEC_MAX_MSG_SIZE ipv6: fix WARNING in ip6_route_net_exit_late() net, neigh: Fix null-ptr-deref in neigh_table_clear() net: mdio: fix undefined behavior in bit shift for __mdiobus_register Bluetooth: L2CAP: fix use-after-free in l2cap_conn_del() Bluetooth: L2CAP: Fix use-after-free caused by l2cap_reassemble_sdu btrfs: fix ulist leaks in error paths of qgroup self tests btrfs: fix inode list leak during backref walking at find_parent_nodes() btrfs: fix inode list leak during backref walking at resolve_indirect_refs() isdn: mISDN: netjet: fix wrong check of device registration mISDN: fix possible memory leak in mISDN_register_device() rose: Fix NULL pointer dereference in rose_send_frame() ipvs: fix WARNING in ip_vs_app_net_cleanup() ipvs: fix WARNING in __ip_vs_cleanup_batch() ipvs: use explicitly signed chars netfilter: nf_tables: release flow rule object from commit path net: tun: fix bugs for oversize packet when napi frags enabled net: sched: Fix use after free in red_enqueue() ata: pata_legacy: fix pdc20230_set_piomode() net: fec: fix improper use of NETDEV_TX_BUSY nfc: nfcmrvl: Fix potential memory leak in nfcmrvl_i2c_nci_send() nfc: s3fwrn5: Fix potential memory leak in s3fwrn5_nci_send() RDMA/qedr: clean up work queue on failure in qedr_alloc_resources() RDMA/core: Fix null-ptr-deref in ib_core_cleanup() net: dsa: Fix possible memory leaks in dsa_loop_init() nfs4: Fix kmemleak when allocate slot failed NFSv4.1: We must always send RECLAIM_COMPLETE after a reboot NFSv4.1: Handle RECLAIM_COMPLETE trunking errors IB/hfi1: Correctly move list in sc_disable() RDMA/cma: Use output interface for net_dev check Linux 5.4.223 can: rcar_canfd: rcar_canfd_handle_global_receive(): fix IRQ storm on global FIFO receive net: enetc: survive memory pressure without crashing net/mlx5: Fix possible use-after-free in async command interface net/mlx5e: Do not increment ESN when updating IPsec ESN state nh: fix scope used to find saddr when adding non gw nh net: ehea: fix possible memory leak in ehea_register_port() openvswitch: switch from WARN to pr_warn ALSA: aoa: Fix I2S device accounting ALSA: aoa: i2sbus: fix possible memory leak in i2sbus_add_dev() PM: domains: Fix handling of unavailable/disabled idle states net: ksz884x: fix missing pci_disable_device() on error in pcidev_init() i40e: Fix flow-type by setting GL_HASH_INSET registers i40e: Fix VF hang when reset is triggered on another VF i40e: Fix ethtool rx-flow-hash setting for X722 media: videodev2.h: V4L2_DV_BT_BLANKING_HEIGHT should check 'interlaced' media: v4l2-dv-timings: add sanity checks for blanking values media: vivid: dev->bitmap_cap wasn't freed in all cases media: vivid: s_fbuf: add more sanity checks PM: hibernate: Allow hybrid sleep to work with s2idle can: mscan: mpc5xxx: mpc5xxx_can_probe(): add missing put_clock() in error path tcp: fix indefinite deferral of RTO with SACK reneging net: lantiq_etop: don't free skb when returning NETDEV_TX_BUSY net: fix UAF issue in nfqnl_nf_hook_drop() when ops_init() failed kcm: annotate data-races around kcm->rx_wait kcm: annotate data-races around kcm->rx_psock amd-xgbe: add the bit rate quirk for Molex cables amd-xgbe: fix the SFP compliance codes check for DAC cables x86/unwind/orc: Fix unreliable stack dump with gcov net: netsec: fix error handling in netsec_register_mdio() tipc: fix a null-ptr-deref in tipc_topsrv_accept ALSA: ac97: fix possible memory leak in snd_ac97_dev_register() arc: iounmap() arg is volatile drm/msm: Fix return type of mdp4_lvds_connector_mode_valid media: v4l2: Fix v4l2_i2c_subdev_set_name function documentation net: ieee802154: fix error return code in dgram_bind() mm,hugetlb: take hugetlb_lock before decrementing h->resv_huge_pages cgroup-v1: add disabled controller check in cgroup1_parse_param() xen/gntdev: Prevent leaking grants Xen/gntdev: don't ignore kernel unmapping error xfs: force the log after remapping a synchronous-writes file xfs: clear XFS_DQ_FREEING if we can't lock the dquot buffer to flush xfs: finish dfops on every insert range shift iteration s390/pci: add missing EX_TABLE entries to __pcistg_mio_inuser()/__pcilg_mio_inuser() s390/futex: add missing EX_TABLE entry to __futex_atomic_op() perf auxtrace: Fix address filter symbol name match for modules kernfs: fix use-after-free in __kernfs_remove mmc: core: Fix kernel panic when remove non-standard SDIO card drm/msm/hdmi: fix memory corruption with too many bridges drm/msm/dsi: fix memory corruption with too many bridges mac802154: Fix LQI recording fbdev: smscufx: Fix several use-after-free bugs iio: light: tsl2583: Fix module unloading tools: iio: iio_utils: fix digit calculation xhci: Remove device endpoints from bandwidth list when freeing the device mtd: rawnand: marvell: Use correct logic for nand-keep-config usb: xhci: add XHCI_SPURIOUS_SUCCESS to ASM1042 despite being a V0.96 controller usb: bdc: change state when port disconnected usb: dwc3: gadget: Don't set IMI for no_interrupt usb: dwc3: gadget: Stop processing more requests on IMI USB: add RESET_RESUME quirk for NVIDIA Jetson devices in RCM ALSA: au88x0: use explicitly signed char ALSA: Use del_timer_sync() before freeing timer can: kvaser_usb: Fix possible completions during init_completion can: j1939: transport: j1939_session_skb_drop_old(): spin_unlock_irqrestore() before kfree_skb() UPSTREAM: once: fix section mismatch on clang builds ANDROID: fix up struct sk_buf ABI breakage ANDROID: fix up CRC issue with struct tcp_sock Linux 5.4.222 once: fix section mismatch on clang builds ANDROID: fix up131287ff83
("once: add DO_ONCE_SLOW() for sleepable contexts") Revert "serial: 8250: Fix restoring termios speed after suspend" Linux 5.4.221 mm: /proc/pid/smaps_rollup: fix no vma's null-deref hv_netvsc: Fix race between VF offering and VF association message from host Makefile.debug: re-enable debug info for .S files ACPI: video: Force backlight native for more TongFang devices riscv: topology: fix default topology reporting arm64: topology: move store_cpu_topology() to shared code iommu/vt-d: Clean up si_domain in the init_dmars() error path net: hns: fix possible memory leak in hnae_ae_register() net: sched: cake: fix null pointer access issue when cake_init() fails net: phy: dp83867: Extend RX strap quirk for SGMII mode net/atm: fix proc_mpc_write incorrect return value HID: magicmouse: Do not set BTN_MOUSE on double report tipc: fix an information leak in tipc_topsrv_kern_subscr tipc: Fix recognition of trial period ACPI: extlog: Handle multiple records btrfs: fix processing of delayed tree block refs during backref walking btrfs: fix processing of delayed data refs during backref walking r8152: add PID for the Lenovo OneLink+ Dock arm64: errata: Remove AES hwcap for COMPAT tasks media: venus: dec: Handle the case where find_format fails KVM: arm64: vgic: Fix exit condition in scan_its_table() ata: ahci: Match EM_MAX_SLOTS with SATA_PMP_MAX_PORTS ata: ahci-imx: Fix MODULE_ALIAS hwmon/coretemp: Handle large core ID value x86/microcode/AMD: Apply the patch early on every logical thread ocfs2: fix BUG when iput after ocfs2_mknod fails ocfs2: clear dinode links count in case of error xfs: fix use-after-free on CIL context on shutdown xfs: move inode flush to the sync workqueue xfs: reflink should force the log out if mounted with wsync xfs: factor out a new xfs_log_force_inode helper xfs: trylock underlying buffer on dquot flush xfs: don't write a corrupt unmount record to force summary counter recalc xfs: tail updates only need to occur when LSN changes xfs: factor common AIL item deletion code xfs: Throttle commits on delayed background CIL push xfs: Lower CIL flush limit for large logs xfs: preserve default grace interval during quotacheck xfs: fix unmount hang and memory leak on shutdown during quotaoff xfs: factor out quotaoff intent AIL removal and memory free xfs: Replace function declaration by actual definition xfs: remove the xfs_qoff_logitem_t typedef xfs: remove the xfs_dq_logitem_t typedef xfs: remove the xfs_disk_dquot_t and xfs_dquot_t xfs: Use scnprintf() for avoiding potential buffer overflow xfs: check owner of dir3 blocks xfs: check owner of dir3 data blocks xfs: fix buffer corruption reporting when xfs_dir3_free_header_check fails xfs: xfs_buf_corruption_error should take __this_address xfs: add a function to deal with corrupt buffers post-verifiers xfs: rework collapse range into an atomic operation xfs: rework insert range into an atomic operation xfs: open code insert range extent split helper Linux 5.4.220 thermal: intel_powerclamp: Use first online CPU as control_cpu inet: fully convert sk->sk_rx_dst to RCU rules efi: libstub: drop pointless get_memory_map() call md: Replace snprintf with scnprintf ext4: continue to expand file system when the target size doesn't reach net/ieee802154: don't warn zero-sized raw_sendmsg() Revert "net/ieee802154: reject zero-sized raw_sendmsg()" net: ieee802154: return -EINVAL for unknown addr type io_uring/af_unix: defer registered files gc to io_uring release perf intel-pt: Fix segfault in intel_pt_print_info() with uClibc clk: bcm2835: Make peripheral PLLC critical usb: idmouse: fix an uninit-value in idmouse_open nvmet-tcp: add bounds check on Transfer Tag nvme: copy firmware_rev on each init staging: rtl8723bs: fix a potential memory leak in rtw_init_cmd_priv() Revert "usb: storage: Add quirk for Samsung Fit flash" usb: musb: Fix musb_gadget.c rxstate overflow bug usb: host: xhci: Fix potential memory leak in xhci_alloc_stream_info() md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d HID: roccat: Fix use-after-free in roccat_read() bcache: fix set_at_max_writeback_rate() for multiple attached devices ata: libahci_platform: Sanity check the DT child nodes number staging: vt6655: fix potential memory leak power: supply: adp5061: fix out-of-bounds read in adp5061_get_chg_type() nbd: Fix hung when signal interrupts nbd_start_device_ioctl() scsi: 3w-9xxx: Avoid disabling device if failing to enable it clk: zynqmp: pll: rectify rate rounding in zynqmp_pll_round_rate media: cx88: Fix a null-ptr-deref bug in buffer_prepare() clk: zynqmp: Fix stack-out-of-bounds in strncpy` btrfs: scrub: try to fix super block errors ARM: dts: imx6sx: add missing properties for sram ARM: dts: imx6sll: add missing properties for sram ARM: dts: imx6sl: add missing properties for sram ARM: dts: imx6qp: add missing properties for sram ARM: dts: imx6dl: add missing properties for sram ARM: dts: imx6q: add missing properties for sram ARM: dts: imx7d-sdb: config the max pressure for tsc2046 mmc: sdhci-msm: add compatible string check for sdm670 drm/amdgpu: fix initial connector audio value platform/x86: msi-laptop: Change DMI match / alias strings to fix module autoloading drm: panel-orientation-quirks: Add quirk for Anbernic Win600 drm/vc4: vec: Fix timings for VEC modes drm/amd/display: fix overflow on MIN_I64 definition drm: Prevent drm_copy_field() to attempt copying a NULL pointer drm: Use size_t type for len variable in drm_copy_field() drm/nouveau/nouveau_bo: fix potential memory leak in nouveau_bo_alloc() r8152: Rate limit overflow messages Bluetooth: L2CAP: Fix user-after-free net: If sock is dead don't access sock's sk_wq in sk_stream_wait_memory wifi: rt2x00: correctly set BBP register 86 for MT7620 wifi: rt2x00: set SoC wmac clock register wifi: rt2x00: set VGC gain for both chains of MT7620 wifi: rt2x00: set correct TX_SW_CFG1 MAC register for MT7620 wifi: rt2x00: don't run Rt5592 IQ calibration on MT7620 can: bcm: check the result of can_send() in bcm_can_tx() Bluetooth: hci_sysfs: Fix attempting to call device_add multiple times Bluetooth: L2CAP: initialize delayed works at l2cap_chan_create() wifi: brcmfmac: fix use-after-free bug in brcmf_netdev_start_xmit() xfrm: Update ipcomp_scratches with NULL when freed wifi: ath9k: avoid uninit memory read in ath9k_htc_rx_msg() tcp: annotate data-race around tcp_md5sig_pool_populated openvswitch: Fix overreporting of drops in dropwatch openvswitch: Fix double reporting of drops in dropwatch bpftool: Clear errno after libcap's checks wifi: brcmfmac: fix invalid address access when enabling SCAN log level NFSD: Return nfserr_serverfault if splice_ok but buf->pages have data thermal: intel_powerclamp: Use get_cpu() instead of smp_processor_id() to avoid crash powercap: intel_rapl: fix UBSAN shift-out-of-bounds issue MIPS: BCM47XX: Cast memcmp() of function to (void *) ACPI: video: Add Toshiba Satellite/Portege Z830 quirk f2fs: fix race condition on setting FI_NO_EXTENT flag crypto: cavium - prevent integer overflow loading firmware kbuild: remove the target in signal traps when interrupted iommu/iova: Fix module config properly crypto: ccp - Release dma channels before dmaengine unrgister crypto: akcipher - default implementation for setting a private key iommu/omap: Fix buffer overflow in debugfs cgroup/cpuset: Enable update_tasks_cpumask() on top_cpuset powerpc: Fix SPE Power ISA properties for e500v1 platforms powerpc/64s: Fix GENERIC_CPU build flags for PPC970 / G5 x86/hyperv: Fix 'struct hv_enlightened_vmcs' definition powerpc/powernv: add missing of_node_put() in opal_export_attrs() powerpc/pci_dn: Add missing of_node_put() powerpc/sysdev/fsl_msi: Add missing of_node_put() powerpc/math_emu/efp: Include module.h mailbox: bcm-ferxrm-mailbox: Fix error check for dma_map_sg clk: ast2600: BCLK comes from EPLL clk: ti: dra7-atl: Fix reference leak in of_dra7_atl_clk_probe clk: bcm2835: fix bcm2835_clock_rate_from_divisor declaration spmi: pmic-arb: correct duplicate APID to PPID mapping logic dmaengine: ioat: stop mod_timer from resurrecting deleted timer in __cleanup() clk: mediatek: mt8183: mfgcfg: Propagate rate changes to parent mfd: sm501: Add check for platform_driver_register() mfd: fsl-imx25: Fix check for platform_get_irq() errors mfd: lp8788: Fix an error handling path in lp8788_irq_init() and lp8788_irq_init() mfd: lp8788: Fix an error handling path in lp8788_probe() mfd: fsl-imx25: Fix an error handling path in mx25_tsadc_setup_irq() mfd: intel_soc_pmic: Fix an error handling path in intel_soc_pmic_i2c_probe() fsi: core: Check error number after calling ida_simple_get scsi: libsas: Fix use-after-free bug in smp_execute_task_sg() serial: 8250: Fix restoring termios speed after suspend firmware: google: Test spinlock on panic path to avoid lockups staging: vt6655: fix some erroneous memory clean-up loops phy: qualcomm: call clk_disable_unprepare in the error handling tty: serial: fsl_lpuart: disable dma rx/tx use flags in lpuart_dma_shutdown drivers: serial: jsm: fix some leaks in probe usb: gadget: function: fix dangling pnp_string in f_printer.c xhci: Don't show warning for reinit on known broken suspend md/raid5: Ensure stripe_fill happens on non-read IO with journal mtd: rawnand: meson: fix bit map use in meson_nfc_ecc_correct() ata: fix ata_id_has_dipm() ata: fix ata_id_has_ncq_autosense() ata: fix ata_id_has_devslp() ata: fix ata_id_sense_reporting_enabled() and ata_id_has_sense_reporting() RDMA/siw: Always consume all skbuf data in sk_data_ready() upcall. mtd: devices: docg3: check the return value of devm_ioremap() in the probe dyndbg: let query-modname override actual module name dyndbg: fix module.dyndbg handling misc: ocxl: fix possible refcount leak in afu_ioctl() RDMA/rxe: Fix the error caused by qp->sk RDMA/rxe: Fix "kernel NULL pointer dereference" error media: xilinx: vipp: Fix refcount leak in xvip_graph_dma_init tty: xilinx_uartps: Fix the ignore_status media: exynos4-is: fimc-is: Add of_node_put() when breaking out of loop HSI: omap_ssi_port: Fix dma_map_sg error check HSI: omap_ssi: Fix refcount leak in ssi_probe clk: tegra20: Fix refcount leak in tegra20_clock_init clk: tegra: Fix refcount leak in tegra114_clock_init clk: tegra: Fix refcount leak in tegra210_clock_init clk: berlin: Add of_node_put() for of_get_parent() clk: oxnas: Hold reference returned by of_get_parent() clk: meson: Hold reference returned by of_get_parent() iio: ABI: Fix wrong format of differential capacitance channel ABI. iio: inkern: only release the device node when done with it iio: adc: at91-sama5d2_adc: lock around oversampling and sample freq iio: adc: at91-sama5d2_adc: check return status for pressure and touch iio: adc: at91-sama5d2_adc: fix AT91_SAMA5D2_MR_TRACKTIM_MAX ARM: dts: exynos: fix polarity of VBUS GPIO of Origen ARM: Drop CMDLINE_* dependency on ATAGS ARM: dts: exynos: correct s5k6a3 reset polarity on Midas family ARM: dts: kirkwood: lsxl: remove first ethernet port ARM: dts: kirkwood: lsxl: fix serial line ARM: dts: turris-omnia: Fix mpp26 pin name and comment soc: qcom: smem_state: Add refcounting for the 'state->of_node' soc: qcom: smsm: Fix refcount leak bugs in qcom_smsm_probe() memory: of: Fix refcount leak bug in of_get_ddr_timings() memory: pl353-smc: Fix refcount leak bug in pl353_smc_probe() ALSA: hda/hdmi: Don't skip notification handling during PM operation ASoC: wm5102: Fix PM disable depth imbalance in wm5102_probe ASoC: wm5110: Fix PM disable depth imbalance in wm5110_probe ASoC: wm8997: Fix PM disable depth imbalance in wm8997_probe mmc: wmt-sdmmc: Fix an error handling path in wmt_mci_probe() ALSA: dmaengine: increment buffer pointer atomically drm/msm/dpu: index dpu_kms->hw_vbif using vbif_idx ASoC: eureka-tlv320: Hold reference returned from of_find_xxx API mmc: au1xmmc: Fix an error handling path in au1xmmc_probe() drm/omap: dss: Fix refcount leak bugs ALSA: hda: beep: Simplify keep-power-at-enable behavior ASoC: rsnd: Add check for rsnd_mod_power_on drm/bridge: megachips: Fix a null pointer dereference bug drm: fix drm_mipi_dbi build errors platform/x86: msi-laptop: Fix resource cleanup platform/x86: msi-laptop: Fix old-ec check for backlight registering platform/chrome: fix memory corruption in ioctl platform/chrome: fix double-free in chromeos_laptop_prepare() drm/mipi-dsi: Detach devices when removing the host drm: bridge: adv7511: fix CEC power down control register offset net: mvpp2: fix mvpp2 debugfs leak once: add DO_ONCE_SLOW() for sleepable contexts net/ieee802154: reject zero-sized raw_sendmsg() bnx2x: fix potential memory leak in bnx2x_tpa_stop() net: rds: don't hold sock lock when cancelling work from rds_tcp_reset_callbacks() tcp: fix tcp_cwnd_validate() to not forget is_cwnd_limited sctp: handle the error returned from sctp_auth_asoc_init_active_key mISDN: fix use-after-free bugs in l1oip timer handlers vhost/vsock: Use kvmalloc/kvfree for larger packets. spi: s3c64xx: Fix large transfers with DMA netfilter: nft_fib: Fix for rpath check with VRF devices spi/omap100k:Fix PM disable depth imbalance in omap1_spi100k_probe x86/microcode/AMD: Track patch allocation size explicitly bpf: Ensure correct locking around vulnerable function find_vpid() net: fs_enet: Fix wrong check in do_pd_setup wifi: rtl8xxxu: gen2: Fix mistake in path B IQ calibration bpf: btf: fix truncated last_member_type_id in btf_struct_resolve wifi: rtl8xxxu: Fix skb misuse in TX queue selection spi: qup: add missing clk_disable_unprepare on error in spi_qup_pm_resume_runtime() spi: qup: add missing clk_disable_unprepare on error in spi_qup_resume() wifi: rtl8xxxu: tighten bounds checking in rtl8xxxu_read_efuse() x86/resctrl: Fix to restore to original value when re-enabling hardware prefetch register bpftool: Fix a wrong type cast in btf_dumper_int wifi: mac80211: allow bw change during channel switch in mesh wifi: ath10k: add peer map clean up for peer delete in ath10k_sta_state() nfsd: Fix a memory leak in an error handling path ARM: 9247/1: mm: set readonly for MT_MEMORY_RO with ARM_LPAE sh: machvec: Use char[] for section boundaries userfaultfd: open userfaultfds with O_RDONLY tracing: Disable interrupt or preemption before acquiring arch_spinlock_t selinux: use "grep -E" instead of "egrep" drm/nouveau: fix a use-after-free in nouveau_gem_prime_import_sg_table() gcov: support GCC 12.1 and newer compilers KVM: VMX: Drop bits 31:16 when shoving exception error code into VMCS KVM: nVMX: Unconditionally purge queued/injected events on nested "exit" KVM: x86/emulator: Fix handing of POP SS to correctly set interruptibility media: cedrus: Set the platform driver data earlier ring-buffer: Fix race between reset page and reading page ring-buffer: Check pending waiters when doing wake ups as well ring-buffer: Have the shortest_full queue be the shortest not longest ring-buffer: Allow splice to read previous partially read pages ftrace: Properly unset FTRACE_HASH_FL_MOD livepatch: fix race between fork and KLP transition ext4: place buffer head allocation before handle start ext4: make ext4_lazyinit_thread freezable ext4: fix null-ptr-deref in ext4_write_info ext4: avoid crash when inline data creation follows DIO write jbd2: wake up journal waiters in FIFO order, not LIFO nilfs2: fix use-after-free bug of struct nilfs_root f2fs: fix to do sanity check on summary info f2fs: fix to do sanity check on destination blkaddr during recovery f2fs: increase the limit for reserve_root btrfs: fix race between quota enable and quota rescan ioctl fbdev: smscufx: Fix use-after-free in ufx_ops_open() powerpc/boot: Explicitly disable usage of SPE instructions PCI: Sanitise firmware BAR assignments behind a PCI-PCI bridge UM: cpuinfo: Fix a warning for CONFIG_CPUMASK_OFFSTACK riscv: Pass -mno-relax only on lld < 15.0.0 riscv: Allow PROT_WRITE-only mmap() parisc: fbdev/stifb: Align graphics memory size to 4MB RISC-V: Make port I/O string accessors actually work regulator: qcom_rpm: Fix circular deferral regression ASoC: wcd9335: fix order of Slimbus unprepare/disable quota: Check next/prev free block number after reading from quota file HID: multitouch: Add memory barriers fs: dlm: handle -EBUSY first in lock arg validation fs: dlm: fix race between test_bit() and queue_work() mmc: sdhci-sprd: Fix minimum clock limit can: kvaser_usb_leaf: Fix CAN state after restart can: kvaser_usb_leaf: Fix TX queue out of sync after restart can: kvaser_usb_leaf: Fix overread with an invalid command can: kvaser_usb: Fix use of uninitialized completion usb: add quirks for Lenovo OneLink+ Dock iio: pressure: dps310: Reset chip after timeout iio: pressure: dps310: Refactor startup procedure iio: dac: ad5593r: Fix i2c read protocol requirements cifs: Fix the error length of VALIDATE_NEGOTIATE_INFO message cifs: destage dirty pages before re-reading them for cache=none mtd: rawnand: atmel: Unmap streaming DMA mappings ALSA: hda/realtek: Add Intel Reference SSID to support headset keys ALSA: hda/realtek: Add quirk for ASUS GV601R laptop ALSA: hda/realtek: Correct pin configs for ASUS G533Z ALSA: hda/realtek: remove ALC289_FIXUP_DUAL_SPK for Dell 5530 ALSA: usb-audio: Fix NULL dererence at error path ALSA: usb-audio: Fix potential memory leaks ALSA: rawmidi: Drop register_mutex in snd_rawmidi_free() ALSA: oss: Fix potential deadlock at unregistration Conflicts: drivers/android/binder.c include/linux/rmap.h kernel/cgroup/cpuset.c mm/rmap.c Change-Id: I34fe2d5c9b0d5844560de9c983867511b5d57265
2795 lines
73 KiB
C
2795 lines
73 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
* operating system. INET is implemented using the BSD Socket
|
|
* interface as the means of communication with the user level.
|
|
*
|
|
* Implementation of the Transmission Control Protocol(TCP).
|
|
*
|
|
* IPv4 specific functions
|
|
*
|
|
* code split from:
|
|
* linux/ipv4/tcp.c
|
|
* linux/ipv4/tcp_input.c
|
|
* linux/ipv4/tcp_output.c
|
|
*
|
|
* See tcp.c for author information
|
|
*/
|
|
|
|
/*
|
|
* Changes:
|
|
* David S. Miller : New socket lookup architecture.
|
|
* This code is dedicated to John Dyson.
|
|
* David S. Miller : Change semantics of established hash,
|
|
* half is devoted to TIME_WAIT sockets
|
|
* and the rest go in the other half.
|
|
* Andi Kleen : Add support for syncookies and fixed
|
|
* some bugs: ip options weren't passed to
|
|
* the TCP layer, missed a check for an
|
|
* ACK bit.
|
|
* Andi Kleen : Implemented fast path mtu discovery.
|
|
* Fixed many serious bugs in the
|
|
* request_sock handling and moved
|
|
* most of it into the af independent code.
|
|
* Added tail drop and some other bugfixes.
|
|
* Added new listen semantics.
|
|
* Mike McLagan : Routing by source
|
|
* Juan Jose Ciarlante: ip_dynaddr bits
|
|
* Andi Kleen: various fixes.
|
|
* Vitaly E. Lavrov : Transparent proxy revived after year
|
|
* coma.
|
|
* Andi Kleen : Fix new listen.
|
|
* Andi Kleen : Fix accept error reporting.
|
|
* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
|
|
* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
|
|
* a single port at the same time.
|
|
*/
|
|
|
|
#define pr_fmt(fmt) "TCP: " fmt
|
|
|
|
#include <linux/bottom_half.h>
|
|
#include <linux/types.h>
|
|
#include <linux/fcntl.h>
|
|
#include <linux/module.h>
|
|
#include <linux/random.h>
|
|
#include <linux/cache.h>
|
|
#include <linux/jhash.h>
|
|
#include <linux/init.h>
|
|
#include <linux/times.h>
|
|
#include <linux/slab.h>
|
|
|
|
#include <net/net_namespace.h>
|
|
#include <net/icmp.h>
|
|
#include <net/inet_hashtables.h>
|
|
#include <net/tcp.h>
|
|
#include <net/transp_v6.h>
|
|
#include <net/ipv6.h>
|
|
#include <net/inet_common.h>
|
|
#include <net/timewait_sock.h>
|
|
#include <net/xfrm.h>
|
|
#include <net/secure_seq.h>
|
|
#include <net/busy_poll.h>
|
|
|
|
#include <linux/inet.h>
|
|
#include <linux/ipv6.h>
|
|
#include <linux/stddef.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/inetdevice.h>
|
|
|
|
#include <crypto/hash.h>
|
|
#include <linux/scatterlist.h>
|
|
|
|
#include <trace/events/tcp.h>
|
|
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
|
|
__be32 daddr, __be32 saddr, const struct tcphdr *th);
|
|
#endif
|
|
|
|
struct inet_hashinfo tcp_hashinfo;
|
|
EXPORT_SYMBOL(tcp_hashinfo);
|
|
|
|
static u32 tcp_v4_init_seq(const struct sk_buff *skb)
|
|
{
|
|
return secure_tcp_seq(ip_hdr(skb)->daddr,
|
|
ip_hdr(skb)->saddr,
|
|
tcp_hdr(skb)->dest,
|
|
tcp_hdr(skb)->source);
|
|
}
|
|
|
|
static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
|
|
{
|
|
return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
|
|
}
|
|
|
|
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
|
|
{
|
|
int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
|
|
const struct inet_timewait_sock *tw = inet_twsk(sktw);
|
|
const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
|
if (reuse == 2) {
|
|
/* Still does not detect *everything* that goes through
|
|
* lo, since we require a loopback src or dst address
|
|
* or direct binding to 'lo' interface.
|
|
*/
|
|
bool loopback = false;
|
|
if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
|
|
loopback = true;
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
if (tw->tw_family == AF_INET6) {
|
|
if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
|
|
(ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
|
|
(tw->tw_v6_daddr.s6_addr[12] == 127)) ||
|
|
ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
|
|
(ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
|
|
(tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
|
|
loopback = true;
|
|
} else
|
|
#endif
|
|
{
|
|
if (ipv4_is_loopback(tw->tw_daddr) ||
|
|
ipv4_is_loopback(tw->tw_rcv_saddr))
|
|
loopback = true;
|
|
}
|
|
if (!loopback)
|
|
reuse = 0;
|
|
}
|
|
|
|
/* With PAWS, it is safe from the viewpoint
|
|
of data integrity. Even without PAWS it is safe provided sequence
|
|
spaces do not overlap i.e. at data rates <= 80Mbit/sec.
|
|
|
|
Actually, the idea is close to VJ's one, only timestamp cache is
|
|
held not per host, but per port pair and TW bucket is used as state
|
|
holder.
|
|
|
|
If TW bucket has been already destroyed we fall back to VJ's scheme
|
|
and use initial timestamp retrieved from peer table.
|
|
*/
|
|
if (tcptw->tw_ts_recent_stamp &&
|
|
(!twp || (reuse && time_after32(ktime_get_seconds(),
|
|
tcptw->tw_ts_recent_stamp)))) {
|
|
/* In case of repair and re-using TIME-WAIT sockets we still
|
|
* want to be sure that it is safe as above but honor the
|
|
* sequence numbers and time stamps set as part of the repair
|
|
* process.
|
|
*
|
|
* Without this check re-using a TIME-WAIT socket with TCP
|
|
* repair would accumulate a -1 on the repair assigned
|
|
* sequence number. The first time it is reused the sequence
|
|
* is -1, the second time -2, etc. This fixes that issue
|
|
* without appearing to create any others.
|
|
*/
|
|
if (likely(!tp->repair)) {
|
|
u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
|
|
|
|
if (!seq)
|
|
seq = 1;
|
|
WRITE_ONCE(tp->write_seq, seq);
|
|
tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
|
|
tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
|
|
}
|
|
sock_hold(sktw);
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(tcp_twsk_unique);
|
|
|
|
static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
|
|
int addr_len)
|
|
{
|
|
/* This check is replicated from tcp_v4_connect() and intended to
|
|
* prevent BPF program called below from accessing bytes that are out
|
|
* of the bound specified by user in addr_len.
|
|
*/
|
|
if (addr_len < sizeof(struct sockaddr_in))
|
|
return -EINVAL;
|
|
|
|
sock_owned_by_me(sk);
|
|
|
|
return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
|
|
}
|
|
|
|
/* This will initiate an outgoing connection. */
|
|
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
|
|
{
|
|
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
__be16 orig_sport, orig_dport;
|
|
__be32 daddr, nexthop;
|
|
struct flowi4 *fl4;
|
|
struct rtable *rt;
|
|
int err;
|
|
struct ip_options_rcu *inet_opt;
|
|
struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
|
|
|
|
if (addr_len < sizeof(struct sockaddr_in))
|
|
return -EINVAL;
|
|
|
|
if (usin->sin_family != AF_INET)
|
|
return -EAFNOSUPPORT;
|
|
|
|
nexthop = daddr = usin->sin_addr.s_addr;
|
|
inet_opt = rcu_dereference_protected(inet->inet_opt,
|
|
lockdep_sock_is_held(sk));
|
|
if (inet_opt && inet_opt->opt.srr) {
|
|
if (!daddr)
|
|
return -EINVAL;
|
|
nexthop = inet_opt->opt.faddr;
|
|
}
|
|
|
|
orig_sport = inet->inet_sport;
|
|
orig_dport = usin->sin_port;
|
|
fl4 = &inet->cork.fl.u.ip4;
|
|
rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
|
|
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
|
|
IPPROTO_TCP,
|
|
orig_sport, orig_dport, sk);
|
|
if (IS_ERR(rt)) {
|
|
err = PTR_ERR(rt);
|
|
if (err == -ENETUNREACH)
|
|
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
|
|
return err;
|
|
}
|
|
|
|
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
|
|
ip_rt_put(rt);
|
|
return -ENETUNREACH;
|
|
}
|
|
|
|
if (!inet_opt || !inet_opt->opt.srr)
|
|
daddr = fl4->daddr;
|
|
|
|
if (!inet->inet_saddr)
|
|
inet->inet_saddr = fl4->saddr;
|
|
sk_rcv_saddr_set(sk, inet->inet_saddr);
|
|
|
|
if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
|
|
/* Reset inherited state */
|
|
tp->rx_opt.ts_recent = 0;
|
|
tp->rx_opt.ts_recent_stamp = 0;
|
|
if (likely(!tp->repair))
|
|
WRITE_ONCE(tp->write_seq, 0);
|
|
}
|
|
|
|
inet->inet_dport = usin->sin_port;
|
|
sk_daddr_set(sk, daddr);
|
|
|
|
inet_csk(sk)->icsk_ext_hdr_len = 0;
|
|
if (inet_opt)
|
|
inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
|
|
|
|
tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
|
|
|
|
/* Socket identity is still unknown (sport may be zero).
|
|
* However we set state to SYN-SENT and not releasing socket
|
|
* lock select source port, enter ourselves into the hash tables and
|
|
* complete initialization after this.
|
|
*/
|
|
tcp_set_state(sk, TCP_SYN_SENT);
|
|
err = inet_hash_connect(tcp_death_row, sk);
|
|
if (err)
|
|
goto failure;
|
|
|
|
sk_set_txhash(sk);
|
|
|
|
rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
|
|
inet->inet_sport, inet->inet_dport, sk);
|
|
if (IS_ERR(rt)) {
|
|
err = PTR_ERR(rt);
|
|
rt = NULL;
|
|
goto failure;
|
|
}
|
|
/* OK, now commit destination to socket. */
|
|
sk->sk_gso_type = SKB_GSO_TCPV4;
|
|
sk_setup_caps(sk, &rt->dst);
|
|
rt = NULL;
|
|
|
|
if (likely(!tp->repair)) {
|
|
if (!tp->write_seq)
|
|
WRITE_ONCE(tp->write_seq,
|
|
secure_tcp_seq(inet->inet_saddr,
|
|
inet->inet_daddr,
|
|
inet->inet_sport,
|
|
usin->sin_port));
|
|
tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
|
|
inet->inet_saddr,
|
|
inet->inet_daddr);
|
|
}
|
|
|
|
inet->inet_id = prandom_u32();
|
|
|
|
if (tcp_fastopen_defer_connect(sk, &err))
|
|
return err;
|
|
if (err)
|
|
goto failure;
|
|
|
|
err = tcp_connect(sk);
|
|
|
|
if (err)
|
|
goto failure;
|
|
|
|
return 0;
|
|
|
|
failure:
|
|
/*
|
|
* This unhashes the socket and releases the local port,
|
|
* if necessary.
|
|
*/
|
|
tcp_set_state(sk, TCP_CLOSE);
|
|
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
|
|
inet_reset_saddr(sk);
|
|
ip_rt_put(rt);
|
|
sk->sk_route_caps = 0;
|
|
inet->inet_dport = 0;
|
|
return err;
|
|
}
|
|
EXPORT_SYMBOL(tcp_v4_connect);
|
|
|
|
/*
|
|
* This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
|
|
* It can be called through tcp_release_cb() if socket was owned by user
|
|
* at the time tcp_v4_err() was called to handle ICMP message.
|
|
*/
|
|
void tcp_v4_mtu_reduced(struct sock *sk)
|
|
{
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
struct dst_entry *dst;
|
|
u32 mtu;
|
|
|
|
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
|
|
return;
|
|
mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
|
|
dst = inet_csk_update_pmtu(sk, mtu);
|
|
if (!dst)
|
|
return;
|
|
|
|
/* Something is about to be wrong... Remember soft error
|
|
* for the case, if this connection will not able to recover.
|
|
*/
|
|
if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
|
|
sk->sk_err_soft = EMSGSIZE;
|
|
|
|
mtu = dst_mtu(dst);
|
|
|
|
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
|
|
ip_sk_accept_pmtu(sk) &&
|
|
inet_csk(sk)->icsk_pmtu_cookie > mtu) {
|
|
tcp_sync_mss(sk, mtu);
|
|
|
|
/* Resend the TCP packet because it's
|
|
* clear that the old packet has been
|
|
* dropped. This is the new "fast" path mtu
|
|
* discovery.
|
|
*/
|
|
tcp_simple_retransmit(sk);
|
|
} /* else let the usual retransmit timer handle it */
|
|
}
|
|
EXPORT_SYMBOL(tcp_v4_mtu_reduced);
|
|
|
|
static void do_redirect(struct sk_buff *skb, struct sock *sk)
|
|
{
|
|
struct dst_entry *dst = __sk_dst_check(sk, 0);
|
|
|
|
if (dst)
|
|
dst->ops->redirect(dst, sk, skb);
|
|
}
|
|
|
|
|
|
/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
|
|
void tcp_req_err(struct sock *sk, u32 seq, bool abort)
|
|
{
|
|
struct request_sock *req = inet_reqsk(sk);
|
|
struct net *net = sock_net(sk);
|
|
|
|
/* ICMPs are not backlogged, hence we cannot get
|
|
* an established socket here.
|
|
*/
|
|
if (seq != tcp_rsk(req)->snt_isn) {
|
|
__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
|
|
} else if (abort) {
|
|
/*
|
|
* Still in SYN_RECV, just remove it silently.
|
|
* There is no good way to pass the error to the newly
|
|
* created socket, and POSIX does not want network
|
|
* errors returned from accept().
|
|
*/
|
|
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
|
|
tcp_listendrop(req->rsk_listener);
|
|
}
|
|
reqsk_put(req);
|
|
}
|
|
EXPORT_SYMBOL(tcp_req_err);
|
|
|
|
/*
|
|
* This routine is called by the ICMP module when it gets some
|
|
* sort of error condition. If err < 0 then the socket should
|
|
* be closed and the error returned to the user. If err > 0
|
|
* it's just the icmp type << 8 | icmp code. After adjustment
|
|
* header points to the first 8 bytes of the tcp header. We need
|
|
* to find the appropriate port.
|
|
*
|
|
* The locking strategy used here is very "optimistic". When
|
|
* someone else accesses the socket the ICMP is just dropped
|
|
* and for some paths there is no check at all.
|
|
* A more general error queue to queue errors for later handling
|
|
* is probably better.
|
|
*
|
|
*/
|
|
|
|
int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
|
|
{
|
|
const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
|
|
struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
|
|
struct inet_connection_sock *icsk;
|
|
struct tcp_sock *tp;
|
|
struct inet_sock *inet;
|
|
const int type = icmp_hdr(icmp_skb)->type;
|
|
const int code = icmp_hdr(icmp_skb)->code;
|
|
struct sock *sk;
|
|
struct sk_buff *skb;
|
|
struct request_sock *fastopen;
|
|
u32 seq, snd_una;
|
|
s32 remaining;
|
|
u32 delta_us;
|
|
int err;
|
|
struct net *net = dev_net(icmp_skb->dev);
|
|
|
|
sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
|
|
th->dest, iph->saddr, ntohs(th->source),
|
|
inet_iif(icmp_skb), 0);
|
|
if (!sk) {
|
|
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
|
|
return -ENOENT;
|
|
}
|
|
if (sk->sk_state == TCP_TIME_WAIT) {
|
|
inet_twsk_put(inet_twsk(sk));
|
|
return 0;
|
|
}
|
|
seq = ntohl(th->seq);
|
|
if (sk->sk_state == TCP_NEW_SYN_RECV) {
|
|
tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
|
|
type == ICMP_TIME_EXCEEDED ||
|
|
(type == ICMP_DEST_UNREACH &&
|
|
(code == ICMP_NET_UNREACH ||
|
|
code == ICMP_HOST_UNREACH)));
|
|
return 0;
|
|
}
|
|
|
|
bh_lock_sock(sk);
|
|
/* If too many ICMPs get dropped on busy
|
|
* servers this needs to be solved differently.
|
|
* We do take care of PMTU discovery (RFC1191) special case :
|
|
* we can receive locally generated ICMP messages while socket is held.
|
|
*/
|
|
if (sock_owned_by_user(sk)) {
|
|
if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
|
|
__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
|
|
}
|
|
if (sk->sk_state == TCP_CLOSE)
|
|
goto out;
|
|
|
|
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
|
|
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
|
|
goto out;
|
|
}
|
|
|
|
icsk = inet_csk(sk);
|
|
tp = tcp_sk(sk);
|
|
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
|
|
fastopen = rcu_dereference(tp->fastopen_rsk);
|
|
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
|
|
if (sk->sk_state != TCP_LISTEN &&
|
|
!between(seq, snd_una, tp->snd_nxt)) {
|
|
__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
|
|
goto out;
|
|
}
|
|
|
|
switch (type) {
|
|
case ICMP_REDIRECT:
|
|
if (!sock_owned_by_user(sk))
|
|
do_redirect(icmp_skb, sk);
|
|
goto out;
|
|
case ICMP_SOURCE_QUENCH:
|
|
/* Just silently ignore these. */
|
|
goto out;
|
|
case ICMP_PARAMETERPROB:
|
|
err = EPROTO;
|
|
break;
|
|
case ICMP_DEST_UNREACH:
|
|
if (code > NR_ICMP_UNREACH)
|
|
goto out;
|
|
|
|
if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
|
|
/* We are not interested in TCP_LISTEN and open_requests
|
|
* (SYN-ACKs send out by Linux are always <576bytes so
|
|
* they should go through unfragmented).
|
|
*/
|
|
if (sk->sk_state == TCP_LISTEN)
|
|
goto out;
|
|
|
|
WRITE_ONCE(tp->mtu_info, info);
|
|
if (!sock_owned_by_user(sk)) {
|
|
tcp_v4_mtu_reduced(sk);
|
|
} else {
|
|
if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
|
|
sock_hold(sk);
|
|
}
|
|
goto out;
|
|
}
|
|
|
|
err = icmp_err_convert[code].errno;
|
|
/* check if icmp_skb allows revert of backoff
|
|
* (see draft-zimmermann-tcp-lcd) */
|
|
if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
|
|
break;
|
|
if (seq != tp->snd_una || !icsk->icsk_retransmits ||
|
|
!icsk->icsk_backoff || fastopen)
|
|
break;
|
|
|
|
if (sock_owned_by_user(sk))
|
|
break;
|
|
|
|
skb = tcp_rtx_queue_head(sk);
|
|
if (WARN_ON_ONCE(!skb))
|
|
break;
|
|
|
|
icsk->icsk_backoff--;
|
|
icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
|
|
TCP_TIMEOUT_INIT;
|
|
icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
|
|
|
|
|
|
tcp_mstamp_refresh(tp);
|
|
delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
|
|
remaining = icsk->icsk_rto -
|
|
usecs_to_jiffies(delta_us);
|
|
|
|
if (remaining > 0) {
|
|
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
|
|
remaining, TCP_RTO_MAX);
|
|
} else {
|
|
/* RTO revert clocked out retransmission.
|
|
* Will retransmit now */
|
|
tcp_retransmit_timer(sk);
|
|
}
|
|
|
|
break;
|
|
case ICMP_TIME_EXCEEDED:
|
|
err = EHOSTUNREACH;
|
|
break;
|
|
default:
|
|
goto out;
|
|
}
|
|
|
|
switch (sk->sk_state) {
|
|
case TCP_SYN_SENT:
|
|
case TCP_SYN_RECV:
|
|
/* Only in fast or simultaneous open. If a fast open socket is
|
|
* is already accepted it is treated as a connected one below.
|
|
*/
|
|
if (fastopen && !fastopen->sk)
|
|
break;
|
|
|
|
if (!sock_owned_by_user(sk)) {
|
|
sk->sk_err = err;
|
|
|
|
sk->sk_error_report(sk);
|
|
|
|
tcp_done(sk);
|
|
} else {
|
|
sk->sk_err_soft = err;
|
|
}
|
|
goto out;
|
|
}
|
|
|
|
/* If we've already connected we will keep trying
|
|
* until we time out, or the user gives up.
|
|
*
|
|
* rfc1122 4.2.3.9 allows to consider as hard errors
|
|
* only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
|
|
* but it is obsoleted by pmtu discovery).
|
|
*
|
|
* Note, that in modern internet, where routing is unreliable
|
|
* and in each dark corner broken firewalls sit, sending random
|
|
* errors ordered by their masters even this two messages finally lose
|
|
* their original sense (even Linux sends invalid PORT_UNREACHs)
|
|
*
|
|
* Now we are in compliance with RFCs.
|
|
* --ANK (980905)
|
|
*/
|
|
|
|
inet = inet_sk(sk);
|
|
if (!sock_owned_by_user(sk) && inet->recverr) {
|
|
sk->sk_err = err;
|
|
sk->sk_error_report(sk);
|
|
} else { /* Only an error on timeout */
|
|
sk->sk_err_soft = err;
|
|
}
|
|
|
|
out:
|
|
bh_unlock_sock(sk);
|
|
sock_put(sk);
|
|
return 0;
|
|
}
|
|
|
|
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
|
|
{
|
|
struct tcphdr *th = tcp_hdr(skb);
|
|
|
|
th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
|
|
skb->csum_start = skb_transport_header(skb) - skb->head;
|
|
skb->csum_offset = offsetof(struct tcphdr, check);
|
|
}
|
|
|
|
/* This routine computes an IPv4 TCP checksum. */
|
|
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
const struct inet_sock *inet = inet_sk(sk);
|
|
|
|
__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
|
|
}
|
|
EXPORT_SYMBOL(tcp_v4_send_check);
|
|
|
|
/*
|
|
* This routine will send an RST to the other tcp.
|
|
*
|
|
* Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
|
|
* for reset.
|
|
* Answer: if a packet caused RST, it is not for a socket
|
|
* existing in our system, if it is matched to a socket,
|
|
* it is just duplicate segment or bug in other side's TCP.
|
|
* So that we build reply only basing on parameters
|
|
* arrived with segment.
|
|
* Exception: precedence violation. We do not implement it in any case.
|
|
*/
|
|
|
|
static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
struct {
|
|
struct tcphdr th;
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
|
|
#endif
|
|
} rep;
|
|
struct ip_reply_arg arg;
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
struct tcp_md5sig_key *key = NULL;
|
|
const __u8 *hash_location = NULL;
|
|
unsigned char newhash[16];
|
|
int genhash;
|
|
struct sock *sk1 = NULL;
|
|
#endif
|
|
u64 transmit_time = 0;
|
|
struct sock *ctl_sk;
|
|
struct net *net;
|
|
|
|
/* Never send a reset in response to a reset. */
|
|
if (th->rst)
|
|
return;
|
|
|
|
/* If sk not NULL, it means we did a successful lookup and incoming
|
|
* route had to be correct. prequeue might have dropped our dst.
|
|
*/
|
|
if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
|
|
return;
|
|
|
|
/* Swap the send and the receive. */
|
|
memset(&rep, 0, sizeof(rep));
|
|
rep.th.dest = th->source;
|
|
rep.th.source = th->dest;
|
|
rep.th.doff = sizeof(struct tcphdr) / 4;
|
|
rep.th.rst = 1;
|
|
|
|
if (th->ack) {
|
|
rep.th.seq = th->ack_seq;
|
|
} else {
|
|
rep.th.ack = 1;
|
|
rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
|
|
skb->len - (th->doff << 2));
|
|
}
|
|
|
|
memset(&arg, 0, sizeof(arg));
|
|
arg.iov[0].iov_base = (unsigned char *)&rep;
|
|
arg.iov[0].iov_len = sizeof(rep.th);
|
|
|
|
net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
rcu_read_lock();
|
|
hash_location = tcp_parse_md5sig_option(th);
|
|
if (sk && sk_fullsock(sk)) {
|
|
key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
|
|
&ip_hdr(skb)->saddr, AF_INET);
|
|
} else if (hash_location) {
|
|
/*
|
|
* active side is lost. Try to find listening socket through
|
|
* source port, and then find md5 key through listening socket.
|
|
* we are not loose security here:
|
|
* Incoming packet is checked with md5 hash with finding key,
|
|
* no RST generated if md5 hash doesn't match.
|
|
*/
|
|
sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
|
|
ip_hdr(skb)->saddr,
|
|
th->source, ip_hdr(skb)->daddr,
|
|
ntohs(th->source), inet_iif(skb),
|
|
tcp_v4_sdif(skb));
|
|
/* don't send rst if it can't find key */
|
|
if (!sk1)
|
|
goto out;
|
|
|
|
key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
|
|
&ip_hdr(skb)->saddr, AF_INET);
|
|
if (!key)
|
|
goto out;
|
|
|
|
|
|
genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
|
|
if (genhash || memcmp(hash_location, newhash, 16) != 0)
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (key) {
|
|
rep.opt[0] = htonl((TCPOPT_NOP << 24) |
|
|
(TCPOPT_NOP << 16) |
|
|
(TCPOPT_MD5SIG << 8) |
|
|
TCPOLEN_MD5SIG);
|
|
/* Update length and the length the header thinks exists */
|
|
arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
|
|
rep.th.doff = arg.iov[0].iov_len / 4;
|
|
|
|
tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
|
|
key, ip_hdr(skb)->saddr,
|
|
ip_hdr(skb)->daddr, &rep.th);
|
|
}
|
|
#endif
|
|
arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
|
|
ip_hdr(skb)->saddr, /* XXX */
|
|
arg.iov[0].iov_len, IPPROTO_TCP, 0);
|
|
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
|
|
arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
|
|
|
|
/* When socket is gone, all binding information is lost.
|
|
* routing might fail in this case. No choice here, if we choose to force
|
|
* input interface, we will misroute in case of asymmetric route.
|
|
*/
|
|
if (sk) {
|
|
arg.bound_dev_if = sk->sk_bound_dev_if;
|
|
if (sk_fullsock(sk))
|
|
trace_tcp_send_reset(sk, skb);
|
|
}
|
|
|
|
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
|
|
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
|
|
|
|
arg.tos = ip_hdr(skb)->tos;
|
|
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
|
|
local_bh_disable();
|
|
ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
|
|
if (sk) {
|
|
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
|
|
inet_twsk(sk)->tw_mark : sk->sk_mark;
|
|
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
|
|
inet_twsk(sk)->tw_priority : sk->sk_priority;
|
|
transmit_time = tcp_transmit_time(sk);
|
|
}
|
|
ip_send_unicast_reply(ctl_sk,
|
|
skb, &TCP_SKB_CB(skb)->header.h4.opt,
|
|
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
|
|
&arg, arg.iov[0].iov_len,
|
|
transmit_time);
|
|
|
|
ctl_sk->sk_mark = 0;
|
|
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
|
|
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
|
|
local_bh_enable();
|
|
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
out:
|
|
rcu_read_unlock();
|
|
#endif
|
|
}
|
|
|
|
/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
|
|
outside socket context is ugly, certainly. What can I do?
|
|
*/
|
|
|
|
static void tcp_v4_send_ack(const struct sock *sk,
|
|
struct sk_buff *skb, u32 seq, u32 ack,
|
|
u32 win, u32 tsval, u32 tsecr, int oif,
|
|
struct tcp_md5sig_key *key,
|
|
int reply_flags, u8 tos)
|
|
{
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
struct {
|
|
struct tcphdr th;
|
|
__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
+ (TCPOLEN_MD5SIG_ALIGNED >> 2)
|
|
#endif
|
|
];
|
|
} rep;
|
|
struct net *net = sock_net(sk);
|
|
struct ip_reply_arg arg;
|
|
struct sock *ctl_sk;
|
|
u64 transmit_time;
|
|
|
|
memset(&rep.th, 0, sizeof(struct tcphdr));
|
|
memset(&arg, 0, sizeof(arg));
|
|
|
|
arg.iov[0].iov_base = (unsigned char *)&rep;
|
|
arg.iov[0].iov_len = sizeof(rep.th);
|
|
if (tsecr) {
|
|
rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
|
|
(TCPOPT_TIMESTAMP << 8) |
|
|
TCPOLEN_TIMESTAMP);
|
|
rep.opt[1] = htonl(tsval);
|
|
rep.opt[2] = htonl(tsecr);
|
|
arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
|
|
}
|
|
|
|
/* Swap the send and the receive. */
|
|
rep.th.dest = th->source;
|
|
rep.th.source = th->dest;
|
|
rep.th.doff = arg.iov[0].iov_len / 4;
|
|
rep.th.seq = htonl(seq);
|
|
rep.th.ack_seq = htonl(ack);
|
|
rep.th.ack = 1;
|
|
rep.th.window = htons(win);
|
|
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
if (key) {
|
|
int offset = (tsecr) ? 3 : 0;
|
|
|
|
rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
|
|
(TCPOPT_NOP << 16) |
|
|
(TCPOPT_MD5SIG << 8) |
|
|
TCPOLEN_MD5SIG);
|
|
arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
|
|
rep.th.doff = arg.iov[0].iov_len/4;
|
|
|
|
tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
|
|
key, ip_hdr(skb)->saddr,
|
|
ip_hdr(skb)->daddr, &rep.th);
|
|
}
|
|
#endif
|
|
arg.flags = reply_flags;
|
|
arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
|
|
ip_hdr(skb)->saddr, /* XXX */
|
|
arg.iov[0].iov_len, IPPROTO_TCP, 0);
|
|
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
|
|
if (oif)
|
|
arg.bound_dev_if = oif;
|
|
arg.tos = tos;
|
|
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
|
|
local_bh_disable();
|
|
ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
|
|
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
|
|
inet_twsk(sk)->tw_mark : sk->sk_mark;
|
|
ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
|
|
inet_twsk(sk)->tw_priority : sk->sk_priority;
|
|
transmit_time = tcp_transmit_time(sk);
|
|
ip_send_unicast_reply(ctl_sk,
|
|
skb, &TCP_SKB_CB(skb)->header.h4.opt,
|
|
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
|
|
&arg, arg.iov[0].iov_len,
|
|
transmit_time);
|
|
|
|
ctl_sk->sk_mark = 0;
|
|
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
|
|
local_bh_enable();
|
|
}
|
|
|
|
static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct inet_timewait_sock *tw = inet_twsk(sk);
|
|
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
|
|
|
|
tcp_v4_send_ack(sk, skb,
|
|
tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
|
|
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
|
|
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
|
|
tcptw->tw_ts_recent,
|
|
tw->tw_bound_dev_if,
|
|
tcp_twsk_md5_key(tcptw),
|
|
tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
|
|
tw->tw_tos
|
|
);
|
|
|
|
inet_twsk_put(tw);
|
|
}
|
|
|
|
static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
|
|
struct request_sock *req)
|
|
{
|
|
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
|
|
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
|
|
*/
|
|
u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
|
|
tcp_sk(sk)->snd_nxt;
|
|
|
|
/* RFC 7323 2.3
|
|
* The window field (SEG.WND) of every outgoing segment, with the
|
|
* exception of <SYN> segments, MUST be right-shifted by
|
|
* Rcv.Wind.Shift bits:
|
|
*/
|
|
tcp_v4_send_ack(sk, skb, seq,
|
|
tcp_rsk(req)->rcv_nxt,
|
|
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
|
|
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
|
|
req->ts_recent,
|
|
0,
|
|
tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
|
|
AF_INET),
|
|
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
|
|
ip_hdr(skb)->tos);
|
|
}
|
|
|
|
/*
|
|
* Send a SYN-ACK after having received a SYN.
|
|
* This still operates on a request_sock only, not on a big
|
|
* socket.
|
|
*/
|
|
static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
|
|
struct flowi *fl,
|
|
struct request_sock *req,
|
|
struct tcp_fastopen_cookie *foc,
|
|
enum tcp_synack_type synack_type)
|
|
{
|
|
const struct inet_request_sock *ireq = inet_rsk(req);
|
|
struct flowi4 fl4;
|
|
int err = -1;
|
|
struct sk_buff *skb;
|
|
|
|
/* First, grab a route. */
|
|
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
|
|
return -1;
|
|
|
|
skb = tcp_make_synack(sk, dst, req, foc, synack_type);
|
|
|
|
if (skb) {
|
|
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
|
|
|
|
rcu_read_lock();
|
|
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
|
|
ireq->ir_rmt_addr,
|
|
rcu_dereference(ireq->ireq_opt));
|
|
rcu_read_unlock();
|
|
err = net_xmit_eval(err);
|
|
}
|
|
|
|
return err;
|
|
}
|
|
|
|
/*
|
|
* IPv4 request_sock destructor.
|
|
*/
|
|
static void tcp_v4_reqsk_destructor(struct request_sock *req)
|
|
{
|
|
kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
|
|
}
|
|
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
/*
|
|
* RFC2385 MD5 checksumming requires a mapping of
|
|
* IP address->MD5 Key.
|
|
* We need to maintain these in the sk structure.
|
|
*/
|
|
|
|
DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
|
|
EXPORT_SYMBOL(tcp_md5_needed);
|
|
|
|
/* Find the Key structure for an address. */
|
|
struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk,
|
|
const union tcp_md5_addr *addr,
|
|
int family)
|
|
{
|
|
const struct tcp_sock *tp = tcp_sk(sk);
|
|
struct tcp_md5sig_key *key;
|
|
const struct tcp_md5sig_info *md5sig;
|
|
__be32 mask;
|
|
struct tcp_md5sig_key *best_match = NULL;
|
|
bool match;
|
|
|
|
/* caller either holds rcu_read_lock() or socket lock */
|
|
md5sig = rcu_dereference_check(tp->md5sig_info,
|
|
lockdep_sock_is_held(sk));
|
|
if (!md5sig)
|
|
return NULL;
|
|
|
|
hlist_for_each_entry_rcu(key, &md5sig->head, node) {
|
|
if (key->family != family)
|
|
continue;
|
|
|
|
if (family == AF_INET) {
|
|
mask = inet_make_mask(key->prefixlen);
|
|
match = (key->addr.a4.s_addr & mask) ==
|
|
(addr->a4.s_addr & mask);
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
} else if (family == AF_INET6) {
|
|
match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
|
|
key->prefixlen);
|
|
#endif
|
|
} else {
|
|
match = false;
|
|
}
|
|
|
|
if (match && (!best_match ||
|
|
key->prefixlen > best_match->prefixlen))
|
|
best_match = key;
|
|
}
|
|
return best_match;
|
|
}
|
|
EXPORT_SYMBOL(__tcp_md5_do_lookup);
|
|
|
|
static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
|
|
const union tcp_md5_addr *addr,
|
|
int family, u8 prefixlen)
|
|
{
|
|
const struct tcp_sock *tp = tcp_sk(sk);
|
|
struct tcp_md5sig_key *key;
|
|
unsigned int size = sizeof(struct in_addr);
|
|
const struct tcp_md5sig_info *md5sig;
|
|
|
|
/* caller either holds rcu_read_lock() or socket lock */
|
|
md5sig = rcu_dereference_check(tp->md5sig_info,
|
|
lockdep_sock_is_held(sk));
|
|
if (!md5sig)
|
|
return NULL;
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
if (family == AF_INET6)
|
|
size = sizeof(struct in6_addr);
|
|
#endif
|
|
hlist_for_each_entry_rcu(key, &md5sig->head, node) {
|
|
if (key->family != family)
|
|
continue;
|
|
if (!memcmp(&key->addr, addr, size) &&
|
|
key->prefixlen == prefixlen)
|
|
return key;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
|
|
const struct sock *addr_sk)
|
|
{
|
|
const union tcp_md5_addr *addr;
|
|
|
|
addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
|
|
return tcp_md5_do_lookup(sk, addr, AF_INET);
|
|
}
|
|
EXPORT_SYMBOL(tcp_v4_md5_lookup);
|
|
|
|
/* This can be called on a newly created socket, from other files */
|
|
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
|
|
int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
|
|
gfp_t gfp)
|
|
{
|
|
/* Add Key to the list */
|
|
struct tcp_md5sig_key *key;
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
struct tcp_md5sig_info *md5sig;
|
|
|
|
key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
|
|
if (key) {
|
|
/* Pre-existing entry - just update that one.
|
|
* Note that the key might be used concurrently.
|
|
*/
|
|
memcpy(key->key, newkey, newkeylen);
|
|
|
|
/* Pairs with READ_ONCE() in tcp_md5_hash_key().
|
|
* Also note that a reader could catch new key->keylen value
|
|
* but old key->key[], this is the reason we use __GFP_ZERO
|
|
* at sock_kmalloc() time below these lines.
|
|
*/
|
|
WRITE_ONCE(key->keylen, newkeylen);
|
|
|
|
return 0;
|
|
}
|
|
|
|
md5sig = rcu_dereference_protected(tp->md5sig_info,
|
|
lockdep_sock_is_held(sk));
|
|
if (!md5sig) {
|
|
md5sig = kmalloc(sizeof(*md5sig), gfp);
|
|
if (!md5sig)
|
|
return -ENOMEM;
|
|
|
|
sk_nocaps_add(sk, NETIF_F_GSO_MASK);
|
|
INIT_HLIST_HEAD(&md5sig->head);
|
|
rcu_assign_pointer(tp->md5sig_info, md5sig);
|
|
}
|
|
|
|
key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
|
|
if (!key)
|
|
return -ENOMEM;
|
|
if (!tcp_alloc_md5sig_pool()) {
|
|
sock_kfree_s(sk, key, sizeof(*key));
|
|
return -ENOMEM;
|
|
}
|
|
|
|
memcpy(key->key, newkey, newkeylen);
|
|
key->keylen = newkeylen;
|
|
key->family = family;
|
|
key->prefixlen = prefixlen;
|
|
memcpy(&key->addr, addr,
|
|
(family == AF_INET6) ? sizeof(struct in6_addr) :
|
|
sizeof(struct in_addr));
|
|
hlist_add_head_rcu(&key->node, &md5sig->head);
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL(tcp_md5_do_add);
|
|
|
|
int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
|
|
u8 prefixlen)
|
|
{
|
|
struct tcp_md5sig_key *key;
|
|
|
|
key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
|
|
if (!key)
|
|
return -ENOENT;
|
|
hlist_del_rcu(&key->node);
|
|
atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
|
|
kfree_rcu(key, rcu);
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL(tcp_md5_do_del);
|
|
|
|
static void tcp_clear_md5_list(struct sock *sk)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
struct tcp_md5sig_key *key;
|
|
struct hlist_node *n;
|
|
struct tcp_md5sig_info *md5sig;
|
|
|
|
md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
|
|
|
|
hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
|
|
hlist_del_rcu(&key->node);
|
|
atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
|
|
kfree_rcu(key, rcu);
|
|
}
|
|
}
|
|
|
|
static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
|
|
char __user *optval, int optlen)
|
|
{
|
|
struct tcp_md5sig cmd;
|
|
struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
|
|
u8 prefixlen = 32;
|
|
|
|
if (optlen < sizeof(cmd))
|
|
return -EINVAL;
|
|
|
|
if (copy_from_user(&cmd, optval, sizeof(cmd)))
|
|
return -EFAULT;
|
|
|
|
if (sin->sin_family != AF_INET)
|
|
return -EINVAL;
|
|
|
|
if (optname == TCP_MD5SIG_EXT &&
|
|
cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
|
|
prefixlen = cmd.tcpm_prefixlen;
|
|
if (prefixlen > 32)
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (!cmd.tcpm_keylen)
|
|
return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
|
|
AF_INET, prefixlen);
|
|
|
|
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
|
|
return -EINVAL;
|
|
|
|
return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
|
|
AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
|
|
GFP_KERNEL);
|
|
}
|
|
|
|
static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
|
|
__be32 daddr, __be32 saddr,
|
|
const struct tcphdr *th, int nbytes)
|
|
{
|
|
struct tcp4_pseudohdr *bp;
|
|
struct scatterlist sg;
|
|
struct tcphdr *_th;
|
|
|
|
bp = hp->scratch;
|
|
bp->saddr = saddr;
|
|
bp->daddr = daddr;
|
|
bp->pad = 0;
|
|
bp->protocol = IPPROTO_TCP;
|
|
bp->len = cpu_to_be16(nbytes);
|
|
|
|
_th = (struct tcphdr *)(bp + 1);
|
|
memcpy(_th, th, sizeof(*th));
|
|
_th->check = 0;
|
|
|
|
sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
|
|
ahash_request_set_crypt(hp->md5_req, &sg, NULL,
|
|
sizeof(*bp) + sizeof(*th));
|
|
return crypto_ahash_update(hp->md5_req);
|
|
}
|
|
|
|
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
|
|
__be32 daddr, __be32 saddr, const struct tcphdr *th)
|
|
{
|
|
struct tcp_md5sig_pool *hp;
|
|
struct ahash_request *req;
|
|
|
|
hp = tcp_get_md5sig_pool();
|
|
if (!hp)
|
|
goto clear_hash_noput;
|
|
req = hp->md5_req;
|
|
|
|
if (crypto_ahash_init(req))
|
|
goto clear_hash;
|
|
if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
|
|
goto clear_hash;
|
|
if (tcp_md5_hash_key(hp, key))
|
|
goto clear_hash;
|
|
ahash_request_set_crypt(req, NULL, md5_hash, 0);
|
|
if (crypto_ahash_final(req))
|
|
goto clear_hash;
|
|
|
|
tcp_put_md5sig_pool();
|
|
return 0;
|
|
|
|
clear_hash:
|
|
tcp_put_md5sig_pool();
|
|
clear_hash_noput:
|
|
memset(md5_hash, 0, 16);
|
|
return 1;
|
|
}
|
|
|
|
int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
|
|
const struct sock *sk,
|
|
const struct sk_buff *skb)
|
|
{
|
|
struct tcp_md5sig_pool *hp;
|
|
struct ahash_request *req;
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
__be32 saddr, daddr;
|
|
|
|
if (sk) { /* valid for establish/request sockets */
|
|
saddr = sk->sk_rcv_saddr;
|
|
daddr = sk->sk_daddr;
|
|
} else {
|
|
const struct iphdr *iph = ip_hdr(skb);
|
|
saddr = iph->saddr;
|
|
daddr = iph->daddr;
|
|
}
|
|
|
|
hp = tcp_get_md5sig_pool();
|
|
if (!hp)
|
|
goto clear_hash_noput;
|
|
req = hp->md5_req;
|
|
|
|
if (crypto_ahash_init(req))
|
|
goto clear_hash;
|
|
|
|
if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
|
|
goto clear_hash;
|
|
if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
|
|
goto clear_hash;
|
|
if (tcp_md5_hash_key(hp, key))
|
|
goto clear_hash;
|
|
ahash_request_set_crypt(req, NULL, md5_hash, 0);
|
|
if (crypto_ahash_final(req))
|
|
goto clear_hash;
|
|
|
|
tcp_put_md5sig_pool();
|
|
return 0;
|
|
|
|
clear_hash:
|
|
tcp_put_md5sig_pool();
|
|
clear_hash_noput:
|
|
memset(md5_hash, 0, 16);
|
|
return 1;
|
|
}
|
|
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
|
|
|
|
#endif
|
|
|
|
/* Called with rcu_read_lock() */
|
|
static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
|
|
const struct sk_buff *skb)
|
|
{
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
/*
|
|
* This gets called for each TCP segment that arrives
|
|
* so we want to be efficient.
|
|
* We have 3 drop cases:
|
|
* o No MD5 hash and one expected.
|
|
* o MD5 hash and we're not expecting one.
|
|
* o MD5 hash and its wrong.
|
|
*/
|
|
const __u8 *hash_location = NULL;
|
|
struct tcp_md5sig_key *hash_expected;
|
|
const struct iphdr *iph = ip_hdr(skb);
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
int genhash;
|
|
unsigned char newhash[16];
|
|
|
|
hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
|
|
AF_INET);
|
|
hash_location = tcp_parse_md5sig_option(th);
|
|
|
|
/* We've parsed the options - do we have a hash? */
|
|
if (!hash_expected && !hash_location)
|
|
return false;
|
|
|
|
if (hash_expected && !hash_location) {
|
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
|
|
return true;
|
|
}
|
|
|
|
if (!hash_expected && hash_location) {
|
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
|
|
return true;
|
|
}
|
|
|
|
/* Okay, so this is hash_expected and hash_location -
|
|
* so we need to calculate the checksum.
|
|
*/
|
|
genhash = tcp_v4_md5_hash_skb(newhash,
|
|
hash_expected,
|
|
NULL, skb);
|
|
|
|
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
|
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
|
|
net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
|
|
&iph->saddr, ntohs(th->source),
|
|
&iph->daddr, ntohs(th->dest),
|
|
genhash ? " tcp_v4_calc_md5_hash failed"
|
|
: "");
|
|
return true;
|
|
}
|
|
return false;
|
|
#endif
|
|
return false;
|
|
}
|
|
|
|
static void tcp_v4_init_req(struct request_sock *req,
|
|
const struct sock *sk_listener,
|
|
struct sk_buff *skb)
|
|
{
|
|
struct inet_request_sock *ireq = inet_rsk(req);
|
|
struct net *net = sock_net(sk_listener);
|
|
|
|
sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
|
|
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
|
|
RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
|
|
}
|
|
|
|
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
|
|
struct flowi *fl,
|
|
const struct request_sock *req)
|
|
{
|
|
return inet_csk_route_req(sk, &fl->u.ip4, req);
|
|
}
|
|
|
|
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
|
|
.family = PF_INET,
|
|
.obj_size = sizeof(struct tcp_request_sock),
|
|
.rtx_syn_ack = tcp_rtx_synack,
|
|
.send_ack = tcp_v4_reqsk_send_ack,
|
|
.destructor = tcp_v4_reqsk_destructor,
|
|
.send_reset = tcp_v4_send_reset,
|
|
.syn_ack_timeout = tcp_syn_ack_timeout,
|
|
};
|
|
|
|
const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
|
|
.mss_clamp = TCP_MSS_DEFAULT,
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
.req_md5_lookup = tcp_v4_md5_lookup,
|
|
.calc_md5_hash = tcp_v4_md5_hash_skb,
|
|
#endif
|
|
.init_req = tcp_v4_init_req,
|
|
#ifdef CONFIG_SYN_COOKIES
|
|
.cookie_init_seq = cookie_v4_init_sequence,
|
|
#endif
|
|
.route_req = tcp_v4_route_req,
|
|
.init_seq = tcp_v4_init_seq,
|
|
.init_ts_off = tcp_v4_init_ts_off,
|
|
.send_synack = tcp_v4_send_synack,
|
|
};
|
|
|
|
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
/* Never answer to SYNs send to broadcast or multicast */
|
|
if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
|
|
goto drop;
|
|
|
|
return tcp_conn_request(&tcp_request_sock_ops,
|
|
&tcp_request_sock_ipv4_ops, sk, skb);
|
|
|
|
drop:
|
|
tcp_listendrop(sk);
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL(tcp_v4_conn_request);
|
|
|
|
|
|
/*
|
|
* The three way handshake has completed - we got a valid synack -
|
|
* now create the new socket.
|
|
*/
|
|
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
|
|
struct request_sock *req,
|
|
struct dst_entry *dst,
|
|
struct request_sock *req_unhash,
|
|
bool *own_req)
|
|
{
|
|
struct inet_request_sock *ireq;
|
|
bool found_dup_sk = false;
|
|
struct inet_sock *newinet;
|
|
struct tcp_sock *newtp;
|
|
struct sock *newsk;
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
struct tcp_md5sig_key *key;
|
|
#endif
|
|
struct ip_options_rcu *inet_opt;
|
|
|
|
if (sk_acceptq_is_full(sk))
|
|
goto exit_overflow;
|
|
|
|
newsk = tcp_create_openreq_child(sk, req, skb);
|
|
if (!newsk)
|
|
goto exit_nonewsk;
|
|
|
|
newsk->sk_gso_type = SKB_GSO_TCPV4;
|
|
inet_sk_rx_dst_set(newsk, skb);
|
|
|
|
newtp = tcp_sk(newsk);
|
|
newinet = inet_sk(newsk);
|
|
ireq = inet_rsk(req);
|
|
sk_daddr_set(newsk, ireq->ir_rmt_addr);
|
|
sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
|
|
newsk->sk_bound_dev_if = ireq->ir_iif;
|
|
newinet->inet_saddr = ireq->ir_loc_addr;
|
|
inet_opt = rcu_dereference(ireq->ireq_opt);
|
|
RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
|
|
newinet->mc_index = inet_iif(skb);
|
|
newinet->mc_ttl = ip_hdr(skb)->ttl;
|
|
newinet->rcv_tos = ip_hdr(skb)->tos;
|
|
inet_csk(newsk)->icsk_ext_hdr_len = 0;
|
|
if (inet_opt)
|
|
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
|
|
newinet->inet_id = prandom_u32();
|
|
|
|
if (!dst) {
|
|
dst = inet_csk_route_child_sock(sk, newsk, req);
|
|
if (!dst)
|
|
goto put_and_exit;
|
|
} else {
|
|
/* syncookie case : see end of cookie_v4_check() */
|
|
}
|
|
sk_setup_caps(newsk, dst);
|
|
|
|
tcp_ca_openreq_child(newsk, dst);
|
|
|
|
tcp_sync_mss(newsk, dst_mtu(dst));
|
|
newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
|
|
|
|
tcp_initialize_rcv_mss(newsk);
|
|
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
/* Copy over the MD5 key from the original socket */
|
|
key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
|
|
AF_INET);
|
|
if (key) {
|
|
/*
|
|
* We're using one, so create a matching key
|
|
* on the newsk structure. If we fail to get
|
|
* memory, then we end up not copying the key
|
|
* across. Shucks.
|
|
*/
|
|
tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
|
|
AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
|
|
sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
|
|
}
|
|
#endif
|
|
|
|
if (__inet_inherit_port(sk, newsk) < 0)
|
|
goto put_and_exit;
|
|
*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
|
|
&found_dup_sk);
|
|
if (likely(*own_req)) {
|
|
tcp_move_syn(newtp, req);
|
|
ireq->ireq_opt = NULL;
|
|
} else {
|
|
newinet->inet_opt = NULL;
|
|
|
|
if (!req_unhash && found_dup_sk) {
|
|
/* This code path should only be executed in the
|
|
* syncookie case only
|
|
*/
|
|
bh_unlock_sock(newsk);
|
|
sock_put(newsk);
|
|
newsk = NULL;
|
|
}
|
|
}
|
|
return newsk;
|
|
|
|
exit_overflow:
|
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
|
|
exit_nonewsk:
|
|
dst_release(dst);
|
|
exit:
|
|
tcp_listendrop(sk);
|
|
return NULL;
|
|
put_and_exit:
|
|
newinet->inet_opt = NULL;
|
|
inet_csk_prepare_forced_close(newsk);
|
|
tcp_done(newsk);
|
|
goto exit;
|
|
}
|
|
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
|
|
|
|
static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
#ifdef CONFIG_SYN_COOKIES
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
|
|
if (!th->syn)
|
|
sk = cookie_v4_check(sk, skb);
|
|
#endif
|
|
return sk;
|
|
}
|
|
|
|
u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
|
|
struct tcphdr *th, u32 *cookie)
|
|
{
|
|
u16 mss = 0;
|
|
#ifdef CONFIG_SYN_COOKIES
|
|
mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
|
|
&tcp_request_sock_ipv4_ops, sk, th);
|
|
if (mss) {
|
|
*cookie = __cookie_v4_init_sequence(iph, th, &mss);
|
|
tcp_synq_overflow(sk);
|
|
}
|
|
#endif
|
|
return mss;
|
|
}
|
|
|
|
/* The socket must have it's spinlock held when we get
|
|
* here, unless it is a TCP_LISTEN socket.
|
|
*
|
|
* We have a potential double-lock case here, so even when
|
|
* doing backlog processing we use the BH locking scheme.
|
|
* This is because we cannot sleep with the original spinlock
|
|
* held.
|
|
*/
|
|
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct sock *rsk;
|
|
|
|
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
|
|
struct dst_entry *dst;
|
|
|
|
dst = rcu_dereference_protected(sk->sk_rx_dst,
|
|
lockdep_sock_is_held(sk));
|
|
|
|
sock_rps_save_rxhash(sk, skb);
|
|
sk_mark_napi_id(sk, skb);
|
|
if (dst) {
|
|
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
|
|
!dst->ops->check(dst, 0)) {
|
|
RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
|
|
dst_release(dst);
|
|
}
|
|
}
|
|
tcp_rcv_established(sk, skb);
|
|
return 0;
|
|
}
|
|
|
|
if (tcp_checksum_complete(skb))
|
|
goto csum_err;
|
|
|
|
if (sk->sk_state == TCP_LISTEN) {
|
|
struct sock *nsk = tcp_v4_cookie_check(sk, skb);
|
|
|
|
if (!nsk)
|
|
goto discard;
|
|
if (nsk != sk) {
|
|
if (tcp_child_process(sk, nsk, skb)) {
|
|
rsk = nsk;
|
|
goto reset;
|
|
}
|
|
return 0;
|
|
}
|
|
} else
|
|
sock_rps_save_rxhash(sk, skb);
|
|
|
|
if (tcp_rcv_state_process(sk, skb)) {
|
|
rsk = sk;
|
|
goto reset;
|
|
}
|
|
return 0;
|
|
|
|
reset:
|
|
tcp_v4_send_reset(rsk, skb);
|
|
discard:
|
|
kfree_skb(skb);
|
|
/* Be careful here. If this function gets more complicated and
|
|
* gcc suffers from register pressure on the x86, sk (in %ebx)
|
|
* might be destroyed here. This current version compiles correctly,
|
|
* but you have been warned.
|
|
*/
|
|
return 0;
|
|
|
|
csum_err:
|
|
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
|
|
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
|
|
goto discard;
|
|
}
|
|
EXPORT_SYMBOL(tcp_v4_do_rcv);
|
|
|
|
int tcp_v4_early_demux(struct sk_buff *skb)
|
|
{
|
|
const struct iphdr *iph;
|
|
const struct tcphdr *th;
|
|
struct sock *sk;
|
|
|
|
if (skb->pkt_type != PACKET_HOST)
|
|
return 0;
|
|
|
|
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
|
|
return 0;
|
|
|
|
iph = ip_hdr(skb);
|
|
th = tcp_hdr(skb);
|
|
|
|
if (th->doff < sizeof(struct tcphdr) / 4)
|
|
return 0;
|
|
|
|
sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
|
|
iph->saddr, th->source,
|
|
iph->daddr, ntohs(th->dest),
|
|
skb->skb_iif, inet_sdif(skb));
|
|
if (sk) {
|
|
skb->sk = sk;
|
|
skb->destructor = sock_edemux;
|
|
if (sk_fullsock(sk)) {
|
|
struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
|
|
|
|
if (dst)
|
|
dst = dst_check(dst, 0);
|
|
if (dst &&
|
|
inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
|
|
skb_dst_set_noref(skb, dst);
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
|
|
u32 tail_gso_size, tail_gso_segs;
|
|
struct skb_shared_info *shinfo;
|
|
const struct tcphdr *th;
|
|
struct tcphdr *thtail;
|
|
struct sk_buff *tail;
|
|
unsigned int hdrlen;
|
|
bool fragstolen;
|
|
u32 gso_segs;
|
|
u32 gso_size;
|
|
int delta;
|
|
|
|
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
|
|
* we can fix skb->truesize to its real value to avoid future drops.
|
|
* This is valid because skb is not yet charged to the socket.
|
|
* It has been noticed pure SACK packets were sometimes dropped
|
|
* (if cooked by drivers without copybreak feature).
|
|
*/
|
|
skb_condense(skb);
|
|
|
|
skb_dst_drop(skb);
|
|
|
|
if (unlikely(tcp_checksum_complete(skb))) {
|
|
bh_unlock_sock(sk);
|
|
__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
|
|
__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
|
|
return true;
|
|
}
|
|
|
|
/* Attempt coalescing to last skb in backlog, even if we are
|
|
* above the limits.
|
|
* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
|
|
*/
|
|
th = (const struct tcphdr *)skb->data;
|
|
hdrlen = th->doff * 4;
|
|
|
|
tail = sk->sk_backlog.tail;
|
|
if (!tail)
|
|
goto no_coalesce;
|
|
thtail = (struct tcphdr *)tail->data;
|
|
|
|
if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
|
|
TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
|
|
((TCP_SKB_CB(tail)->tcp_flags |
|
|
TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
|
|
!((TCP_SKB_CB(tail)->tcp_flags &
|
|
TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
|
|
((TCP_SKB_CB(tail)->tcp_flags ^
|
|
TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
|
|
#ifdef CONFIG_TLS_DEVICE
|
|
tail->decrypted != skb->decrypted ||
|
|
#endif
|
|
thtail->doff != th->doff ||
|
|
memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
|
|
goto no_coalesce;
|
|
|
|
__skb_pull(skb, hdrlen);
|
|
|
|
shinfo = skb_shinfo(skb);
|
|
gso_size = shinfo->gso_size ?: skb->len;
|
|
gso_segs = shinfo->gso_segs ?: 1;
|
|
|
|
shinfo = skb_shinfo(tail);
|
|
tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
|
|
tail_gso_segs = shinfo->gso_segs ?: 1;
|
|
|
|
if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
|
|
TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
|
|
|
|
if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
|
|
TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
|
|
thtail->window = th->window;
|
|
}
|
|
|
|
/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
|
|
* thtail->fin, so that the fast path in tcp_rcv_established()
|
|
* is not entered if we append a packet with a FIN.
|
|
* SYN, RST, URG are not present.
|
|
* ACK is set on both packets.
|
|
* PSH : we do not really care in TCP stack,
|
|
* at least for 'GRO' packets.
|
|
*/
|
|
thtail->fin |= th->fin;
|
|
TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
|
|
|
|
if (TCP_SKB_CB(skb)->has_rxtstamp) {
|
|
TCP_SKB_CB(tail)->has_rxtstamp = true;
|
|
tail->tstamp = skb->tstamp;
|
|
skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
|
|
}
|
|
|
|
/* Not as strict as GRO. We only need to carry mss max value */
|
|
shinfo->gso_size = max(gso_size, tail_gso_size);
|
|
shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
|
|
|
|
sk->sk_backlog.len += delta;
|
|
__NET_INC_STATS(sock_net(sk),
|
|
LINUX_MIB_TCPBACKLOGCOALESCE);
|
|
kfree_skb_partial(skb, fragstolen);
|
|
return false;
|
|
}
|
|
__skb_push(skb, hdrlen);
|
|
|
|
no_coalesce:
|
|
/* Only socket owner can try to collapse/prune rx queues
|
|
* to reduce memory overhead, so add a little headroom here.
|
|
* Few sockets backlog are possibly concurrently non empty.
|
|
*/
|
|
limit += 64*1024;
|
|
|
|
if (unlikely(sk_add_backlog(sk, skb, limit))) {
|
|
bh_unlock_sock(sk);
|
|
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
EXPORT_SYMBOL(tcp_add_backlog);
|
|
|
|
int tcp_filter(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct tcphdr *th = (struct tcphdr *)skb->data;
|
|
|
|
return sk_filter_trim_cap(sk, skb, th->doff * 4);
|
|
}
|
|
EXPORT_SYMBOL(tcp_filter);
|
|
|
|
static void tcp_v4_restore_cb(struct sk_buff *skb)
|
|
{
|
|
memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
|
|
sizeof(struct inet_skb_parm));
|
|
}
|
|
|
|
static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
|
|
const struct tcphdr *th)
|
|
{
|
|
/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
|
|
* barrier() makes sure compiler wont play fool^Waliasing games.
|
|
*/
|
|
memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
|
|
sizeof(struct inet_skb_parm));
|
|
barrier();
|
|
|
|
TCP_SKB_CB(skb)->seq = ntohl(th->seq);
|
|
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
|
|
skb->len - th->doff * 4);
|
|
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
|
|
TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
|
|
TCP_SKB_CB(skb)->tcp_tw_isn = 0;
|
|
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
|
|
TCP_SKB_CB(skb)->sacked = 0;
|
|
TCP_SKB_CB(skb)->has_rxtstamp =
|
|
skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
|
|
}
|
|
|
|
/*
|
|
* From tcp_input.c
|
|
*/
|
|
|
|
int tcp_v4_rcv(struct sk_buff *skb)
|
|
{
|
|
struct net *net = dev_net(skb->dev);
|
|
struct sk_buff *skb_to_free;
|
|
int sdif = inet_sdif(skb);
|
|
const struct iphdr *iph;
|
|
const struct tcphdr *th;
|
|
bool refcounted;
|
|
struct sock *sk;
|
|
int ret;
|
|
|
|
if (skb->pkt_type != PACKET_HOST)
|
|
goto discard_it;
|
|
|
|
/* Count it even if it's bad */
|
|
__TCP_INC_STATS(net, TCP_MIB_INSEGS);
|
|
|
|
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
|
|
goto discard_it;
|
|
|
|
th = (const struct tcphdr *)skb->data;
|
|
|
|
if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
|
|
goto bad_packet;
|
|
if (!pskb_may_pull(skb, th->doff * 4))
|
|
goto discard_it;
|
|
|
|
/* An explanation is required here, I think.
|
|
* Packet length and doff are validated by header prediction,
|
|
* provided case of th->doff==0 is eliminated.
|
|
* So, we defer the checks. */
|
|
|
|
if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
|
|
goto csum_error;
|
|
|
|
th = (const struct tcphdr *)skb->data;
|
|
iph = ip_hdr(skb);
|
|
lookup:
|
|
sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
|
|
th->dest, sdif, &refcounted);
|
|
if (!sk)
|
|
goto no_tcp_socket;
|
|
|
|
process:
|
|
if (sk->sk_state == TCP_TIME_WAIT)
|
|
goto do_time_wait;
|
|
|
|
if (sk->sk_state == TCP_NEW_SYN_RECV) {
|
|
struct request_sock *req = inet_reqsk(sk);
|
|
bool req_stolen = false;
|
|
struct sock *nsk;
|
|
|
|
sk = req->rsk_listener;
|
|
if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
|
|
sk_drops_add(sk, skb);
|
|
reqsk_put(req);
|
|
goto discard_it;
|
|
}
|
|
if (tcp_checksum_complete(skb)) {
|
|
reqsk_put(req);
|
|
goto csum_error;
|
|
}
|
|
if (unlikely(sk->sk_state != TCP_LISTEN)) {
|
|
inet_csk_reqsk_queue_drop_and_put(sk, req);
|
|
goto lookup;
|
|
}
|
|
/* We own a reference on the listener, increase it again
|
|
* as we might lose it too soon.
|
|
*/
|
|
sock_hold(sk);
|
|
refcounted = true;
|
|
nsk = NULL;
|
|
if (!tcp_filter(sk, skb)) {
|
|
th = (const struct tcphdr *)skb->data;
|
|
iph = ip_hdr(skb);
|
|
tcp_v4_fill_cb(skb, iph, th);
|
|
nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
|
|
}
|
|
if (!nsk) {
|
|
reqsk_put(req);
|
|
if (req_stolen) {
|
|
/* Another cpu got exclusive access to req
|
|
* and created a full blown socket.
|
|
* Try to feed this packet to this socket
|
|
* instead of discarding it.
|
|
*/
|
|
tcp_v4_restore_cb(skb);
|
|
sock_put(sk);
|
|
goto lookup;
|
|
}
|
|
goto discard_and_relse;
|
|
}
|
|
if (nsk == sk) {
|
|
reqsk_put(req);
|
|
tcp_v4_restore_cb(skb);
|
|
} else if (tcp_child_process(sk, nsk, skb)) {
|
|
tcp_v4_send_reset(nsk, skb);
|
|
goto discard_and_relse;
|
|
} else {
|
|
sock_put(sk);
|
|
return 0;
|
|
}
|
|
}
|
|
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
|
|
__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
|
|
goto discard_and_relse;
|
|
}
|
|
|
|
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
|
|
goto discard_and_relse;
|
|
|
|
if (tcp_v4_inbound_md5_hash(sk, skb))
|
|
goto discard_and_relse;
|
|
|
|
nf_reset_ct(skb);
|
|
|
|
if (tcp_filter(sk, skb))
|
|
goto discard_and_relse;
|
|
th = (const struct tcphdr *)skb->data;
|
|
iph = ip_hdr(skb);
|
|
tcp_v4_fill_cb(skb, iph, th);
|
|
|
|
skb->dev = NULL;
|
|
|
|
if (sk->sk_state == TCP_LISTEN) {
|
|
ret = tcp_v4_do_rcv(sk, skb);
|
|
goto put_and_return;
|
|
}
|
|
|
|
sk_incoming_cpu_update(sk);
|
|
|
|
bh_lock_sock_nested(sk);
|
|
tcp_segs_in(tcp_sk(sk), skb);
|
|
ret = 0;
|
|
if (!sock_owned_by_user(sk)) {
|
|
skb_to_free = sk->sk_rx_skb_cache;
|
|
sk->sk_rx_skb_cache = NULL;
|
|
ret = tcp_v4_do_rcv(sk, skb);
|
|
} else {
|
|
if (tcp_add_backlog(sk, skb))
|
|
goto discard_and_relse;
|
|
skb_to_free = NULL;
|
|
}
|
|
bh_unlock_sock(sk);
|
|
if (skb_to_free)
|
|
__kfree_skb(skb_to_free);
|
|
|
|
put_and_return:
|
|
if (refcounted)
|
|
sock_put(sk);
|
|
|
|
return ret;
|
|
|
|
no_tcp_socket:
|
|
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
|
|
goto discard_it;
|
|
|
|
tcp_v4_fill_cb(skb, iph, th);
|
|
|
|
if (tcp_checksum_complete(skb)) {
|
|
csum_error:
|
|
__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
|
|
bad_packet:
|
|
__TCP_INC_STATS(net, TCP_MIB_INERRS);
|
|
} else {
|
|
tcp_v4_send_reset(NULL, skb);
|
|
}
|
|
|
|
discard_it:
|
|
/* Discard frame. */
|
|
kfree_skb(skb);
|
|
return 0;
|
|
|
|
discard_and_relse:
|
|
sk_drops_add(sk, skb);
|
|
if (refcounted)
|
|
sock_put(sk);
|
|
goto discard_it;
|
|
|
|
do_time_wait:
|
|
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
|
|
inet_twsk_put(inet_twsk(sk));
|
|
goto discard_it;
|
|
}
|
|
|
|
tcp_v4_fill_cb(skb, iph, th);
|
|
|
|
if (tcp_checksum_complete(skb)) {
|
|
inet_twsk_put(inet_twsk(sk));
|
|
goto csum_error;
|
|
}
|
|
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
|
|
case TCP_TW_SYN: {
|
|
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
|
|
&tcp_hashinfo, skb,
|
|
__tcp_hdrlen(th),
|
|
iph->saddr, th->source,
|
|
iph->daddr, th->dest,
|
|
inet_iif(skb),
|
|
sdif);
|
|
if (sk2) {
|
|
inet_twsk_deschedule_put(inet_twsk(sk));
|
|
sk = sk2;
|
|
tcp_v4_restore_cb(skb);
|
|
refcounted = false;
|
|
goto process;
|
|
}
|
|
}
|
|
/* to ACK */
|
|
/* fall through */
|
|
case TCP_TW_ACK:
|
|
tcp_v4_timewait_ack(sk, skb);
|
|
break;
|
|
case TCP_TW_RST:
|
|
tcp_v4_send_reset(sk, skb);
|
|
inet_twsk_deschedule_put(inet_twsk(sk));
|
|
goto discard_it;
|
|
case TCP_TW_SUCCESS:;
|
|
}
|
|
goto discard_it;
|
|
}
|
|
|
|
static struct timewait_sock_ops tcp_timewait_sock_ops = {
|
|
.twsk_obj_size = sizeof(struct tcp_timewait_sock),
|
|
.twsk_unique = tcp_twsk_unique,
|
|
.twsk_destructor= tcp_twsk_destructor,
|
|
};
|
|
|
|
void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
|
|
{
|
|
struct dst_entry *dst = skb_dst(skb);
|
|
|
|
if (dst && dst_hold_safe(dst)) {
|
|
rcu_assign_pointer(sk->sk_rx_dst, dst);
|
|
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
|
|
}
|
|
}
|
|
EXPORT_SYMBOL(inet_sk_rx_dst_set);
|
|
|
|
const struct inet_connection_sock_af_ops ipv4_specific = {
|
|
.queue_xmit = ip_queue_xmit,
|
|
.send_check = tcp_v4_send_check,
|
|
.rebuild_header = inet_sk_rebuild_header,
|
|
.sk_rx_dst_set = inet_sk_rx_dst_set,
|
|
.conn_request = tcp_v4_conn_request,
|
|
.syn_recv_sock = tcp_v4_syn_recv_sock,
|
|
.net_header_len = sizeof(struct iphdr),
|
|
.setsockopt = ip_setsockopt,
|
|
.getsockopt = ip_getsockopt,
|
|
.addr2sockaddr = inet_csk_addr2sockaddr,
|
|
.sockaddr_len = sizeof(struct sockaddr_in),
|
|
#ifdef CONFIG_COMPAT
|
|
.compat_setsockopt = compat_ip_setsockopt,
|
|
.compat_getsockopt = compat_ip_getsockopt,
|
|
#endif
|
|
.mtu_reduced = tcp_v4_mtu_reduced,
|
|
};
|
|
EXPORT_SYMBOL(ipv4_specific);
|
|
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
|
|
.md5_lookup = tcp_v4_md5_lookup,
|
|
.calc_md5_hash = tcp_v4_md5_hash_skb,
|
|
.md5_parse = tcp_v4_parse_md5_keys,
|
|
};
|
|
#endif
|
|
|
|
/* NOTE: A lot of things set to zero explicitly by call to
|
|
* sk_alloc() so need not be done here.
|
|
*/
|
|
static int tcp_v4_init_sock(struct sock *sk)
|
|
{
|
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
|
|
|
tcp_init_sock(sk);
|
|
|
|
icsk->icsk_af_ops = &ipv4_specific;
|
|
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
|
|
#endif
|
|
|
|
return 0;
|
|
}
|
|
|
|
void tcp_v4_destroy_sock(struct sock *sk)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
|
trace_tcp_destroy_sock(sk);
|
|
|
|
tcp_clear_xmit_timers(sk);
|
|
|
|
tcp_cleanup_congestion_control(sk);
|
|
|
|
tcp_cleanup_ulp(sk);
|
|
|
|
/* Cleanup up the write buffer. */
|
|
tcp_write_queue_purge(sk);
|
|
|
|
/* Check if we want to disable active TFO */
|
|
tcp_fastopen_active_disable_ofo_check(sk);
|
|
|
|
/* Cleans up our, hopefully empty, out_of_order_queue. */
|
|
skb_rbtree_purge(&tp->out_of_order_queue);
|
|
|
|
#ifdef CONFIG_TCP_MD5SIG
|
|
/* Clean up the MD5 key list, if any */
|
|
if (tp->md5sig_info) {
|
|
tcp_clear_md5_list(sk);
|
|
kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
|
|
tp->md5sig_info = NULL;
|
|
}
|
|
#endif
|
|
|
|
/* Clean up a referenced TCP bind bucket. */
|
|
if (inet_csk(sk)->icsk_bind_hash)
|
|
inet_put_port(sk);
|
|
|
|
BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
|
|
|
|
/* If socket is aborted during connect operation */
|
|
tcp_free_fastopen_req(tp);
|
|
tcp_fastopen_destroy_cipher(sk);
|
|
tcp_saved_syn_free(tp);
|
|
|
|
sk_sockets_allocated_dec(sk);
|
|
}
|
|
EXPORT_SYMBOL(tcp_v4_destroy_sock);
|
|
|
|
#ifdef CONFIG_PROC_FS
|
|
/* Proc filesystem TCP sock list dumping. */
|
|
|
|
/*
|
|
* Get next listener socket follow cur. If cur is NULL, get first socket
|
|
* starting from bucket given in st->bucket; when st->bucket is zero the
|
|
* very first socket in the hash table is returned.
|
|
*/
|
|
static void *listening_get_next(struct seq_file *seq, void *cur)
|
|
{
|
|
struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
|
|
struct tcp_iter_state *st = seq->private;
|
|
struct net *net = seq_file_net(seq);
|
|
struct inet_listen_hashbucket *ilb;
|
|
struct hlist_nulls_node *node;
|
|
struct sock *sk = cur;
|
|
|
|
if (!sk) {
|
|
get_head:
|
|
ilb = &tcp_hashinfo.listening_hash[st->bucket];
|
|
spin_lock(&ilb->lock);
|
|
sk = sk_nulls_head(&ilb->nulls_head);
|
|
st->offset = 0;
|
|
goto get_sk;
|
|
}
|
|
ilb = &tcp_hashinfo.listening_hash[st->bucket];
|
|
++st->num;
|
|
++st->offset;
|
|
|
|
sk = sk_nulls_next(sk);
|
|
get_sk:
|
|
sk_nulls_for_each_from(sk, node) {
|
|
if (!net_eq(sock_net(sk), net))
|
|
continue;
|
|
if (sk->sk_family == afinfo->family)
|
|
return sk;
|
|
}
|
|
spin_unlock(&ilb->lock);
|
|
st->offset = 0;
|
|
if (++st->bucket < INET_LHTABLE_SIZE)
|
|
goto get_head;
|
|
return NULL;
|
|
}
|
|
|
|
static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
|
|
{
|
|
struct tcp_iter_state *st = seq->private;
|
|
void *rc;
|
|
|
|
st->bucket = 0;
|
|
st->offset = 0;
|
|
rc = listening_get_next(seq, NULL);
|
|
|
|
while (rc && *pos) {
|
|
rc = listening_get_next(seq, rc);
|
|
--*pos;
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
static inline bool empty_bucket(const struct tcp_iter_state *st)
|
|
{
|
|
return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
|
|
}
|
|
|
|
/*
|
|
* Get first established socket starting from bucket given in st->bucket.
|
|
* If st->bucket is zero, the very first socket in the hash is returned.
|
|
*/
|
|
static void *established_get_first(struct seq_file *seq)
|
|
{
|
|
struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
|
|
struct tcp_iter_state *st = seq->private;
|
|
struct net *net = seq_file_net(seq);
|
|
void *rc = NULL;
|
|
|
|
st->offset = 0;
|
|
for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
|
|
struct sock *sk;
|
|
struct hlist_nulls_node *node;
|
|
spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
|
|
|
|
/* Lockless fast path for the common case of empty buckets */
|
|
if (empty_bucket(st))
|
|
continue;
|
|
|
|
spin_lock_bh(lock);
|
|
sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
|
|
if (sk->sk_family != afinfo->family ||
|
|
!net_eq(sock_net(sk), net)) {
|
|
continue;
|
|
}
|
|
rc = sk;
|
|
goto out;
|
|
}
|
|
spin_unlock_bh(lock);
|
|
}
|
|
out:
|
|
return rc;
|
|
}
|
|
|
|
static void *established_get_next(struct seq_file *seq, void *cur)
|
|
{
|
|
struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
|
|
struct sock *sk = cur;
|
|
struct hlist_nulls_node *node;
|
|
struct tcp_iter_state *st = seq->private;
|
|
struct net *net = seq_file_net(seq);
|
|
|
|
++st->num;
|
|
++st->offset;
|
|
|
|
sk = sk_nulls_next(sk);
|
|
|
|
sk_nulls_for_each_from(sk, node) {
|
|
if (sk->sk_family == afinfo->family &&
|
|
net_eq(sock_net(sk), net))
|
|
return sk;
|
|
}
|
|
|
|
spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
|
|
++st->bucket;
|
|
return established_get_first(seq);
|
|
}
|
|
|
|
static void *established_get_idx(struct seq_file *seq, loff_t pos)
|
|
{
|
|
struct tcp_iter_state *st = seq->private;
|
|
void *rc;
|
|
|
|
st->bucket = 0;
|
|
rc = established_get_first(seq);
|
|
|
|
while (rc && pos) {
|
|
rc = established_get_next(seq, rc);
|
|
--pos;
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
|
|
{
|
|
void *rc;
|
|
struct tcp_iter_state *st = seq->private;
|
|
|
|
st->state = TCP_SEQ_STATE_LISTENING;
|
|
rc = listening_get_idx(seq, &pos);
|
|
|
|
if (!rc) {
|
|
st->state = TCP_SEQ_STATE_ESTABLISHED;
|
|
rc = established_get_idx(seq, pos);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static void *tcp_seek_last_pos(struct seq_file *seq)
|
|
{
|
|
struct tcp_iter_state *st = seq->private;
|
|
int bucket = st->bucket;
|
|
int offset = st->offset;
|
|
int orig_num = st->num;
|
|
void *rc = NULL;
|
|
|
|
switch (st->state) {
|
|
case TCP_SEQ_STATE_LISTENING:
|
|
if (st->bucket >= INET_LHTABLE_SIZE)
|
|
break;
|
|
st->state = TCP_SEQ_STATE_LISTENING;
|
|
rc = listening_get_next(seq, NULL);
|
|
while (offset-- && rc && bucket == st->bucket)
|
|
rc = listening_get_next(seq, rc);
|
|
if (rc)
|
|
break;
|
|
st->bucket = 0;
|
|
st->state = TCP_SEQ_STATE_ESTABLISHED;
|
|
/* Fallthrough */
|
|
case TCP_SEQ_STATE_ESTABLISHED:
|
|
if (st->bucket > tcp_hashinfo.ehash_mask)
|
|
break;
|
|
rc = established_get_first(seq);
|
|
while (offset-- && rc && bucket == st->bucket)
|
|
rc = established_get_next(seq, rc);
|
|
}
|
|
|
|
st->num = orig_num;
|
|
|
|
return rc;
|
|
}
|
|
|
|
void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
|
|
{
|
|
struct tcp_iter_state *st = seq->private;
|
|
void *rc;
|
|
|
|
if (*pos && *pos == st->last_pos) {
|
|
rc = tcp_seek_last_pos(seq);
|
|
if (rc)
|
|
goto out;
|
|
}
|
|
|
|
st->state = TCP_SEQ_STATE_LISTENING;
|
|
st->num = 0;
|
|
st->bucket = 0;
|
|
st->offset = 0;
|
|
rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
|
|
|
|
out:
|
|
st->last_pos = *pos;
|
|
return rc;
|
|
}
|
|
EXPORT_SYMBOL(tcp_seq_start);
|
|
|
|
void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
|
{
|
|
struct tcp_iter_state *st = seq->private;
|
|
void *rc = NULL;
|
|
|
|
if (v == SEQ_START_TOKEN) {
|
|
rc = tcp_get_idx(seq, 0);
|
|
goto out;
|
|
}
|
|
|
|
switch (st->state) {
|
|
case TCP_SEQ_STATE_LISTENING:
|
|
rc = listening_get_next(seq, v);
|
|
if (!rc) {
|
|
st->state = TCP_SEQ_STATE_ESTABLISHED;
|
|
st->bucket = 0;
|
|
st->offset = 0;
|
|
rc = established_get_first(seq);
|
|
}
|
|
break;
|
|
case TCP_SEQ_STATE_ESTABLISHED:
|
|
rc = established_get_next(seq, v);
|
|
break;
|
|
}
|
|
out:
|
|
++*pos;
|
|
st->last_pos = *pos;
|
|
return rc;
|
|
}
|
|
EXPORT_SYMBOL(tcp_seq_next);
|
|
|
|
void tcp_seq_stop(struct seq_file *seq, void *v)
|
|
{
|
|
struct tcp_iter_state *st = seq->private;
|
|
|
|
switch (st->state) {
|
|
case TCP_SEQ_STATE_LISTENING:
|
|
if (v != SEQ_START_TOKEN)
|
|
spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
|
|
break;
|
|
case TCP_SEQ_STATE_ESTABLISHED:
|
|
if (v)
|
|
spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
|
|
break;
|
|
}
|
|
}
|
|
EXPORT_SYMBOL(tcp_seq_stop);
|
|
|
|
static void get_openreq4(const struct request_sock *req,
|
|
struct seq_file *f, int i)
|
|
{
|
|
const struct inet_request_sock *ireq = inet_rsk(req);
|
|
long delta = req->rsk_timer.expires - jiffies;
|
|
|
|
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
|
|
" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
|
|
i,
|
|
ireq->ir_loc_addr,
|
|
ireq->ir_num,
|
|
ireq->ir_rmt_addr,
|
|
ntohs(ireq->ir_rmt_port),
|
|
TCP_SYN_RECV,
|
|
0, 0, /* could print option size, but that is af dependent. */
|
|
1, /* timers active (only the expire timer) */
|
|
jiffies_delta_to_clock_t(delta),
|
|
req->num_timeout,
|
|
from_kuid_munged(seq_user_ns(f),
|
|
sock_i_uid(req->rsk_listener)),
|
|
0, /* non standard timer */
|
|
0, /* open_requests have no inode */
|
|
0,
|
|
req);
|
|
}
|
|
|
|
static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
|
|
{
|
|
int timer_active;
|
|
unsigned long timer_expires;
|
|
const struct tcp_sock *tp = tcp_sk(sk);
|
|
const struct inet_connection_sock *icsk = inet_csk(sk);
|
|
const struct inet_sock *inet = inet_sk(sk);
|
|
const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
|
|
__be32 dest = inet->inet_daddr;
|
|
__be32 src = inet->inet_rcv_saddr;
|
|
__u16 destp = ntohs(inet->inet_dport);
|
|
__u16 srcp = ntohs(inet->inet_sport);
|
|
__u8 seq_state = sk->sk_state;
|
|
int rx_queue;
|
|
int state;
|
|
|
|
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
|
|
icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
|
|
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
|
|
timer_active = 1;
|
|
timer_expires = icsk->icsk_timeout;
|
|
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
|
|
timer_active = 4;
|
|
timer_expires = icsk->icsk_timeout;
|
|
} else if (timer_pending(&sk->sk_timer)) {
|
|
timer_active = 2;
|
|
timer_expires = sk->sk_timer.expires;
|
|
} else {
|
|
timer_active = 0;
|
|
timer_expires = jiffies;
|
|
}
|
|
|
|
state = inet_sk_state_load(sk);
|
|
if (inet->transparent)
|
|
seq_state |= 0x80;
|
|
|
|
if (state == TCP_LISTEN)
|
|
rx_queue = sk->sk_ack_backlog;
|
|
else
|
|
/* Because we don't lock the socket,
|
|
* we might find a transient negative value.
|
|
*/
|
|
rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
|
|
READ_ONCE(tp->copied_seq), 0);
|
|
|
|
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
|
|
"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
|
|
i, src, srcp, dest, destp, seq_state,
|
|
READ_ONCE(tp->write_seq) - tp->snd_una,
|
|
rx_queue,
|
|
timer_active,
|
|
jiffies_delta_to_clock_t(timer_expires - jiffies),
|
|
icsk->icsk_retransmits,
|
|
from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
|
|
icsk->icsk_probes_out,
|
|
sock_i_ino(sk),
|
|
refcount_read(&sk->sk_refcnt), sk,
|
|
jiffies_to_clock_t(icsk->icsk_rto),
|
|
jiffies_to_clock_t(icsk->icsk_ack.ato),
|
|
(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
|
|
tp->snd_cwnd,
|
|
state == TCP_LISTEN ?
|
|
fastopenq->max_qlen :
|
|
(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
|
|
}
|
|
|
|
static void get_timewait4_sock(const struct inet_timewait_sock *tw,
|
|
struct seq_file *f, int i)
|
|
{
|
|
long delta = tw->tw_timer.expires - jiffies;
|
|
__be32 dest, src;
|
|
__u16 destp, srcp;
|
|
|
|
dest = tw->tw_daddr;
|
|
src = tw->tw_rcv_saddr;
|
|
destp = ntohs(tw->tw_dport);
|
|
srcp = ntohs(tw->tw_sport);
|
|
|
|
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
|
|
" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
|
|
i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
|
|
3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
|
|
refcount_read(&tw->tw_refcnt), tw);
|
|
}
|
|
|
|
#define TMPSZ 150
|
|
|
|
static int tcp4_seq_show(struct seq_file *seq, void *v)
|
|
{
|
|
struct tcp_iter_state *st;
|
|
struct sock *sk = v;
|
|
|
|
seq_setwidth(seq, TMPSZ - 1);
|
|
if (v == SEQ_START_TOKEN) {
|
|
seq_puts(seq, " sl local_address rem_address st tx_queue "
|
|
"rx_queue tr tm->when retrnsmt uid timeout "
|
|
"inode");
|
|
goto out;
|
|
}
|
|
st = seq->private;
|
|
|
|
if (sk->sk_state == TCP_TIME_WAIT)
|
|
get_timewait4_sock(v, seq, st->num);
|
|
else if (sk->sk_state == TCP_NEW_SYN_RECV)
|
|
get_openreq4(v, seq, st->num);
|
|
else
|
|
get_tcp4_sock(v, seq, st->num);
|
|
out:
|
|
seq_pad(seq, '\n');
|
|
return 0;
|
|
}
|
|
|
|
static const struct seq_operations tcp4_seq_ops = {
|
|
.show = tcp4_seq_show,
|
|
.start = tcp_seq_start,
|
|
.next = tcp_seq_next,
|
|
.stop = tcp_seq_stop,
|
|
};
|
|
|
|
static struct tcp_seq_afinfo tcp4_seq_afinfo = {
|
|
.family = AF_INET,
|
|
};
|
|
|
|
static int __net_init tcp4_proc_init_net(struct net *net)
|
|
{
|
|
if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
|
|
sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
|
|
return -ENOMEM;
|
|
return 0;
|
|
}
|
|
|
|
static void __net_exit tcp4_proc_exit_net(struct net *net)
|
|
{
|
|
remove_proc_entry("tcp", net->proc_net);
|
|
}
|
|
|
|
static struct pernet_operations tcp4_net_ops = {
|
|
.init = tcp4_proc_init_net,
|
|
.exit = tcp4_proc_exit_net,
|
|
};
|
|
|
|
int __init tcp4_proc_init(void)
|
|
{
|
|
return register_pernet_subsys(&tcp4_net_ops);
|
|
}
|
|
|
|
void tcp4_proc_exit(void)
|
|
{
|
|
unregister_pernet_subsys(&tcp4_net_ops);
|
|
}
|
|
#endif /* CONFIG_PROC_FS */
|
|
|
|
struct proto tcp_prot = {
|
|
.name = "TCP",
|
|
.owner = THIS_MODULE,
|
|
.close = tcp_close,
|
|
.pre_connect = tcp_v4_pre_connect,
|
|
.connect = tcp_v4_connect,
|
|
.disconnect = tcp_disconnect,
|
|
.accept = inet_csk_accept,
|
|
.ioctl = tcp_ioctl,
|
|
.init = tcp_v4_init_sock,
|
|
.destroy = tcp_v4_destroy_sock,
|
|
.shutdown = tcp_shutdown,
|
|
.setsockopt = tcp_setsockopt,
|
|
.getsockopt = tcp_getsockopt,
|
|
.keepalive = tcp_set_keepalive,
|
|
.recvmsg = tcp_recvmsg,
|
|
.sendmsg = tcp_sendmsg,
|
|
.sendpage = tcp_sendpage,
|
|
.backlog_rcv = tcp_v4_do_rcv,
|
|
.release_cb = tcp_release_cb,
|
|
.hash = inet_hash,
|
|
.unhash = inet_unhash,
|
|
.get_port = inet_csk_get_port,
|
|
.enter_memory_pressure = tcp_enter_memory_pressure,
|
|
.leave_memory_pressure = tcp_leave_memory_pressure,
|
|
.stream_memory_free = tcp_stream_memory_free,
|
|
.sockets_allocated = &tcp_sockets_allocated,
|
|
.orphan_count = &tcp_orphan_count,
|
|
.memory_allocated = &tcp_memory_allocated,
|
|
.memory_pressure = &tcp_memory_pressure,
|
|
.sysctl_mem = sysctl_tcp_mem,
|
|
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
|
|
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
|
|
.max_header = MAX_TCP_HEADER,
|
|
.obj_size = sizeof(struct tcp_sock),
|
|
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
|
.twsk_prot = &tcp_timewait_sock_ops,
|
|
.rsk_prot = &tcp_request_sock_ops,
|
|
.h.hashinfo = &tcp_hashinfo,
|
|
.no_autobind = true,
|
|
#ifdef CONFIG_COMPAT
|
|
.compat_setsockopt = compat_tcp_setsockopt,
|
|
.compat_getsockopt = compat_tcp_getsockopt,
|
|
#endif
|
|
.diag_destroy = tcp_abort,
|
|
};
|
|
EXPORT_SYMBOL(tcp_prot);
|
|
|
|
static void __net_exit tcp_sk_exit(struct net *net)
|
|
{
|
|
int cpu;
|
|
|
|
if (net->ipv4.tcp_congestion_control)
|
|
module_put(net->ipv4.tcp_congestion_control->owner);
|
|
|
|
for_each_possible_cpu(cpu)
|
|
inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
|
|
free_percpu(net->ipv4.tcp_sk);
|
|
}
|
|
|
|
static int __net_init tcp_sk_init(struct net *net)
|
|
{
|
|
int res, cpu, cnt;
|
|
|
|
net->ipv4.tcp_sk = alloc_percpu(struct sock *);
|
|
if (!net->ipv4.tcp_sk)
|
|
return -ENOMEM;
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
struct sock *sk;
|
|
|
|
res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
|
|
IPPROTO_TCP, net);
|
|
if (res)
|
|
goto fail;
|
|
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
|
|
|
|
/* Please enforce IP_DF and IPID==0 for RST and
|
|
* ACK sent in SYN-RECV and TIME-WAIT state.
|
|
*/
|
|
inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
|
|
|
|
*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
|
|
}
|
|
|
|
net->ipv4.sysctl_tcp_ecn = 2;
|
|
net->ipv4.sysctl_tcp_ecn_fallback = 1;
|
|
|
|
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
|
|
net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
|
|
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
|
|
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
|
|
net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
|
|
|
|
net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
|
|
net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
|
|
net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
|
|
|
|
net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
|
|
net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
|
|
net->ipv4.sysctl_tcp_syncookies = 1;
|
|
net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
|
|
net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
|
|
net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
|
|
net->ipv4.sysctl_tcp_orphan_retries = 0;
|
|
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
|
|
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
|
|
net->ipv4.sysctl_tcp_tw_reuse = 2;
|
|
|
|
cnt = tcp_hashinfo.ehash_mask + 1;
|
|
net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
|
|
net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
|
|
|
|
net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
|
|
net->ipv4.sysctl_tcp_sack = 1;
|
|
net->ipv4.sysctl_tcp_window_scaling = 1;
|
|
net->ipv4.sysctl_tcp_timestamps = 1;
|
|
net->ipv4.sysctl_tcp_early_retrans = 3;
|
|
net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
|
|
net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
|
|
net->ipv4.sysctl_tcp_retrans_collapse = 1;
|
|
net->ipv4.sysctl_tcp_max_reordering = 300;
|
|
net->ipv4.sysctl_tcp_dsack = 1;
|
|
net->ipv4.sysctl_tcp_app_win = 31;
|
|
net->ipv4.sysctl_tcp_adv_win_scale = 1;
|
|
net->ipv4.sysctl_tcp_frto = 2;
|
|
net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
|
|
/* This limits the percentage of the congestion window which we
|
|
* will allow a single TSO frame to consume. Building TSO frames
|
|
* which are too large can cause TCP streams to be bursty.
|
|
*/
|
|
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
|
|
/* Default TSQ limit of 16 TSO segments */
|
|
net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
|
|
/* rfc5961 challenge ack rate limiting */
|
|
net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
|
|
net->ipv4.sysctl_tcp_min_tso_segs = 2;
|
|
net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
|
|
net->ipv4.sysctl_tcp_autocorking = 1;
|
|
net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
|
|
net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
|
|
net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
|
|
if (net != &init_net) {
|
|
memcpy(net->ipv4.sysctl_tcp_rmem,
|
|
init_net.ipv4.sysctl_tcp_rmem,
|
|
sizeof(init_net.ipv4.sysctl_tcp_rmem));
|
|
memcpy(net->ipv4.sysctl_tcp_wmem,
|
|
init_net.ipv4.sysctl_tcp_wmem,
|
|
sizeof(init_net.ipv4.sysctl_tcp_wmem));
|
|
}
|
|
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
|
|
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
|
|
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
|
|
spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
|
|
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
|
|
atomic_set(&net->ipv4.tfo_active_disable_times, 0);
|
|
|
|
/* Reno is always built in */
|
|
if (!net_eq(net, &init_net) &&
|
|
try_module_get(init_net.ipv4.tcp_congestion_control->owner))
|
|
net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
|
|
else
|
|
net->ipv4.tcp_congestion_control = &tcp_reno;
|
|
|
|
return 0;
|
|
fail:
|
|
tcp_sk_exit(net);
|
|
|
|
return res;
|
|
}
|
|
|
|
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
|
|
{
|
|
struct net *net;
|
|
|
|
inet_twsk_purge(&tcp_hashinfo, AF_INET);
|
|
|
|
list_for_each_entry(net, net_exit_list, exit_list)
|
|
tcp_fastopen_ctx_destroy(net);
|
|
}
|
|
|
|
static struct pernet_operations __net_initdata tcp_sk_ops = {
|
|
.init = tcp_sk_init,
|
|
.exit = tcp_sk_exit,
|
|
.exit_batch = tcp_sk_exit_batch,
|
|
};
|
|
|
|
void __init tcp_v4_init(void)
|
|
{
|
|
if (register_pernet_subsys(&tcp_sk_ops))
|
|
panic("Failed to create the TCP control socket.\n");
|
|
}
|