diff --git a/Documentation/ABI/testing/sysfs-class-backlight b/Documentation/ABI/testing/sysfs-class-backlight new file mode 100644 index 000000000000..3ab175a3f5cb --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-backlight @@ -0,0 +1,26 @@ +What: /sys/class/backlight//scale +Date: July 2019 +KernelVersion: 5.4 +Contact: Daniel Thompson +Description: + Description of the scale of the brightness curve. + + The human eye senses brightness approximately logarithmically, + hence linear changes in brightness are perceived as being + non-linear. To achieve a linear perception of brightness changes + controls like sliders need to apply a logarithmic mapping for + backlights with a linear brightness curve. + + Possible values of the attribute are: + + unknown + The scale of the brightness curve is unknown. + + linear + The brightness changes linearly with each step. Brightness + controls should apply a logarithmic mapping for a linear + perception. + + non-linear + The brightness changes non-linearly with each step. Brightness + controls should use a linear mapping for a linear perception. diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index 29601d93a1c2..ed35833ad7f0 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -429,10 +429,15 @@ KernelVersion: 2.6.22 Contact: Pekka Enberg , Christoph Lameter Description: - The shrink file is written when memory should be reclaimed from - a cache. Empty partial slabs are freed and the partial list is - sorted so the slabs with the fewest available objects are used - first. + The shrink file is used to reclaim unused slab cache + memory from a cache. Empty per-cpu or partial slabs + are freed and the partial list is sorted so the slabs + with the fewest available objects are used first. + It only accepts a value of "1" on write for shrinking + the cache. Other input values are considered invalid. + Shrinking slab caches might be expensive and can + adversely impact other running applications. So it + should be used with care. What: /sys/kernel/slab/cache/slab_size Date: May 2007 diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 41bdc038dad9..0ae4f564c2d6 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -85,8 +85,10 @@ Brief summary of control files. memory.oom_control set/show oom controls. memory.numa_stat show the number of memory usage per numa node - memory.kmem.limit_in_bytes set/show hard limit for kernel memory + This knob is deprecated and shouldn't be + used. It is planned that this be removed in + the foreseeable future. memory.kmem.usage_in_bytes show current kernel memory allocation memory.kmem.failcnt show the number of kernel memory usage hits limits diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d3814789304f..944e03e29f65 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -809,6 +809,8 @@ enables the feature at boot time. By default, it is disabled and the system will work mostly the same as a kernel built without CONFIG_DEBUG_PAGEALLOC. + Note: to get most of debug_pagealloc error reports, it's + useful to also enable the page_owner functionality. on: enable the feature debugpat [X86] Enable PAT debugging @@ -3465,12 +3467,13 @@ specify the device is described above. If is not specified, PAGE_SIZE is used as alignment. - PCI-PCI bridge can be specified, if resource + A PCI-PCI bridge can be specified if resource windows need to be expanded. To specify the alignment for several instances of a device, the PCI vendor, device, subvendor, and subdevice may be - specified, e.g., 4096@pci:8086:9c22:103c:198f + specified, e.g., 12@pci:8086:9c22:103c:198f + for 4096-byte alignment. ecrc= Enable/disable PCIe ECRC (transaction layer end-to-end CRC checking). bios: Use BIOS/firmware settings. This is the diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 08af5caf036d..f77de49b1d51 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -42,6 +42,9 @@ String Manipulation .. kernel-doc:: lib/string.c :export: +.. kernel-doc:: include/linux/string.h + :internal: + .. kernel-doc:: mm/util.c :functions: kstrdup kstrdup_const kstrndup kmemdup kmemdup_nul memdup_user vmemdup_user strndup_user memdup_user_nul diff --git a/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt b/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt index e9de3756752b..c9a6587fe4bb 100644 --- a/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt +++ b/Documentation/devicetree/bindings/i2c/brcm,bcm2835-i2c.txt @@ -1,7 +1,9 @@ Broadcom BCM2835 I2C controller Required properties: -- compatible : Should be "brcm,bcm2835-i2c". +- compatible : Should be one of: + "brcm,bcm2711-i2c" + "brcm,bcm2835-i2c" - reg: Should contain register location and length. - interrupts: Should contain interrupt. - clocks : The clock feeding the I2C controller. diff --git a/Documentation/devicetree/bindings/i2c/i2c-rcar.txt b/Documentation/devicetree/bindings/i2c/renesas,i2c.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-rcar.txt rename to Documentation/devicetree/bindings/i2c/renesas,i2c.txt diff --git a/Documentation/devicetree/bindings/i2c/i2c-emev2.txt b/Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-emev2.txt rename to Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt diff --git a/Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt b/Documentation/devicetree/bindings/i2c/renesas,iic.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt rename to Documentation/devicetree/bindings/i2c/renesas,iic.txt diff --git a/Documentation/devicetree/bindings/i2c/i2c-riic.txt b/Documentation/devicetree/bindings/i2c/renesas,riic.txt similarity index 100% rename from Documentation/devicetree/bindings/i2c/i2c-riic.txt rename to Documentation/devicetree/bindings/i2c/renesas,riic.txt diff --git a/Documentation/devicetree/bindings/mfd/mt6397.txt b/Documentation/devicetree/bindings/mfd/mt6397.txt index 0ebd08af777d..a9b105ac00a8 100644 --- a/Documentation/devicetree/bindings/mfd/mt6397.txt +++ b/Documentation/devicetree/bindings/mfd/mt6397.txt @@ -8,11 +8,12 @@ MT6397/MT6323 is a multifunction device with the following sub modules: - Clock - LED - Keys +- Power controller It is interfaced to host controller using SPI interface by a proprietary hardware called PMIC wrapper or pwrap. MT6397/MT6323 MFD is a child device of pwrap. See the following for pwarp node definitions: -Documentation/devicetree/bindings/soc/mediatek/pwrap.txt +../soc/mediatek/pwrap.txt This document describes the binding for MFD device and its sub module. @@ -22,14 +23,16 @@ compatible: "mediatek,mt6397" or "mediatek,mt6323" Optional subnodes: - rtc - Required properties: + Required properties: Should be one of follows + - compatible: "mediatek,mt6323-rtc" - compatible: "mediatek,mt6397-rtc" + For details, see ../rtc/rtc-mt6397.txt - regulators Required properties: - compatible: "mediatek,mt6397-regulator" - see Documentation/devicetree/bindings/regulator/mt6397-regulator.txt + see ../regulator/mt6397-regulator.txt - compatible: "mediatek,mt6323-regulator" - see Documentation/devicetree/bindings/regulator/mt6323-regulator.txt + see ../regulator/mt6323-regulator.txt - codec Required properties: - compatible: "mediatek,mt6397-codec" @@ -39,12 +42,17 @@ Optional subnodes: - led Required properties: - compatible: "mediatek,mt6323-led" - see Documentation/devicetree/bindings/leds/leds-mt6323.txt + see ../leds/leds-mt6323.txt - keys Required properties: - compatible: "mediatek,mt6397-keys" or "mediatek,mt6323-keys" - see Documentation/devicetree/bindings/input/mtk-pmic-keys.txt + see ../input/mtk-pmic-keys.txt + +- power-controller + Required properties: + - compatible: "mediatek,mt6323-pwrc" + For details, see ../power/reset/mt6323-poweroff.txt Example: pwrap: pwrap@1000f000 { diff --git a/Documentation/devicetree/bindings/mfd/rn5t618.txt b/Documentation/devicetree/bindings/mfd/rn5t618.txt index 65c23263cc54..b74e5e94d1cb 100644 --- a/Documentation/devicetree/bindings/mfd/rn5t618.txt +++ b/Documentation/devicetree/bindings/mfd/rn5t618.txt @@ -14,6 +14,10 @@ Required properties: "ricoh,rc5t619" - reg: the I2C slave address of the device +Optional properties: + - system-power-controller: + See Documentation/devicetree/bindings/power/power-controller.txt + Sub-nodes: - regulators: the node is required if the regulator functionality is needed. The valid regulator names are: DCDC1, DCDC2, DCDC3, DCDC4 @@ -28,6 +32,7 @@ Example: pmic@32 { compatible = "ricoh,rn5t618"; reg = <0x32>; + system-power-controller; regulators { DCDC1 { diff --git a/Documentation/devicetree/bindings/pci/designware-pcie.txt b/Documentation/devicetree/bindings/pci/designware-pcie.txt index 5561a1c060d0..78494c4050f7 100644 --- a/Documentation/devicetree/bindings/pci/designware-pcie.txt +++ b/Documentation/devicetree/bindings/pci/designware-pcie.txt @@ -11,7 +11,6 @@ Required properties: the ATU address space. (The old way of getting the configuration address space from "ranges" is deprecated and should be avoided.) -- num-lanes: number of lanes to use RC mode: - #address-cells: set to <3> - #size-cells: set to <2> @@ -34,6 +33,11 @@ Optional properties: - clock-names: Must include the following entries: - "pcie" - "pcie_bus" +- snps,enable-cdm-check: This is a boolean property and if present enables + automatic checking of CDM (Configuration Dependent Module) registers + for data corruption. CDM registers include standard PCIe configuration + space registers, Port Logic registers, DMA and iATU (internal Address + Translation Unit) registers. RC mode: - num-viewport: number of view ports configured in hardware. If a platform does not specify it, the driver assumes 2. diff --git a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt index a7f5f5afa0e6..de4b2baf91e8 100644 --- a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt +++ b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt @@ -50,7 +50,7 @@ Additional required properties for imx7d-pcie and imx8mq-pcie: - power-domains: Must be set to a phandle pointing to PCIE_PHY power domain - resets: Must contain phandles to PCIe-related reset lines exposed by SRC IP block -- reset-names: Must contain the following entires: +- reset-names: Must contain the following entries: - "pciephy" - "apps" - "turnoff" diff --git a/Documentation/devicetree/bindings/pci/mediatek-pcie.txt b/Documentation/devicetree/bindings/pci/mediatek-pcie.txt index 92437a366e5f..7468d666763a 100644 --- a/Documentation/devicetree/bindings/pci/mediatek-pcie.txt +++ b/Documentation/devicetree/bindings/pci/mediatek-pcie.txt @@ -6,6 +6,7 @@ Required properties: "mediatek,mt2712-pcie" "mediatek,mt7622-pcie" "mediatek,mt7623-pcie" + "mediatek,mt7629-pcie" - device_type: Must be "pci" - reg: Base addresses and lengths of the PCIe subsys and root ports. - reg-names: Names of the above areas to use during resource lookup. diff --git a/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt new file mode 100644 index 000000000000..b739f92da58e --- /dev/null +++ b/Documentation/devicetree/bindings/pci/nvidia,tegra194-pcie.txt @@ -0,0 +1,171 @@ +NVIDIA Tegra PCIe controller (Synopsys DesignWare Core based) + +This PCIe host controller is based on the Synopsis Designware PCIe IP +and thus inherits all the common properties defined in designware-pcie.txt. + +Required properties: +- compatible: For Tegra19x, must contain "nvidia,tegra194-pcie". +- device_type: Must be "pci" +- power-domains: A phandle to the node that controls power to the respective + PCIe controller and a specifier name for the PCIe controller. Following are + the specifiers for the different PCIe controllers + TEGRA194_POWER_DOMAIN_PCIEX8B: C0 + TEGRA194_POWER_DOMAIN_PCIEX1A: C1 + TEGRA194_POWER_DOMAIN_PCIEX1A: C2 + TEGRA194_POWER_DOMAIN_PCIEX1A: C3 + TEGRA194_POWER_DOMAIN_PCIEX4A: C4 + TEGRA194_POWER_DOMAIN_PCIEX8A: C5 + these specifiers are defined in + "include/dt-bindings/power/tegra194-powergate.h" file. +- reg: A list of physical base address and length pairs for each set of + controller registers. Must contain an entry for each entry in the reg-names + property. +- reg-names: Must include the following entries: + "appl": Controller's application logic registers + "config": As per the definition in designware-pcie.txt + "atu_dma": iATU and DMA registers. This is where the iATU (internal Address + Translation Unit) registers of the PCIe core are made available + for SW access. + "dbi": The aperture where root port's own configuration registers are + available +- interrupts: A list of interrupt outputs of the controller. Must contain an + entry for each entry in the interrupt-names property. +- interrupt-names: Must include the following entries: + "intr": The Tegra interrupt that is asserted for controller interrupts + "msi": The Tegra interrupt that is asserted when an MSI is received +- bus-range: Range of bus numbers associated with this controller +- #address-cells: Address representation for root ports (must be 3) + - cell 0 specifies the bus and device numbers of the root port: + [23:16]: bus number + [15:11]: device number + - cell 1 denotes the upper 32 address bits and should be 0 + - cell 2 contains the lower 32 address bits and is used to translate to the + CPU address space +- #size-cells: Size representation for root ports (must be 2) +- ranges: Describes the translation of addresses for root ports and standard + PCI regions. The entries must be 7 cells each, where the first three cells + correspond to the address as described for the #address-cells property + above, the fourth and fifth cells are for the physical CPU address to + translate to and the sixth and seventh cells are as described for the + #size-cells property above. + - Entries setup the mapping for the standard I/O, memory and + prefetchable PCI regions. The first cell determines the type of region + that is setup: + - 0x81000000: I/O memory region + - 0x82000000: non-prefetchable memory region + - 0xc2000000: prefetchable memory region + Please refer to the standard PCI bus binding document for a more detailed + explanation. +- #interrupt-cells: Size representation for interrupts (must be 1) +- interrupt-map-mask and interrupt-map: Standard PCI IRQ mapping properties + Please refer to the standard PCI bus binding document for a more detailed + explanation. +- clocks: Must contain an entry for each entry in clock-names. + See ../clocks/clock-bindings.txt for details. +- clock-names: Must include the following entries: + - core +- resets: Must contain an entry for each entry in reset-names. + See ../reset/reset.txt for details. +- reset-names: Must include the following entries: + - apb + - core +- phys: Must contain a phandle to P2U PHY for each entry in phy-names. +- phy-names: Must include an entry for each active lane. + "p2u-N": where N ranges from 0 to one less than the total number of lanes +- nvidia,bpmp: Must contain a pair of phandle to BPMP controller node followed + by controller-id. Following are the controller ids for each controller. + 0: C0 + 1: C1 + 2: C2 + 3: C3 + 4: C4 + 5: C5 +- vddio-pex-ctl-supply: Regulator supply for PCIe side band signals + +Optional properties: +- pinctrl-names: A list of pinctrl state names. + It is mandatory for C5 controller and optional for other controllers. + - "default": Configures PCIe I/O for proper operation. +- pinctrl-0: phandle for the 'default' state of pin configuration. + It is mandatory for C5 controller and optional for other controllers. +- supports-clkreq: Refer to Documentation/devicetree/bindings/pci/pci.txt +- nvidia,update-fc-fixup: This is a boolean property and needs to be present to + improve performance when a platform is designed in such a way that it + satisfies at least one of the following conditions thereby enabling root + port to exchange optimum number of FC (Flow Control) credits with + downstream devices + 1. If C0/C4/C5 run at x1/x2 link widths (irrespective of speed and MPS) + 2. If C0/C1/C2/C3/C4/C5 operate at their respective max link widths and + a) speed is Gen-2 and MPS is 256B + b) speed is >= Gen-3 with any MPS +- nvidia,aspm-cmrt-us: Common Mode Restore Time for proper operation of ASPM + to be specified in microseconds +- nvidia,aspm-pwr-on-t-us: Power On time for proper operation of ASPM to be + specified in microseconds +- nvidia,aspm-l0s-entrance-latency-us: ASPM L0s entrance latency to be + specified in microseconds +- vpcie3v3-supply: A phandle to the regulator node that supplies 3.3V to the slot + if the platform has one such slot. (Ex:- x16 slot owned by C5 controller + in p2972-0000 platform). +- vpcie12v-supply: A phandle to the regulator node that supplies 12V to the slot + if the platform has one such slot. (Ex:- x16 slot owned by C5 controller + in p2972-0000 platform). + +Examples: +========= + +Tegra194: +-------- + + pcie@14180000 { + compatible = "nvidia,tegra194-pcie", "snps,dw-pcie"; + power-domains = <&bpmp TEGRA194_POWER_DOMAIN_PCIEX8B>; + reg = <0x00 0x14180000 0x0 0x00020000 /* appl registers (128K) */ + 0x00 0x38000000 0x0 0x00040000 /* configuration space (256K) */ + 0x00 0x38040000 0x0 0x00040000>; /* iATU_DMA reg space (256K) */ + reg-names = "appl", "config", "atu_dma"; + + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + num-lanes = <8>; + linux,pci-domain = <0>; + + pinctrl-names = "default"; + pinctrl-0 = <&pex_rst_c5_out_state>, <&clkreq_c5_bi_dir_state>; + + clocks = <&bpmp TEGRA194_CLK_PEX0_CORE_0>; + clock-names = "core"; + + resets = <&bpmp TEGRA194_RESET_PEX0_CORE_0_APB>, + <&bpmp TEGRA194_RESET_PEX0_CORE_0>; + reset-names = "apb", "core"; + + interrupts = , /* controller interrupt */ + ; /* MSI interrupt */ + interrupt-names = "intr", "msi"; + + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0>; + interrupt-map = <0 0 0 0 &gic GIC_SPI 72 IRQ_TYPE_LEVEL_HIGH>; + + nvidia,bpmp = <&bpmp 0>; + + supports-clkreq; + nvidia,aspm-cmrt-us = <60>; + nvidia,aspm-pwr-on-t-us = <20>; + nvidia,aspm-l0s-entrance-latency-us = <3>; + + bus-range = <0x0 0xff>; + ranges = <0x81000000 0x0 0x38100000 0x0 0x38100000 0x0 0x00100000 /* downstream I/O (1MB) */ + 0x82000000 0x0 0x38200000 0x0 0x38200000 0x0 0x01E00000 /* non-prefetchable memory (30MB) */ + 0xc2000000 0x18 0x00000000 0x18 0x00000000 0x4 0x00000000>; /* prefetchable memory (16GB) */ + + vddio-pex-ctl-supply = <&vdd_1v8ao>; + vpcie3v3-supply = <&vdd_3v3_pcie>; + vpcie12v-supply = <&vdd_12v_pcie>; + + phys = <&p2u_hsio_2>, <&p2u_hsio_3>, <&p2u_hsio_4>, + <&p2u_hsio_5>; + phy-names = "p2u-0", "p2u-1", "p2u-2", "p2u-3"; + }; diff --git a/Documentation/devicetree/bindings/pci/pci-armada8k.txt b/Documentation/devicetree/bindings/pci/pci-armada8k.txt index 8324a4ee6f06..7a813d0e6d63 100644 --- a/Documentation/devicetree/bindings/pci/pci-armada8k.txt +++ b/Documentation/devicetree/bindings/pci/pci-armada8k.txt @@ -11,7 +11,7 @@ Required properties: - reg-names: - "ctrl" for the control register region - "config" for the config space region -- interrupts: Interrupt specifier for the PCIe controler +- interrupts: Interrupt specifier for the PCIe controller - clocks: reference to the PCIe controller clocks - clock-names: mandatory if there is a second clock, in this case the name must be "core" for the first clock and "reg" for the second diff --git a/Documentation/devicetree/bindings/pci/pci.txt b/Documentation/devicetree/bindings/pci/pci.txt index 2a5d91024059..29bcbd88f457 100644 --- a/Documentation/devicetree/bindings/pci/pci.txt +++ b/Documentation/devicetree/bindings/pci/pci.txt @@ -27,6 +27,11 @@ driver implementation may support the following properties: - reset-gpios: If present this property specifies PERST# GPIO. Host drivers can parse the GPIO and apply fundamental reset to endpoints. +- supports-clkreq: + If present this property specifies that CLKREQ signal routing exists from + root port to downstream device and host bridge drivers can do programming + which depends on CLKREQ signal existence. For example, programming root port + not to advertise ASPM L1 Sub-States support if there is no CLKREQ signal. PCI-PCI Bridge properties ------------------------- diff --git a/Documentation/devicetree/bindings/pci/pcie-al.txt b/Documentation/devicetree/bindings/pci/pcie-al.txt new file mode 100644 index 000000000000..557a5089229d --- /dev/null +++ b/Documentation/devicetree/bindings/pci/pcie-al.txt @@ -0,0 +1,46 @@ +* Amazon Annapurna Labs PCIe host bridge + +Amazon's Annapurna Labs PCIe Host Controller is based on the Synopsys DesignWare +PCI core. It inherits common properties defined in +Documentation/devicetree/bindings/pci/designware-pcie.txt. + +Properties of the host controller node that differ from it are: + +- compatible: + Usage: required + Value type: + Definition: Value should contain + - "amazon,al-alpine-v2-pcie" for alpine_v2 + - "amazon,al-alpine-v3-pcie" for alpine_v3 + +- reg: + Usage: required + Value type: + Definition: Register ranges as listed in the reg-names property + +- reg-names: + Usage: required + Value type: + Definition: Must include the following entries + - "config" PCIe ECAM space + - "controller" AL proprietary registers + - "dbi" Designware PCIe registers + +Example: + + pcie-external0: pcie@fb600000 { + compatible = "amazon,al-alpine-v3-pcie"; + reg = <0x0 0xfb600000 0x0 0x00100000 + 0x0 0xfd800000 0x0 0x00010000 + 0x0 0xfd810000 0x0 0x00001000>; + reg-names = "config", "controller", "dbi"; + bus-range = <0 255>; + device_type = "pci"; + #address-cells = <3>; + #size-cells = <2>; + #interrupt-cells = <1>; + interrupts = ; + interrupt-map-mask = <0x00 0 0 7>; + interrupt-map = <0x0000 0 0 1 &gic GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>; /* INTa */ + ranges = <0x02000000 0x0 0xc0010000 0x0 0xc0010000 0x0 0x07ff0000>; + }; diff --git a/Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt b/Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt new file mode 100644 index 000000000000..d23ff90baad5 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/phy-tegra194-p2u.txt @@ -0,0 +1,28 @@ +NVIDIA Tegra194 P2U binding + +Tegra194 has two PHY bricks namely HSIO (High Speed IO) and NVHS (NVIDIA High +Speed) each interfacing with 12 and 8 P2U instances respectively. +A P2U instance is a glue logic between Synopsys DesignWare Core PCIe IP's PIPE +interface and PHY of HSIO/NVHS bricks. Each P2U instance represents one PCIe +lane. + +Required properties: +- compatible: For Tegra19x, must contain "nvidia,tegra194-p2u". +- reg: Should be the physical address space and length of respective each P2U + instance. +- reg-names: Must include the entry "ctl". + +Required properties for PHY port node: +- #phy-cells: Defined by generic PHY bindings. Must be 0. + +Refer to phy/phy-bindings.txt for the generic PHY binding properties. + +Example: + +p2u_hsio_0: phy@3e10000 { + compatible = "nvidia,tegra194-p2u"; + reg = <0x03e10000 0x10000>; + reg-names = "ctl"; + + #phy-cells = <0>; +}; diff --git a/Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt b/Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt new file mode 100644 index 000000000000..933f0c48e887 --- /dev/null +++ b/Documentation/devicetree/bindings/power/reset/mt6323-poweroff.txt @@ -0,0 +1,20 @@ +Device Tree Bindings for Power Controller on MediaTek PMIC + +The power controller which could be found on PMIC is responsible for externally +powering off or on the remote MediaTek SoC through the circuit BBPU. + +Required properties: +- compatible: Should be one of follows + "mediatek,mt6323-pwrc": for MT6323 PMIC + +Example: + + pmic { + compatible = "mediatek,mt6323"; + + ... + + power-controller { + compatible = "mediatek,mt6323-pwrc"; + }; + } diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index d2c6a5ccf0f5..b19b6a03f91c 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt @@ -158,6 +158,20 @@ Mount Options copies. Currently, it's only used in copy_file_range, which will revert to the default VFS implementation if this option is used. + recover_session= + Set auto reconnect mode in the case where the client is blacklisted. The + available modes are "no" and "clean". The default is "no". + + * no: never attempt to reconnect when client detects that it has been + blacklisted. Operations will generally fail after being blacklisted. + + * clean: client reconnects to the ceph cluster automatically when it + detects that it has been blacklisted. During reconnect, client drops + dirty data/metadata, invalidates page caches and writable file handles. + After reconnect, file locks become stale because the MDS loses track + of them. If an inode contains any stale file locks, read/write on the + inode is not allowed until applications release all stale file locks. + More Information ================ diff --git a/Documentation/vm/split_page_table_lock.rst b/Documentation/vm/split_page_table_lock.rst index 889b00be469f..ff51f4a5494d 100644 --- a/Documentation/vm/split_page_table_lock.rst +++ b/Documentation/vm/split_page_table_lock.rst @@ -54,9 +54,9 @@ Hugetlb-specific helpers: Support of split page table lock by an architecture =================================================== -There's no need in special enabling of PTE split page table lock: -everything required is done by pgtable_page_ctor() and pgtable_page_dtor(), -which must be called on PTE table allocation / freeing. +There's no need in special enabling of PTE split page table lock: everything +required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which +must be called on PTE table allocation / freeing. Make sure the architecture doesn't use slab allocator for page table allocation: slab uses page->slab_cache for its pages. @@ -74,7 +74,7 @@ paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. -NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must +NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must be handled properly. page->ptl @@ -94,7 +94,7 @@ trick: split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs one more cache line for indirect access; -The spinlock_t allocated in pgtable_page_ctor() for PTE table and in +The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in pgtable_pmd_page_ctor() for PMD table. Please, never access page->ptl directly -- use appropriate helper. diff --git a/MAINTAINERS b/MAINTAINERS index a400af0501c9..a97f1be63b9d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -728,7 +728,7 @@ ALTERA SYSTEM MANAGER DRIVER M: Thor Thayer S: Maintained F: drivers/mfd/altera-sysmgr.c -F: include/linux/mfd/altera-sysgmr.h +F: include/linux/mfd/altera-sysmgr.h ALTERA SYSTEM RESOURCE DRIVER FOR ARRIA10 DEVKIT M: Thor Thayer @@ -2921,6 +2921,8 @@ F: drivers/video/backlight/ F: include/linux/backlight.h F: include/linux/pwm_backlight.h F: Documentation/devicetree/bindings/leds/backlight +F: Documentation/ABI/stable/sysfs-class-backlight +F: Documentation/ABI/testing/sysfs-class-backlight BATMAN ADVANCED M: Marek Lindner @@ -4338,6 +4340,12 @@ S: Maintained F: Documentation/filesystems/cramfs.txt F: fs/cramfs/ +CREATIVE SB0540 +M: Bastien Nocera +L: linux-input@vger.kernel.org +S: Maintained +F: drivers/hid/hid-creative-sb0540.c + CRYPTO API M: Herbert Xu M: "David S. Miller" @@ -7857,6 +7865,12 @@ S: Maintained F: drivers/mfd/lpc_ich.c F: drivers/gpio/gpio-ich.c +ICY I2C DRIVER +M: Max Staudt +L: linux-i2c@vger.kernel.org +S: Maintained +F: drivers/i2c/busses/i2c-icy.c + IDE SUBSYSTEM M: "David S. Miller" L: linux-ide@vger.kernel.org @@ -9042,7 +9056,7 @@ S: Supported F: Documentation/security/keys/trusted-encrypted.rst F: include/keys/trusted-type.h F: security/keys/trusted.c -F: security/keys/trusted.h +F: include/keys/trusted.h KEYS/KEYRINGS: M: David Howells @@ -12574,16 +12588,18 @@ F: arch/x86/kernel/early-quirks.c PCI NATIVE HOST BRIDGE AND ENDPOINT DRIVERS M: Lorenzo Pieralisi +R: Andrew Murray L: linux-pci@vger.kernel.org Q: http://patchwork.ozlabs.org/project/linux-pci/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/lpieralisi/pci.git/ S: Supported F: drivers/pci/controller/ -PCIE DRIVER FOR ANNAPURNA LABS +PCIE DRIVER FOR AMAZON ANNAPURNA LABS M: Jonathan Chocron L: linux-pci@vger.kernel.org S: Maintained +F: Documentation/devicetree/bindings/pci/pcie-al.txt F: drivers/pci/controller/dwc/pcie-al.c PCIE DRIVER FOR AMLOGIC MESON @@ -13781,7 +13797,7 @@ F: drivers/clk/renesas/ RENESAS EMEV2 I2C DRIVER M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/i2c-emev2.txt +F: Documentation/devicetree/bindings/i2c/renesas,iic-emev2.txt F: drivers/i2c/busses/i2c-emev2.c RENESAS ETHERNET DRIVERS @@ -13803,15 +13819,15 @@ F: drivers/iio/adc/rcar-gyroadc.c RENESAS R-CAR I2C DRIVERS M: Wolfram Sang S: Supported -F: Documentation/devicetree/bindings/i2c/i2c-rcar.txt -F: Documentation/devicetree/bindings/i2c/i2c-sh_mobile.txt +F: Documentation/devicetree/bindings/i2c/renesas,i2c.txt +F: Documentation/devicetree/bindings/i2c/renesas,iic.txt F: drivers/i2c/busses/i2c-rcar.c F: drivers/i2c/busses/i2c-sh_mobile.c RENESAS RIIC DRIVER M: Chris Brandt S: Supported -F: Documentation/devicetree/bindings/i2c/i2c-riic.txt +F: Documentation/devicetree/bindings/i2c/renesas,riic.txt F: drivers/i2c/busses/i2c-riic.c RENESAS USB PHY DRIVER diff --git a/arch/Kconfig b/arch/Kconfig index 0fcf8ec1e098..5f8a5d84dbbe 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -706,6 +706,17 @@ config HAVE_ARCH_COMPAT_MMAP_BASES and vice-versa 32-bit applications to call 64-bit mmap(). Required for applications doing different bitness syscalls. +# This allows to use a set of generic functions to determine mmap base +# address by giving priority to top-down scheme only if the process +# is not in legacy mode (compat task, unlimited stack size or +# sysctl_legacy_va_layout). +# Architecture that selects this option can provide its own version of: +# - STACK_RND_MASK +config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT + bool + depends on MMU + select ARCH_HAS_ELF_RANDOMIZE + config HAVE_COPY_THREAD_TLS bool help diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h index 71ded3b7d82d..eb91f1e85629 100644 --- a/arch/alpha/include/asm/pgalloc.h +++ b/arch/alpha/include/asm/pgalloc.h @@ -53,6 +53,4 @@ pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -#define check_pgt_cache() do { } while (0) - #endif /* _ALPHA_PGALLOC_H */ diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 89c2032f9960..065b57f408c3 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -359,11 +359,6 @@ extern void paging_init(void); #include -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - /* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */ #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index ac23379b7a87..a18ec7f63888 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -68,6 +68,9 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index 9bdb8ed5b0db..b747f2ec2928 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h @@ -108,7 +108,7 @@ pte_alloc_one(struct mm_struct *mm) return 0; memzero((void *)pte_pg, PTRS_PER_PTE * sizeof(pte_t)); page = virt_to_page(pte_pg); - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return 0; } @@ -123,13 +123,12 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, pgtable_t ptep) { - pgtable_page_dtor(virt_to_page(ptep)); + pgtable_pte_page_dtor(virt_to_page(ptep)); free_pages((unsigned long)ptep, __get_order_pte()); } #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) -#define check_pgt_cache() do { } while (0) #define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd)) #endif /* _ASM_ARC_PGALLOC_H */ diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 1d87c18a2976..7addd0301c51 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -395,11 +395,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, /* to cope with aliasing VIPT cache */ #define HAVE_ARCH_UNMAPPED_AREA -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 229f2cdd81ca..8a50efb559f3 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -34,6 +34,7 @@ config ARM select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_IPC_PARSE_VERSION select BINFMT_FLAT_ARGVP_ENVP_ON_STACK select BUILDTIME_EXTABLE_SORT if MMU diff --git a/arch/arm/boot/dts/ls1021a.dtsi b/arch/arm/boot/dts/ls1021a.dtsi index 464df4290ffc..2f6977ada447 100644 --- a/arch/arm/boot/dts/ls1021a.dtsi +++ b/arch/arm/boot/dts/ls1021a.dtsi @@ -874,7 +874,6 @@ #address-cells = <3>; #size-cells = <2>; device_type = "pci"; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -899,7 +898,6 @@ #address-cells = <3>; #size-cells = <2>; device_type = "pci"; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index a2a68b751971..069da393110c 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -15,8 +15,6 @@ #include #include -#define check_pgt_cache() do { } while (0) - #ifdef CONFIG_MMU #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER)) diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h index d0de24f06724..010fa1a35a68 100644 --- a/arch/arm/include/asm/pgtable-nommu.h +++ b/arch/arm/include/asm/pgtable-nommu.h @@ -70,11 +70,6 @@ typedef pte_t *pte_addr_t; */ extern unsigned int kobjsize(const void *objp); -/* - * No page table caches to initialise. - */ -#define pgtable_cache_init() do { } while (0) - /* * All 32bit addresses are effectively valid for vmalloc... * Sort of meaningless for non-VM targets. diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index f2e990dc27e7..3ae120cd1715 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -368,8 +368,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define pgtable_cache_init() do { } while (0) - #endif /* !__ASSEMBLY__ */ #endif /* CONFIG_MMU */ diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 20c2f42454b8..614bf829e454 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -140,8 +140,6 @@ static inline void prefetchw(const void *ptr) #endif #endif -#define HAVE_ARCH_PICK_MMAP_LAYOUT - #endif #endif /* __ASM_ARM_PROCESSOR_H */ diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index b75ea15b85c0..669474add486 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -44,7 +44,7 @@ static inline void __tlb_remove_table(void *_table) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); #ifndef CONFIG_ARM_LPAE /* diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index f934a6739fc0..9485acc520a4 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -319,11 +319,6 @@ unsigned long get_wchan(struct task_struct *p) return 0; } -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - return randomize_page(mm->brk, 0x02000000); -} - #ifdef CONFIG_MMU #ifdef CONFIG_KUSER_HELPERS /* diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 6ecbda87ee46..6d89db7895d1 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -204,18 +204,17 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page) * coherent with the kernels mapping. */ if (!PageHighMem(page)) { - size_t page_size = PAGE_SIZE << compound_order(page); - __cpuc_flush_dcache_area(page_address(page), page_size); + __cpuc_flush_dcache_area(page_address(page), page_size(page)); } else { unsigned long i; if (cache_is_vipt_nonaliasing()) { - for (i = 0; i < (1 << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { void *addr = kmap_atomic(page + i); __cpuc_flush_dcache_area(addr, PAGE_SIZE); kunmap_atomic(addr); } } else { - for (i = 0; i < (1 << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { void *addr = kmap_high_get(page + i); if (addr) { __cpuc_flush_dcache_area(addr, PAGE_SIZE); diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index f866870db749..b8d912ac9e61 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c @@ -17,33 +17,6 @@ ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \ (((pgoff)<personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (rlim_stack->rlim_cur == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) -{ - unsigned long gap = rlim_stack->rlim_cur; - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(TASK_SIZE - gap - rnd); -} - /* * We need to ensure that shared mappings are correctly aligned to * avoid aliasing issues with VIPT caches. We need to ensure that @@ -171,31 +144,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } -unsigned long arch_mmap_rnd(void) -{ - unsigned long rnd; - - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); - - return rnd << PAGE_SHIFT; -} - -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) -{ - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); - - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} - /* * You really shouldn't be using read() or write() on /dev/mem. This * might go away in the future. diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 25da9b2d9610..48c2888297dd 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -731,7 +731,7 @@ static void *__init late_alloc(unsigned long sz) { void *ptr = (void *)__get_free_pages(GFP_PGTABLE_KERNEL, get_order(sz)); - if (!ptr || !pgtable_page_ctor(virt_to_page(ptr))) + if (!ptr || !pgtable_pte_page_ctor(virt_to_page(ptr))) BUG(); return ptr; } diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 72236a906ee7..c04dba1c3e81 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -15,7 +15,6 @@ config ARM64 select ARCH_HAS_DMA_COHERENT_TO_PFN select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI - select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL @@ -71,6 +70,7 @@ config ARM64 select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi index 124a7e2d8442..337919366dc8 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi @@ -486,7 +486,6 @@ #address-cells = <3>; #size-cells = <2>; device_type = "pci"; - num-lanes = <4>; num-viewport = <2>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index 71d9ed9ff985..c084c7a4b6a6 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -677,7 +677,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -704,7 +703,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <2>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -731,7 +729,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <2>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x50 0x00010000 0x0 0x00010000 /* downstream I/O */ diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi index b0ef08b090dd..d4c1da3d4bde 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi @@ -649,7 +649,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <8>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x40 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -671,7 +670,6 @@ reg-names = "regs", "addr_space"; num-ib-windows = <6>; num-ob-windows = <8>; - num-lanes = <2>; status = "disabled"; }; @@ -687,7 +685,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <2>; num-viewport = <8>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x48 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -709,7 +706,6 @@ reg-names = "regs", "addr_space"; num-ib-windows = <6>; num-ob-windows = <8>; - num-lanes = <2>; status = "disabled"; }; @@ -725,7 +721,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <2>; num-viewport = <8>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x50 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -747,7 +742,6 @@ reg-names = "regs", "addr_space"; num-ib-windows = <6>; num-ob-windows = <8>; - num-lanes = <2>; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi index d1469b0747c7..c676d0771762 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi @@ -469,7 +469,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <256>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x20 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -495,7 +494,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x28 0x00010000 0x0 0x00010000 /* downstream I/O */ @@ -521,7 +519,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <8>; num-viewport = <6>; bus-range = <0x0 0xff>; ranges = <0x81000000 0x0 0x00000000 0x30 0x00010000 0x0 0x00010000 /* downstream I/O */ diff --git a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi index 64101c9962ce..7a0be8eaa84a 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls208xa.dtsi @@ -639,7 +639,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; msi-parent = <&its>; @@ -661,7 +660,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; msi-parent = <&its>; @@ -683,7 +681,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <8>; num-viewport = <256>; bus-range = <0x0 0xff>; msi-parent = <&its>; @@ -705,7 +702,6 @@ #size-cells = <2>; device_type = "pci"; dma-coherent; - num-lanes = <4>; num-viewport = <6>; bus-range = <0x0 0xff>; msi-parent = <&its>; diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi b/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi index 62e07e1197cc..4c38426a6969 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194-p2888.dtsi @@ -289,5 +289,29 @@ gpio = <&gpio TEGRA194_MAIN_GPIO(A, 3) GPIO_ACTIVE_HIGH>; enable-active-high; }; + + vdd_3v3_pcie: regulator@2 { + compatible = "regulator-fixed"; + reg = <2>; + + regulator-name = "PEX_3V3"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + gpio = <&gpio TEGRA194_MAIN_GPIO(Z, 2) GPIO_ACTIVE_HIGH>; + regulator-boot-on; + enable-active-high; + }; + + vdd_12v_pcie: regulator@3 { + compatible = "regulator-fixed"; + reg = <3>; + + regulator-name = "VDD_12V"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <1200000>; + gpio = <&gpio TEGRA194_MAIN_GPIO(A, 1) GPIO_ACTIVE_LOW>; + regulator-boot-on; + enable-active-low; + }; }; }; diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts b/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts index 23597d53c9c9..d47cd8c4dd24 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts +++ b/arch/arm64/boot/dts/nvidia/tegra194-p2972-0000.dts @@ -93,9 +93,11 @@ }; pcie@141a0000 { - status = "disabled"; + status = "okay"; vddio-pex-ctl-supply = <&vdd_1v8ao>; + vpcie3v3-supply = <&vdd_3v3_pcie>; + vpcie12v-supply = <&vdd_12v_pcie>; phys = <&p2u_nvhs_0>, <&p2u_nvhs_1>, <&p2u_nvhs_2>, <&p2u_nvhs_3>, <&p2u_nvhs_4>, <&p2u_nvhs_5>, diff --git a/arch/arm64/boot/dts/nvidia/tegra194.dtsi b/arch/arm64/boot/dts/nvidia/tegra194.dtsi index adebbbf36bd0..3c0cf54f0aab 100644 --- a/arch/arm64/boot/dts/nvidia/tegra194.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra194.dtsi @@ -3,8 +3,9 @@ #include #include #include -#include +#include #include +#include #include / { @@ -130,6 +131,38 @@ }; }; + pinmux: pinmux@2430000 { + compatible = "nvidia,tegra194-pinmux"; + reg = <0x2430000 0x17000 + 0xc300000 0x4000>; + + status = "okay"; + + pex_rst_c5_out_state: pex_rst_c5_out { + pex_rst { + nvidia,pins = "pex_l5_rst_n_pgg1"; + nvidia,schmitt = ; + nvidia,lpdr = ; + nvidia,enable-input = ; + nvidia,io-high-voltage = ; + nvidia,tristate = ; + nvidia,pull = ; + }; + }; + + clkreq_c5_bi_dir_state: clkreq_c5_bi_dir { + clkreq { + nvidia,pins = "pex_l5_clkreq_n_pgg0"; + nvidia,schmitt = ; + nvidia,lpdr = ; + nvidia,enable-input = ; + nvidia,io-high-voltage = ; + nvidia,tristate = ; + nvidia,pull = ; + }; + }; + }; + uarta: serial@3100000 { compatible = "nvidia,tegra194-uart", "nvidia,tegra20-uart"; reg = <0x03100000 0x40>; @@ -1365,6 +1398,9 @@ num-viewport = <8>; linux,pci-domain = <5>; + pinctrl-names = "default"; + pinctrl-0 = <&pex_rst_c5_out_state>, <&clkreq_c5_bi_dir_state>; + clocks = <&bpmp TEGRA194_CLK_PEX1_CORE_5>, <&bpmp TEGRA194_CLK_PEX1_CORE_5M>; clock-names = "core", "core_m"; diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 14d0bc44d451..172d76fa0245 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -15,8 +15,6 @@ #include /* for pte_{alloc,free}_one */ -#define check_pgt_cache() do { } while (0) - #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) #if CONFIG_PGTABLE_LEVELS > 2 diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 57427d17580e..7576df00eb50 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -861,8 +861,6 @@ extern int kern_addr_valid(unsigned long addr); #include -static inline void pgtable_cache_init(void) { } - /* * On AArch64, the cache coherency is handled via the set_pte_at() function. */ diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index c67848c55009..5623685c7d13 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -280,8 +280,6 @@ static inline void spin_lock_prefetch(const void *ptr) "nop") : : "p" (ptr)); } -#define HAVE_ARCH_PICK_MMAP_LAYOUT - extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */ extern void __init minsigstksz_setup(void); diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index a95d1fcb7e21..b76df828e6b7 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -44,7 +44,7 @@ static inline void tlb_flush(struct mmu_gather *tlb) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); tlb_remove_table(tlb, pte); } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 03689c0beb34..a47462def04b 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -557,14 +557,6 @@ unsigned long arch_align_stack(unsigned long sp) return sp & ~0xf; } -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - if (is_compat_task()) - return randomize_page(mm->brk, SZ_32M); - else - return randomize_page(mm->brk, SZ_1G); -} - /* * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. */ diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index dc19300309d2..ac485163a4a7 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -56,8 +56,7 @@ void __sync_icache_dcache(pte_t pte) struct page *page = pte_page(pte); if (!test_and_set_bit(PG_dcache_clean, &page->flags)) - sync_icache_aliases(page_address(page), - PAGE_SIZE << compound_order(page)); + sync_icache_aliases(page_address(page), page_size(page)); } EXPORT_SYMBOL_GPL(__sync_icache_dcache); diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index b050641b5139..3028bacbc4e9 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -20,78 +20,6 @@ #include -/* - * Leave enough space between the mmap area and the stack to honour ulimit in - * the face of randomisation. - */ -#define MIN_GAP (SZ_128M) -#define MAX_GAP (STACK_TOP/6*5) - -static int mmap_is_legacy(struct rlimit *rlim_stack) -{ - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (rlim_stack->rlim_cur == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -unsigned long arch_mmap_rnd(void) -{ - unsigned long rnd; - -#ifdef CONFIG_COMPAT - if (test_thread_flag(TIF_32BIT)) - rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); - else -#endif - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); - return rnd << PAGE_SHIFT; -} - -static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) -{ - unsigned long gap = rlim_stack->rlim_cur; - unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap; - - /* Values close to RLIM_INFINITY can overflow. */ - if (gap + pad > gap) - gap += pad; - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(STACK_TOP - gap - rnd); -} - -/* - * This function, called very early during the creation of a new process VM - * image, sets up which VM layout function to use: - */ -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) -{ - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); - - /* - * Fall back to the standard layout if the personality bit is set, or - * if the expected stack growth is unlimited: - */ - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} - /* * You really shouldn't be using read() or write() on /dev/mem. This might go * away in the future. diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 53dc6f24cfb7..60c929f3683b 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -384,7 +384,7 @@ static phys_addr_t pgd_pgtable_alloc(int shift) * folded, and if so pgtable_pmd_page_ctor() becomes nop. */ if (shift == PAGE_SHIFT) - BUG_ON(!pgtable_page_ctor(phys_to_page(pa))); + BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa))); else if (shift == PMD_SHIFT) BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa))); diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 7548f9ca1f11..4a64089e5771 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -35,7 +35,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } -void __init pgd_cache_init(void) +void __init pgtable_cache_init(void) { if (PGD_SIZE == PAGE_SIZE) return; diff --git a/arch/c6x/include/asm/pgtable.h b/arch/c6x/include/asm/pgtable.h index 0bd805964ea6..0b6919c00413 100644 --- a/arch/c6x/include/asm/pgtable.h +++ b/arch/c6x/include/asm/pgtable.h @@ -59,11 +59,6 @@ extern unsigned long empty_zero_page; #define swapper_pg_dir ((pgd_t *) 0) -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - /* * c6x is !MMU, so define the simpliest implementation */ diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index 98c5716708d6..c7c1ed27e348 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -71,12 +71,10 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #define __pte_free_tlb(tlb, pte, address) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page(tlb, pte); \ } while (0) -#define check_pgt_cache() do {} while (0) - extern void pagetable_init(void); extern void pre_mmu_init(void); extern void pre_trap_init(void); diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index c429a6f347de..0040b3a05b61 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -296,11 +296,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ #define kern_addr_valid(addr) (1) -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do {} while (0) - #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ remap_pfn_range(vma, vaddr, pfn, size, prot) diff --git a/arch/h8300/include/asm/pgtable.h b/arch/h8300/include/asm/pgtable.h index a99caa49d265..4d00152fab58 100644 --- a/arch/h8300/include/asm/pgtable.h +++ b/arch/h8300/include/asm/pgtable.h @@ -4,7 +4,6 @@ #define __ARCH_USE_5LEVEL_HACK #include #include -#define pgtable_cache_init() do { } while (0) extern void paging_init(void); #define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */ #define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */ @@ -34,11 +33,6 @@ static inline int pte_file(pte_t pte) { return 0; } extern unsigned int kobjsize(const void *objp); extern int is_in_rom(unsigned long); -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - /* * All 32bit addresses are effectively valid for vmalloc... * Sort of meaningless for non-VM targets. diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index d6544dc71258..cc9be514a676 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -13,8 +13,6 @@ #include /* for pte_{alloc,free}_one */ -#define check_pgt_cache() do {} while (0) - extern unsigned long long kmap_generation; /* @@ -96,7 +94,7 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor((pte)); \ + pgtable_pte_page_dtor((pte)); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index a3ff6d24c09e..2fec20ad939e 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -431,9 +431,6 @@ static inline int pte_exec(pte_t pte) #define __pte_offset(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -/* I think this is in case we have page table caches; needed by init/main.c */ -#define pgtable_cache_init() do { } while (0) - /* * Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is * interpreted as swap information. The remaining free bits are interpreted as diff --git a/arch/hexagon/mm/Makefile b/arch/hexagon/mm/Makefile index 1894263ae5bc..893838499591 100644 --- a/arch/hexagon/mm/Makefile +++ b/arch/hexagon/mm/Makefile @@ -3,5 +3,5 @@ # Makefile for Hexagon memory management subsystem # -obj-y := init.o pgalloc.o ioremap.o uaccess.o vm_fault.o cache.o +obj-y := init.o ioremap.o uaccess.o vm_fault.o cache.o obj-y += copy_to_user.o copy_from_user.o strnlen_user.o vm_tlb.o diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index f1f6ebd537b7..c961773a6fff 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -71,19 +71,6 @@ void __init mem_init(void) init_mm.context.ptbase = __pa(init_mm.pgd); } -/* - * free_initrd_mem - frees... initrd memory. - * @start - start of init memory - * @end - end of init memory - * - * Apparently has to be passed the address of the initrd memory. - * - * Wrapped by #ifdef CONFIG_BLKDEV_INITRD - */ -void free_initrd_mem(unsigned long start, unsigned long end) -{ -} - void sync_icache_dcache(pte_t pte) { unsigned long addr; diff --git a/arch/hexagon/mm/pgalloc.c b/arch/hexagon/mm/pgalloc.c deleted file mode 100644 index 4d4316140237..000000000000 --- a/arch/hexagon/mm/pgalloc.c +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. - */ - -#include - -void __init pgtable_cache_init(void) -{ -} diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 685a3df126ca..16714477eef4 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -72,10 +72,6 @@ config 64BIT config ZONE_DMA32 def_bool y -config QUICKLIST - bool - default y - config MMU bool default y diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index c9e481023c25..f4c491044882 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -19,18 +19,19 @@ #include #include #include -#include + +#include #include static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - quicklist_free(0, NULL, pgd); + free_page((unsigned long)pgd); } #if CONFIG_PGTABLE_LEVELS == 4 @@ -42,12 +43,12 @@ pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return (pud_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) { - quicklist_free(0, NULL, pud); + free_page((unsigned long)pud); } #define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud) #endif /* CONFIG_PGTABLE_LEVELS == 4 */ @@ -60,12 +61,12 @@ pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd) static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return quicklist_alloc(0, GFP_KERNEL, NULL); + return (pmd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { - quicklist_free(0, NULL, pmd); + free_page((unsigned long)pmd); } #define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd) @@ -83,43 +84,6 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte) pmd_val(*pmd_entry) = __pa(pte); } -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - struct page *page; - void *pg; - - pg = quicklist_alloc(0, GFP_KERNEL, NULL); - if (!pg) - return NULL; - page = virt_to_page(pg); - if (!pgtable_page_ctor(page)) { - quicklist_free(0, NULL, pg); - return NULL; - } - return page; -} - -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return quicklist_alloc(0, GFP_KERNEL, NULL); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - quicklist_free_page(0, NULL, pte); -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - quicklist_free(0, NULL, pte); -} - -static inline void check_pgt_cache(void) -{ - quicklist_trim(0, NULL, 25, 16); -} - #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) #endif /* _ASM_IA64_PGALLOC_H */ diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index b1e7468eb65a..d602e7c622db 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -566,11 +566,6 @@ extern struct page *zero_page_memmap_ptr; #define KERNEL_TR_PAGE_SHIFT _PAGE_SIZE_64M #define KERNEL_TR_PAGE_SIZE (1 << KERNEL_TR_PAGE_SHIFT) -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - /* These tell get_user_pages() that the first gate page is accessible from user-level. */ #define FIXADDR_USER_START GATE_ADDR #ifdef HAVE_BUGGY_SEGREL diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index f10208478131..8e91c86e8072 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -583,6 +583,7 @@ void ia64_process_pending_intr(void) static irqreturn_t dummy_handler (int irq, void *dev_id) { BUG(); + return IRQ_NONE; } static struct irqaction ipi_irqaction = { diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index db09a693f094..5b00dc3898e1 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -108,7 +108,6 @@ setup_per_cpu_areas(void) struct pcpu_group_info *gi; unsigned int cpu; ssize_t static_size, reserved_size, dyn_size; - int rc; ai = pcpu_alloc_alloc_info(1, num_possible_cpus()); if (!ai) diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 219fc640414b..4f33f6e7e206 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -186,7 +186,7 @@ void __init setup_per_cpu_areas(void) unsigned long base_offset; unsigned int cpu; ssize_t static_size, reserved_size, dyn_size; - int node, prev_node, unit, nr_units, rc; + int node, prev_node, unit, nr_units; ai = pcpu_alloc_alloc_info(MAX_NUMNODES, nr_cpu_ids); if (!ai) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 678b98a09c85..bf9df2625bc8 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -64,7 +64,7 @@ __ia64_sync_icache_dcache (pte_t pte) if (test_bit(PG_arch_1, &page->flags)) return; /* i-cache is already coherent with d-cache */ - flush_icache_range(addr, addr + (PAGE_SIZE << compound_order(page))); + flush_icache_range(addr, addr + page_size(page)); set_bit(PG_arch_1, &page->flags); /* mark page as clean */ } diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index 4399d712f6db..b34d44d666a4 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -41,7 +41,7 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page, unsigned long address) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } @@ -54,7 +54,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -73,7 +73,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) static inline void pte_free(struct mm_struct *mm, struct page *page) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index d04d9ba9b976..acab315c851f 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -36,7 +36,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) page = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0); if(!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -51,7 +51,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) static inline void pte_free(struct mm_struct *mm, pgtable_t page) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); cache_page(kmap(page)); kunmap(page); __free_page(page); @@ -60,7 +60,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t page) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page, unsigned long address) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); cache_page(kmap(page)); kunmap(page); __free_page(page); diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index fde4534b974f..646c174fff99 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -176,11 +176,4 @@ pgprot_t pgprot_dmacoherent(pgprot_t prot); #include #endif /* !__ASSEMBLY__ */ -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - -#define check_pgt_cache() do { } while (0) - #endif /* _M68K_PGTABLE_H */ diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index fc3a96c77bd8..c18165b0d904 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -44,11 +44,6 @@ extern void paging_init(void); */ #define ZERO_PAGE(vaddr) (virt_to_page(0)) -/* - * No page table caches to initialise. - */ -#define pgtable_cache_init() do { } while (0) - /* * All 32bit addresses are effectively valid for vmalloc... * Sort of meaningless for non-VM targets. @@ -60,6 +55,4 @@ extern void paging_init(void); #include -#define check_pgt_cache() do { } while (0) - #endif /* _M68KNOMMU_PGTABLE_H */ diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 1a8ddbd0d23c..856121122b91 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -21,7 +21,7 @@ extern const char bad_pmd_string[]; #define __pte_free_tlb(tlb,pte,addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 632c9477a0f6..c9c4be822456 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -5,15 +5,18 @@ config MICROBLAZE select ARCH_NO_SWAP select ARCH_HAS_BINFMT_FLAT if !MMU select ARCH_HAS_DMA_COHERENT_TO_PFN if MMU + select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_UNCACHED_SEGMENT if !MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_EXTABLE_SORT select TIMER_OF select CLONE_BACKWARDS3 select COMMON_CLK + select DMA_DIRECT_REMAP if MMU select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS select GENERIC_CPU_DEVICES diff --git a/arch/microblaze/boot/dts/system.dts b/arch/microblaze/boot/dts/system.dts index 5a8a9d090c37..5b236527176e 100644 --- a/arch/microblaze/boot/dts/system.dts +++ b/arch/microblaze/boot/dts/system.dts @@ -18,7 +18,6 @@ #address-cells = <1>; #size-cells = <1>; compatible = "xlnx,microblaze"; - hard-reset-gpios = <&LEDs_8Bit 2 1>; model = "testing"; DDR2_SDRAM: memory@90000000 { device_type = "memory"; @@ -281,6 +280,21 @@ gpios = <&LEDs_8Bit 7 1>; }; } ; + + gpio-restart { + compatible = "gpio-restart"; + /* + * FIXME: is this active low or active high? + * the current flag (1) indicates active low. + * delay measures are templates, should be adjusted + * to datasheet or trial-and-error with real hardware. + */ + gpios = <&LEDs_8Bit 2 1>; + active-delay = <100>; + inactive-delay = <10>; + wait-delay = <100>; + }; + RS232_Uart_1: serial@84000000 { clock-frequency = <125000000>; compatible = "xlnx,xps-uartlite-1.00.a"; diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index 92fd4e95b488..654edfdc7867 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -5,15 +5,10 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_SYSFS_DEPRECATED=y CONFIG_SYSFS_DEPRECATED_V2=y -CONFIG_KALLSYMS_ALL=y # CONFIG_BASE_FULL is not set +CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_PARTITION_ADVANCED=y -# CONFIG_EFI_PARTITION is not set CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR=1 CONFIG_XILINX_MICROBLAZE0_USE_PCMP_INSTR=1 CONFIG_XILINX_MICROBLAZE0_USE_BARREL=1 @@ -25,14 +20,19 @@ CONFIG_MMU=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE_FORCE=y CONFIG_HIGHMEM=y -CONFIG_PCI=y CONFIG_PCI_XILINX=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_PARTITION_ADVANCED=y +# CONFIG_EFI_PARTITION is not set CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IPV6 is not set CONFIG_BRIDGE=m +CONFIG_PCI=y CONFIG_MTD=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y @@ -41,6 +41,7 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_NETDEVICES=y CONFIG_XILINX_EMACLITE=y +CONFIG_XILINX_AXI_EMAC=y CONFIG_XILINX_LL_TEMAC=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set @@ -59,6 +60,8 @@ CONFIG_SPI_XILINX=y CONFIG_GPIOLIB=y CONFIG_GPIO_SYSFS=y CONFIG_GPIO_XILINX=y +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_GPIO_RESTART=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_XILINX_WATCHDOG=y @@ -74,8 +77,8 @@ CONFIG_CRAMFS=y CONFIG_ROMFS_FS=y CONFIG_NFS_FS=y CONFIG_CIFS=y -CONFIG_CIFS_STATS=y CONFIG_CIFS_STATS2=y +CONFIG_ENCRYPTED_KEYS=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_SLAB=y CONFIG_DETECT_HUNG_TASK=y @@ -83,6 +86,3 @@ CONFIG_DEBUG_SPINLOCK=y CONFIG_KGDB=y CONFIG_KGDB_TESTS=y CONFIG_KGDB_KDB=y -CONFIG_EARLY_PRINTK=y -CONFIG_KEYS=y -CONFIG_ENCRYPTED_KEYS=y diff --git a/arch/microblaze/configs/nommu_defconfig b/arch/microblaze/configs/nommu_defconfig index 06d69a6e192d..377de39ccb8c 100644 --- a/arch/microblaze/configs/nommu_defconfig +++ b/arch/microblaze/configs/nommu_defconfig @@ -7,15 +7,10 @@ CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_SYSFS_DEPRECATED=y CONFIG_SYSFS_DEPRECATED_V2=y -CONFIG_KALLSYMS_ALL=y # CONFIG_BASE_FULL is not set +CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_SLAB=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_PARTITION_ADVANCED=y -# CONFIG_EFI_PARTITION is not set CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR=1 CONFIG_XILINX_MICROBLAZE0_USE_PCMP_INSTR=1 CONFIG_XILINX_MICROBLAZE0_USE_BARREL=1 @@ -25,13 +20,18 @@ CONFIG_XILINX_MICROBLAZE0_USE_FPU=2 CONFIG_HZ_100=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE_FORCE=y -CONFIG_PCI=y CONFIG_PCI_XILINX=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_PARTITION_ADVANCED=y +# CONFIG_EFI_PARTITION is not set CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IPV6 is not set +CONFIG_PCI=y CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y CONFIG_MTD_BLOCK=y @@ -62,6 +62,8 @@ CONFIG_SPI_XILINX=y CONFIG_GPIOLIB=y CONFIG_GPIO_SYSFS=y CONFIG_GPIO_XILINX=y +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_GPIO_RESTART=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_XILINX_WATCHDOG=y @@ -75,11 +77,6 @@ CONFIG_ROMFS_FS=y CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_NLS=y -CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_SLAB=y -CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEBUG_SPINLOCK=y -CONFIG_EARLY_PRINTK=y CONFIG_KEYS=y CONFIG_ENCRYPTED_KEYS=y CONFIG_CRYPTO_ECB=y @@ -87,3 +84,7 @@ CONFIG_CRYPTO_MD4=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_ARC4=y CONFIG_CRYPTO_DES=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_SLAB=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_DEBUG_SPINLOCK=y diff --git a/arch/microblaze/include/asm/io.h b/arch/microblaze/include/asm/io.h index c7968139486f..86c95b2a1ce1 100644 --- a/arch/microblaze/include/asm/io.h +++ b/arch/microblaze/include/asm/io.h @@ -40,7 +40,6 @@ extern void iounmap(volatile void __iomem *addr); extern void __iomem *ioremap(phys_addr_t address, unsigned long size); #define ioremap_nocache(addr, size) ioremap((addr), (size)) -#define ioremap_fullcache(addr, size) ioremap((addr), (size)) #define ioremap_wc(addr, size) ioremap((addr), (size)) #define ioremap_wt(addr, size) ioremap((addr), (size)) diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h index 21ddba9188b2..7c4dc5d85f53 100644 --- a/arch/microblaze/include/asm/pci.h +++ b/arch/microblaze/include/asm/pci.h @@ -66,8 +66,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, unsigned long size, pgprot_t prot); -#define HAVE_ARCH_PCI_RESOURCE_TO_USER - /* This part of code was originally in xilinx-pci.h */ #ifdef CONFIG_PCI_XILINX extern void __init xilinx_pci_init(void); diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index f4cc9ffc449e..7ecb05baa601 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -21,83 +21,23 @@ #include #include -#define PGDIR_ORDER 0 - -/* - * This is handled very differently on MicroBlaze since out page tables - * are all 0's and I want to be able to use these zero'd pages elsewhere - * as well - it gives us quite a speedup. - * -- Cort - */ -extern struct pgtable_cache_struct { - unsigned long *pgd_cache; - unsigned long *pte_cache; - unsigned long pgtable_cache_sz; -} quicklists; - -#define pgd_quicklist (quicklists.pgd_cache) -#define pmd_quicklist ((unsigned long *)0) -#define pte_quicklist (quicklists.pte_cache) -#define pgtable_cache_size (quicklists.pgtable_cache_sz) - -extern unsigned long *zero_cache; /* head linked list of pre-zero'd pages */ -extern atomic_t zero_sz; /* # currently pre-zero'd pages */ -extern atomic_t zeropage_hits; /* # zero'd pages request that we've done */ -extern atomic_t zeropage_calls; /* # zero'd pages request that've been made */ -extern atomic_t zerototal; /* # pages zero'd over time */ - -#define zero_quicklist (zero_cache) -#define zero_cache_sz (zero_sz) -#define zero_cache_calls (zeropage_calls) -#define zero_cache_hits (zeropage_hits) -#define zero_cache_total (zerototal) - -/* - * return a pre-zero'd page from the list, - * return NULL if none available -- Cort - */ -extern unsigned long get_zero_page_fast(void); +#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL +#include extern void __bad_pte(pmd_t *pmd); -static inline pgd_t *get_pgd_slow(void) +static inline pgd_t *get_pgd(void) { - pgd_t *ret; - - ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGDIR_ORDER); - if (ret != NULL) - clear_page(ret); - return ret; + return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0); } -static inline pgd_t *get_pgd_fast(void) -{ - unsigned long *ret; - - ret = pgd_quicklist; - if (ret != NULL) { - pgd_quicklist = (unsigned long *)(*ret); - ret[0] = 0; - pgtable_cache_size--; - } else - ret = (unsigned long *)get_pgd_slow(); - return (pgd_t *)ret; -} - -static inline void free_pgd_fast(pgd_t *pgd) -{ - *(unsigned long **)pgd = pgd_quicklist; - pgd_quicklist = (unsigned long *) pgd; - pgtable_cache_size++; -} - -static inline void free_pgd_slow(pgd_t *pgd) +static inline void free_pgd(pgd_t *pgd) { free_page((unsigned long)pgd); } -#define pgd_free(mm, pgd) free_pgd_fast(pgd) -#define pgd_alloc(mm) get_pgd_fast() +#define pgd_free(mm, pgd) free_pgd(pgd) +#define pgd_alloc(mm) get_pgd() #define pmd_pgtable(pmd) pmd_page(pmd) @@ -110,50 +50,6 @@ static inline void free_pgd_slow(pgd_t *pgd) extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -static inline struct page *pte_alloc_one(struct mm_struct *mm) -{ - struct page *ptepage; - -#ifdef CONFIG_HIGHPTE - int flags = GFP_KERNEL | __GFP_HIGHMEM; -#else - int flags = GFP_KERNEL; -#endif - - ptepage = alloc_pages(flags, 0); - if (!ptepage) - return NULL; - clear_highpage(ptepage); - if (!pgtable_page_ctor(ptepage)) { - __free_page(ptepage); - return NULL; - } - return ptepage; -} - -static inline void pte_free_fast(pte_t *pte) -{ - *(unsigned long **)pte = pte_quicklist; - pte_quicklist = (unsigned long *) pte; - pgtable_cache_size++; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline void pte_free_slow(struct page *ptepage) -{ - __free_page(ptepage); -} - -static inline void pte_free(struct mm_struct *mm, struct page *ptepage) -{ - pgtable_page_dtor(ptepage); - __free_page(ptepage); -} - #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte)) #define pmd_populate(mm, pmd, pte) \ @@ -171,10 +67,6 @@ static inline void pte_free(struct mm_struct *mm, struct page *ptepage) #define __pmd_free_tlb(tlb, x, addr) pmd_free((tlb)->mm, x) #define pgd_populate(mm, pmd, pte) BUG() -extern int do_check_pgt_cache(int, int); - #endif /* CONFIG_MMU */ -#define check_pgt_cache() do { } while (0) - #endif /* _ASM_MICROBLAZE_PGALLOC_H */ diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 142d3f004848..954b69af451f 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -46,8 +46,6 @@ extern int mem_init_done; #define swapper_pg_dir ((pgd_t *) NULL) -#define pgtable_cache_init() do {} while (0) - #define arch_enter_lazy_cpu_mode() do {} while (0) #define pgprot_noncached_wc(prot) prot @@ -526,11 +524,6 @@ extern unsigned long iopa(unsigned long addr); /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ #define kern_addr_valid(addr) (1) -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - void do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long error_code); diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h index bff2a71c828a..a1f206b90753 100644 --- a/arch/microblaze/include/asm/uaccess.h +++ b/arch/microblaze/include/asm/uaccess.h @@ -163,44 +163,15 @@ extern long __user_bad(void); * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ -#define get_user(x, ptr) \ - __get_user_check((x), (ptr), sizeof(*(ptr))) - -#define __get_user_check(x, ptr, size) \ -({ \ - unsigned long __gu_val = 0; \ - const typeof(*(ptr)) __user *__gu_addr = (ptr); \ - int __gu_err = 0; \ - \ - if (access_ok(__gu_addr, size)) { \ - switch (size) { \ - case 1: \ - __get_user_asm("lbu", __gu_addr, __gu_val, \ - __gu_err); \ - break; \ - case 2: \ - __get_user_asm("lhu", __gu_addr, __gu_val, \ - __gu_err); \ - break; \ - case 4: \ - __get_user_asm("lw", __gu_addr, __gu_val, \ - __gu_err); \ - break; \ - default: \ - __gu_err = __user_bad(); \ - break; \ - } \ - } else { \ - __gu_err = -EFAULT; \ - } \ - x = (__force typeof(*(ptr)))__gu_val; \ - __gu_err; \ +#define get_user(x, ptr) ({ \ + const typeof(*(ptr)) __user *__gu_ptr = (ptr); \ + access_ok(__gu_ptr, sizeof(*__gu_ptr)) ? \ + __get_user(x, __gu_ptr) : -EFAULT; \ }) #define __get_user(x, ptr) \ ({ \ unsigned long __gu_val = 0; \ - /*unsigned long __gu_ptr = (unsigned long)(ptr);*/ \ long __gu_err; \ switch (sizeof(*(ptr))) { \ case 1: \ @@ -212,6 +183,11 @@ extern long __user_bad(void); case 4: \ __get_user_asm("lw", (ptr), __gu_val, __gu_err); \ break; \ + case 8: \ + __gu_err = __copy_from_user(&__gu_val, ptr, 8); \ + if (__gu_err) \ + __gu_err = -EFAULT; \ + break; \ default: \ /* __gu_val = 0; __gu_err = -EINVAL;*/ __gu_err = __user_bad();\ } \ diff --git a/arch/microblaze/kernel/reset.c b/arch/microblaze/kernel/reset.c index fcbe1daf6316..5f4722908164 100644 --- a/arch/microblaze/kernel/reset.c +++ b/arch/microblaze/kernel/reset.c @@ -8,83 +8,9 @@ */ #include +#include #include - -/* Trigger specific functions */ -#ifdef CONFIG_GPIOLIB - -#include - -static int handle; /* reset pin handle */ -static unsigned int reset_val; - -static int of_platform_reset_gpio_probe(void) -{ - int ret; - handle = of_get_named_gpio(of_find_node_by_path("/"), - "hard-reset-gpios", 0); - - if (!gpio_is_valid(handle)) { - pr_info("Skipping unavailable RESET gpio %d (%s)\n", - handle, "reset"); - return -ENODEV; - } - - ret = gpio_request(handle, "reset"); - if (ret < 0) { - pr_info("GPIO pin is already allocated\n"); - return ret; - } - - /* get current setup value */ - reset_val = gpio_get_value(handle); - /* FIXME maybe worth to perform any action */ - pr_debug("Reset: Gpio output state: 0x%x\n", reset_val); - - /* Setup GPIO as output */ - ret = gpio_direction_output(handle, 0); - if (ret < 0) - goto err; - - /* Setup output direction */ - gpio_set_value(handle, 0); - - pr_info("RESET: Registered gpio device: %d, current val: %d\n", - handle, reset_val); - return 0; -err: - gpio_free(handle); - return ret; -} -device_initcall(of_platform_reset_gpio_probe); - - -static void gpio_system_reset(void) -{ - if (gpio_is_valid(handle)) - gpio_set_value(handle, 1 - reset_val); - else - pr_notice("Reset GPIO unavailable - halting!\n"); -} -#else -static void gpio_system_reset(void) -{ - pr_notice("No reset GPIO present - halting!\n"); -} - -void of_platform_reset_gpio_probe(void) -{ - return; -} -#endif - -void machine_restart(char *cmd) -{ - pr_notice("Machine restart...\n"); - gpio_system_reset(); - while (1) - ; -} +#include void machine_shutdown(void) { @@ -106,3 +32,12 @@ void machine_power_off(void) while (1) ; } + +void machine_restart(char *cmd) +{ + do_kernel_restart(cmd); + /* Give the restart hook 1 s to take us down */ + mdelay(1000); + pr_emerg("Reboot failed -- System halted\n"); + while (1); +} diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c index bc7042209c57..8c5f0c332d8b 100644 --- a/arch/microblaze/mm/consistent.c +++ b/arch/microblaze/mm/consistent.c @@ -4,217 +4,56 @@ * Copyright (C) 2010 Michal Simek * Copyright (C) 2010 PetaLogix * Copyright (C) 2005 John Williams - * - * Based on PowerPC version derived from arch/arm/mm/consistent.c - * Copyright (C) 2001 Dan Malek (dmalek@jlc.net) - * Copyright (C) 2000 Russell King */ -#include -#include -#include #include -#include #include #include -#include -#include #include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include #include - -#include -#include -#include -#include -#include -#include -#include #include -#include +#include -#ifndef CONFIG_MMU -/* I have to use dcache values because I can't relate on ram size */ -# define UNCACHED_SHADOW_MASK (cpuinfo.dcache_high - cpuinfo.dcache_base + 1) -#endif - -/* - * Consistent memory allocators. Used for DMA devices that want to - * share uncached memory with the processor core. - * My crufty no-MMU approach is simple. In the HW platform we can optionally - * mirror the DDR up above the processor cacheable region. So, memory accessed - * in this mirror region will not be cached. It's alloced from the same - * pool as normal memory, but the handle we return is shifted up into the - * uncached region. This will no doubt cause big problems if memory allocated - * here is not also freed properly. -- JW - */ -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) +void arch_dma_prep_coherent(struct page *page, size_t size) { - unsigned long order, vaddr; - void *ret; - unsigned int i, err = 0; - struct page *page, *end; + phys_addr_t paddr = page_to_phys(page); -#ifdef CONFIG_MMU - phys_addr_t pa; - struct vm_struct *area; - unsigned long va; -#endif - - if (in_interrupt()) - BUG(); - - /* Only allocate page size areas. */ - size = PAGE_ALIGN(size); - order = get_order(size); - - vaddr = __get_free_pages(gfp | __GFP_ZERO, order); - if (!vaddr) - return NULL; - - /* - * we need to ensure that there are no cachelines in use, - * or worse dirty in this area. - */ - flush_dcache_range(virt_to_phys((void *)vaddr), - virt_to_phys((void *)vaddr) + size); + flush_dcache_range(paddr, paddr + size); +} #ifndef CONFIG_MMU - ret = (void *)vaddr; - /* - * Here's the magic! Note if the uncached shadow is not implemented, - * it's up to the calling code to also test that condition and make - * other arranegments, such as manually flushing the cache and so on. - */ -# ifdef CONFIG_XILINX_UNCACHED_SHADOW - ret = (void *)((unsigned) ret | UNCACHED_SHADOW_MASK); -# endif - if ((unsigned int)ret > cpuinfo.dcache_base && - (unsigned int)ret < cpuinfo.dcache_high) +/* + * Consistent memory allocators. Used for DMA devices that want to share + * uncached memory with the processor core. My crufty no-MMU approach is + * simple. In the HW platform we can optionally mirror the DDR up above the + * processor cacheable region. So, memory accessed in this mirror region will + * not be cached. It's alloced from the same pool as normal memory, but the + * handle we return is shifted up into the uncached region. This will no doubt + * cause big problems if memory allocated here is not also freed properly. -- JW + * + * I have to use dcache values because I can't relate on ram size: + */ +#ifdef CONFIG_XILINX_UNCACHED_SHADOW +#define UNCACHED_SHADOW_MASK (cpuinfo.dcache_high - cpuinfo.dcache_base + 1) +#else +#define UNCACHED_SHADOW_MASK 0 +#endif /* CONFIG_XILINX_UNCACHED_SHADOW */ + +void *uncached_kernel_address(void *ptr) +{ + unsigned long addr = (unsigned long)ptr; + + addr |= UNCACHED_SHADOW_MASK; + if (addr > cpuinfo.dcache_base && addr < cpuinfo.dcache_high) pr_warn("ERROR: Your cache coherent area is CACHED!!!\n"); - - /* dma_handle is same as physical (shadowed) address */ - *dma_handle = (dma_addr_t)ret; -#else - /* Allocate some common virtual space to map the new pages. */ - area = get_vm_area(size, VM_ALLOC); - if (!area) { - free_pages(vaddr, order); - return NULL; - } - va = (unsigned long) area->addr; - ret = (void *)va; - - /* This gives us the real physical address of the first page. */ - *dma_handle = pa = __virt_to_phys(vaddr); -#endif - - /* - * free wasted pages. We skip the first page since we know - * that it will have count = 1 and won't require freeing. - * We also mark the pages in use as reserved so that - * remap_page_range works. - */ - page = virt_to_page(vaddr); - end = page + (1 << order); - - split_page(page, order); - - for (i = 0; i < size && err == 0; i += PAGE_SIZE) { -#ifdef CONFIG_MMU - /* MS: This is the whole magic - use cache inhibit pages */ - err = map_page(va + i, pa + i, _PAGE_KERNEL | _PAGE_NO_CACHE); -#endif - - SetPageReserved(page); - page++; - } - - /* Free the otherwise unused pages. */ - while (page < end) { - __free_page(page); - page++; - } - - if (err) { - free_pages(vaddr, order); - return NULL; - } - - return ret; + return (void *)addr; } -#ifdef CONFIG_MMU -static pte_t *consistent_virt_to_pte(void *vaddr) +void *cached_kernel_address(void *ptr) { - unsigned long addr = (unsigned long)vaddr; + unsigned long addr = (unsigned long)ptr; - return pte_offset_kernel(pmd_offset(pgd_offset_k(addr), addr), addr); -} - -long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr, - dma_addr_t dma_addr) -{ - pte_t *ptep = consistent_virt_to_pte(vaddr); - - if (pte_none(*ptep) || !pte_present(*ptep)) - return 0; - - return pte_pfn(*ptep); -} -#endif - -/* - * free page(s) as defined by the above mapping. - */ -void arch_dma_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, unsigned long attrs) -{ - struct page *page; - - if (in_interrupt()) - BUG(); - - size = PAGE_ALIGN(size); - -#ifndef CONFIG_MMU - /* Clear SHADOW_MASK bit in address, and free as per usual */ -# ifdef CONFIG_XILINX_UNCACHED_SHADOW - vaddr = (void *)((unsigned)vaddr & ~UNCACHED_SHADOW_MASK); -# endif - page = virt_to_page(vaddr); - - do { - __free_reserved_page(page); - page++; - } while (size -= PAGE_SIZE); -#else - do { - pte_t *ptep = consistent_virt_to_pte(vaddr); - unsigned long pfn; - - if (!pte_none(*ptep) && pte_present(*ptep)) { - pfn = pte_pfn(*ptep); - pte_clear(&init_mm, (unsigned int)vaddr, ptep); - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - __free_reserved_page(page); - } - } - vaddr += PAGE_SIZE; - } while (size -= PAGE_SIZE); - - /* flush tlb */ - flush_tlb_all(); -#endif + return (void *)(addr & ~UNCACHED_SHADOW_MASK); } +#endif /* CONFIG_MMU */ diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index 8fe54fda31dc..010bb9cee2e4 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -44,10 +44,6 @@ unsigned long ioremap_base; unsigned long ioremap_bot; EXPORT_SYMBOL(ioremap_bot); -#ifndef CONFIG_SMP -struct pgtable_cache_struct quicklists; -#endif - static void __iomem *__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) { diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index cc8e2b1032a5..a0bd9bdb5f83 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -5,7 +5,6 @@ config MIPS select ARCH_32BIT_OFF_T if !64BIT select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT select ARCH_CLOCKSOURCE_DATA - select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_SUPPORTS_UPROBES @@ -13,6 +12,7 @@ config MIPS select ARCH_USE_CMPXCHG_LOCKREF if 64BIT select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h index 436099883022..6f48649201c5 100644 --- a/arch/mips/include/asm/pci.h +++ b/arch/mips/include/asm/pci.h @@ -108,7 +108,6 @@ extern unsigned long PCIBIOS_MIN_MEM; #define HAVE_PCI_MMAP #define ARCH_GENERIC_PCI_MMAP_RESOURCE -#define HAVE_ARCH_PCI_RESOURCE_TO_USER /* * Dynamic DMA mapping stuff. diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index aa16b85ddffc..166842337eb2 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -54,7 +54,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #define __pte_free_tlb(tlb,pte,address) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) @@ -105,8 +105,6 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) #endif /* __PAGETABLE_PUD_FOLDED */ -#define check_pgt_cache() do { } while (0) - extern void pagetable_init(void); #endif /* _ASM_PGALLOC_H */ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 4dca733d5076..f85bd5b15f51 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -661,9 +661,4 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - #endif /* _ASM_PGTABLE_H */ diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index aca909bd7841..fba18d4a9190 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -29,11 +29,6 @@ extern unsigned int vced_count, vcei_count; -/* - * MIPS does have an arch_pick_mmap_layout() - */ -#define HAVE_ARCH_PICK_MMAP_LAYOUT 1 - #ifdef CONFIG_32BIT #ifdef CONFIG_KVM_GUEST /* User space process size is limited to 1GB in KVM Guest Mode */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index c2b40969eb1f..57dc2ac4f8bd 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -95,6 +95,9 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index d79f2b432318..00fe90c6db3e 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c @@ -20,33 +20,6 @@ unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ EXPORT_SYMBOL(shm_align_mask); -/* gap between mmap and stack */ -#define MIN_GAP (128*1024*1024UL) -#define MAX_GAP ((TASK_SIZE)/6*5) - -static int mmap_is_legacy(struct rlimit *rlim_stack) -{ - if (current->personality & ADDR_COMPAT_LAYOUT) - return 1; - - if (rlim_stack->rlim_cur == RLIM_INFINITY) - return 1; - - return sysctl_legacy_va_layout; -} - -static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) -{ - unsigned long gap = rlim_stack->rlim_cur; - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(TASK_SIZE - gap - rnd); -} - #define COLOUR_ALIGN(addr, pgoff) \ ((((addr) + shm_align_mask) & ~shm_align_mask) + \ (((pgoff) << PAGE_SHIFT) & shm_align_mask)) @@ -144,63 +117,6 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, addr0, len, pgoff, flags, DOWN); } -unsigned long arch_mmap_rnd(void) -{ - unsigned long rnd; - -#ifdef CONFIG_COMPAT - if (TASK_IS_32BIT_ADDR) - rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); - else -#endif /* CONFIG_COMPAT */ - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); - - return rnd << PAGE_SHIFT; -} - -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) -{ - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); - - if (mmap_is_legacy(rlim_stack)) { - mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, rlim_stack); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } -} - -static inline unsigned long brk_rnd(void) -{ - unsigned long rnd = get_random_long(); - - rnd = rnd << PAGE_SHIFT; - /* 8MB for 32bit, 256MB for 64bit */ - if (TASK_IS_32BIT_ADDR) - rnd = rnd & 0x7ffffful; - else - rnd = rnd & 0xffffffful; - - return rnd; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long base = mm->brk; - unsigned long ret; - - ret = PAGE_ALIGN(base + brk_rnd()); - - if (ret < mm->brk) - return mm->brk; - - return ret; -} - bool __virt_addr_valid(const volatile void *kaddr) { unsigned long vaddr = (unsigned long)kaddr; diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h index e78b43d8389f..37125e6884d7 100644 --- a/arch/nds32/include/asm/pgalloc.h +++ b/arch/nds32/include/asm/pgalloc.h @@ -23,8 +23,6 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t * pgd); -#define check_pgt_cache() do { } while (0) - static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { pgtable_t pte; diff --git a/arch/nds32/include/asm/pgtable.h b/arch/nds32/include/asm/pgtable.h index c70cc56bec09..0588ec99725c 100644 --- a/arch/nds32/include/asm/pgtable.h +++ b/arch/nds32/include/asm/pgtable.h @@ -403,8 +403,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; * into virtual address `from' */ -#define pgtable_cache_init() do { } while (0) - #endif /* !__ASSEMBLY__ */ #endif /* _ASMNDS32_PGTABLE_H */ diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index 4bc8cf72067e..0b146d773c85 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -41,10 +41,8 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) -#define check_pgt_cache() do { } while (0) - #endif /* _ASM_NIOS2_PGALLOC_H */ diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 95237b7f6fc1..99985d8b7166 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -291,8 +291,6 @@ static inline void pte_clear(struct mm_struct *mm, #include -#define pgtable_cache_init() do { } while (0) - extern void __init paging_init(void); extern void __init mmu_init(void); diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 3d4b397c2d06..da12a4c38c4b 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -75,7 +75,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm) if (!pte) return NULL; clear_page(page_address(pte)); - if (!pgtable_page_ctor(pte)) { + if (!pgtable_pte_page_ctor(pte)) { __free_page(pte); return NULL; } @@ -89,18 +89,16 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, struct page *pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); __free_page(pte); } #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) #define pmd_pgtable(pmd) pmd_page(pmd) -#define check_pgt_cache() do { } while (0) - #endif diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 2fe9ff5b5d6f..248d22d8faa7 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -443,11 +443,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, #include -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - typedef pte_t *pte_addr_t; #endif /* __ASSEMBLY__ */ diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index 4f2059a50fae..d98647c29b74 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -124,6 +124,4 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) pmd_populate_kernel(mm, pmd, page_address(pte_page)) #define pmd_pgtable(pmd) pmd_page(pmd) -#define check_pgt_cache() do { } while (0) - #endif diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 6d58c1739b42..4ac374b3a99f 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -132,8 +132,6 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) #define PTRS_PER_PTE (1UL << BITS_PER_PTE) /* Definitions for 2nd level */ -#define pgtable_cache_init() do { } while (0) - #define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE) #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index c98162f494db..6fd8871e4081 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -48,6 +48,9 @@ #define MADV_DONTFORK 10 /* don't inherit across fork */ #define MADV_DOFORK 11 /* do inherit across fork */ +#define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ + #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 2372d35533ad..327567b8f7d6 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -112,8 +112,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file, unsigned long size, pgprot_t prot); -#define HAVE_ARCH_PCI_RESOURCE_TO_USER - extern resource_size_t pcibios_io_space_offset(struct pci_controller *hose); extern void pcibios_setup_bus_devices(struct pci_bus *bus); extern void pcibios_setup_bus_self(struct pci_bus *bus); diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 2b2c60a1a66d..6dd78a2dc03a 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -64,8 +64,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) extern struct kmem_cache *pgtable_cache[]; #define PGT_CACHE(shift) pgtable_cache[shift] -static inline void check_pgt_cache(void) { } - #ifdef CONFIG_PPC_BOOK3S #include #else diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 8b7865a2d576..4053b2ab427c 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -87,7 +87,6 @@ extern unsigned long ioremap_bot; unsigned long vmalloc_to_phys(void *vmalloc_addr); void pgtable_cache_add(unsigned int shift); -void pgtable_cache_init(void); #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32) void mark_initmem_nx(void); diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 3410ea9f4de1..6c123760164e 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1748,7 +1748,7 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr, /* * IF we try to do a HUGE PTE update after a withdraw is done. * we will find the below NULL. This happens when we do - * split_huge_page_pmd + * split_huge_pmd */ if (!hpte_slot_array) return; diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index b056cae3388b..56cc84520577 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -129,11 +129,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, * Allow to use larger than 64k IOMMU pages. Only do that * if we are backed by hugetlb. */ - if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { - struct page *head = compound_head(page); - - pageshift = compound_order(head) + PAGE_SHIFT; - } + if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) + pageshift = page_shift(compound_head(page)); mem->pageshift = min(mem->pageshift, pageshift); /* * We don't need struct page reference any more, switch diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a8953f108808..73d4873fc7f8 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -667,7 +667,7 @@ void flush_dcache_icache_hugepage(struct page *page) BUG_ON(!PageCompound(page)); - for (i = 0; i < (1UL << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { if (!PageHighMem(page)) { __flush_dcache_icache(page_address(page+i)); } else { diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index a7b05214760c..ee4bd6d38602 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -25,7 +25,7 @@ void pte_frag_destroy(void *pte_frag) count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } } @@ -61,7 +61,7 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -113,7 +113,7 @@ void pte_fragment_free(unsigned long *table, int kernel) BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&page->pt_frag_refcount)) { if (!kernel) - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } } diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 065ff14b76e1..1d93e55a2de1 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -10,6 +10,8 @@ #include #include +#include +#include #include #include #include @@ -20,7 +22,6 @@ #include #include #include -#include #include #include @@ -30,7 +31,7 @@ #include "spufs.h" struct spufs_sb_info { - int debug; + bool debug; }; static struct kmem_cache *spufs_inode_cache; @@ -574,16 +575,27 @@ long spufs_create(struct path *path, struct dentry *dentry, } /* File system initialization */ -enum { - Opt_uid, Opt_gid, Opt_mode, Opt_debug, Opt_err, +struct spufs_fs_context { + kuid_t uid; + kgid_t gid; + umode_t mode; }; -static const match_table_t spufs_tokens = { - { Opt_uid, "uid=%d" }, - { Opt_gid, "gid=%d" }, - { Opt_mode, "mode=%o" }, - { Opt_debug, "debug" }, - { Opt_err, NULL }, +enum { + Opt_uid, Opt_gid, Opt_mode, Opt_debug, +}; + +static const struct fs_parameter_spec spufs_param_specs[] = { + fsparam_u32 ("gid", Opt_gid), + fsparam_u32oct ("mode", Opt_mode), + fsparam_u32 ("uid", Opt_uid), + fsparam_flag ("debug", Opt_debug), + {} +}; + +static const struct fs_parameter_description spufs_fs_parameters = { + .name = "spufs", + .specs = spufs_param_specs, }; static int spufs_show_options(struct seq_file *m, struct dentry *root) @@ -604,47 +616,41 @@ static int spufs_show_options(struct seq_file *m, struct dentry *root) return 0; } -static int -spufs_parse_options(struct super_block *sb, char *options, struct inode *root) +static int spufs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; + struct spufs_fs_context *ctx = fc->fs_private; + struct spufs_sb_info *sbi = fc->s_fs_info; + struct fs_parse_result result; + kuid_t uid; + kgid_t gid; + int opt; - while ((p = strsep(&options, ",")) != NULL) { - int token, option; + opt = fs_parse(fc, &spufs_fs_parameters, param, &result); + if (opt < 0) + return opt; - if (!*p) - continue; - - token = match_token(p, spufs_tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - return 0; - root->i_uid = make_kuid(current_user_ns(), option); - if (!uid_valid(root->i_uid)) - return 0; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return 0; - root->i_gid = make_kgid(current_user_ns(), option); - if (!gid_valid(root->i_gid)) - return 0; - break; - case Opt_mode: - if (match_octal(&args[0], &option)) - return 0; - root->i_mode = option | S_IFDIR; - break; - case Opt_debug: - spufs_get_sb_info(sb)->debug = 1; - break; - default: - return 0; - } + switch (opt) { + case Opt_uid: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + return invalf(fc, "Unknown uid"); + ctx->uid = uid; + break; + case Opt_gid: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + return invalf(fc, "Unknown gid"); + ctx->gid = gid; + break; + case Opt_mode: + ctx->mode = result.uint_32 & S_IALLUGO; + break; + case Opt_debug: + sbi->debug = true; + break; } - return 1; + + return 0; } static void spufs_exit_isolated_loader(void) @@ -678,79 +684,98 @@ spufs_init_isolated_loader(void) printk(KERN_INFO "spufs: SPU isolation mode enabled\n"); } -static int -spufs_create_root(struct super_block *sb, void *data) +static int spufs_create_root(struct super_block *sb, struct fs_context *fc) { + struct spufs_fs_context *ctx = fc->fs_private; struct inode *inode; - int ret; - ret = -ENODEV; if (!spu_management_ops) - goto out; + return -ENODEV; - ret = -ENOMEM; - inode = spufs_new_inode(sb, S_IFDIR | 0775); + inode = spufs_new_inode(sb, S_IFDIR | ctx->mode); if (!inode) - goto out; + return -ENOMEM; + inode->i_uid = ctx->uid; + inode->i_gid = ctx->gid; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; SPUFS_I(inode)->i_ctx = NULL; inc_nlink(inode); - ret = -EINVAL; - if (!spufs_parse_options(sb, data, inode)) - goto out_iput; - - ret = -ENOMEM; sb->s_root = d_make_root(inode); if (!sb->s_root) - goto out; - + return -ENOMEM; return 0; -out_iput: - iput(inode); -out: - return ret; } -static int -spufs_fill_super(struct super_block *sb, void *data, int silent) +static const struct super_operations spufs_ops = { + .alloc_inode = spufs_alloc_inode, + .free_inode = spufs_free_inode, + .statfs = simple_statfs, + .evict_inode = spufs_evict_inode, + .show_options = spufs_show_options, +}; + +static int spufs_fill_super(struct super_block *sb, struct fs_context *fc) { - struct spufs_sb_info *info; - static const struct super_operations s_ops = { - .alloc_inode = spufs_alloc_inode, - .free_inode = spufs_free_inode, - .statfs = simple_statfs, - .evict_inode = spufs_evict_inode, - .show_options = spufs_show_options, - }; - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; - sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = SPUFS_MAGIC; - sb->s_op = &s_ops; - sb->s_fs_info = info; + sb->s_op = &spufs_ops; - return spufs_create_root(sb, data); + return spufs_create_root(sb, fc); } -static struct dentry * -spufs_mount(struct file_system_type *fstype, int flags, - const char *name, void *data) +static int spufs_get_tree(struct fs_context *fc) { - return mount_single(fstype, flags, data, spufs_fill_super); + return get_tree_single(fc, spufs_fill_super); +} + +static void spufs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations spufs_context_ops = { + .free = spufs_free_fc, + .parse_param = spufs_parse_param, + .get_tree = spufs_get_tree, +}; + +static int spufs_init_fs_context(struct fs_context *fc) +{ + struct spufs_fs_context *ctx; + struct spufs_sb_info *sbi; + + ctx = kzalloc(sizeof(struct spufs_fs_context), GFP_KERNEL); + if (!ctx) + goto nomem; + + sbi = kzalloc(sizeof(struct spufs_sb_info), GFP_KERNEL); + if (!sbi) + goto nomem_ctx; + + ctx->uid = current_uid(); + ctx->gid = current_gid(); + ctx->mode = 0755; + + fc->s_fs_info = sbi; + fc->ops = &spufs_context_ops; + return 0; + +nomem_ctx: + kfree(ctx); +nomem: + return -ENOMEM; } static struct file_system_type spufs_type = { .owner = THIS_MODULE, .name = "spufs", - .mount = spufs_mount, + .init_fs_context = spufs_init_fs_context, + .parameters = &spufs_fs_parameters, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("spufs"); diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 71d29fb4008a..8eebbc8860bb 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -59,6 +59,18 @@ config RISCV select ARCH_HAS_GIGANTIC_PAGE select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select SPARSEMEM_STATIC if 32BIT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU + select HAVE_ARCH_MMAP_RND_BITS + +config ARCH_MMAP_RND_BITS_MIN + default 18 if 64BIT + default 8 + +# max bits determined by the following formula: +# VA_BITS - PAGE_SHIFT - 3 +config ARCH_MMAP_RND_BITS_MAX + default 24 if 64BIT # SV39 based + default 17 config MMU def_bool y diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index 56a67d66f72f..d59ea92285ec 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -78,12 +78,8 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #define __pte_free_tlb(tlb, pte, buf) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ } while (0) -static inline void check_pgt_cache(void) -{ -} - #endif /* _ASM_RISCV_PGALLOC_H */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 80905b27ee98..c60123f018f5 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -424,11 +424,6 @@ extern void *dtb_early_va; extern void setup_bootmem(void); extern void paging_init(void); -static inline void pgtable_cache_init(void) -{ - /* No page table caches to initialize */ -} - #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index a4418fc425b8..70139d0791b6 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -12,17 +12,17 @@ #include #include #include +#include +#include #include #include #include #include #include -#include #include #include #include #include -#include #include #include #include "hypfs.h" @@ -207,52 +207,44 @@ static int hypfs_release(struct inode *inode, struct file *filp) return 0; } -enum { opt_uid, opt_gid, opt_err }; +enum { Opt_uid, Opt_gid, }; -static const match_table_t hypfs_tokens = { - {opt_uid, "uid=%u"}, - {opt_gid, "gid=%u"}, - {opt_err, NULL} +static const struct fs_parameter_spec hypfs_param_specs[] = { + fsparam_u32("gid", Opt_gid), + fsparam_u32("uid", Opt_uid), + {} }; -static int hypfs_parse_options(char *options, struct super_block *sb) +static const struct fs_parameter_description hypfs_fs_parameters = { + .name = "hypfs", + .specs = hypfs_param_specs, +}; + +static int hypfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *str; - substring_t args[MAX_OPT_ARGS]; + struct hypfs_sb_info *hypfs_info = fc->s_fs_info; + struct fs_parse_result result; kuid_t uid; kgid_t gid; + int opt; - if (!options) - return 0; - while ((str = strsep(&options, ",")) != NULL) { - int token, option; - struct hypfs_sb_info *hypfs_info = sb->s_fs_info; + opt = fs_parse(fc, &hypfs_fs_parameters, param, &result); + if (opt < 0) + return opt; - if (!*str) - continue; - token = match_token(str, hypfs_tokens, args); - switch (token) { - case opt_uid: - if (match_int(&args[0], &option)) - return -EINVAL; - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) - return -EINVAL; - hypfs_info->uid = uid; - break; - case opt_gid: - if (match_int(&args[0], &option)) - return -EINVAL; - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) - return -EINVAL; - hypfs_info->gid = gid; - break; - case opt_err: - default: - pr_err("%s is not a valid mount option\n", str); - return -EINVAL; - } + switch (opt) { + case Opt_uid: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + return invalf(fc, "Unknown uid"); + hypfs_info->uid = uid; + break; + case Opt_gid: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + return invalf(fc, "Unknown gid"); + hypfs_info->gid = gid; + break; } return 0; } @@ -266,26 +258,18 @@ static int hypfs_show_options(struct seq_file *s, struct dentry *root) return 0; } -static int hypfs_fill_super(struct super_block *sb, void *data, int silent) +static int hypfs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct hypfs_sb_info *sbi = sb->s_fs_info; struct inode *root_inode; - struct dentry *root_dentry; - int rc = 0; - struct hypfs_sb_info *sbi; + struct dentry *root_dentry, *update_file; + int rc; - sbi = kzalloc(sizeof(struct hypfs_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - mutex_init(&sbi->lock); - sbi->uid = current_uid(); - sbi->gid = current_gid(); - sb->s_fs_info = sbi; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = HYPFS_MAGIC; sb->s_op = &hypfs_s_ops; - if (hypfs_parse_options(data, sb)) - return -EINVAL; + root_inode = hypfs_make_inode(sb, S_IFDIR | 0755); if (!root_inode) return -ENOMEM; @@ -300,18 +284,46 @@ static int hypfs_fill_super(struct super_block *sb, void *data, int silent) rc = hypfs_diag_create_files(root_dentry); if (rc) return rc; - sbi->update_file = hypfs_create_update_file(root_dentry); - if (IS_ERR(sbi->update_file)) - return PTR_ERR(sbi->update_file); + update_file = hypfs_create_update_file(root_dentry); + if (IS_ERR(update_file)) + return PTR_ERR(update_file); + sbi->update_file = update_file; hypfs_update_update(sb); pr_info("Hypervisor filesystem mounted\n"); return 0; } -static struct dentry *hypfs_mount(struct file_system_type *fst, int flags, - const char *devname, void *data) +static int hypfs_get_tree(struct fs_context *fc) { - return mount_single(fst, flags, data, hypfs_fill_super); + return get_tree_single(fc, hypfs_fill_super); +} + +static void hypfs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations hypfs_context_ops = { + .free = hypfs_free_fc, + .parse_param = hypfs_parse_param, + .get_tree = hypfs_get_tree, +}; + +static int hypfs_init_fs_context(struct fs_context *fc) +{ + struct hypfs_sb_info *sbi; + + sbi = kzalloc(sizeof(struct hypfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + mutex_init(&sbi->lock); + sbi->uid = current_uid(); + sbi->gid = current_gid(); + + fc->s_fs_info = sbi; + fc->ops = &hypfs_context_ops; + return 0; } static void hypfs_kill_super(struct super_block *sb) @@ -442,7 +454,8 @@ static const struct file_operations hypfs_file_ops = { static struct file_system_type hypfs_type = { .owner = THIS_MODULE, .name = "s390_hypfs", - .mount = hypfs_mount, + .init_fs_context = hypfs_init_fs_context, + .parameters = &hypfs_fs_parameters, .kill_sb = hypfs_kill_super }; diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index ae3e3221d4b5..ceeb552d3472 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -70,7 +70,7 @@ struct hws_qsi_info_block { /* Bit(s) */ unsigned long tear; /* 24-31: TEAR contents */ unsigned long dear; /* 32-39: DEAR contents */ unsigned int rsvrd0; /* 40-43: reserved */ - unsigned int cpu_speed; /* 44-47: CPU speed */ + unsigned int cpu_speed; /* 44-47: CPU speed */ unsigned long long rsvrd1; /* 48-55: reserved */ unsigned long long rsvrd2; /* 56-63: reserved */ } __packed; @@ -89,10 +89,10 @@ struct hws_lsctl_request_block { unsigned long tear; /* 16-23: TEAR contents */ unsigned long dear; /* 24-31: DEAR contents */ /* 32-63: */ - unsigned long rsvrd1; /* reserved */ - unsigned long rsvrd2; /* reserved */ - unsigned long rsvrd3; /* reserved */ - unsigned long rsvrd4; /* reserved */ + unsigned long rsvrd1; /* reserved */ + unsigned long rsvrd2; /* reserved */ + unsigned long rsvrd3; /* reserved */ + unsigned long rsvrd4; /* reserved */ } __packed; struct hws_basic_entry { diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index 560d8f766ddf..4652ffffe0b2 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -60,6 +60,7 @@ struct perf_sf_sde_regs { #define PERF_CPUM_SF_MODE_MASK (PERF_CPUM_SF_BASIC_MODE| \ PERF_CPUM_SF_DIAG_MODE) #define PERF_CPUM_SF_FULL_BLOCKS 0x0004 /* Process full SDBs only */ +#define PERF_CPUM_SF_FREQ_MODE 0x0008 /* Sampling with frequency */ #define REG_NONE 0 #define REG_OVERFLOW 1 @@ -70,5 +71,6 @@ struct perf_sf_sde_regs { #define SAMPL_FLAGS(hwc) ((hwc)->config_base) #define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE) #define SDB_FULL_BLOCKS(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FULL_BLOCKS) +#define SAMPLE_FREQ_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FREQ_MODE) #endif /* _ASM_S390_PERF_EVENT_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0c4600725fc2..36c578c0ff96 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1682,12 +1682,6 @@ extern void s390_reset_cmma(struct mm_struct *mm); #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -/* - * No page table caches to initialise - */ -static inline void pgtable_cache_init(void) { } -static inline void check_pgt_cache(void) { } - #include #endif /* _S390_PAGE_H */ diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h index 8c5755f41dde..f9e5e1f0821d 100644 --- a/arch/s390/include/uapi/asm/zcrypt.h +++ b/arch/s390/include/uapi/asm/zcrypt.h @@ -4,7 +4,7 @@ * * zcrypt 2.2.1 (user-visible header) * - * Copyright IBM Corp. 2001, 2018 + * Copyright IBM Corp. 2001, 2019 * Author(s): Robert Burroughs * Eric Rossman (edrossma@us.ibm.com) * @@ -286,7 +286,7 @@ struct zcrypt_device_matrix_ext { * 0x08: CEX3A * 0x0a: CEX4 * 0x0b: CEX5 - * 0x0c: CEX6 + * 0x0c: CEX6 and CEX7 * 0x0d: device is disabled * * ZCRYPT_QDEPTH_MASK diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 292a452cd1f3..544a02e944c6 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -673,13 +673,89 @@ static void cpumsf_output_event_pid(struct perf_event *event, rcu_read_unlock(); } +static unsigned long getrate(bool freq, unsigned long sample, + struct hws_qsi_info_block *si) +{ + unsigned long rate; + + if (freq) { + rate = freq_to_sample_rate(si, sample); + rate = hw_limit_rate(si, rate); + } else { + /* The min/max sampling rates specifies the valid range + * of sample periods. If the specified sample period is + * out of range, limit the period to the range boundary. + */ + rate = hw_limit_rate(si, sample); + + /* The perf core maintains a maximum sample rate that is + * configurable through the sysctl interface. Ensure the + * sampling rate does not exceed this value. This also helps + * to avoid throttling when pushing samples with + * perf_event_overflow(). + */ + if (sample_rate_to_freq(si, rate) > + sysctl_perf_event_sample_rate) { + debug_sprintf_event(sfdbg, 1, + "Sampling rate exceeds maximum " + "perf sample rate\n"); + rate = 0; + } + } + return rate; +} + +/* The sampling information (si) contains information about the + * min/max sampling intervals and the CPU speed. So calculate the + * correct sampling interval and avoid the whole period adjust + * feedback loop. + * + * Since the CPU Measurement sampling facility can not handle frequency + * calculate the sampling interval when frequency is specified using + * this formula: + * interval := cpu_speed * 1000000 / sample_freq + * + * Returns errno on bad input and zero on success with parameter interval + * set to the correct sampling rate. + * + * Note: This function turns off freq bit to avoid calling function + * perf_adjust_period(). This causes frequency adjustment in the common + * code part which causes tremendous variations in the counter values. + */ +static int __hw_perf_event_init_rate(struct perf_event *event, + struct hws_qsi_info_block *si) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + unsigned long rate; + + if (attr->freq) { + if (!attr->sample_freq) + return -EINVAL; + rate = getrate(attr->freq, attr->sample_freq, si); + attr->freq = 0; /* Don't call perf_adjust_period() */ + SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FREQ_MODE; + } else { + rate = getrate(attr->freq, attr->sample_period, si); + if (!rate) + return -EINVAL; + } + attr->sample_period = rate; + SAMPL_RATE(hwc) = rate; + hw_init_period(hwc, SAMPL_RATE(hwc)); + debug_sprintf_event(sfdbg, 4, "__hw_perf_event_init_rate:" + "cpu:%d period:%llx freq:%d,%#lx\n", event->cpu, + event->attr.sample_period, event->attr.freq, + SAMPLE_FREQ_MODE(hwc)); + return 0; +} + static int __hw_perf_event_init(struct perf_event *event) { struct cpu_hw_sf *cpuhw; struct hws_qsi_info_block si; struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; - unsigned long rate; int cpu, err; /* Reserve CPU-measurement sampling facility */ @@ -745,43 +821,9 @@ static int __hw_perf_event_init(struct perf_event *event) if (attr->config1 & PERF_CPUM_SF_FULL_BLOCKS) SAMPL_FLAGS(hwc) |= PERF_CPUM_SF_FULL_BLOCKS; - /* The sampling information (si) contains information about the - * min/max sampling intervals and the CPU speed. So calculate the - * correct sampling interval and avoid the whole period adjust - * feedback loop. - */ - rate = 0; - if (attr->freq) { - if (!attr->sample_freq) { - err = -EINVAL; - goto out; - } - rate = freq_to_sample_rate(&si, attr->sample_freq); - rate = hw_limit_rate(&si, rate); - attr->freq = 0; - attr->sample_period = rate; - } else { - /* The min/max sampling rates specifies the valid range - * of sample periods. If the specified sample period is - * out of range, limit the period to the range boundary. - */ - rate = hw_limit_rate(&si, hwc->sample_period); - - /* The perf core maintains a maximum sample rate that is - * configurable through the sysctl interface. Ensure the - * sampling rate does not exceed this value. This also helps - * to avoid throttling when pushing samples with - * perf_event_overflow(). - */ - if (sample_rate_to_freq(&si, rate) > - sysctl_perf_event_sample_rate) { - err = -EINVAL; - debug_sprintf_event(sfdbg, 1, "Sampling rate exceeds maximum perf sample rate\n"); - goto out; - } - } - SAMPL_RATE(hwc) = rate; - hw_init_period(hwc, SAMPL_RATE(hwc)); + err = __hw_perf_event_init_rate(event, &si); + if (err) + goto out; /* Initialize sample data overflow accounting */ hwc->extra_reg.reg = REG_OVERFLOW; @@ -904,6 +946,8 @@ static void cpumsf_pmu_enable(struct pmu *pmu) if (sfb_has_pending_allocs(&cpuhw->sfb, hwc)) extend_sampling_buffer(&cpuhw->sfb, hwc); } + /* Rate may be adjusted with ioctl() */ + cpuhw->lsctl.interval = SAMPL_RATE(&cpuhw->event->hw); } /* (Re)enable the PMU and sampling facility */ @@ -922,8 +966,9 @@ static void cpumsf_pmu_enable(struct pmu *pmu) lpp(&S390_lowcore.lpp); debug_sprintf_event(sfdbg, 6, "pmu_enable: es=%i cs=%i ed=%i cd=%i " - "tear=%p dear=%p\n", cpuhw->lsctl.es, - cpuhw->lsctl.cs, cpuhw->lsctl.ed, cpuhw->lsctl.cd, + "interval:%lx tear=%p dear=%p\n", + cpuhw->lsctl.es, cpuhw->lsctl.cs, cpuhw->lsctl.ed, + cpuhw->lsctl.cd, cpuhw->lsctl.interval, (void *) cpuhw->lsctl.tear, (void *) cpuhw->lsctl.dear); } @@ -1717,6 +1762,44 @@ static void cpumsf_pmu_read(struct perf_event *event) /* Nothing to do ... updates are interrupt-driven */ } +/* Check if the new sampling period/freqeuncy is appropriate. + * + * Return non-zero on error and zero on passed checks. + */ +static int cpumsf_pmu_check_period(struct perf_event *event, u64 value) +{ + struct hws_qsi_info_block si; + unsigned long rate; + bool do_freq; + + memset(&si, 0, sizeof(si)); + if (event->cpu == -1) { + if (qsi(&si)) + return -ENODEV; + } else { + /* Event is pinned to a particular CPU, retrieve the per-CPU + * sampling structure for accessing the CPU-specific QSI. + */ + struct cpu_hw_sf *cpuhw = &per_cpu(cpu_hw_sf, event->cpu); + + si = cpuhw->qsi; + } + + do_freq = !!SAMPLE_FREQ_MODE(&event->hw); + rate = getrate(do_freq, value, &si); + if (!rate) + return -EINVAL; + + event->attr.sample_period = rate; + SAMPL_RATE(&event->hw) = rate; + hw_init_period(&event->hw, SAMPL_RATE(&event->hw)); + debug_sprintf_event(sfdbg, 4, "cpumsf_pmu_check_period:" + "cpu:%d value:%llx period:%llx freq:%d\n", + event->cpu, value, + event->attr.sample_period, do_freq); + return 0; +} + /* Activate sampling control. * Next call of pmu_enable() starts sampling. */ @@ -1908,6 +1991,8 @@ static struct pmu cpumf_sampling = { .setup_aux = aux_buffer_setup, .free_aux = aux_buffer_free, + + .check_period = cpumsf_pmu_check_period, }; static void cpumf_measurement_alert(struct ext_code ext_code, diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 2db6fb405a9a..3627953007ed 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -311,7 +311,8 @@ int arch_update_cpu_topology(void) on_each_cpu(__arch_update_dedicated_flag, NULL, 0); for_each_online_cpu(cpu) { dev = get_cpu_device(cpu); - kobject_uevent(&dev->kobj, KOBJ_CHANGE); + if (dev) + kobject_uevent(&dev->kobj, KOBJ_CHANGE); } return rc; } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 54fcdf66ae96..3dd253f81a77 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -210,7 +210,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) page = alloc_page(GFP_KERNEL); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -256,7 +256,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) atomic_xor_bits(&page->_refcount, 3U << 24); } - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } @@ -308,7 +308,7 @@ void __tlb_remove_table(void *_table) case 3: /* 4K page table with pgstes */ if (mask & 3) atomic_xor_bits(&page->_refcount, 3 << 24); - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); break; } diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index b56f908b1395..22d968bfe9bb 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -2,10 +2,8 @@ #ifndef __ASM_SH_PGALLOC_H #define __ASM_SH_PGALLOC_H -#include #include - -#define QUICK_PT 0 /* Other page table pages that are zero on free */ +#include extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); @@ -29,44 +27,9 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, } #define pmd_pgtable(pmd) pmd_page(pmd) -/* - * Allocate and free page tables. - */ -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - struct page *page; - void *pg; - - pg = quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); - if (!pg) - return NULL; - page = virt_to_page(pg); - if (!pgtable_page_ctor(page)) { - quicklist_free(QUICK_PT, NULL, pg); - return NULL; - } - return page; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - quicklist_free(QUICK_PT, NULL, pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t pte) -{ - pgtable_page_dtor(pte); - quicklist_free_page(QUICK_PT, NULL, pte); -} - #define __pte_free_tlb(tlb,pte,addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) @@ -79,9 +42,4 @@ do { \ } while (0); #endif -static inline void check_pgt_cache(void) -{ - quicklist_trim(QUICK_PT, NULL, 25, 16); -} - #endif /* __ASM_SH_PGALLOC_H */ diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 9085d1142fa3..cbd0f3c55a0c 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -123,11 +123,6 @@ typedef pte_t *pte_addr_t; #define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) -/* - * Initialise the page table caches - */ -extern void pgtable_cache_init(void); - struct vm_area_struct; struct mm_struct; diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index 02ed2df25a54..5c8a2ebfc720 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -1,9 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 menu "Memory management options" -config QUICKLIST - def_bool y - config MMU bool "Support for memory management hardware" depends on !CPU_SH2 diff --git a/arch/sh/mm/nommu.c b/arch/sh/mm/nommu.c index cc779a90d917..dca946f426c6 100644 --- a/arch/sh/mm/nommu.c +++ b/arch/sh/mm/nommu.c @@ -97,7 +97,3 @@ void __init page_table_range_init(unsigned long start, unsigned long end, void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) { } - -void pgtable_cache_init(void) -{ -} diff --git a/arch/sparc/include/asm/pci.h b/arch/sparc/include/asm/pci.h index cfec79bb1831..4deddf430e5d 100644 --- a/arch/sparc/include/asm/pci.h +++ b/arch/sparc/include/asm/pci.h @@ -38,8 +38,6 @@ static inline int pci_proc_domain(struct pci_bus *bus) #define arch_can_pci_mmap_io() 1 #define HAVE_ARCH_PCI_GET_UNMAPPED_AREA #define get_pci_unmapped_area get_fb_unmapped_area - -#define HAVE_ARCH_PCI_RESOURCE_TO_USER #endif /* CONFIG_SPARC64 */ #if defined(CONFIG_SPARC64) || defined(CONFIG_LEON_PCI) diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h index 282be50a4adf..10538a4d1a1e 100644 --- a/arch/sparc/include/asm/pgalloc_32.h +++ b/arch/sparc/include/asm/pgalloc_32.h @@ -17,8 +17,6 @@ void srmmu_free_nocache(void *addr, int size); extern struct resource sparc_iomap; -#define check_pgt_cache() do { } while (0) - pgd_t *get_pgd_fast(void); static inline void free_pgd_fast(pgd_t *pgd) { diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h index 48abccba4991..9d3e5cc95bbb 100644 --- a/arch/sparc/include/asm/pgalloc_64.h +++ b/arch/sparc/include/asm/pgalloc_64.h @@ -69,8 +69,6 @@ void pte_free(struct mm_struct *mm, pgtable_t ptepage); #define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE) #define pmd_pgtable(PMD) ((pte_t *)__pmd_page(PMD)) -#define check_pgt_cache() do { } while (0) - void pgtable_free(void *table, bool is_page); #ifdef CONFIG_SMP diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 4eebed6c6781..31da44826645 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -445,9 +445,4 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, /* We provide our own get_unmapped_area to cope with VA holes for userland */ #define HAVE_ARCH_UNMAPPED_AREA -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - #endif /* !(_SPARC_PGTABLE_H) */ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 1599de730532..6ae8016ef4ec 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1078,7 +1078,7 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, } #define io_remap_pfn_range io_remap_pfn_range -static inline unsigned long untagged_addr(unsigned long start) +static inline unsigned long __untagged_addr(unsigned long start) { if (adi_capable()) { long addr = start; @@ -1098,7 +1098,8 @@ static inline unsigned long untagged_addr(unsigned long start) return start; } -#define untagged_addr untagged_addr +#define untagged_addr(addr) \ + ((__typeof__(addr))(__untagged_addr((unsigned long)(addr)))) static inline bool pte_access_permitted(pte_t pte, bool write) { @@ -1135,7 +1136,6 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long, unsigned long); #define HAVE_ARCH_FB_UNMAPPED_AREA -void pgtable_cache_init(void); void sun4v_register_fault_status(void); void sun4v_ktsb_register(void); void __init cheetah_ecache_flush_init(void); diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 046ab116cc8c..906eda1158b4 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -31,7 +31,6 @@ #include #include #include -#include /* bug in asm-generic/tlb.h: check_pgt_cache */ #include #include #include diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 4b099dd7a767..e6d91819da92 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2903,7 +2903,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) return NULL; - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { free_unref_page(page); return NULL; } @@ -2919,7 +2919,7 @@ static void __pte_free(pgtable_t pte) { struct page *page = virt_to_page(pte); - pgtable_page_dtor(page); + pgtable_pte_page_dtor(page); __free_page(page); } diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index aaebbc00d262..cc3ad64479ac 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -378,7 +378,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) if ((pte = (unsigned long)pte_alloc_one_kernel(mm)) == 0) return NULL; page = pfn_to_page(__nocache_pa(pte) >> PAGE_SHIFT); - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -389,7 +389,7 @@ void pte_free(struct mm_struct *mm, pgtable_t pte) { unsigned long p; - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); p = (unsigned long)page_address(pte); /* Cached address (for test) */ if (p == 0) BUG(); diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 023599c3fa51..881e76da1938 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -29,7 +29,7 @@ extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); #define __pte_free_tlb(tlb,pte, address) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb),(pte)); \ } while (0) @@ -43,7 +43,5 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x)) #endif -#define check_pgt_cache() do { } while (0) - #endif diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index e4d3ed980d82..36a44d58f373 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -32,8 +32,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* zero page used for uninitialized stuff */ extern unsigned long *empty_zero_page; -#define pgtable_cache_init() do ; while (0) - /* Just any arbitrary offset to the start of the vmalloc VM area: the * current 8MB value just means that there will be a 8MB "hole" after the * physical memory until the kernel virtual memory starts. That means that diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h index 3f0903bd98e9..ba1c9a79993b 100644 --- a/arch/unicore32/include/asm/pgalloc.h +++ b/arch/unicore32/include/asm/pgalloc.h @@ -18,8 +18,6 @@ #define __HAVE_ARCH_PTE_ALLOC_ONE #include -#define check_pgt_cache() do { } while (0) - #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) #define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h index 126e961a8cb0..c8f7ba12f309 100644 --- a/arch/unicore32/include/asm/pgtable.h +++ b/arch/unicore32/include/asm/pgtable.h @@ -285,8 +285,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; #include -#define pgtable_cache_init() do { } while (0) - #endif /* !__ASSEMBLY__ */ #endif /* __UNICORE_PGTABLE_H__ */ diff --git a/arch/unicore32/include/asm/tlb.h b/arch/unicore32/include/asm/tlb.h index 10d2356bfddd..4663d8cc80ef 100644 --- a/arch/unicore32/include/asm/tlb.h +++ b/arch/unicore32/include/asm/tlb.h @@ -15,7 +15,7 @@ #define __pte_free_tlb(tlb, pte, addr) \ do { \ - pgtable_page_dtor(pte); \ + pgtable_pte_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ } while (0) diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index c78da8eda8f2..0dca7f7aeff2 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -29,8 +29,6 @@ extern pgd_t swapper_pg_dir[1024]; extern pgd_t initial_page_table[1024]; extern pmd_t initial_pg_pmd[]; -static inline void pgtable_cache_init(void) { } -static inline void check_pgt_cache(void) { } void paging_init(void); void sync_initial_page_table(void); diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 4990d26dfc73..0b6c4042942a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -241,9 +241,6 @@ extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define pgtable_cache_init() do { } while (0) -#define check_pgt_cache() do { } while (0) - #define PAGE_AGP PAGE_KERNEL_NOCACHE #define HAVE_PAGE_AGP 1 diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index fa16036fa592..65ebe4b88f7c 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -54,23 +54,10 @@ static u64 get_subtree_max_end(struct rb_node *node) return ret; } -static u64 compute_subtree_max_end(struct memtype *data) -{ - u64 max_end = data->end, child_max_end; +#define NODE_END(node) ((node)->end) - child_max_end = get_subtree_max_end(data->rb.rb_right); - if (child_max_end > max_end) - max_end = child_max_end; - - child_max_end = get_subtree_max_end(data->rb.rb_left); - if (child_max_end > max_end) - max_end = child_max_end; - - return max_end; -} - -RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, - u64, subtree_max_end, compute_subtree_max_end) +RB_DECLARE_CALLBACKS_MAX(static, memtype_rb_augment_cb, + struct memtype, rb, u64, subtree_max_end, NODE_END) /* Find the first (lowest start addr) overlapping range from rb tree */ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 44816ff6411f..3e4b9035bb9a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -45,7 +45,7 @@ early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); paravirt_tlb_remove_table(tlb, pte); } @@ -357,7 +357,7 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, static struct kmem_cache *pgd_cache; -void __init pgd_cache_init(void) +void __init pgtable_cache_init(void) { /* * When PAE kernel is running as a Xen domain, it does not use @@ -402,10 +402,6 @@ static inline void _pgd_free(pgd_t *pgd) } #else -void __init pgd_cache_init(void) -{ -} - static inline pgd_t *_pgd_alloc(void) { return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h index dd744aa450fa..1d38f0e755ba 100644 --- a/arch/xtensa/include/asm/pgalloc.h +++ b/arch/xtensa/include/asm/pgalloc.h @@ -55,7 +55,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) if (!pte) return NULL; page = virt_to_page(pte); - if (!pgtable_page_ctor(page)) { + if (!pgtable_pte_page_ctor(page)) { __free_page(page); return NULL; } @@ -69,7 +69,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static inline void pte_free(struct mm_struct *mm, pgtable_t pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); __free_page(pte); } #define pmd_pgtable(pmd) pmd_page(pmd) diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index ce3ff5e591b9..3f7fe5a8c286 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -238,7 +238,6 @@ extern void paging_init(void); # define swapper_pg_dir NULL static inline void paging_init(void) { } #endif -static inline void pgtable_cache_init(void) { } /* * The pmd contains the kernel virtual address of the pte page. diff --git a/arch/xtensa/include/asm/tlbflush.h b/arch/xtensa/include/asm/tlbflush.h index 06875feb27c2..856e2da2e397 100644 --- a/arch/xtensa/include/asm/tlbflush.h +++ b/arch/xtensa/include/asm/tlbflush.h @@ -160,9 +160,6 @@ static inline void invalidate_dtlb_mapping (unsigned address) invalidate_dtlb_entry(tlb_entry); } -#define check_pgt_cache() do { } while (0) - - /* * DO NOT USE THESE FUNCTIONS. These instructions aren't part of the Xtensa * ISA and exist only for test purposes.. diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index ebbb48842190..e5e643752947 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -103,6 +103,9 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index b33be928d164..0319d6339822 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2016,7 +2016,7 @@ static void bfq_add_request(struct request *rq) (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected && bfqd->rq_in_driver > 0)) && time_is_before_eq_jiffies(bfqq->decrease_time_jif + - msecs_to_jiffies(100))) { + msecs_to_jiffies(10))) { bfqd->last_empty_occupied_ns = ktime_get_ns(); /* * Start the state machine for measuring the @@ -2025,7 +2025,21 @@ static void bfq_add_request(struct request *rq) * be set when rq will be dispatched. */ bfqd->wait_dispatch = true; - bfqd->rqs_injected = false; + /* + * If there is no I/O in service in the drive, + * then possible injection occurred before the + * arrival of rq will not affect the total + * service time of rq. So the injection limit + * must not be updated as a function of such + * total service time, unless new injection + * occurs before rq is completed. To have the + * injection limit updated only in the latter + * case, reset rqs_injected here (rqs_injected + * will be set in case injection is performed + * on bfqq before rq is completed). + */ + if (bfqd->rq_in_driver == 0) + bfqd->rqs_injected = false; } } @@ -5784,14 +5798,14 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns; unsigned int old_limit = bfqq->inject_limit; - if (bfqq->last_serv_time_ns > 0) { + if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) { u64 threshold = (bfqq->last_serv_time_ns * 3)>>1; if (tot_time_ns >= threshold && old_limit > 0) { bfqq->inject_limit--; bfqq->decrease_time_jif = jiffies; } else if (tot_time_ns < threshold && - old_limit < bfqd->max_rq_in_driver<<1) + old_limit <= bfqd->max_rq_in_driver) bfqq->inject_limit++; } @@ -5809,12 +5823,14 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, */ if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || tot_time_ns < bfqq->last_serv_time_ns) { + if (bfqq->last_serv_time_ns == 0) { + /* + * Now we certainly have a base value: make sure we + * start trying injection. + */ + bfqq->inject_limit = max_t(unsigned int, 1, old_limit); + } bfqq->last_serv_time_ns = tot_time_ns; - /* - * Now we certainly have a base value: make sure we - * start trying injection. - */ - bfqq->inject_limit = max_t(unsigned int, 1, old_limit); } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) /* * No I/O injected and no request still in service in @@ -5830,6 +5846,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, /* update complete, not waiting for any request completion any longer */ bfqd->waited_rq = NULL; + bfqd->rqs_injected = false; } /* diff --git a/block/blk-core.c b/block/blk-core.c index 875e8d105067..d5e668ec751b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1436,6 +1437,12 @@ bool blk_update_request(struct request *req, blk_status_t error, if (!req->bio) return false; +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && + error == BLK_STS_OK) + req->q->integrity.profile->complete_fn(req, nr_bytes); +#endif + if (unlikely(error && !blk_rq_is_passthrough(req) && !(req->rq_flags & RQF_QUIET))) print_req_error(req, error, __func__); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index ca39b4624cf8..ff1070edbb40 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -368,10 +368,21 @@ static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) return BLK_STS_OK; } +static void blk_integrity_nop_prepare(struct request *rq) +{ +} + +static void blk_integrity_nop_complete(struct request *rq, + unsigned int nr_bytes) +{ +} + static const struct blk_integrity_profile nop_profile = { .name = "nop", .generate_fn = blk_integrity_nop_fn, .verify_fn = blk_integrity_nop_fn, + .prepare_fn = blk_integrity_nop_prepare, + .complete_fn = blk_integrity_nop_complete, }; /** diff --git a/block/blk-mq.c b/block/blk-mq.c index 20a49be536b5..29275f5a996f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -30,6 +30,7 @@ #include #include +#include #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" @@ -700,6 +701,11 @@ void blk_mq_start_request(struct request *rq) */ rq->nr_phys_segments++; } + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) + q->integrity.profile->prepare_fn(rq); +#endif } EXPORT_SYMBOL(blk_mq_start_request); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 785dd58947f1..347dda16c2f4 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -266,6 +266,7 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct bsg_set *bset = container_of(q->tag_set, struct bsg_set, tag_set); + int sts = BLK_STS_IOERR; int ret; blk_mq_start_request(req); @@ -274,14 +275,15 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_IOERR; if (!bsg_prepare_job(dev, req)) - return BLK_STS_IOERR; + goto out; ret = bset->job_fn(blk_mq_rq_to_pdu(req)); - if (ret) - return BLK_STS_IOERR; + if (!ret) + sts = BLK_STS_OK; +out: put_device(dev); - return BLK_STS_OK; + return sts; } /* called right after the request is allocated for the request_queue */ diff --git a/block/t10-pi.c b/block/t10-pi.c index 0c0094609dd6..9803c7e0376e 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -27,7 +27,7 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len) * tag. */ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, - csum_fn *fn, unsigned int type) + csum_fn *fn, enum t10_dif_type type) { unsigned int i; @@ -37,7 +37,7 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, pi->guard_tag = fn(iter->data_buf, iter->interval); pi->app_tag = 0; - if (type == 1) + if (type == T10_PI_TYPE1_PROTECTION) pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); else pi->ref_tag = 0; @@ -51,17 +51,18 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, } static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, - csum_fn *fn, unsigned int type) + csum_fn *fn, enum t10_dif_type type) { unsigned int i; + BUG_ON(type == T10_PI_TYPE0_PROTECTION); + for (i = 0 ; i < iter->data_size ; i += iter->interval) { struct t10_pi_tuple *pi = iter->prot_buf; __be16 csum; - switch (type) { - case 1: - case 2: + if (type == T10_PI_TYPE1_PROTECTION || + type == T10_PI_TYPE2_PROTECTION) { if (pi->app_tag == T10_PI_APP_ESCAPE) goto next; @@ -73,12 +74,10 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, iter->seed, be32_to_cpu(pi->ref_tag)); return BLK_STS_PROTECTION; } - break; - case 3: + } else if (type == T10_PI_TYPE3_PROTECTION) { if (pi->app_tag == T10_PI_APP_ESCAPE && pi->ref_tag == T10_PI_REF_ESCAPE) goto next; - break; } csum = fn(iter->data_buf, iter->interval); @@ -102,94 +101,40 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) { - return t10_pi_generate(iter, t10_pi_crc_fn, 1); + return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) { - return t10_pi_generate(iter, t10_pi_ip_fn, 1); + return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) { - return t10_pi_verify(iter, t10_pi_crc_fn, 1); + return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); } static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) { - return t10_pi_verify(iter, t10_pi_ip_fn, 1); + return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); } -static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_crc_fn, 3); -} - -static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_generate(iter, t10_pi_ip_fn, 3); -} - -static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_crc_fn, 3); -} - -static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) -{ - return t10_pi_verify(iter, t10_pi_ip_fn, 3); -} - -const struct blk_integrity_profile t10_pi_type1_crc = { - .name = "T10-DIF-TYPE1-CRC", - .generate_fn = t10_pi_type1_generate_crc, - .verify_fn = t10_pi_type1_verify_crc, -}; -EXPORT_SYMBOL(t10_pi_type1_crc); - -const struct blk_integrity_profile t10_pi_type1_ip = { - .name = "T10-DIF-TYPE1-IP", - .generate_fn = t10_pi_type1_generate_ip, - .verify_fn = t10_pi_type1_verify_ip, -}; -EXPORT_SYMBOL(t10_pi_type1_ip); - -const struct blk_integrity_profile t10_pi_type3_crc = { - .name = "T10-DIF-TYPE3-CRC", - .generate_fn = t10_pi_type3_generate_crc, - .verify_fn = t10_pi_type3_verify_crc, -}; -EXPORT_SYMBOL(t10_pi_type3_crc); - -const struct blk_integrity_profile t10_pi_type3_ip = { - .name = "T10-DIF-TYPE3-IP", - .generate_fn = t10_pi_type3_generate_ip, - .verify_fn = t10_pi_type3_verify_ip, -}; -EXPORT_SYMBOL(t10_pi_type3_ip); - /** - * t10_pi_prepare - prepare PI prior submitting request to device + * t10_pi_type1_prepare - prepare PI prior submitting request to device * @rq: request with PI that should be prepared - * @protection_type: PI type (Type 1/Type 2/Type 3) * * For Type 1/Type 2, the virtual start sector is the one that was * originally submitted by the block layer for the ref_tag usage. Due to * partitioning, MD/DM cloning, etc. the actual physical start sector is * likely to be different. Remap protection information to match the * physical LBA. - * - * Type 3 does not have a reference tag so no remapping is required. */ -void t10_pi_prepare(struct request *rq, u8 protection_type) +static void t10_pi_type1_prepare(struct request *rq) { const int tuple_sz = rq->q->integrity.tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); struct bio *bio; - if (protection_type == T10_PI_TYPE3_PROTECTION) - return; - __rq_for_each_bio(bio, rq) { struct bio_integrity_payload *bip = bio_integrity(bio); u32 virt = bip_get_seed(bip) & 0xffffffff; @@ -222,13 +167,11 @@ void t10_pi_prepare(struct request *rq, u8 protection_type) bip->bip_flags |= BIP_MAPPED_INTEGRITY; } } -EXPORT_SYMBOL(t10_pi_prepare); /** - * t10_pi_complete - prepare PI prior returning request to the block layer + * t10_pi_type1_complete - prepare PI prior returning request to the blk layer * @rq: request with PI that should be prepared - * @protection_type: PI type (Type 1/Type 2/Type 3) - * @intervals: total elements to prepare + * @nr_bytes: total bytes to prepare * * For Type 1/Type 2, the virtual start sector is the one that was * originally submitted by the block layer for the ref_tag usage. Due to @@ -236,19 +179,14 @@ EXPORT_SYMBOL(t10_pi_prepare); * likely to be different. Since the physical start sector was submitted * to the device, we should remap it back to virtual values expected by the * block layer. - * - * Type 3 does not have a reference tag so no remapping is required. */ -void t10_pi_complete(struct request *rq, u8 protection_type, - unsigned int intervals) +static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { + unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; const int tuple_sz = rq->q->integrity.tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); struct bio *bio; - if (protection_type == T10_PI_TYPE3_PROTECTION) - return; - __rq_for_each_bio(bio, rq) { struct bio_integrity_payload *bip = bio_integrity(bio); u32 virt = bip_get_seed(bip) & 0xffffffff; @@ -276,4 +214,73 @@ void t10_pi_complete(struct request *rq, u8 protection_type, } } } -EXPORT_SYMBOL(t10_pi_complete); + +static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); +} + +/** + * Type 3 does not have a reference tag so no remapping is required. + */ +static void t10_pi_type3_prepare(struct request *rq) +{ +} + +/** + * Type 3 does not have a reference tag so no remapping is required. + */ +static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes) +{ +} + +const struct blk_integrity_profile t10_pi_type1_crc = { + .name = "T10-DIF-TYPE1-CRC", + .generate_fn = t10_pi_type1_generate_crc, + .verify_fn = t10_pi_type1_verify_crc, + .prepare_fn = t10_pi_type1_prepare, + .complete_fn = t10_pi_type1_complete, +}; +EXPORT_SYMBOL(t10_pi_type1_crc); + +const struct blk_integrity_profile t10_pi_type1_ip = { + .name = "T10-DIF-TYPE1-IP", + .generate_fn = t10_pi_type1_generate_ip, + .verify_fn = t10_pi_type1_verify_ip, + .prepare_fn = t10_pi_type1_prepare, + .complete_fn = t10_pi_type1_complete, +}; +EXPORT_SYMBOL(t10_pi_type1_ip); + +const struct blk_integrity_profile t10_pi_type3_crc = { + .name = "T10-DIF-TYPE3-CRC", + .generate_fn = t10_pi_type3_generate_crc, + .verify_fn = t10_pi_type3_verify_crc, + .prepare_fn = t10_pi_type3_prepare, + .complete_fn = t10_pi_type3_complete, +}; +EXPORT_SYMBOL(t10_pi_type3_crc); + +const struct blk_integrity_profile t10_pi_type3_ip = { + .name = "T10-DIF-TYPE3-IP", + .generate_fn = t10_pi_type3_generate_ip, + .verify_fn = t10_pi_type3_verify_ip, + .prepare_fn = t10_pi_type3_prepare, + .complete_fn = t10_pi_type3_complete, +}; +EXPORT_SYMBOL(t10_pi_type3_ip); diff --git a/drivers/acpi/acpi_apd.c b/drivers/acpi/acpi_apd.c index 7cd0c9ac71ea..71511ae2dfcd 100644 --- a/drivers/acpi/acpi_apd.c +++ b/drivers/acpi/acpi_apd.c @@ -160,11 +160,17 @@ static const struct apd_device_desc hip08_i2c_desc = { .setup = acpi_apd_setup, .fixed_clk_rate = 250000000, }; + static const struct apd_device_desc thunderx2_i2c_desc = { .setup = acpi_apd_setup, .fixed_clk_rate = 125000000, }; +static const struct apd_device_desc nxp_i2c_desc = { + .setup = acpi_apd_setup, + .fixed_clk_rate = 350000000, +}; + static const struct apd_device_desc hip08_spi_desc = { .setup = acpi_apd_setup, .fixed_clk_rate = 250000000, @@ -238,6 +244,7 @@ static const struct acpi_device_id acpi_apd_device_ids[] = { { "HISI02A1", APD_ADDR(hip07_i2c_desc) }, { "HISI02A2", APD_ADDR(hip08_i2c_desc) }, { "HISI0173", APD_ADDR(hip08_spi_desc) }, + { "NXP0001", APD_ADDR(nxp_i2c_desc) }, #endif { } }; diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 314a187ed572..d1e666ef3fcc 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c index 9e9583a6bba9..e742780950de 100644 --- a/drivers/ata/libahci_platform.c +++ b/drivers/ata/libahci_platform.c @@ -497,6 +497,7 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev, if (of_property_read_u32(child, "reg", &port)) { rc = -EINVAL; + of_node_put(child); goto err_out; } @@ -514,14 +515,18 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev, if (port_dev) { rc = ahci_platform_get_regulator(hpriv, port, &port_dev->dev); - if (rc == -EPROBE_DEFER) + if (rc == -EPROBE_DEFER) { + of_node_put(child); goto err_out; + } } #endif rc = ahci_platform_get_phy(hpriv, port, dev, child); - if (rc) + if (rc) { + of_node_put(child); goto err_out; + } enabled_ports++; } diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 20c39d1bcef8..6bea4f3f8040 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -100,26 +100,9 @@ unsigned long __weak memory_block_size_bytes(void) } EXPORT_SYMBOL_GPL(memory_block_size_bytes); -static unsigned long get_memory_block_size(void) -{ - unsigned long block_sz; - - block_sz = memory_block_size_bytes(); - - /* Validate blk_sz is a power of 2 and not less than section size */ - if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { - WARN_ON(1); - block_sz = MIN_MEMORY_BLOCK_SIZE; - } - - return block_sz; -} - /* - * use this as the physical section index that this memsection - * uses. + * Show the first physical section index (number) of this memory block. */ - static ssize_t phys_index_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -131,7 +114,10 @@ static ssize_t phys_index_show(struct device *dev, } /* - * Show whether the section of memory is likely to be hot-removable + * Show whether the memory block is likely to be offlineable (or is already + * offline). Once offline, the memory block could be removed. The return + * value does, however, not indicate that there is a way to remove the + * memory block. */ static ssize_t removable_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -455,12 +441,12 @@ static DEVICE_ATTR_RO(phys_device); static DEVICE_ATTR_RO(removable); /* - * Block size attribute stuff + * Show the memory block size (shared by all memory blocks). */ static ssize_t block_size_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%lx\n", get_memory_block_size()); + return sprintf(buf, "%lx\n", memory_block_size_bytes()); } static DEVICE_ATTR_RO(block_size_bytes); @@ -670,10 +656,10 @@ static int init_memory_block(struct memory_block **memory, return -ENOMEM; mem->start_section_nr = block_id * sections_per_block; - mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; mem->state = state; start_pfn = section_nr_to_pfn(mem->start_section_nr); mem->phys_device = arch_get_memory_phys_device(start_pfn); + mem->nid = NUMA_NO_NODE; ret = register_memory(mem); @@ -810,19 +796,22 @@ static const struct attribute_group *memory_root_attr_groups[] = { /* * Initialize the sysfs support for memory devices... */ -int __init memory_dev_init(void) +void __init memory_dev_init(void) { int ret; int err; unsigned long block_sz, nr; + /* Validate the configured memory block size */ + block_sz = memory_block_size_bytes(); + if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) + panic("Memory block size not suitable: 0x%lx\n", block_sz); + sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; + ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); if (ret) goto out; - block_sz = get_memory_block_size(); - sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; - /* * Create entries for memory sections that were found * during boot and have been initialized @@ -838,8 +827,7 @@ int __init memory_dev_init(void) out: if (ret) - printk(KERN_ERR "%s() failed: %d\n", __func__, ret); - return ret; + panic("%s() failed: %d\n", __func__, ret); } /** diff --git a/drivers/base/node.c b/drivers/base/node.c index 75b7e6f6535b..296546ffed6c 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -427,6 +427,8 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d AnonHugePages: %8lu kB\n" "Node %d ShmemHugePages: %8lu kB\n" "Node %d ShmemPmdMapped: %8lu kB\n" + "Node %d FileHugePages: %8lu kB\n" + "Node %d FilePmdMapped: %8lu kB\n" #endif , nid, K(node_page_state(pgdat, NR_FILE_DIRTY)), @@ -452,6 +454,10 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * + HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_FILE_THPS) * + HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) * HPAGE_PMD_NR) #endif ); @@ -756,15 +762,13 @@ static int __ref get_nid_for_pfn(unsigned long pfn) static int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) { + unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE; + unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); + unsigned long end_pfn = start_pfn + memory_block_pfns - 1; int ret, nid = *(int *)arg; - unsigned long pfn, sect_start_pfn, sect_end_pfn; + unsigned long pfn; - mem_blk->nid = nid; - - sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); - sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); - sect_end_pfn += PAGES_PER_SECTION - 1; - for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { + for (pfn = start_pfn; pfn <= end_pfn; pfn++) { int page_nid; /* @@ -789,6 +793,13 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk, if (page_nid != nid) continue; } + + /* + * If this memory block spans multiple nodes, we only indicate + * the last processed node. + */ + mem_blk->nid = nid; + ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, &mem_blk->dev.kobj, kobject_name(&mem_blk->dev.kobj)); @@ -804,32 +815,18 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk, } /* - * Unregister memory block device under all nodes that it spans. - * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes). + * Unregister a memory block device under the node it spans. Memory blocks + * with multiple nodes cannot be offlined and therefore also never be removed. */ void unregister_memory_block_under_nodes(struct memory_block *mem_blk) { - unsigned long pfn, sect_start_pfn, sect_end_pfn; - static nodemask_t unlinked_nodes; + if (mem_blk->nid == NUMA_NO_NODE) + return; - nodes_clear(unlinked_nodes); - sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); - sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); - for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { - int nid; - - nid = get_nid_for_pfn(pfn); - if (nid < 0) - continue; - if (!node_online(nid)) - continue; - if (node_test_and_set(nid, unlinked_nodes)) - continue; - sysfs_remove_link(&node_devices[nid]->dev.kobj, - kobject_name(&mem_blk->dev.kobj)); - sysfs_remove_link(&mem_blk->dev.kobj, - kobject_name(&node_devices[nid]->dev.kobj)); - } + sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj, + kobject_name(&mem_blk->dev.kobj)); + sysfs_remove_link(&mem_blk->dev.kobj, + kobject_name(&node_devices[mem_blk->nid]->dev.kobj)); } int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c index c58986556161..651bd0236a99 100644 --- a/drivers/block/drbd/drbd_interval.c +++ b/drivers/block/drbd/drbd_interval.c @@ -13,33 +13,10 @@ sector_t interval_end(struct rb_node *node) return this->end; } -/** - * compute_subtree_last - compute end of @node - * - * The end of an interval is the highest (start + (size >> 9)) value of this - * node and of its children. Called for @node and its parents whenever the end - * may have changed. - */ -static inline sector_t -compute_subtree_last(struct drbd_interval *node) -{ - sector_t max = node->sector + (node->size >> 9); +#define NODE_END(node) ((node)->sector + ((node)->size >> 9)) - if (node->rb.rb_left) { - sector_t left = interval_end(node->rb.rb_left); - if (left > max) - max = left; - } - if (node->rb.rb_right) { - sector_t right = interval_end(node->rb.rb_right); - if (right > max) - max = right; - } - return max; -} - -RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb, - sector_t, end, compute_subtree_last); +RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks, + struct drbd_interval, rb, sector_t, end, NODE_END); /** * drbd_insert_interval - insert a new interval into a tree diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a8e3815295fe..ac07e8c94c79 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -71,14 +72,17 @@ struct link_dead_args { int index; }; -#define NBD_TIMEDOUT 0 +#define NBD_RT_TIMEDOUT 0 +#define NBD_RT_DISCONNECT_REQUESTED 1 +#define NBD_RT_DISCONNECTED 2 +#define NBD_RT_HAS_PID_FILE 3 +#define NBD_RT_HAS_CONFIG_REF 4 +#define NBD_RT_BOUND 5 +#define NBD_RT_DESTROY_ON_DISCONNECT 6 +#define NBD_RT_DISCONNECT_ON_CLOSE 7 + +#define NBD_DESTROY_ON_DISCONNECT 0 #define NBD_DISCONNECT_REQUESTED 1 -#define NBD_DISCONNECTED 2 -#define NBD_HAS_PID_FILE 3 -#define NBD_HAS_CONFIG_REF 4 -#define NBD_BOUND 5 -#define NBD_DESTROY_ON_DISCONNECT 6 -#define NBD_DISCONNECT_ON_CLOSE 7 struct nbd_config { u32 flags; @@ -113,6 +117,9 @@ struct nbd_device { struct list_head list; struct task_struct *task_recv; struct task_struct *task_setup; + + struct completion *destroy_complete; + unsigned long flags; }; #define NBD_CMD_REQUEUED 1 @@ -223,6 +230,16 @@ static void nbd_dev_remove(struct nbd_device *nbd) disk->private_data = NULL; put_disk(disk); } + + /* + * Place this in the last just before the nbd is freed to + * make sure that the disk and the related kobject are also + * totally removed to avoid duplicate creation of the same + * one. + */ + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete) + complete(nbd->destroy_complete); + kfree(nbd); } @@ -238,8 +255,8 @@ static void nbd_put(struct nbd_device *nbd) static int nbd_disconnected(struct nbd_config *config) { - return test_bit(NBD_DISCONNECTED, &config->runtime_flags) || - test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags); + return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) || + test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); } static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, @@ -257,9 +274,9 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, if (!nsock->dead) { kernel_sock_shutdown(nsock->sock, SHUT_RDWR); if (atomic_dec_return(&nbd->config->live_connections) == 0) { - if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED, + if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED, &nbd->config->runtime_flags)) { - set_bit(NBD_DISCONNECTED, + set_bit(NBD_RT_DISCONNECTED, &nbd->config->runtime_flags); dev_info(nbd_to_dev(nbd), "Disconnected due to user request.\n"); @@ -333,7 +350,7 @@ static void sock_shutdown(struct nbd_device *nbd) if (config->num_connections == 0) return; - if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags)) + if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return; for (i = 0; i < config->num_connections; i++) { @@ -427,7 +444,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, } dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n"); - set_bit(NBD_TIMEDOUT, &config->runtime_flags); + set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags); cmd->status = BLK_STS_IOERR; mutex_unlock(&cmd->lock); sock_shutdown(nbd); @@ -795,7 +812,7 @@ static int find_fallback(struct nbd_device *nbd, int index) struct nbd_sock *nsock = config->socks[index]; int fallback = nsock->fallback_index; - if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) + if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return new_index; if (config->num_connections <= 1) { @@ -836,7 +853,7 @@ static int wait_for_reconnect(struct nbd_device *nbd) struct nbd_config *config = nbd->config; if (!config->dead_conn_timeout) return 0; - if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) + if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) return 0; return wait_event_timeout(config->conn_wait, atomic_read(&config->live_connections) > 0, @@ -969,12 +986,12 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, return err; if (!netlink && !nbd->task_setup && - !test_bit(NBD_BOUND, &config->runtime_flags)) + !test_bit(NBD_RT_BOUND, &config->runtime_flags)) nbd->task_setup = current; if (!netlink && (nbd->task_setup != current || - test_bit(NBD_BOUND, &config->runtime_flags))) { + test_bit(NBD_RT_BOUND, &config->runtime_flags))) { dev_err(disk_to_dev(nbd->disk), "Device being setup by another task"); sockfd_put(sock); @@ -1053,7 +1070,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) mutex_unlock(&nsock->tx_lock); sockfd_put(old); - clear_bit(NBD_DISCONNECTED, &config->runtime_flags); + clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags); /* We take the tx_mutex in an error path in the recv_work, so we * need to queue_work outside of the tx_mutex. @@ -1124,7 +1141,8 @@ static int nbd_disconnect(struct nbd_device *nbd) struct nbd_config *config = nbd->config; dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); - set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags); + set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); + set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags); send_disconnects(nbd); return 0; } @@ -1143,7 +1161,7 @@ static void nbd_config_put(struct nbd_device *nbd) struct nbd_config *config = nbd->config; nbd_dev_dbg_close(nbd); nbd_size_clear(nbd); - if (test_and_clear_bit(NBD_HAS_PID_FILE, + if (test_and_clear_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags)) device_remove_file(disk_to_dev(nbd->disk), &pid_attr); nbd->task_recv = NULL; @@ -1209,7 +1227,7 @@ static int nbd_start_device(struct nbd_device *nbd) dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); return error; } - set_bit(NBD_HAS_PID_FILE, &config->runtime_flags); + set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags); nbd_dev_dbg_init(nbd); for (i = 0; i < num_connections; i++) { @@ -1256,9 +1274,9 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b mutex_lock(&nbd->config_lock); nbd_bdev_reset(bdev); /* user requested, ignore socket errors */ - if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags)) + if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags)) ret = 0; - if (test_bit(NBD_TIMEDOUT, &config->runtime_flags)) + if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags)) ret = -ETIMEDOUT; return ret; } @@ -1269,7 +1287,7 @@ static void nbd_clear_sock_ioctl(struct nbd_device *nbd, sock_shutdown(nbd); __invalidate_device(bdev, true); nbd_bdev_reset(bdev); - if (test_and_clear_bit(NBD_HAS_CONFIG_REF, + if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); } @@ -1364,7 +1382,7 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode, /* Don't allow ioctl operations on a nbd device that was created with * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine. */ - if (!test_bit(NBD_BOUND, &config->runtime_flags) || + if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK)) error = __nbd_ioctl(bdev, nbd, cmd, arg); else @@ -1435,7 +1453,7 @@ static void nbd_release(struct gendisk *disk, fmode_t mode) struct nbd_device *nbd = disk->private_data; struct block_device *bdev = bdget_disk(disk, 0); - if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && + if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && bdev->bd_openers == 0) nbd_disconnect_and_put(nbd); @@ -1636,6 +1654,7 @@ static int nbd_dev_add(int index) nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; nbd->tag_set.driver_data = nbd; + nbd->destroy_complete = NULL; err = blk_mq_alloc_tag_set(&nbd->tag_set); if (err) @@ -1750,6 +1769,7 @@ static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) { + DECLARE_COMPLETION_ONSTACK(destroy_complete); struct nbd_device *nbd = NULL; struct nbd_config *config; int index = -1; @@ -1801,6 +1821,17 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) mutex_unlock(&nbd_index_mutex); return -EINVAL; } + + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && + test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { + nbd->destroy_complete = &destroy_complete; + mutex_unlock(&nbd_index_mutex); + + /* Wait untill the the nbd stuff is totally destroyed */ + wait_for_completion(&destroy_complete); + goto again; + } + if (!refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); if (index == -1) @@ -1833,7 +1864,7 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; } refcount_set(&nbd->config_refs, 1); - set_bit(NBD_BOUND, &config->runtime_flags); + set_bit(NBD_RT_BOUND, &config->runtime_flags); ret = nbd_genl_size_set(info, nbd); if (ret) @@ -1853,12 +1884,15 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { - set_bit(NBD_DESTROY_ON_DISCONNECT, + set_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags); + set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); put_dev = true; + } else { + clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { - set_bit(NBD_DISCONNECT_ON_CLOSE, + set_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } } @@ -1897,7 +1931,7 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) out: mutex_unlock(&nbd->config_lock); if (!ret) { - set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags); + set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags); refcount_inc(&nbd->config_refs); nbd_connect_reply(info, nbd->index); } @@ -1919,7 +1953,7 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd) * queue. */ flush_workqueue(nbd->recv_workq); - if (test_and_clear_bit(NBD_HAS_CONFIG_REF, + if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); } @@ -2003,7 +2037,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) mutex_lock(&nbd->config_lock); config = nbd->config; - if (!test_bit(NBD_BOUND, &config->runtime_flags) || + if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || !nbd->task_recv) { dev_err(nbd_to_dev(nbd), "not configured, cannot reconfigure\n"); @@ -2026,20 +2060,22 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { - if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT, + if (!test_and_set_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags)) put_dev = true; + set_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } else { - if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT, + if (test_and_clear_bit(NBD_RT_DESTROY_ON_DISCONNECT, &config->runtime_flags)) refcount_inc(&nbd->refs); + clear_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags); } if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { - set_bit(NBD_DISCONNECT_ON_CLOSE, + set_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } else { - clear_bit(NBD_DISCONNECT_ON_CLOSE, + clear_bit(NBD_RT_DISCONNECT_ON_CLOSE, &config->runtime_flags); } } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 024060165afa..76457003f140 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2594,7 +2594,6 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) if (ret) return ret; if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) { - WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); return -EINVAL; } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c8fb886aebd4..7c4350c0fb77 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1754,8 +1754,6 @@ static struct rbd_img_request *rbd_img_request_create( mutex_init(&img_request->state_mutex); kref_init(&img_request->kref); - dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, - obj_op_name(op_type), img_request); return img_request; } @@ -2944,6 +2942,9 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) __set_bit(IMG_REQ_CHILD, &child_img_req->flags); child_img_req->obj_request = obj_req; + dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req, + obj_req); + if (!rbd_img_is_write(img_req)) { switch (img_req->data_type) { case OBJ_REQUEST_BIO: @@ -4877,6 +4878,9 @@ static void rbd_queue_workfn(struct work_struct *work) img_request->rq = rq; snapc = NULL; /* img_request consumes a ref */ + dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev, + img_request, obj_op_name(op_type), offset, length); + if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) result = rbd_img_fill_nodata(img_request, offset, length); else @@ -5669,17 +5673,20 @@ static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) { + size_t size; void *reply_buf; int ret; void *p; - reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); + /* Response will be an encoded string, which includes a length */ + size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX; + reply_buf = kzalloc(size, GFP_KERNEL); if (!reply_buf) return -ENOMEM; ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, &rbd_dev->header_oloc, "get_object_prefix", - NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); + NULL, 0, reply_buf, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -6696,7 +6703,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) dout("rbd id object name is %s\n", oid.name); /* Response will be an encoded string, which includes a length */ - size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; response = kzalloc(size, GFP_NOIO); if (!response) { @@ -6708,7 +6714,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, "get_id", NULL, 0, - response, RBD_IMAGE_ID_LEN_MAX); + response, size); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret == -ENOENT) { image_id = kstrdup("", GFP_KERNEL); diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index bdab5d9af8d2..80b850ef1bf6 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -68,7 +68,7 @@ static void add_early_randomness(struct hwrng *rng) size_t size = min_t(size_t, 16, rng_buffer_size()); mutex_lock(&reading_mutex); - bytes_read = rng_get_data(rng, rng_buffer, size, 1); + bytes_read = rng_get_data(rng, rng_buffer, size, 0); mutex_unlock(&reading_mutex); if (bytes_read > 0) add_device_randomness(rng_buffer, bytes_read); diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c index 1b4f95c13e00..d7a3888ad80f 100644 --- a/drivers/char/tpm/tpm-interface.c +++ b/drivers/char/tpm/tpm-interface.c @@ -320,18 +320,22 @@ int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, if (!chip) return -ENODEV; - for (i = 0; i < chip->nr_allocated_banks; i++) - if (digests[i].alg_id != chip->allocated_banks[i].alg_id) - return -EINVAL; + for (i = 0; i < chip->nr_allocated_banks; i++) { + if (digests[i].alg_id != chip->allocated_banks[i].alg_id) { + rc = EINVAL; + goto out; + } + } if (chip->flags & TPM_CHIP_FLAG_TPM2) { rc = tpm2_pcr_extend(chip, pcr_idx, digests); - tpm_put_ops(chip); - return rc; + goto out; } rc = tpm1_pcr_extend(chip, pcr_idx, digests[0].digest, "attempting extend a PCR value"); + +out: tpm_put_ops(chip); return rc; } @@ -354,14 +358,9 @@ int tpm_send(struct tpm_chip *chip, void *cmd, size_t buflen) if (!chip) return -ENODEV; - rc = tpm_buf_init(&buf, 0, 0); - if (rc) - goto out; - - memcpy(buf.data, cmd, buflen); + buf.data = cmd; rc = tpm_transmit_cmd(chip, &buf, 0, "attempting to a send a command"); - tpm_buf_destroy(&buf); -out: + tpm_put_ops(chip); return rc; } diff --git a/drivers/char/xillybus/xillybus_pcie.c b/drivers/char/xillybus/xillybus_pcie.c index 02c15952b103..18b0c392bc93 100644 --- a/drivers/char/xillybus/xillybus_pcie.c +++ b/drivers/char/xillybus/xillybus_pcie.c @@ -9,7 +9,6 @@ #include #include -#include #include #include "xillybus.h" diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c index c70cb5f272cf..0891ab829b1b 100644 --- a/drivers/crypto/chelsio/chtls/chtls_io.c +++ b/drivers/crypto/chelsio/chtls/chtls_io.c @@ -1078,7 +1078,7 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) bool merge; if (page) - pg_size <<= compound_order(page); + pg_size = page_size(page); if (off < pg_size && skb_can_coalesce(skb, i, page, off)) { merge = 1; @@ -1105,8 +1105,7 @@ int chtls_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) __GFP_NORETRY, order); if (page) - pg_size <<= - compound_order(page); + pg_size <<= order; } if (!page) { page = alloc_page(gfp); diff --git a/drivers/crypto/hisilicon/sec/sec_algs.c b/drivers/crypto/hisilicon/sec/sec_algs.c index e0508ea160f1..c27e7160d2df 100644 --- a/drivers/crypto/hisilicon/sec/sec_algs.c +++ b/drivers/crypto/hisilicon/sec/sec_algs.c @@ -153,6 +153,24 @@ static void sec_alg_skcipher_init_context(struct crypto_skcipher *atfm, ctx->cipher_alg); } +static void sec_free_hw_sgl(struct sec_hw_sgl *hw_sgl, + dma_addr_t psec_sgl, struct sec_dev_info *info) +{ + struct sec_hw_sgl *sgl_current, *sgl_next; + dma_addr_t sgl_next_dma; + + sgl_current = hw_sgl; + while (sgl_current) { + sgl_next = sgl_current->next; + sgl_next_dma = sgl_current->next_sgl; + + dma_pool_free(info->hw_sgl_pool, sgl_current, psec_sgl); + + sgl_current = sgl_next; + psec_sgl = sgl_next_dma; + } +} + static int sec_alloc_and_fill_hw_sgl(struct sec_hw_sgl **sec_sgl, dma_addr_t *psec_sgl, struct scatterlist *sgl, @@ -199,35 +217,12 @@ static int sec_alloc_and_fill_hw_sgl(struct sec_hw_sgl **sec_sgl, return 0; err_free_hw_sgls: - sgl_current = *sec_sgl; - while (sgl_current) { - sgl_next = sgl_current->next; - dma_pool_free(info->hw_sgl_pool, sgl_current, - sgl_current->next_sgl); - sgl_current = sgl_next; - } + sec_free_hw_sgl(*sec_sgl, *psec_sgl, info); *psec_sgl = 0; return ret; } -static void sec_free_hw_sgl(struct sec_hw_sgl *hw_sgl, - dma_addr_t psec_sgl, struct sec_dev_info *info) -{ - struct sec_hw_sgl *sgl_current, *sgl_next; - - if (!hw_sgl) - return; - sgl_current = hw_sgl; - while (sgl_current->next) { - sgl_next = sgl_current->next; - dma_pool_free(info->hw_sgl_pool, sgl_current, - sgl_current->next_sgl); - sgl_current = sgl_next; - } - dma_pool_free(info->hw_sgl_pool, hw_sgl, psec_sgl); -} - static int sec_alg_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen, enum sec_cipher_alg alg) diff --git a/drivers/crypto/hisilicon/zip/zip_crypto.c b/drivers/crypto/hisilicon/zip/zip_crypto.c index 5a3f84dcdcde..59023545a1c4 100644 --- a/drivers/crypto/hisilicon/zip/zip_crypto.c +++ b/drivers/crypto/hisilicon/zip/zip_crypto.c @@ -559,7 +559,7 @@ static int hisi_zip_acompress(struct acomp_req *acomp_req) struct hisi_zip_ctx *ctx = crypto_tfm_ctx(acomp_req->base.tfm); struct hisi_zip_qp_ctx *qp_ctx = &ctx->qp_ctx[QPC_COMP]; struct hisi_zip_req *req; - size_t head_size; + int head_size; int ret; /* let's output compression head now */ @@ -567,7 +567,7 @@ static int hisi_zip_acompress(struct acomp_req *acomp_req) if (head_size < 0) return -ENOMEM; - req = hisi_zip_create_req(acomp_req, qp_ctx, head_size, true); + req = hisi_zip_create_req(acomp_req, qp_ctx, (size_t)head_size, true); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index 6e0ca75585d4..1b2ee96c888d 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -785,7 +785,6 @@ static int hisi_zip_clear_vft_config(struct hisi_zip *hisi_zip) static int hisi_zip_sriov_enable(struct pci_dev *pdev, int max_vfs) { -#ifdef CONFIG_PCI_IOV struct hisi_zip *hisi_zip = pci_get_drvdata(pdev); int pre_existing_vfs, num_vfs, ret; @@ -815,9 +814,6 @@ static int hisi_zip_sriov_enable(struct pci_dev *pdev, int max_vfs) } return num_vfs; -#else - return 0; -#endif } static int hisi_zip_sriov_disable(struct pci_dev *pdev) @@ -948,7 +944,8 @@ static struct pci_driver hisi_zip_pci_driver = { .id_table = hisi_zip_dev_ids, .probe = hisi_zip_probe, .remove = hisi_zip_remove, - .sriov_configure = hisi_zip_sriov_configure, + .sriov_configure = IS_ENABLED(CONFIG_PCI_IOV) ? + hisi_zip_sriov_configure : 0, .err_handler = &hisi_zip_err_handler, }; diff --git a/drivers/crypto/inside-secure/safexcel.c b/drivers/crypto/inside-secure/safexcel.c index b456b85f46d3..4ab1bde8dd9b 100644 --- a/drivers/crypto/inside-secure/safexcel.c +++ b/drivers/crypto/inside-secure/safexcel.c @@ -1789,32 +1789,50 @@ static struct pci_driver safexcel_pci_driver = { }; #endif +/* Unfortunately, we have to resort to global variables here */ +#if IS_ENABLED(CONFIG_PCI) +int pcireg_rc = -EINVAL; /* Default safe value */ +#endif +#if IS_ENABLED(CONFIG_OF) +int ofreg_rc = -EINVAL; /* Default safe value */ +#endif + static int __init safexcel_init(void) { - int rc; +#if IS_ENABLED(CONFIG_PCI) + /* Register PCI driver */ + pcireg_rc = pci_register_driver(&safexcel_pci_driver); +#endif #if IS_ENABLED(CONFIG_OF) - /* Register platform driver */ - platform_driver_register(&crypto_safexcel); + /* Register platform driver */ + ofreg_rc = platform_driver_register(&crypto_safexcel); + #if IS_ENABLED(CONFIG_PCI) + /* Return success if either PCI or OF registered OK */ + return pcireg_rc ? ofreg_rc : 0; + #else + return ofreg_rc; + #endif +#else + #if IS_ENABLED(CONFIG_PCI) + return pcireg_rc; + #else + return -EINVAL; + #endif #endif - -#if IS_ENABLED(CONFIG_PCI) - /* Register PCI driver */ - rc = pci_register_driver(&safexcel_pci_driver); -#endif - - return 0; } static void __exit safexcel_exit(void) { #if IS_ENABLED(CONFIG_OF) - /* Unregister platform driver */ + /* Unregister platform driver */ + if (!ofreg_rc) platform_driver_unregister(&crypto_safexcel); #endif #if IS_ENABLED(CONFIG_PCI) - /* Unregister PCI driver if successfully registered before */ + /* Unregister PCI driver if successfully registered before */ + if (!pcireg_rc) pci_unregister_driver(&safexcel_pci_driver); #endif } diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index cb6c10b1bf36..56e3068c9947 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -3116,6 +3116,7 @@ static int talitos_remove(struct platform_device *ofdev) break; case CRYPTO_ALG_TYPE_AEAD: crypto_unregister_aead(&t_alg->algt.alg.aead); + break; case CRYPTO_ALG_TYPE_AHASH: crypto_unregister_ahash(&t_alg->algt.alg.hash); break; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 42b936b6bbf1..6d021ecc8d59 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1103,7 +1103,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( alloc_flags = 0; if (!offset || !*offset) return -EINVAL; - user_addr = *offset; + user_addr = untagged_addr(*offset); } else if (flags & (ALLOC_MEM_FLAGS_DOORBELL | ALLOC_MEM_FLAGS_MMIO_REMAP)) { domain = AMDGPU_GEM_DOMAIN_GTT; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index b174bd5eb38e..8ceb44925947 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -291,6 +291,8 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data, uint32_t handle; int r; + args->addr = untagged_addr(args->addr); + if (offset_in_page(args->addr | args->size)) return -EINVAL; diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c index 98bccace8c1c..3d61c4fb4dec 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c @@ -978,10 +978,10 @@ static int adv7511_init_cec_regmap(struct adv7511 *adv) { int ret; - adv->i2c_cec = i2c_new_secondary_device(adv->i2c_main, "cec", + adv->i2c_cec = i2c_new_ancillary_device(adv->i2c_main, "cec", ADV7511_CEC_I2C_ADDR_DEFAULT); - if (!adv->i2c_cec) - return -EINVAL; + if (IS_ERR(adv->i2c_cec)) + return PTR_ERR(adv->i2c_cec); i2c_set_clientdata(adv->i2c_cec, adv); adv->regmap_cec = devm_regmap_init_i2c(adv->i2c_cec, @@ -1162,20 +1162,20 @@ static int adv7511_probe(struct i2c_client *i2c, const struct i2c_device_id *id) adv7511_packet_disable(adv7511, 0xffff); - adv7511->i2c_edid = i2c_new_secondary_device(i2c, "edid", + adv7511->i2c_edid = i2c_new_ancillary_device(i2c, "edid", ADV7511_EDID_I2C_ADDR_DEFAULT); - if (!adv7511->i2c_edid) { - ret = -EINVAL; + if (IS_ERR(adv7511->i2c_edid)) { + ret = PTR_ERR(adv7511->i2c_edid); goto uninit_regulators; } regmap_write(adv7511->regmap, ADV7511_REG_EDID_I2C_ADDR, adv7511->i2c_edid->addr << 1); - adv7511->i2c_packet = i2c_new_secondary_device(i2c, "packet", + adv7511->i2c_packet = i2c_new_ancillary_device(i2c, "packet", ADV7511_PACKET_I2C_ADDR_DEFAULT); - if (!adv7511->i2c_packet) { - ret = -EINVAL; + if (IS_ERR(adv7511->i2c_packet)) { + ret = PTR_ERR(adv7511->i2c_packet); goto err_i2c_unregister_edid; } diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index 4cf58dbbe439..b2b076606f54 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -296,6 +296,8 @@ int radeon_gem_userptr_ioctl(struct drm_device *dev, void *data, uint32_t handle; int r; + args->addr = untagged_addr(args->addr); + if (offset_in_page(args->addr | args->size)) return -EINVAL; diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c index feaa538026a0..3db000aacd26 100644 --- a/drivers/gpu/drm/via/via_dmablit.c +++ b/drivers/gpu/drm/via/via_dmablit.c @@ -174,7 +174,6 @@ via_map_blit_for_device(struct pci_dev *pdev, static void via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg) { - struct page *page; int i; switch (vsg->state) { @@ -189,13 +188,8 @@ via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg) kfree(vsg->desc_pages); /* fall through */ case dr_via_pages_locked: - for (i = 0; i < vsg->num_pages; ++i) { - if (NULL != (page = vsg->pages[i])) { - if (!PageReserved(page) && (DMA_FROM_DEVICE == vsg->direction)) - SetPageDirty(page); - put_page(page); - } - } + put_user_pages_dirty_lock(vsg->pages, vsg->num_pages, + (vsg->direction == DMA_FROM_DEVICE)); /* fall through */ case dr_via_pages_alloc: vfree(vsg->pages); diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index a958b9625bba..1ecb5124421c 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -273,6 +273,15 @@ config HID_CP2112 and gpiochip to expose these functions of the CP2112. The customizable USB descriptor fields are exposed as sysfs attributes. +config HID_CREATIVE_SB0540 + tristate "Creative SB0540 infrared receiver" + depends on USB_HID + help + Support for Creative infrared SB0540-compatible remote controls, such + as the RM-1500 and RM-1800 remotes. + + Say Y here if you want support for Creative SB0540 infrared receiver. + config HID_CYPRESS tristate "Cypress mouse and barcode readers" depends on HID diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile index cc5d827c9164..0c03308cfb08 100644 --- a/drivers/hid/Makefile +++ b/drivers/hid/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_HID_ALPS) += hid-alps.o obj-$(CONFIG_HID_ACRUX) += hid-axff.o obj-$(CONFIG_HID_APPLE) += hid-apple.o obj-$(CONFIG_HID_APPLEIR) += hid-appleir.o +obj-$(CONFIG_HID_CREATIVE_SB0540) += hid-creative-sb0540.o obj-$(CONFIG_HID_ASUS) += hid-asus.o obj-$(CONFIG_HID_AUREAL) += hid-aureal.o obj-$(CONFIG_HID_BELKIN) += hid-belkin.o diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c index 81df62f48c4c..6ac8becc2372 100644 --- a/drivers/hid/hid-apple.c +++ b/drivers/hid/hid-apple.c @@ -54,7 +54,6 @@ MODULE_PARM_DESC(swap_opt_cmd, "Swap the Option (\"Alt\") and Command (\"Flag\") struct apple_sc { unsigned long quirks; unsigned int fn_on; - DECLARE_BITMAP(pressed_fn, KEY_CNT); DECLARE_BITMAP(pressed_numlock, KEY_CNT); }; @@ -181,6 +180,8 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, { struct apple_sc *asc = hid_get_drvdata(hid); const struct apple_key_translation *trans, *table; + bool do_translate; + u16 code = 0; if (usage->code == KEY_FN) { asc->fn_on = !!value; @@ -189,8 +190,6 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, } if (fnmode) { - int do_translate; - if (hid->product >= USB_DEVICE_ID_APPLE_WELLSPRING4_ANSI && hid->product <= USB_DEVICE_ID_APPLE_WELLSPRING4A_JIS) table = macbookair_fn_keys; @@ -202,25 +201,33 @@ static int hidinput_apple_event(struct hid_device *hid, struct input_dev *input, trans = apple_find_translation (table, usage->code); if (trans) { - if (test_bit(usage->code, asc->pressed_fn)) - do_translate = 1; - else if (trans->flags & APPLE_FLAG_FKEY) - do_translate = (fnmode == 2 && asc->fn_on) || - (fnmode == 1 && !asc->fn_on); - else - do_translate = asc->fn_on; + if (test_bit(trans->from, input->key)) + code = trans->from; + else if (test_bit(trans->to, input->key)) + code = trans->to; - if (do_translate) { - if (value) - set_bit(usage->code, asc->pressed_fn); - else - clear_bit(usage->code, asc->pressed_fn); + if (!code) { + if (trans->flags & APPLE_FLAG_FKEY) { + switch (fnmode) { + case 1: + do_translate = !asc->fn_on; + break; + case 2: + do_translate = asc->fn_on; + break; + default: + /* should never happen */ + do_translate = false; + } + } else { + do_translate = asc->fn_on; + } - input_event(input, usage->type, trans->to, - value); - - return 1; + code = do_translate ? trans->to : trans->from; } + + input_event(input, usage->type, code, value); + return 1; } if (asc->quirks & APPLE_NUMLOCK_EMULATION && diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 210b81a56e1a..3eaee2c37931 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1311,8 +1311,8 @@ u32 hid_field_extract(const struct hid_device *hid, u8 *report, unsigned offset, unsigned n) { if (n > 32) { - hid_warn(hid, "hid_field_extract() called with n (%d) > 32! (%s)\n", - n, current->comm); + hid_warn_once(hid, "%s() called with n (%d) > 32! (%s)\n", + __func__, n, current->comm); n = 32; } diff --git a/drivers/hid/hid-cougar.c b/drivers/hid/hid-cougar.c index e0bb7b34f3a4..4ff3bc1d25e2 100644 --- a/drivers/hid/hid-cougar.c +++ b/drivers/hid/hid-cougar.c @@ -207,7 +207,7 @@ static int cougar_probe(struct hid_device *hdev, error = hid_parse(hdev); if (error) { hid_err(hdev, "parse failed\n"); - goto fail; + return error; } if (hdev->collection->usage == COUGAR_VENDOR_USAGE) { @@ -219,7 +219,7 @@ static int cougar_probe(struct hid_device *hdev, error = hid_hw_start(hdev, connect_mask); if (error) { hid_err(hdev, "hw start failed\n"); - goto fail; + return error; } error = cougar_bind_shared_data(hdev, cougar); @@ -249,8 +249,6 @@ static int cougar_probe(struct hid_device *hdev, fail_stop_and_cleanup: hid_hw_stop(hdev); -fail: - hid_set_drvdata(hdev, NULL); return error; } diff --git a/drivers/hid/hid-creative-sb0540.c b/drivers/hid/hid-creative-sb0540.c new file mode 100644 index 000000000000..b4c8e7a5d3e0 --- /dev/null +++ b/drivers/hid/hid-creative-sb0540.c @@ -0,0 +1,268 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HID driver for the Creative SB0540 receiver + * + * Copyright (C) 2019 Red Hat Inc. All Rights Reserved + * + */ + +#include +#include +#include +#include "hid-ids.h" + +MODULE_AUTHOR("Bastien Nocera "); +MODULE_DESCRIPTION("HID Creative SB0540 receiver"); +MODULE_LICENSE("GPL"); + +static const unsigned short creative_sb0540_key_table[] = { + KEY_POWER, + KEY_RESERVED, /* text: 24bit */ + KEY_RESERVED, /* 24bit wheel up */ + KEY_RESERVED, /* 24bit wheel down */ + KEY_RESERVED, /* text: CMSS */ + KEY_RESERVED, /* CMSS wheel Up */ + KEY_RESERVED, /* CMSS wheel Down */ + KEY_RESERVED, /* text: EAX */ + KEY_RESERVED, /* EAX wheel up */ + KEY_RESERVED, /* EAX wheel down */ + KEY_RESERVED, /* text: 3D Midi */ + KEY_RESERVED, /* 3D Midi wheel up */ + KEY_RESERVED, /* 3D Midi wheel down */ + KEY_MUTE, + KEY_VOLUMEUP, + KEY_VOLUMEDOWN, + KEY_UP, + KEY_LEFT, + KEY_RIGHT, + KEY_REWIND, + KEY_OK, + KEY_FASTFORWARD, + KEY_DOWN, + KEY_AGAIN, /* text: Return, symbol: Jump to */ + KEY_PLAY, /* text: Start */ + KEY_ESC, /* text: Cancel */ + KEY_RECORD, + KEY_OPTION, + KEY_MENU, /* text: Display */ + KEY_PREVIOUS, + KEY_PLAYPAUSE, + KEY_NEXT, + KEY_SLOW, + KEY_STOP, + KEY_NUMERIC_1, + KEY_NUMERIC_2, + KEY_NUMERIC_3, + KEY_NUMERIC_4, + KEY_NUMERIC_5, + KEY_NUMERIC_6, + KEY_NUMERIC_7, + KEY_NUMERIC_8, + KEY_NUMERIC_9, + KEY_NUMERIC_0 +}; + +/* + * Codes and keys from lirc's + * remotes/creative/lircd.conf.alsa_usb + * order and size must match creative_sb0540_key_table[] above + */ +static const unsigned short creative_sb0540_codes[] = { + 0x619E, + 0x916E, + 0x926D, + 0x936C, + 0x718E, + 0x946B, + 0x956A, + 0x8C73, + 0x9669, + 0x9768, + 0x9867, + 0x9966, + 0x9A65, + 0x6E91, + 0x629D, + 0x639C, + 0x7B84, + 0x6B94, + 0x728D, + 0x8778, + 0x817E, + 0x758A, + 0x8D72, + 0x8E71, + 0x8877, + 0x7C83, + 0x738C, + 0x827D, + 0x7689, + 0x7F80, + 0x7986, + 0x7A85, + 0x7D82, + 0x857A, + 0x8B74, + 0x8F70, + 0x906F, + 0x8A75, + 0x847B, + 0x7887, + 0x8976, + 0x837C, + 0x7788, + 0x807F +}; + +struct creative_sb0540 { + struct input_dev *input_dev; + struct hid_device *hid; + unsigned short keymap[ARRAY_SIZE(creative_sb0540_key_table)]; +}; + +static inline u64 reverse(u64 data, int bits) +{ + int i; + u64 c; + + c = 0; + for (i = 0; i < bits; i++) { + c |= (u64) (((data & (((u64) 1) << i)) ? 1 : 0)) + << (bits - 1 - i); + } + return (c); +} + +static int get_key(struct creative_sb0540 *creative_sb0540, u64 keycode) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(creative_sb0540_codes); i++) { + if (creative_sb0540_codes[i] == keycode) + return creative_sb0540->keymap[i]; + } + + return 0; + +} + +static int creative_sb0540_raw_event(struct hid_device *hid, + struct hid_report *report, u8 *data, int len) +{ + struct creative_sb0540 *creative_sb0540 = hid_get_drvdata(hid); + u64 code, main_code; + int key; + + if (len != 6) + return 0; + + /* From daemons/hw_hiddev.c sb0540_rec() in lirc */ + code = reverse(data[5], 8); + main_code = (code << 8) + ((~code) & 0xff); + + /* + * Flip to get values in the same format as + * remotes/creative/lircd.conf.alsa_usb in lirc + */ + main_code = ((main_code & 0xff) << 8) + + ((main_code & 0xff00) >> 8); + + key = get_key(creative_sb0540, main_code); + if (key == 0 || key == KEY_RESERVED) { + hid_err(hid, "Could not get a key for main_code %llX\n", + main_code); + return 0; + } + + input_report_key(creative_sb0540->input_dev, key, 1); + input_report_key(creative_sb0540->input_dev, key, 0); + input_sync(creative_sb0540->input_dev); + + /* let hidraw and hiddev handle the report */ + return 0; +} + +static int creative_sb0540_input_configured(struct hid_device *hid, + struct hid_input *hidinput) +{ + struct input_dev *input_dev = hidinput->input; + struct creative_sb0540 *creative_sb0540 = hid_get_drvdata(hid); + int i; + + creative_sb0540->input_dev = input_dev; + + input_dev->keycode = creative_sb0540->keymap; + input_dev->keycodesize = sizeof(unsigned short); + input_dev->keycodemax = ARRAY_SIZE(creative_sb0540->keymap); + + input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP); + + memcpy(creative_sb0540->keymap, creative_sb0540_key_table, + sizeof(creative_sb0540->keymap)); + for (i = 0; i < ARRAY_SIZE(creative_sb0540_key_table); i++) + set_bit(creative_sb0540->keymap[i], input_dev->keybit); + clear_bit(KEY_RESERVED, input_dev->keybit); + + return 0; +} + +static int creative_sb0540_input_mapping(struct hid_device *hid, + struct hid_input *hi, struct hid_field *field, + struct hid_usage *usage, unsigned long **bit, int *max) +{ + /* + * We are remapping the keys ourselves, so ignore the hid-input + * keymap processing. + */ + return -1; +} + +static int creative_sb0540_probe(struct hid_device *hid, + const struct hid_device_id *id) +{ + int ret; + struct creative_sb0540 *creative_sb0540; + + creative_sb0540 = devm_kzalloc(&hid->dev, + sizeof(struct creative_sb0540), GFP_KERNEL); + + if (!creative_sb0540) + return -ENOMEM; + + creative_sb0540->hid = hid; + + /* force input as some remotes bypass the input registration */ + hid->quirks |= HID_QUIRK_HIDINPUT_FORCE; + + hid_set_drvdata(hid, creative_sb0540); + + ret = hid_parse(hid); + if (ret) { + hid_err(hid, "parse failed\n"); + return ret; + } + + ret = hid_hw_start(hid, HID_CONNECT_DEFAULT); + if (ret) { + hid_err(hid, "hw start failed\n"); + return ret; + } + + return ret; +} + +static const struct hid_device_id creative_sb0540_devices[] = { + { HID_USB_DEVICE(USB_VENDOR_ID_CREATIVELABS, USB_DEVICE_ID_CREATIVE_SB0540) }, + { } +}; +MODULE_DEVICE_TABLE(hid, creative_sb0540_devices); + +static struct hid_driver creative_sb0540_driver = { + .name = "creative-sb0540", + .id_table = creative_sb0540_devices, + .raw_event = creative_sb0540_raw_event, + .input_configured = creative_sb0540_input_configured, + .probe = creative_sb0540_probe, + .input_mapping = creative_sb0540_input_mapping, +}; +module_hid_driver(creative_sb0540_driver); diff --git a/drivers/hid/hid-gfrm.c b/drivers/hid/hid-gfrm.c index 86c317320bf2..699186ff2349 100644 --- a/drivers/hid/hid-gfrm.c +++ b/drivers/hid/hid-gfrm.c @@ -123,12 +123,6 @@ static int gfrm_probe(struct hid_device *hdev, const struct hid_device_id *id) return ret; } -static void gfrm_remove(struct hid_device *hdev) -{ - hid_hw_stop(hdev); - hid_set_drvdata(hdev, NULL); -} - static const struct hid_device_id gfrm_devices[] = { { HID_BLUETOOTH_DEVICE(0x58, 0x2000), .driver_data = GFRM100 }, @@ -142,7 +136,6 @@ static struct hid_driver gfrm_driver = { .name = "gfrm", .id_table = gfrm_devices, .probe = gfrm_probe, - .remove = gfrm_remove, .input_mapping = gfrm_input_mapping, .raw_event = gfrm_raw_event, .input_configured = gfrm_input_configured, diff --git a/drivers/hid/hid-hyperv.c b/drivers/hid/hid-hyperv.c index 7795831d37c2..cc5b09b87ab0 100644 --- a/drivers/hid/hid-hyperv.c +++ b/drivers/hid/hid-hyperv.c @@ -104,8 +104,8 @@ struct synthhid_input_report { #pragma pack(pop) -#define INPUTVSC_SEND_RING_BUFFER_SIZE (10*PAGE_SIZE) -#define INPUTVSC_RECV_RING_BUFFER_SIZE (10*PAGE_SIZE) +#define INPUTVSC_SEND_RING_BUFFER_SIZE (40 * 1024) +#define INPUTVSC_RECV_RING_BUFFER_SIZE (40 * 1024) enum pipe_prot_msg_type { diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 0a00be19f7a0..76969a22b0f2 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -314,6 +314,7 @@ #define USB_VENDOR_ID_CREATIVELABS 0x041e #define USB_DEVICE_ID_CREATIVE_SB_OMNI_SURROUND_51 0x322c #define USB_DEVICE_ID_PRODIKEYS_PCMIDI 0x2801 +#define USB_DEVICE_ID_CREATIVE_SB0540 0x3100 #define USB_VENDOR_ID_CVTOUCH 0x1ff7 #define USB_DEVICE_ID_CVTOUCH_SCREEN 0x0013 @@ -568,6 +569,7 @@ #define USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0B4A 0x0b4a #define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE 0x134a #define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_094A 0x094a +#define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0941 0x0941 #define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0641 0x0641 #define USB_VENDOR_ID_HUION 0x256c @@ -769,7 +771,8 @@ #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER 0xc52f #define USB_DEVICE_ID_LOGITECH_UNIFYING_RECEIVER_2 0xc532 #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_2 0xc534 -#define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED 0xc539 +#define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1 0xc539 +#define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_1 0xc53f #define USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_POWERPLAY 0xc53a #define USB_DEVICE_ID_SPACETRAVELLER 0xc623 #define USB_DEVICE_ID_SPACENAVIGATOR 0xc626 diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c index 364bc7f11d9d..96fa2a2c2cd3 100644 --- a/drivers/hid/hid-lenovo.c +++ b/drivers/hid/hid-lenovo.c @@ -866,8 +866,6 @@ static void lenovo_remove_tpkbd(struct hid_device *hdev) led_classdev_unregister(&data_pointer->led_micmute); led_classdev_unregister(&data_pointer->led_mute); - - hid_set_drvdata(hdev, NULL); } static void lenovo_remove_cptkbd(struct hid_device *hdev) diff --git a/drivers/hid/hid-lg.c b/drivers/hid/hid-lg.c index 5008a3dc28f4..0dc7cdfc56f7 100644 --- a/drivers/hid/hid-lg.c +++ b/drivers/hid/hid-lg.c @@ -818,7 +818,7 @@ static int lg_probe(struct hid_device *hdev, const struct hid_device_id *id) if (!buf) { ret = -ENOMEM; - goto err_free; + goto err_stop; } ret = hid_hw_raw_request(hdev, buf[0], buf, sizeof(cbuf), @@ -850,9 +850,12 @@ static int lg_probe(struct hid_device *hdev, const struct hid_device_id *id) ret = lg4ff_init(hdev); if (ret) - goto err_free; + goto err_stop; return 0; + +err_stop: + hid_hw_stop(hdev); err_free: kfree(drv_data); return ret; @@ -863,8 +866,7 @@ static void lg_remove(struct hid_device *hdev) struct lg_drv_data *drv_data = hid_get_drvdata(hdev); if (drv_data->quirks & LG_FF4) lg4ff_deinit(hdev); - else - hid_hw_stop(hdev); + hid_hw_stop(hdev); kfree(drv_data); } diff --git a/drivers/hid/hid-lg4ff.c b/drivers/hid/hid-lg4ff.c index cefba038520c..03f0220062ca 100644 --- a/drivers/hid/hid-lg4ff.c +++ b/drivers/hid/hid-lg4ff.c @@ -1477,7 +1477,6 @@ int lg4ff_deinit(struct hid_device *hid) } } #endif - hid_hw_stop(hid); drv_data->device_props = NULL; kfree(entry); diff --git a/drivers/hid/hid-logitech-dj.c b/drivers/hid/hid-logitech-dj.c index cc47f948c1d0..bb50d6e7745b 100644 --- a/drivers/hid/hid-logitech-dj.c +++ b/drivers/hid/hid-logitech-dj.c @@ -380,9 +380,9 @@ static const char consumer_descriptor[] = { 0x75, 0x10, /* REPORT_SIZE (16) */ 0x95, 0x02, /* REPORT_COUNT (2) */ 0x15, 0x01, /* LOGICAL_MIN (1) */ - 0x26, 0x8C, 0x02, /* LOGICAL_MAX (652) */ + 0x26, 0xFF, 0x02, /* LOGICAL_MAX (767) */ 0x19, 0x01, /* USAGE_MIN (1) */ - 0x2A, 0x8C, 0x02, /* USAGE_MAX (652) */ + 0x2A, 0xFF, 0x02, /* USAGE_MAX (767) */ 0x81, 0x00, /* INPUT (Data Ary Abs) */ 0xC0, /* END_COLLECTION */ }; /* */ @@ -959,6 +959,7 @@ static void logi_hidpp_recv_queue_notif(struct hid_device *hdev, break; case 0x07: device_type = "eQUAD step 4 Gaming"; + logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); break; case 0x08: device_type = "eQUAD step 4 for gamepads"; @@ -968,7 +969,12 @@ static void logi_hidpp_recv_queue_notif(struct hid_device *hdev, logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); break; case 0x0c: - device_type = "eQUAD Lightspeed"; + device_type = "eQUAD Lightspeed 1"; + logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); + workitem.reports_supported |= STD_KEYBOARD; + break; + case 0x0d: + device_type = "eQUAD Lightspeed 1_1"; logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); workitem.reports_supported |= STD_KEYBOARD; break; @@ -1734,14 +1740,14 @@ static int logi_dj_probe(struct hid_device *hdev, if (retval < 0) { hid_err(hdev, "%s: logi_dj_recv_query_paired_devices error:%d\n", __func__, retval); - goto logi_dj_recv_query_paired_devices_failed; + /* + * This can happen with a KVM, let the probe succeed, + * logi_dj_recv_queue_unknown_work will retry later. + */ } } - return retval; - -logi_dj_recv_query_paired_devices_failed: - hid_hw_close(hdev); + return 0; llopen_failed: switch_to_dj_mode_fail: @@ -1832,9 +1838,17 @@ static const struct hid_device_id logi_dj_receivers[] = { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_2), .driver_data = recvr_type_hidpp}, + { /* Logitech G700(s) receiver (0xc531) */ + HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, + 0xc531), + .driver_data = recvr_type_gaming_hidpp}, { /* Logitech lightspeed receiver (0xc539) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, - USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED), + USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1), + .driver_data = recvr_type_gaming_hidpp}, + { /* Logitech lightspeed receiver (0xc53f) */ + HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, + USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_1), .driver_data = recvr_type_gaming_hidpp}, { /* Logitech 27 MHz HID++ 1.0 receiver (0xc513) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_MX3000_RECEIVER), diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index b603c14d043b..3cfeb1629f79 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -68,6 +68,7 @@ MODULE_LICENSE("GPL"); #define MT_QUIRK_STICKY_FINGERS BIT(16) #define MT_QUIRK_ASUS_CUSTOM_UP BIT(17) #define MT_QUIRK_WIN8_PTP_BUTTONS BIT(18) +#define MT_QUIRK_SEPARATE_APP_REPORT BIT(19) #define MT_INPUTMODE_TOUCHSCREEN 0x02 #define MT_INPUTMODE_TOUCHPAD 0x03 @@ -103,6 +104,7 @@ struct mt_usages { struct mt_application { struct list_head list; unsigned int application; + unsigned int report_id; struct list_head mt_usages; /* mt usages list */ __s32 quirks; @@ -203,6 +205,7 @@ static void mt_post_parse(struct mt_device *td, struct mt_application *app); #define MT_CLS_VTL 0x0110 #define MT_CLS_GOOGLE 0x0111 #define MT_CLS_RAZER_BLADE_STEALTH 0x0112 +#define MT_CLS_SMART_TECH 0x0113 #define MT_DEFAULT_MAXCONTACT 10 #define MT_MAX_MAXCONTACT 250 @@ -263,7 +266,8 @@ static const struct mt_class mt_classes[] = { MT_QUIRK_HOVERING | MT_QUIRK_CONTACT_CNT_ACCURATE | MT_QUIRK_STICKY_FINGERS | - MT_QUIRK_WIN8_PTP_BUTTONS }, + MT_QUIRK_WIN8_PTP_BUTTONS, + .export_all_inputs = true }, { .name = MT_CLS_EXPORT_ALL_INPUTS, .quirks = MT_QUIRK_ALWAYS_VALID | MT_QUIRK_CONTACT_CNT_ACCURATE, @@ -353,6 +357,12 @@ static const struct mt_class mt_classes[] = { MT_QUIRK_CONTACT_CNT_ACCURATE | MT_QUIRK_WIN8_PTP_BUTTONS, }, + { .name = MT_CLS_SMART_TECH, + .quirks = MT_QUIRK_ALWAYS_VALID | + MT_QUIRK_IGNORE_DUPLICATES | + MT_QUIRK_CONTACT_CNT_ACCURATE | + MT_QUIRK_SEPARATE_APP_REPORT, + }, { } }; @@ -509,8 +519,9 @@ static struct mt_usages *mt_allocate_usage(struct hid_device *hdev, } static struct mt_application *mt_allocate_application(struct mt_device *td, - unsigned int application) + struct hid_report *report) { + unsigned int application = report->application; struct mt_application *mt_application; mt_application = devm_kzalloc(&td->hdev->dev, sizeof(*mt_application), @@ -535,6 +546,7 @@ static struct mt_application *mt_allocate_application(struct mt_device *td, mt_application->scantime = DEFAULT_ZERO; mt_application->raw_cc = DEFAULT_ZERO; mt_application->quirks = td->mtclass.quirks; + mt_application->report_id = report->id; list_add_tail(&mt_application->list, &td->applications); @@ -542,19 +554,23 @@ static struct mt_application *mt_allocate_application(struct mt_device *td, } static struct mt_application *mt_find_application(struct mt_device *td, - unsigned int application) + struct hid_report *report) { + unsigned int application = report->application; struct mt_application *tmp, *mt_application = NULL; list_for_each_entry(tmp, &td->applications, list) { if (application == tmp->application) { - mt_application = tmp; - break; + if (!(td->mtclass.quirks & MT_QUIRK_SEPARATE_APP_REPORT) || + tmp->report_id == report->id) { + mt_application = tmp; + break; + } } } if (!mt_application) - mt_application = mt_allocate_application(td, application); + mt_application = mt_allocate_application(td, report); return mt_application; } @@ -571,7 +587,7 @@ static struct mt_report_data *mt_allocate_report_data(struct mt_device *td, return NULL; rdata->report = report; - rdata->application = mt_find_application(td, report->application); + rdata->application = mt_find_application(td, report); if (!rdata->application) { devm_kfree(&td->hdev->dev, rdata); @@ -1561,6 +1577,9 @@ static int mt_input_configured(struct hid_device *hdev, struct hid_input *hi) case HID_VD_ASUS_CUSTOM_MEDIA_KEYS: suffix = "Custom Media Keys"; break; + case HID_DG_PEN: + suffix = "Stylus"; + break; default: suffix = "UNKNOWN"; break; @@ -2022,6 +2041,10 @@ static const struct hid_device_id mt_devices[] = { HID_DEVICE(BUS_I2C, HID_GROUP_MULTITOUCH_WIN_8, USB_VENDOR_ID_SYNAPTICS, 0x8323) }, + /* Smart Tech panels */ + { .driver_data = MT_CLS_SMART_TECH, + MT_USB_DEVICE(0x0b8c, 0x0092)}, + /* Stantum panels */ { .driver_data = MT_CLS_CONFIDENCE, MT_USB_DEVICE(USB_VENDOR_ID_STANTUM_STM, diff --git a/drivers/hid/hid-picolcd_core.c b/drivers/hid/hid-picolcd_core.c index 5f7a39a5d4af..1b5c63241af0 100644 --- a/drivers/hid/hid-picolcd_core.c +++ b/drivers/hid/hid-picolcd_core.c @@ -534,8 +534,7 @@ static int picolcd_probe(struct hid_device *hdev, data = kzalloc(sizeof(struct picolcd_data), GFP_KERNEL); if (data == NULL) { hid_err(hdev, "can't allocate space for Minibox PicoLCD device data\n"); - error = -ENOMEM; - goto err_no_cleanup; + return -ENOMEM; } spin_lock_init(&data->lock); @@ -597,9 +596,6 @@ static int picolcd_probe(struct hid_device *hdev, hid_hw_stop(hdev); err_cleanup_data: kfree(data); -err_no_cleanup: - hid_set_drvdata(hdev, NULL); - return error; } @@ -635,7 +631,6 @@ static void picolcd_remove(struct hid_device *hdev) picolcd_exit_cir(data); picolcd_exit_keys(data); - hid_set_drvdata(hdev, NULL); mutex_destroy(&data->mutex); /* Finally, clean up the picolcd data itself */ kfree(data); diff --git a/drivers/hid/hid-prodikeys.c b/drivers/hid/hid-prodikeys.c index 21544ebff855..5a3b3d974d84 100644 --- a/drivers/hid/hid-prodikeys.c +++ b/drivers/hid/hid-prodikeys.c @@ -551,10 +551,14 @@ static void pcmidi_setup_extra_keys( static int pcmidi_set_operational(struct pcmidi_snd *pm) { + int rc; + if (pm->ifnum != 1) return 0; /* only set up ONCE for interace 1 */ - pcmidi_get_output_report(pm); + rc = pcmidi_get_output_report(pm); + if (rc < 0) + return rc; pcmidi_submit_output_report(pm, 0xc1); return 0; } @@ -683,7 +687,11 @@ static int pcmidi_snd_initialise(struct pcmidi_snd *pm) spin_lock_init(&pm->rawmidi_in_lock); init_sustain_timers(pm); - pcmidi_set_operational(pm); + err = pcmidi_set_operational(pm); + if (err < 0) { + pk_error("failed to find output report\n"); + goto fail_register; + } /* register it */ err = snd_card_register(card); diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index 166f41f3173b..c50bcd967d99 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -92,6 +92,7 @@ static const struct hid_device_id hid_quirks[] = { { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0B4A), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_094A), HID_QUIRK_ALWAYS_POLL }, + { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0941), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0641), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_IDEACOM, USB_DEVICE_ID_IDEACOM_IDC6680), HID_QUIRK_MULTI_INPUT }, { HID_USB_DEVICE(USB_VENDOR_ID_INNOMEDIA, USB_DEVICE_ID_INNEX_GENESIS_ATARI), HID_QUIRK_MULTI_INPUT }, diff --git a/drivers/hid/hid-sensor-hub.c b/drivers/hid/hid-sensor-hub.c index be92a6f79687..94c7398b5c27 100644 --- a/drivers/hid/hid-sensor-hub.c +++ b/drivers/hid/hid-sensor-hub.c @@ -742,7 +742,6 @@ static void sensor_hub_remove(struct hid_device *hdev) } spin_unlock_irqrestore(&data->lock, flags); mfd_remove_devices(&hdev->dev); - hid_set_drvdata(hdev, NULL); mutex_destroy(&data->mutex); } diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index 49dd2d905c7f..73c0f7a95e2d 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -2811,7 +2811,6 @@ static int sony_input_configured(struct hid_device *hdev, sony_cancel_work_sync(sc); sony_remove_dev_list(sc); sony_release_device_id(sc); - hid_hw_stop(hdev); return ret; } @@ -2876,6 +2875,7 @@ static int sony_probe(struct hid_device *hdev, const struct hid_device_id *id) */ if (!(hdev->claimed & HID_CLAIMED_INPUT)) { hid_err(hdev, "failed to claim input\n"); + hid_hw_stop(hdev); return -ENODEV; } diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index 006bd6f4f653..bbc6ec1aa5cb 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -252,7 +252,7 @@ static __poll_t hidraw_poll(struct file *file, poll_table *wait) poll_wait(file, &list->hidraw->wait, wait); if (list->head != list->tail) - return EPOLLIN | EPOLLRDNORM; + return EPOLLIN | EPOLLRDNORM | EPOLLOUT; if (!list->hidraw->exist) return EPOLLERR | EPOLLHUP; return 0; @@ -370,7 +370,7 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd, mutex_lock(&minors_lock); dev = hidraw_table[minor]; - if (!dev) { + if (!dev || !dev->exist) { ret = -ENODEV; goto out; } diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c index 90164fed08d3..2a7c6e33bb1c 100644 --- a/drivers/hid/i2c-hid/i2c-hid-core.c +++ b/drivers/hid/i2c-hid/i2c-hid-core.c @@ -169,9 +169,7 @@ static const struct i2c_hid_quirks { __u16 idProduct; __u32 quirks; } i2c_hid_quirks[] = { - { USB_VENDOR_ID_WEIDA, USB_DEVICE_ID_WEIDA_8752, - I2C_HID_QUIRK_SET_PWR_WAKEUP_DEV }, - { USB_VENDOR_ID_WEIDA, USB_DEVICE_ID_WEIDA_8755, + { USB_VENDOR_ID_WEIDA, HID_ANY_ID, I2C_HID_QUIRK_SET_PWR_WAKEUP_DEV }, { I2C_VENDOR_ID_HANTICK, I2C_PRODUCT_ID_HANTICK_5288, I2C_HID_QUIRK_NO_IRQ_AFTER_RESET | diff --git a/drivers/hid/intel-ish-hid/ipc/hw-ish.h b/drivers/hid/intel-ish-hid/ipc/hw-ish.h index 5792a104000a..6c1e6110867f 100644 --- a/drivers/hid/intel-ish-hid/ipc/hw-ish.h +++ b/drivers/hid/intel-ish-hid/ipc/hw-ish.h @@ -78,5 +78,6 @@ irqreturn_t ish_irq_handler(int irq, void *dev_id); struct ishtp_device *ish_dev_init(struct pci_dev *pdev); int ish_hw_start(struct ishtp_device *dev); void ish_device_disable(struct ishtp_device *dev); +int ish_disable_dma(struct ishtp_device *dev); #endif /* _ISHTP_HW_ISH_H_ */ diff --git a/drivers/hid/intel-ish-hid/ipc/ipc.c b/drivers/hid/intel-ish-hid/ipc/ipc.c index 18fe8af89aad..8f8dfdf64833 100644 --- a/drivers/hid/intel-ish-hid/ipc/ipc.c +++ b/drivers/hid/intel-ish-hid/ipc/ipc.c @@ -672,7 +672,7 @@ irqreturn_t ish_irq_handler(int irq, void *dev_id) * * Return: 0 for success else error code. */ -static int ish_disable_dma(struct ishtp_device *dev) +int ish_disable_dma(struct ishtp_device *dev) { unsigned int dma_delay; diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c index 279567baca3d..784dcc8c7022 100644 --- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c +++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #define CREATE_TRACE_POINTS @@ -98,6 +99,11 @@ static const struct pci_device_id ish_invalid_pci_ids[] = { {} }; +static inline bool ish_should_enter_d0i3(struct pci_dev *pdev) +{ + return !pm_suspend_via_firmware() || pdev->device == CHV_DEVICE_ID; +} + /** * ish_probe() - PCI driver probe callback * @pdev: pci device @@ -148,7 +154,6 @@ static int ish_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* mapping IO device memory */ hw->mem_addr = pcim_iomap_table(pdev)[0]; ishtp->pdev = pdev; - pdev->dev_flags |= PCI_DEV_FLAGS_NO_D3; /* request and enable interrupt */ ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); @@ -185,7 +190,6 @@ static void ish_remove(struct pci_dev *pdev) struct ishtp_device *ishtp_dev = pci_get_drvdata(pdev); ishtp_bus_remove_all_clients(ishtp_dev, false); - pdev->dev_flags &= ~PCI_DEV_FLAGS_NO_D3; ish_device_disable(ishtp_dev); } @@ -207,17 +211,13 @@ static void __maybe_unused ish_resume_handler(struct work_struct *work) { struct pci_dev *pdev = to_pci_dev(ish_resume_device); struct ishtp_device *dev = pci_get_drvdata(pdev); - uint32_t fwsts; int ret; - /* Get ISH FW status */ - fwsts = IPC_GET_ISH_FWSTS(dev->ops->get_fw_status(dev)); + /* Check the NO_D3 flag to distinguish the resume paths */ + if (pdev->dev_flags & PCI_DEV_FLAGS_NO_D3) { + pdev->dev_flags &= ~PCI_DEV_FLAGS_NO_D3; + disable_irq_wake(pdev->irq); - /* - * If currently, in ISH FW, sensor app is loaded or beyond that, - * it means ISH isn't powered off, in this case, send a resume message. - */ - if (fwsts >= FWSTS_SENSOR_APP_LOADED) { ishtp_send_resume(dev); /* Waiting to get resume response */ @@ -225,16 +225,20 @@ static void __maybe_unused ish_resume_handler(struct work_struct *work) ret = wait_event_interruptible_timeout(dev->resume_wait, !dev->resume_flag, msecs_to_jiffies(WAIT_FOR_RESUME_ACK_MS)); - } - /* - * If in ISH FW, sensor app isn't loaded yet, or no resume response. - * That means this platform is not S0ix compatible, or something is - * wrong with ISH FW. So on resume, full reboot of ISH processor will - * happen, so need to go through init sequence again. - */ - if (dev->resume_flag) + /* + * If the flag is not cleared, something is wrong with ISH FW. + * So on resume, need to go through init sequence again. + */ + if (dev->resume_flag) + ish_init(dev); + } else { + /* + * Resume from the D3, full reboot of ISH processor will happen, + * so need to go through init sequence again. + */ ish_init(dev); + } } /** @@ -250,23 +254,43 @@ static int __maybe_unused ish_suspend(struct device *device) struct pci_dev *pdev = to_pci_dev(device); struct ishtp_device *dev = pci_get_drvdata(pdev); - enable_irq_wake(pdev->irq); - /* - * If previous suspend hasn't been asnwered then ISH is likely dead, - * don't attempt nested notification - */ - if (dev->suspend_flag) - return 0; + if (ish_should_enter_d0i3(pdev)) { + /* + * If previous suspend hasn't been asnwered then ISH is likely + * dead, don't attempt nested notification + */ + if (dev->suspend_flag) + return 0; - dev->resume_flag = 0; - dev->suspend_flag = 1; - ishtp_send_suspend(dev); + dev->resume_flag = 0; + dev->suspend_flag = 1; + ishtp_send_suspend(dev); - /* 25 ms should be enough for live ISH to flush all IPC buf */ - if (dev->suspend_flag) - wait_event_interruptible_timeout(dev->suspend_wait, - !dev->suspend_flag, - msecs_to_jiffies(25)); + /* 25 ms should be enough for live ISH to flush all IPC buf */ + if (dev->suspend_flag) + wait_event_interruptible_timeout(dev->suspend_wait, + !dev->suspend_flag, + msecs_to_jiffies(25)); + + if (dev->suspend_flag) { + /* + * It looks like FW halt, clear the DMA bit, and put + * ISH into D3, and FW would reset on resume. + */ + ish_disable_dma(dev); + } else { + /* Set the NO_D3 flag, the ISH would enter D0i3 */ + pdev->dev_flags |= PCI_DEV_FLAGS_NO_D3; + + enable_irq_wake(pdev->irq); + } + } else { + /* + * Clear the DMA bit before putting ISH into D3, + * or ISH FW would reset automatically. + */ + ish_disable_dma(dev); + } return 0; } @@ -288,7 +312,6 @@ static int __maybe_unused ish_resume(struct device *device) ish_resume_device = device; dev->resume_flag = 1; - disable_irq_wake(pdev->irq); schedule_work(&resume_work); return 0; diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c index 4e11cc6fc34b..1f9bc4483465 100644 --- a/drivers/hid/usbhid/hiddev.c +++ b/drivers/hid/usbhid/hiddev.c @@ -428,7 +428,7 @@ static __poll_t hiddev_poll(struct file *file, poll_table *wait) poll_wait(file, &list->hiddev->wait, wait); if (list->head != list->tail) - return EPOLLIN | EPOLLRDNORM; + return EPOLLIN | EPOLLRDNORM | EPOLLOUT; if (!list->hiddev->exist) return EPOLLERR | EPOLLHUP; return 0; diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 53bddb50aeba..5ded94b7bf68 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -88,7 +88,7 @@ static void wacom_wac_queue_flush(struct hid_device *hdev, } static int wacom_wac_pen_serial_enforce(struct hid_device *hdev, - struct hid_report *report, u8 *raw_data, int size) + struct hid_report *report, u8 *raw_data, int report_size) { struct wacom *wacom = hid_get_drvdata(hdev); struct wacom_wac *wacom_wac = &wacom->wacom_wac; @@ -149,7 +149,8 @@ static int wacom_wac_pen_serial_enforce(struct hid_device *hdev, if (flush) wacom_wac_queue_flush(hdev, &wacom_wac->pen_fifo); else if (insert) - wacom_wac_queue_insert(hdev, &wacom_wac->pen_fifo, raw_data, size); + wacom_wac_queue_insert(hdev, &wacom_wac->pen_fifo, + raw_data, report_size); return insert && !flush; } @@ -2176,7 +2177,7 @@ static void wacom_update_name(struct wacom *wacom, const char *suffix) { struct wacom_wac *wacom_wac = &wacom->wacom_wac; struct wacom_features *features = &wacom_wac->features; - char name[WACOM_NAME_MAX]; + char name[WACOM_NAME_MAX - 20]; /* Leave some room for suffixes */ /* Generic devices name unspecified */ if ((features->type == HID_GENERIC) && !strcmp("Wacom HID", features->name)) { @@ -2718,14 +2719,12 @@ static int wacom_probe(struct hid_device *hdev, wacom_wac->features = *((struct wacom_features *)id->driver_data); features = &wacom_wac->features; - if (features->check_for_hid_type && features->hid_type != hdev->type) { - error = -ENODEV; - goto fail; - } + if (features->check_for_hid_type && features->hid_type != hdev->type) + return -ENODEV; error = kfifo_alloc(&wacom_wac->pen_fifo, WACOM_PKGLEN_MAX, GFP_KERNEL); if (error) - goto fail; + return error; wacom_wac->hid_data.inputmode = -1; wacom_wac->mode_report = -1; @@ -2743,12 +2742,12 @@ static int wacom_probe(struct hid_device *hdev, error = hid_parse(hdev); if (error) { hid_err(hdev, "parse failed\n"); - goto fail; + return error; } error = wacom_parse_and_register(wacom, false); if (error) - goto fail; + return error; if (hdev->bus == BUS_BLUETOOTH) { error = device_create_file(&hdev->dev, &dev_attr_speed); @@ -2759,10 +2758,6 @@ static int wacom_probe(struct hid_device *hdev, } return 0; - -fail: - hid_set_drvdata(hdev, NULL); - return error; } static void wacom_remove(struct hid_device *hdev) @@ -2791,8 +2786,6 @@ static void wacom_remove(struct hid_device *hdev) wacom_release_resources(wacom); kfifo_free(&wacom_wac->pen_fifo); - - hid_set_drvdata(hdev, NULL); } #ifdef CONFIG_PM diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 1713235d28cb..2b0a5b8ca6e6 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -251,7 +251,7 @@ static int wacom_dtu_irq(struct wacom_wac *wacom) static int wacom_dtus_irq(struct wacom_wac *wacom) { - char *data = wacom->data; + unsigned char *data = wacom->data; struct input_dev *input = wacom->pen_input; unsigned short prox, pressure = 0; @@ -483,6 +483,8 @@ static int wacom_intuos_pad(struct wacom_wac *wacom) int ring1 = 0, ring2 = 0; int strip1 = 0, strip2 = 0; bool prox = false; + bool wrench = false, keyboard = false, mute_touch = false, menu = false, + info = false; /* pad packets. Works as a second tool and is always in prox */ if (!(data[0] == WACOM_REPORT_INTUOSPAD || data[0] == WACOM_REPORT_INTUOS5PAD || @@ -512,10 +514,32 @@ static int wacom_intuos_pad(struct wacom_wac *wacom) keys = ((data[3] & 0x1C) ? 1<<2 : 0) | ((data[4] & 0xE0) ? 1<<1 : 0) | ((data[4] & 0x07) ? 1<<0 : 0); + keyboard = !!(data[4] & 0xE0); + info = !!(data[3] & 0x1C); + + if (features->oPid) { + mute_touch = !!(data[4] & 0x07); + if (mute_touch) + wacom->shared->is_touch_on = + !wacom->shared->is_touch_on; + } else { + wrench = !!(data[4] & 0x07); + } } else if (features->type == WACOM_27QHD) { nkeys = 3; keys = data[2] & 0x07; + wrench = !!(data[2] & 0x01); + keyboard = !!(data[2] & 0x02); + + if (features->oPid) { + mute_touch = !!(data[2] & 0x04); + if (mute_touch) + wacom->shared->is_touch_on = + !wacom->shared->is_touch_on; + } else { + menu = !!(data[2] & 0x04); + } input_report_abs(input, ABS_X, be16_to_cpup((__be16 *)&data[4])); input_report_abs(input, ABS_Y, be16_to_cpup((__be16 *)&data[6])); input_report_abs(input, ABS_Z, be16_to_cpup((__be16 *)&data[8])); @@ -561,6 +585,9 @@ static int wacom_intuos_pad(struct wacom_wac *wacom) if (features->type == WACOM_22HD) { nkeys = 3; keys = data[9] & 0x07; + + info = !!(data[9] & 0x01); + wrench = !!(data[9] & 0x02); } } else { buttons = ((data[6] & 0x10) << 5) | @@ -572,7 +599,7 @@ static int wacom_intuos_pad(struct wacom_wac *wacom) strip2 = ((data[3] & 0x1f) << 8) | data[4]; } - prox = (buttons & ~(~0 << nbuttons)) | (keys & ~(~0 << nkeys)) | + prox = (buttons & ~(~0U << nbuttons)) | (keys & ~(~0U << nkeys)) | (ring1 & 0x80) | (ring2 & 0x80) | strip1 | strip2; wacom_report_numbered_buttons(input, nbuttons, buttons); @@ -580,6 +607,18 @@ static int wacom_intuos_pad(struct wacom_wac *wacom) for (i = 0; i < nkeys; i++) input_report_key(input, KEY_PROG1 + i, keys & (1 << i)); + input_report_key(input, KEY_BUTTONCONFIG, wrench); + input_report_key(input, KEY_ONSCREEN_KEYBOARD, keyboard); + input_report_key(input, KEY_CONTROLPANEL, menu); + input_report_key(input, KEY_INFO, info); + + if (wacom->shared && wacom->shared->touch_input) { + input_report_switch(wacom->shared->touch_input, + SW_MUTE_DEVICE, + !wacom->shared->is_touch_on); + input_sync(wacom->shared->touch_input); + } + input_report_abs(input, ABS_RX, strip1); input_report_abs(input, ABS_RY, strip2); @@ -1483,6 +1522,12 @@ static int wacom_24hdt_irq(struct wacom_wac *wacom) int byte_per_packet = WACOM_BYTES_PER_24HDT_PACKET; int y_offset = 2; + if (wacom->shared->has_mute_touch_switch && + !wacom->shared->is_touch_on) { + if (!wacom->shared->touch_down) + return 0; + } + if (wacom->features.type == WACOM_27QHDT) { current_num_contacts = data[63]; num_contacts_left = 10; @@ -2051,14 +2096,14 @@ static void wacom_wac_pad_event(struct hid_device *hdev, struct hid_field *field (hdev->product == 0x34d || hdev->product == 0x34e || /* MobileStudio Pro */ hdev->product == 0x357 || hdev->product == 0x358 || /* Intuos Pro 2 */ hdev->product == 0x392 || /* Intuos Pro 2 */ - hdev->product == 0x399)) { /* MobileStudio Pro */ + hdev->product == 0x398 || hdev->product == 0x399)) { /* MobileStudio Pro */ value = (field->logical_maximum - value); if (hdev->product == 0x357 || hdev->product == 0x358 || hdev->product == 0x392) value = wacom_offset_rotation(input, usage, value, 3, 16); else if (hdev->product == 0x34d || hdev->product == 0x34e || - hdev->product == 0x399) + hdev->product == 0x398 || hdev->product == 0x399) value = wacom_offset_rotation(input, usage, value, 1, 2); } else { @@ -3815,6 +3860,14 @@ int wacom_setup_touch_input_capabilities(struct input_dev *input_dev, /* fall through */ case WACOM_27QHDT: + if (wacom_wac->shared->touch->product == 0x32C || + wacom_wac->shared->touch->product == 0xF6) { + input_dev->evbit[0] |= BIT_MASK(EV_SW); + __set_bit(SW_MUTE_DEVICE, input_dev->swbit); + wacom_wac->shared->has_mute_touch_switch = true; + } + /* fall through */ + case MTSCREEN: case MTTPC: case MTTPC_B: @@ -4050,6 +4103,12 @@ int wacom_setup_pad_input_capabilities(struct input_dev *input_dev, __set_bit(KEY_PROG2, input_dev->keybit); __set_bit(KEY_PROG3, input_dev->keybit); + __set_bit(KEY_ONSCREEN_KEYBOARD, input_dev->keybit); + __set_bit(KEY_INFO, input_dev->keybit); + + if (!features->oPid) + __set_bit(KEY_BUTTONCONFIG, input_dev->keybit); + input_set_abs_params(input_dev, ABS_WHEEL, 0, 71, 0, 0); input_set_abs_params(input_dev, ABS_THROTTLE, 0, 71, 0, 0); break; @@ -4058,6 +4117,12 @@ int wacom_setup_pad_input_capabilities(struct input_dev *input_dev, __set_bit(KEY_PROG1, input_dev->keybit); __set_bit(KEY_PROG2, input_dev->keybit); __set_bit(KEY_PROG3, input_dev->keybit); + + __set_bit(KEY_ONSCREEN_KEYBOARD, input_dev->keybit); + __set_bit(KEY_BUTTONCONFIG, input_dev->keybit); + + if (!features->oPid) + __set_bit(KEY_CONTROLPANEL, input_dev->keybit); input_set_abs_params(input_dev, ABS_X, -2048, 2048, 0, 0); input_abs_set_res(input_dev, ABS_X, 1024); /* points/g */ input_set_abs_params(input_dev, ABS_Y, -2048, 2048, 0, 0); @@ -4071,6 +4136,9 @@ int wacom_setup_pad_input_capabilities(struct input_dev *input_dev, __set_bit(KEY_PROG1, input_dev->keybit); __set_bit(KEY_PROG2, input_dev->keybit); __set_bit(KEY_PROG3, input_dev->keybit); + + __set_bit(KEY_BUTTONCONFIG, input_dev->keybit); + __set_bit(KEY_INFO, input_dev->keybit); /* fall through */ case WACOM_21UX2: diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index addcef50df7a..8eb167540b4f 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -407,7 +407,15 @@ void hv_process_channel_removal(struct vmbus_channel *channel) cpumask_clear_cpu(channel->target_cpu, &primary_channel->alloced_cpus_in_node); - vmbus_release_relid(channel->offermsg.child_relid); + /* + * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and + * the relid is invalidated; after hibernation, when the user-space app + * destroys the channel, the relid is INVALID_RELID, and in this case + * it's unnecessary and unsafe to release the old relid, since the same + * relid can refer to a completely different channel now. + */ + if (channel->offermsg.child_relid != INVALID_RELID) + vmbus_release_relid(channel->offermsg.child_relid); free_channel(channel); } @@ -545,6 +553,10 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) mutex_lock(&vmbus_connection.channel_mutex); + /* Remember the channels that should be cleaned up upon suspend. */ + if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel)) + atomic_inc(&vmbus_connection.nr_chan_close_on_suspend); + /* * Now that we have acquired the channel_mutex, * we can release the potentially racing rescind thread. @@ -847,6 +859,67 @@ void vmbus_initiate_unload(bool crash) vmbus_wait_for_unload(); } +static void check_ready_for_resume_event(void) +{ + /* + * If all the old primary channels have been fixed up, then it's safe + * to resume. + */ + if (atomic_dec_and_test(&vmbus_connection.nr_chan_fixup_on_resume)) + complete(&vmbus_connection.ready_for_resume_event); +} + +static void vmbus_setup_channel_state(struct vmbus_channel *channel, + struct vmbus_channel_offer_channel *offer) +{ + /* + * Setup state for signalling the host. + */ + channel->sig_event = VMBUS_EVENT_CONNECTION_ID; + + if (vmbus_proto_version != VERSION_WS2008) { + channel->is_dedicated_interrupt = + (offer->is_dedicated_interrupt != 0); + channel->sig_event = offer->connection_id; + } + + memcpy(&channel->offermsg, offer, + sizeof(struct vmbus_channel_offer_channel)); + channel->monitor_grp = (u8)offer->monitorid / 32; + channel->monitor_bit = (u8)offer->monitorid % 32; +} + +/* + * find_primary_channel_by_offer - Get the channel object given the new offer. + * This is only used in the resume path of hibernation. + */ +static struct vmbus_channel * +find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer) +{ + struct vmbus_channel *channel = NULL, *iter; + const guid_t *inst1, *inst2; + + /* Ignore sub-channel offers. */ + if (offer->offer.sub_channel_index != 0) + return NULL; + + mutex_lock(&vmbus_connection.channel_mutex); + + list_for_each_entry(iter, &vmbus_connection.chn_list, listentry) { + inst1 = &iter->offermsg.offer.if_instance; + inst2 = &offer->offer.if_instance; + + if (guid_equal(inst1, inst2)) { + channel = iter; + break; + } + } + + mutex_unlock(&vmbus_connection.channel_mutex); + + return channel; +} + /* * vmbus_onoffer - Handler for channel offers from vmbus in parent partition. * @@ -854,12 +927,58 @@ void vmbus_initiate_unload(bool crash) static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) { struct vmbus_channel_offer_channel *offer; - struct vmbus_channel *newchannel; + struct vmbus_channel *oldchannel, *newchannel; + size_t offer_sz; offer = (struct vmbus_channel_offer_channel *)hdr; trace_vmbus_onoffer(offer); + oldchannel = find_primary_channel_by_offer(offer); + + if (oldchannel != NULL) { + atomic_dec(&vmbus_connection.offer_in_progress); + + /* + * We're resuming from hibernation: all the sub-channel and + * hv_sock channels we had before the hibernation should have + * been cleaned up, and now we must be seeing a re-offered + * primary channel that we had before the hibernation. + */ + + WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID); + /* Fix up the relid. */ + oldchannel->offermsg.child_relid = offer->child_relid; + + offer_sz = sizeof(*offer); + if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) { + check_ready_for_resume_event(); + return; + } + + /* + * This is not an error, since the host can also change the + * other field(s) of the offer, e.g. on WS RS5 (Build 17763), + * the offer->connection_id of the Mellanox VF vmbus device + * can change when the host reoffers the device upon resume. + */ + pr_debug("vmbus offer changed: relid=%d\n", + offer->child_relid); + + print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET, + 16, 4, &oldchannel->offermsg, offer_sz, + false); + print_hex_dump_debug("New vmbus offer: ", DUMP_PREFIX_OFFSET, + 16, 4, offer, offer_sz, false); + + /* Fix up the old channel. */ + vmbus_setup_channel_state(oldchannel, offer); + + check_ready_for_resume_event(); + + return; + } + /* Allocate the channel object and save this offer. */ newchannel = alloc_channel(); if (!newchannel) { @@ -869,25 +988,21 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) return; } - /* - * Setup state for signalling the host. - */ - newchannel->sig_event = VMBUS_EVENT_CONNECTION_ID; - - if (vmbus_proto_version != VERSION_WS2008) { - newchannel->is_dedicated_interrupt = - (offer->is_dedicated_interrupt != 0); - newchannel->sig_event = offer->connection_id; - } - - memcpy(&newchannel->offermsg, offer, - sizeof(struct vmbus_channel_offer_channel)); - newchannel->monitor_grp = (u8)offer->monitorid / 32; - newchannel->monitor_bit = (u8)offer->monitorid % 32; + vmbus_setup_channel_state(newchannel, offer); vmbus_process_offer(newchannel); } +static void check_ready_for_suspend_event(void) +{ + /* + * If all the sub-channels or hv_sock channels have been cleaned up, + * then it's safe to suspend. + */ + if (atomic_dec_and_test(&vmbus_connection.nr_chan_close_on_suspend)) + complete(&vmbus_connection.ready_for_suspend_event); +} + /* * vmbus_onoffer_rescind - Rescind offer handler. * @@ -898,6 +1013,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) struct vmbus_channel_rescind_offer *rescind; struct vmbus_channel *channel; struct device *dev; + bool clean_up_chan_for_suspend; rescind = (struct vmbus_channel_rescind_offer *)hdr; @@ -937,6 +1053,8 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) return; } + clean_up_chan_for_suspend = is_hvsock_channel(channel) || + is_sub_channel(channel); /* * Before setting channel->rescind in vmbus_rescind_cleanup(), we * should make sure the channel callback is not running any more. @@ -962,6 +1080,10 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) if (channel->device_obj) { if (channel->chn_rescind_callback) { channel->chn_rescind_callback(channel); + + if (clean_up_chan_for_suspend) + check_ready_for_suspend_event(); + return; } /* @@ -994,6 +1116,11 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) } mutex_unlock(&vmbus_connection.channel_mutex); } + + /* The "channel" may have been freed. Do not access it any longer. */ + + if (clean_up_chan_for_suspend) + check_ready_for_suspend_event(); } void vmbus_hvsock_device_unregister(struct vmbus_channel *channel) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 09829e15d4a0..6e4c015783ff 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -26,6 +26,11 @@ struct vmbus_connection vmbus_connection = { .conn_state = DISCONNECTED, .next_gpadl_handle = ATOMIC_INIT(0xE1E10), + + .ready_for_suspend_event= COMPLETION_INITIALIZER( + vmbus_connection.ready_for_suspend_event), + .ready_for_resume_event = COMPLETION_INITIALIZER( + vmbus_connection.ready_for_resume_event), }; EXPORT_SYMBOL_GPL(vmbus_connection); @@ -59,8 +64,7 @@ static __u32 vmbus_get_next_version(__u32 current_version) } } -static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, - __u32 version) +int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) { int ret = 0; unsigned int cur_cpu; diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 6188fb7dda42..fcc52797c169 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -154,7 +154,7 @@ void hv_synic_free(void) * retrieve the initialized message and event pages. Otherwise, we create and * initialize the message and event pages. */ -int hv_synic_init(unsigned int cpu) +void hv_synic_enable_regs(unsigned int cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); @@ -196,6 +196,11 @@ int hv_synic_init(unsigned int cpu) sctrl.enable = 1; hv_set_synic_state(sctrl.as_uint64); +} + +int hv_synic_init(unsigned int cpu) +{ + hv_synic_enable_regs(cpu); hv_stimer_init(cpu); @@ -205,20 +210,45 @@ int hv_synic_init(unsigned int cpu) /* * hv_synic_cleanup - Cleanup routine for hv_synic_init(). */ -int hv_synic_cleanup(unsigned int cpu) +void hv_synic_disable_regs(unsigned int cpu) { union hv_synic_sint shared_sint; union hv_synic_simp simp; union hv_synic_siefp siefp; union hv_synic_scontrol sctrl; + + hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + + shared_sint.masked = 1; + + /* Need to correctly cleanup in the case of SMP!!! */ + /* Disable the interrupt */ + hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + + hv_get_simp(simp.as_uint64); + simp.simp_enabled = 0; + simp.base_simp_gpa = 0; + + hv_set_simp(simp.as_uint64); + + hv_get_siefp(siefp.as_uint64); + siefp.siefp_enabled = 0; + siefp.base_siefp_gpa = 0; + + hv_set_siefp(siefp.as_uint64); + + /* Disable the global synic bit */ + hv_get_synic_state(sctrl.as_uint64); + sctrl.enable = 0; + hv_set_synic_state(sctrl.as_uint64); +} + +int hv_synic_cleanup(unsigned int cpu) +{ struct vmbus_channel *channel, *sc; bool channel_found = false; unsigned long flags; - hv_get_synic_state(sctrl.as_uint64); - if (sctrl.enable != 1) - return -EFAULT; - /* * Search for channels which are bound to the CPU we're about to * cleanup. In case we find one and vmbus is still connected we need to @@ -249,29 +279,7 @@ int hv_synic_cleanup(unsigned int cpu) hv_stimer_cleanup(cpu); - hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); - - shared_sint.masked = 1; - - /* Need to correctly cleanup in the case of SMP!!! */ - /* Disable the interrupt */ - hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); - - hv_get_simp(simp.as_uint64); - simp.simp_enabled = 0; - simp.base_simp_gpa = 0; - - hv_set_simp(simp.as_uint64); - - hv_get_siefp(siefp.as_uint64); - siefp.siefp_enabled = 0; - siefp.base_siefp_gpa = 0; - - hv_set_siefp(siefp.as_uint64); - - /* Disable the global synic bit */ - sctrl.enable = 0; - hv_set_synic_state(sctrl.as_uint64); + hv_synic_disable_regs(cpu); return 0; } diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 6fb4ea5f0304..34bd73526afd 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -494,7 +494,7 @@ enum hv_dm_state { static __u8 recv_buffer[PAGE_SIZE]; -static __u8 *send_buffer; +static __u8 balloon_up_send_buffer[PAGE_SIZE]; #define PAGES_IN_2M 512 #define HA_CHUNK (32 * 1024) @@ -1292,8 +1292,8 @@ static void balloon_up(struct work_struct *dummy) } while (!done) { - bl_resp = (struct dm_balloon_response *)send_buffer; - memset(send_buffer, 0, PAGE_SIZE); + memset(balloon_up_send_buffer, 0, PAGE_SIZE); + bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer; bl_resp->hdr.type = DM_BALLOON_RESPONSE; bl_resp->hdr.size = sizeof(struct dm_balloon_response); bl_resp->more_pages = 1; @@ -1564,58 +1564,18 @@ static void balloon_onchannelcallback(void *context) } -static int balloon_probe(struct hv_device *dev, - const struct hv_vmbus_device_id *dev_id) +static int balloon_connect_vsp(struct hv_device *dev) { - int ret; - unsigned long t; struct dm_version_request version_req; struct dm_capabilities cap_msg; - -#ifdef CONFIG_MEMORY_HOTPLUG - do_hot_add = hot_add; -#else - do_hot_add = false; -#endif - - /* - * First allocate a send buffer. - */ - - send_buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!send_buffer) - return -ENOMEM; + unsigned long t; + int ret; ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0, - balloon_onchannelcallback, dev); - + balloon_onchannelcallback, dev); if (ret) - goto probe_error0; + return ret; - dm_device.dev = dev; - dm_device.state = DM_INITIALIZING; - dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8; - init_completion(&dm_device.host_event); - init_completion(&dm_device.config_event); - INIT_LIST_HEAD(&dm_device.ha_region_list); - spin_lock_init(&dm_device.ha_lock); - INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); - INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); - dm_device.host_specified_ha_region = false; - - dm_device.thread = - kthread_run(dm_thread_func, &dm_device, "hv_balloon"); - if (IS_ERR(dm_device.thread)) { - ret = PTR_ERR(dm_device.thread); - goto probe_error1; - } - -#ifdef CONFIG_MEMORY_HOTPLUG - set_online_page_callback(&hv_online_page); - register_memory_notifier(&hv_memory_nb); -#endif - - hv_set_drvdata(dev, &dm_device); /* * Initiate the hand shake with the host and negotiate * a version that the host can support. We start with the @@ -1631,16 +1591,15 @@ static int balloon_probe(struct hv_device *dev, dm_device.version = version_req.version.version; ret = vmbus_sendpacket(dev->channel, &version_req, - sizeof(struct dm_version_request), - (unsigned long)NULL, - VM_PKT_DATA_INBAND, 0); + sizeof(struct dm_version_request), + (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); if (ret) - goto probe_error2; + goto out; t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); if (t == 0) { ret = -ETIMEDOUT; - goto probe_error2; + goto out; } /* @@ -1648,8 +1607,8 @@ static int balloon_probe(struct hv_device *dev, * fail the probe function. */ if (dm_device.state == DM_INIT_ERROR) { - ret = -ETIMEDOUT; - goto probe_error2; + ret = -EPROTO; + goto out; } pr_info("Using Dynamic Memory protocol version %u.%u\n", @@ -1682,16 +1641,15 @@ static int balloon_probe(struct hv_device *dev, cap_msg.max_page_number = -1; ret = vmbus_sendpacket(dev->channel, &cap_msg, - sizeof(struct dm_capabilities), - (unsigned long)NULL, - VM_PKT_DATA_INBAND, 0); + sizeof(struct dm_capabilities), + (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); if (ret) - goto probe_error2; + goto out; t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); if (t == 0) { ret = -ETIMEDOUT; - goto probe_error2; + goto out; } /* @@ -1699,25 +1657,65 @@ static int balloon_probe(struct hv_device *dev, * fail the probe function. */ if (dm_device.state == DM_INIT_ERROR) { - ret = -ETIMEDOUT; - goto probe_error2; + ret = -EPROTO; + goto out; } + return 0; +out: + vmbus_close(dev->channel); + return ret; +} + +static int balloon_probe(struct hv_device *dev, + const struct hv_vmbus_device_id *dev_id) +{ + int ret; + +#ifdef CONFIG_MEMORY_HOTPLUG + do_hot_add = hot_add; +#else + do_hot_add = false; +#endif + dm_device.dev = dev; + dm_device.state = DM_INITIALIZING; + dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8; + init_completion(&dm_device.host_event); + init_completion(&dm_device.config_event); + INIT_LIST_HEAD(&dm_device.ha_region_list); + spin_lock_init(&dm_device.ha_lock); + INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); + INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); + dm_device.host_specified_ha_region = false; + +#ifdef CONFIG_MEMORY_HOTPLUG + set_online_page_callback(&hv_online_page); + register_memory_notifier(&hv_memory_nb); +#endif + + hv_set_drvdata(dev, &dm_device); + + ret = balloon_connect_vsp(dev); + if (ret != 0) + return ret; + dm_device.state = DM_INITIALIZED; - last_post_time = jiffies; + + dm_device.thread = + kthread_run(dm_thread_func, &dm_device, "hv_balloon"); + if (IS_ERR(dm_device.thread)) { + ret = PTR_ERR(dm_device.thread); + goto probe_error; + } return 0; -probe_error2: +probe_error: + vmbus_close(dev->channel); #ifdef CONFIG_MEMORY_HOTPLUG + unregister_memory_notifier(&hv_memory_nb); restore_online_page_callback(&hv_online_page); #endif - kthread_stop(dm_device.thread); - -probe_error1: - vmbus_close(dev->channel); -probe_error0: - kfree(send_buffer); return ret; } @@ -1734,12 +1732,11 @@ static int balloon_remove(struct hv_device *dev) cancel_work_sync(&dm->balloon_wrk.wrk); cancel_work_sync(&dm->ha_wrk.wrk); - vmbus_close(dev->channel); kthread_stop(dm->thread); - kfree(send_buffer); + vmbus_close(dev->channel); #ifdef CONFIG_MEMORY_HOTPLUG - restore_online_page_callback(&hv_online_page); unregister_memory_notifier(&hv_memory_nb); + restore_online_page_callback(&hv_online_page); #endif spin_lock_irqsave(&dm_device.ha_lock, flags); list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) { diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 50eaa1fd6e45..af9379a3bf89 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -169,8 +169,10 @@ extern int hv_synic_alloc(void); extern void hv_synic_free(void); +extern void hv_synic_enable_regs(unsigned int cpu); extern int hv_synic_init(unsigned int cpu); +extern void hv_synic_disable_regs(unsigned int cpu); extern int hv_synic_cleanup(unsigned int cpu); /* Interface */ @@ -256,6 +258,32 @@ struct vmbus_connection { struct workqueue_struct *work_queue; struct workqueue_struct *handle_primary_chan_wq; struct workqueue_struct *handle_sub_chan_wq; + + /* + * The number of sub-channels and hv_sock channels that should be + * cleaned up upon suspend: sub-channels will be re-created upon + * resume, and hv_sock channels should not survive suspend. + */ + atomic_t nr_chan_close_on_suspend; + /* + * vmbus_bus_suspend() waits for "nr_chan_close_on_suspend" to + * drop to zero. + */ + struct completion ready_for_suspend_event; + + /* + * The number of primary channels that should be "fixed up" + * upon resume: these channels are re-offered upon resume, and some + * fields of the channel offers (i.e. child_relid and connection_id) + * can change, so the old offermsg must be fixed up, before the resume + * callbacks of the VSC drivers start to further touch the channels. + */ + atomic_t nr_chan_fixup_on_resume; + /* + * vmbus_bus_resume() waits for "nr_chan_fixup_on_resume" to + * drop to zero. + */ + struct completion ready_for_resume_event; }; @@ -270,6 +298,8 @@ struct vmbus_msginfo { extern struct vmbus_connection vmbus_connection; +int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version); + static inline void vmbus_send_interrupt(u32 relid) { sync_set_bit(relid, vmbus_connection.send_int_page); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index ebd35fc35290..391f0b225c9a 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -24,12 +24,14 @@ #include #include +#include #include #include #include #include #include #include +#include #include #include "hyperv_vmbus.h" @@ -910,6 +912,43 @@ static void vmbus_shutdown(struct device *child_device) drv->shutdown(dev); } +/* + * vmbus_suspend - Suspend a vmbus device + */ +static int vmbus_suspend(struct device *child_device) +{ + struct hv_driver *drv; + struct hv_device *dev = device_to_hv_device(child_device); + + /* The device may not be attached yet */ + if (!child_device->driver) + return 0; + + drv = drv_to_hv_drv(child_device->driver); + if (!drv->suspend) + return -EOPNOTSUPP; + + return drv->suspend(dev); +} + +/* + * vmbus_resume - Resume a vmbus device + */ +static int vmbus_resume(struct device *child_device) +{ + struct hv_driver *drv; + struct hv_device *dev = device_to_hv_device(child_device); + + /* The device may not be attached yet */ + if (!child_device->driver) + return 0; + + drv = drv_to_hv_drv(child_device->driver); + if (!drv->resume) + return -EOPNOTSUPP; + + return drv->resume(dev); +} /* * vmbus_device_release - Final callback release of the vmbus child device @@ -925,6 +964,14 @@ static void vmbus_device_release(struct device *device) kfree(hv_dev); } +/* + * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than + * SET_SYSTEM_SLEEP_PM_OPS: see the comment before vmbus_bus_pm. + */ +static const struct dev_pm_ops vmbus_pm = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_suspend, vmbus_resume) +}; + /* The one and only one */ static struct bus_type hv_bus = { .name = "vmbus", @@ -935,6 +982,7 @@ static struct bus_type hv_bus = { .uevent = vmbus_uevent, .dev_groups = vmbus_dev_groups, .drv_groups = vmbus_drv_groups, + .pm = &vmbus_pm, }; struct onmessage_work_context { @@ -1022,6 +1070,41 @@ void vmbus_on_msg_dpc(unsigned long data) vmbus_signal_eom(msg, message_type); } +/* + * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for + * hibernation, because hv_sock connections can not persist across hibernation. + */ +static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) +{ + struct onmessage_work_context *ctx; + struct vmbus_channel_rescind_offer *rescind; + + WARN_ON(!is_hvsock_channel(channel)); + + /* + * sizeof(*ctx) is small and the allocation should really not fail, + * otherwise the state of the hv_sock connections ends up in limbo. + */ + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL); + + /* + * So far, these are not really used by Linux. Just set them to the + * reasonable values conforming to the definitions of the fields. + */ + ctx->msg.header.message_type = 1; + ctx->msg.header.payload_size = sizeof(*rescind); + + /* These values are actually used by Linux. */ + rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.u.payload; + rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER; + rescind->child_relid = channel->offermsg.child_relid; + + INIT_WORK(&ctx->work, vmbus_onmessage_work); + + queue_work_on(vmbus_connection.connect_cpu, + vmbus_connection.work_queue, + &ctx->work); +} /* * Direct callback for channels using other deferred processing @@ -2042,6 +2125,129 @@ static int vmbus_acpi_add(struct acpi_device *device) return ret_val; } +static int vmbus_bus_suspend(struct device *dev) +{ + struct vmbus_channel *channel, *sc; + unsigned long flags; + + while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { + /* + * We wait here until the completion of any channel + * offers that are currently in progress. + */ + msleep(1); + } + + mutex_lock(&vmbus_connection.channel_mutex); + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (!is_hvsock_channel(channel)) + continue; + + vmbus_force_channel_rescinded(channel); + } + mutex_unlock(&vmbus_connection.channel_mutex); + + /* + * Wait until all the sub-channels and hv_sock channels have been + * cleaned up. Sub-channels should be destroyed upon suspend, otherwise + * they would conflict with the new sub-channels that will be created + * in the resume path. hv_sock channels should also be destroyed, but + * a hv_sock channel of an established hv_sock connection can not be + * really destroyed since it may still be referenced by the userspace + * application, so we just force the hv_sock channel to be rescinded + * by vmbus_force_channel_rescinded(), and the userspace application + * will thoroughly destroy the channel after hibernation. + * + * Note: the counter nr_chan_close_on_suspend may never go above 0 if + * the VM has no sub-channel and hv_sock channel, e.g. a 1-vCPU VM. + */ + if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0) + wait_for_completion(&vmbus_connection.ready_for_suspend_event); + + WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0); + + mutex_lock(&vmbus_connection.channel_mutex); + + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + /* + * Invalidate the field. Upon resume, vmbus_onoffer() will fix + * up the field, and the other fields (if necessary). + */ + channel->offermsg.child_relid = INVALID_RELID; + + if (is_hvsock_channel(channel)) { + if (!channel->rescind) { + pr_err("hv_sock channel not rescinded!\n"); + WARN_ON_ONCE(1); + } + continue; + } + + spin_lock_irqsave(&channel->lock, flags); + list_for_each_entry(sc, &channel->sc_list, sc_list) { + pr_err("Sub-channel not deleted!\n"); + WARN_ON_ONCE(1); + } + spin_unlock_irqrestore(&channel->lock, flags); + + atomic_inc(&vmbus_connection.nr_chan_fixup_on_resume); + } + + mutex_unlock(&vmbus_connection.channel_mutex); + + vmbus_initiate_unload(false); + + vmbus_connection.conn_state = DISCONNECTED; + + /* Reset the event for the next resume. */ + reinit_completion(&vmbus_connection.ready_for_resume_event); + + return 0; +} + +static int vmbus_bus_resume(struct device *dev) +{ + struct vmbus_channel_msginfo *msginfo; + size_t msgsize; + int ret; + + /* + * We only use the 'vmbus_proto_version', which was in use before + * hibernation, to re-negotiate with the host. + */ + if (vmbus_proto_version == VERSION_INVAL || + vmbus_proto_version == 0) { + pr_err("Invalid proto version = 0x%x\n", vmbus_proto_version); + return -EINVAL; + } + + msgsize = sizeof(*msginfo) + + sizeof(struct vmbus_channel_initiate_contact); + + msginfo = kzalloc(msgsize, GFP_KERNEL); + + if (msginfo == NULL) + return -ENOMEM; + + ret = vmbus_negotiate_version(msginfo, vmbus_proto_version); + + kfree(msginfo); + + if (ret != 0) + return ret; + + WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) == 0); + + vmbus_request_offers(); + + wait_for_completion(&vmbus_connection.ready_for_resume_event); + + /* Reset the event for the next suspend. */ + reinit_completion(&vmbus_connection.ready_for_suspend_event); + + return 0; +} + static const struct acpi_device_id vmbus_acpi_device_ids[] = { {"VMBUS", 0}, {"VMBus", 0}, @@ -2049,6 +2255,19 @@ static const struct acpi_device_id vmbus_acpi_device_ids[] = { }; MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids); +/* + * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than + * SET_SYSTEM_SLEEP_PM_OPS, otherwise NIC SR-IOV can not work, because the + * "pci_dev_pm_ops" uses the "noirq" callbacks: in the resume path, the + * pci "noirq" restore callback runs before "non-noirq" callbacks (see + * resume_target_kernel() -> dpm_resume_start(), and hibernation_restore() -> + * dpm_resume_end()). This means vmbus_bus_resume() and the pci-hyperv's + * resume callback must also run via the "noirq" callbacks. + */ +static const struct dev_pm_ops vmbus_bus_pm = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_bus_suspend, vmbus_bus_resume) +}; + static struct acpi_driver vmbus_acpi_driver = { .name = "vmbus", .ids = vmbus_acpi_device_ids, @@ -2056,6 +2275,7 @@ static struct acpi_driver vmbus_acpi_driver = { .add = vmbus_acpi_add, .remove = vmbus_acpi_remove, }, + .drv.pm = &vmbus_bus_pm, }; static void hv_kexec_handler(void) @@ -2086,6 +2306,47 @@ static void hv_crash_handler(struct pt_regs *regs) hyperv_cleanup(); }; +static int hv_synic_suspend(void) +{ + /* + * When we reach here, all the non-boot CPUs have been offlined, and + * the stimers on them have been unbound in hv_synic_cleanup() -> + * hv_stimer_cleanup() -> clockevents_unbind_device(). + * + * hv_synic_suspend() only runs on CPU0 with interrupts disabled. Here + * we do not unbind the stimer on CPU0 because: 1) it's unnecessary + * because the interrupts remain disabled between syscore_suspend() + * and syscore_resume(): see create_image() and resume_target_kernel(); + * 2) the stimer on CPU0 is automatically disabled later by + * syscore_suspend() -> timekeeping_suspend() -> tick_suspend() -> ... + * -> clockevents_shutdown() -> ... -> hv_ce_shutdown(); 3) a warning + * would be triggered if we call clockevents_unbind_device(), which + * may sleep, in an interrupts-disabled context. So, we intentionally + * don't call hv_stimer_cleanup(0) here. + */ + + hv_synic_disable_regs(0); + + return 0; +} + +static void hv_synic_resume(void) +{ + hv_synic_enable_regs(0); + + /* + * Note: we don't need to call hv_stimer_init(0), because the timer + * on CPU0 is not unbound in hv_synic_suspend(), and the timer is + * automatically re-enabled in timekeeping_resume(). + */ +} + +/* The callbacks run only on CPU0, with irqs_disabled. */ +static struct syscore_ops hv_synic_syscore_ops = { + .suspend = hv_synic_suspend, + .resume = hv_synic_resume, +}; + static int __init hv_acpi_init(void) { int ret, t; @@ -2116,6 +2377,8 @@ static int __init hv_acpi_init(void) hv_setup_kexec_handler(hv_kexec_handler); hv_setup_crash_handler(hv_crash_handler); + register_syscore_ops(&hv_synic_syscore_ops); + return 0; cleanup: @@ -2128,6 +2391,8 @@ static void __exit vmbus_exit(void) { int cpu; + unregister_syscore_ops(&hv_synic_syscore_ops); + hv_remove_kexec_handler(); hv_remove_crash_handler(); vmbus_connection.conn_state = DISCONNECTED; diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 5587215b8ddb..146ce40d8e0a 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -429,6 +429,7 @@ config I2C_AXXIA tristate "Axxia I2C controller" depends on ARCH_AXXIA || COMPILE_TEST default ARCH_AXXIA + select I2C_SLAVE help Say yes if you want to support the I2C bus on Axxia platforms. @@ -977,7 +978,7 @@ config I2C_SIRF will be called i2c-sirf. config I2C_SPRD - bool "Spreadtrum I2C interface" + tristate "Spreadtrum I2C interface" depends on I2C=y && ARCH_SPRD help If you say yes to this option, support will be included for the @@ -1309,6 +1310,20 @@ config I2C_ELEKTOR This support is also available as a module. If so, the module will be called i2c-elektor. +config I2C_ICY + tristate "ICY Zorro card" + depends on ZORRO + select I2C_ALGOPCF + help + This supports the PCF8584 Zorro bus I2C adapter, known as ICY. + Say Y if you own such an adapter. + + This support is also available as a module. If so, the module + will be called i2c-icy. + + If you have a 2019 edition board with an LTC2990 sensor at address + 0x4c, loading the module 'ltc2990' is sufficient to enable it. + config I2C_MLXCPLD tristate "Mellanox I2C driver" depends on X86_64 diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index 80c23895eaaf..3ab8aebc39c9 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -140,6 +140,7 @@ obj-$(CONFIG_I2C_BCM_KONA) += i2c-bcm-kona.o obj-$(CONFIG_I2C_BRCMSTB) += i2c-brcmstb.o obj-$(CONFIG_I2C_CROS_EC_TUNNEL) += i2c-cros-ec-tunnel.o obj-$(CONFIG_I2C_ELEKTOR) += i2c-elektor.o +obj-$(CONFIG_I2C_ICY) += i2c-icy.o obj-$(CONFIG_I2C_MLXCPLD) += i2c-mlxcpld.o obj-$(CONFIG_I2C_OPAL) += i2c-opal.o obj-$(CONFIG_I2C_PCA_ISA) += i2c-pca-isa.o diff --git a/drivers/i2c/busses/i2c-axxia.c b/drivers/i2c/busses/i2c-axxia.c index ff3142b15cab..0214daa913ff 100644 --- a/drivers/i2c/busses/i2c-axxia.c +++ b/drivers/i2c/busses/i2c-axxia.c @@ -77,6 +77,40 @@ MST_STATUS_IP) #define MST_TX_BYTES_XFRD 0x50 #define MST_RX_BYTES_XFRD 0x54 +#define SLV_ADDR_DEC_CTL 0x58 +#define SLV_ADDR_DEC_GCE BIT(0) /* ACK to General Call Address from own master (loopback) */ +#define SLV_ADDR_DEC_OGCE BIT(1) /* ACK to General Call Address from external masters */ +#define SLV_ADDR_DEC_SA1E BIT(2) /* ACK to addr_1 enabled */ +#define SLV_ADDR_DEC_SA1M BIT(3) /* 10-bit addressing for addr_1 enabled */ +#define SLV_ADDR_DEC_SA2E BIT(4) /* ACK to addr_2 enabled */ +#define SLV_ADDR_DEC_SA2M BIT(5) /* 10-bit addressing for addr_2 enabled */ +#define SLV_ADDR_1 0x5c +#define SLV_ADDR_2 0x60 +#define SLV_RX_CTL 0x64 +#define SLV_RX_ACSA1 BIT(0) /* Generate ACK for writes to addr_1 */ +#define SLV_RX_ACSA2 BIT(1) /* Generate ACK for writes to addr_2 */ +#define SLV_RX_ACGCA BIT(2) /* ACK data phase transfers to General Call Address */ +#define SLV_DATA 0x68 +#define SLV_RX_FIFO 0x6c +#define SLV_FIFO_DV1 BIT(0) /* Data Valid for addr_1 */ +#define SLV_FIFO_DV2 BIT(1) /* Data Valid for addr_2 */ +#define SLV_FIFO_AS BIT(2) /* (N)ACK Sent */ +#define SLV_FIFO_TNAK BIT(3) /* Timeout NACK */ +#define SLV_FIFO_STRC BIT(4) /* First byte after start condition received */ +#define SLV_FIFO_RSC BIT(5) /* Repeated Start Condition */ +#define SLV_FIFO_STPC BIT(6) /* Stop Condition */ +#define SLV_FIFO_DV (SLV_FIFO_DV1 | SLV_FIFO_DV2) +#define SLV_INT_ENABLE 0x70 +#define SLV_INT_STATUS 0x74 +#define SLV_STATUS_RFH BIT(0) /* FIFO service */ +#define SLV_STATUS_WTC BIT(1) /* Write transfer complete */ +#define SLV_STATUS_SRS1 BIT(2) /* Slave read from addr 1 */ +#define SLV_STATUS_SRRS1 BIT(3) /* Repeated start from addr 1 */ +#define SLV_STATUS_SRND1 BIT(4) /* Read request not following start condition */ +#define SLV_STATUS_SRC1 BIT(5) /* Read canceled */ +#define SLV_STATUS_SRAT1 BIT(6) /* Slave Read timed out */ +#define SLV_STATUS_SRDRE1 BIT(7) /* Data written after timed out */ +#define SLV_READ_DUMMY 0x78 #define SCL_HIGH_PERIOD 0x80 #define SCL_LOW_PERIOD 0x84 #define SPIKE_FLTR_LEN 0x88 @@ -111,6 +145,8 @@ struct axxia_i2c_dev { struct clk *i2c_clk; u32 bus_clk_rate; bool last; + struct i2c_client *slave; + int irq; }; static void i2c_int_disable(struct axxia_i2c_dev *idev, u32 mask) @@ -276,13 +312,65 @@ static int axxia_i2c_fill_tx_fifo(struct axxia_i2c_dev *idev) return ret; } +static void axxia_i2c_slv_fifo_event(struct axxia_i2c_dev *idev) +{ + u32 fifo_status = readl(idev->base + SLV_RX_FIFO); + u8 val; + + dev_dbg(idev->dev, "slave irq fifo_status=0x%x\n", fifo_status); + + if (fifo_status & SLV_FIFO_DV1) { + if (fifo_status & SLV_FIFO_STRC) + i2c_slave_event(idev->slave, + I2C_SLAVE_WRITE_REQUESTED, &val); + + val = readl(idev->base + SLV_DATA); + i2c_slave_event(idev->slave, I2C_SLAVE_WRITE_RECEIVED, &val); + } + if (fifo_status & SLV_FIFO_STPC) { + readl(idev->base + SLV_DATA); /* dummy read */ + i2c_slave_event(idev->slave, I2C_SLAVE_STOP, &val); + } + if (fifo_status & SLV_FIFO_RSC) + readl(idev->base + SLV_DATA); /* dummy read */ +} + +static irqreturn_t axxia_i2c_slv_isr(struct axxia_i2c_dev *idev) +{ + u32 status = readl(idev->base + SLV_INT_STATUS); + u8 val; + + dev_dbg(idev->dev, "slave irq status=0x%x\n", status); + + if (status & SLV_STATUS_RFH) + axxia_i2c_slv_fifo_event(idev); + if (status & SLV_STATUS_SRS1) { + i2c_slave_event(idev->slave, I2C_SLAVE_READ_REQUESTED, &val); + writel(val, idev->base + SLV_DATA); + } + if (status & SLV_STATUS_SRND1) { + i2c_slave_event(idev->slave, I2C_SLAVE_READ_PROCESSED, &val); + writel(val, idev->base + SLV_DATA); + } + if (status & SLV_STATUS_SRC1) + i2c_slave_event(idev->slave, I2C_SLAVE_STOP, &val); + + writel(INT_SLV, idev->base + INTERRUPT_STATUS); + return IRQ_HANDLED; +} + static irqreturn_t axxia_i2c_isr(int irq, void *_dev) { struct axxia_i2c_dev *idev = _dev; + irqreturn_t ret = IRQ_NONE; u32 status; - if (!(readl(idev->base + INTERRUPT_STATUS) & INT_MST)) - return IRQ_NONE; + status = readl(idev->base + INTERRUPT_STATUS); + + if (status & INT_SLV) + ret = axxia_i2c_slv_isr(idev); + if (!(status & INT_MST)) + return ret; /* Read interrupt status bits */ status = readl(idev->base + MST_INT_STATUS); @@ -583,9 +671,58 @@ static u32 axxia_i2c_func(struct i2c_adapter *adap) return caps; } +static int axxia_i2c_reg_slave(struct i2c_client *slave) +{ + struct axxia_i2c_dev *idev = i2c_get_adapdata(slave->adapter); + u32 slv_int_mask = SLV_STATUS_RFH; + u32 dec_ctl; + + if (idev->slave) + return -EBUSY; + + idev->slave = slave; + + /* Enable slave mode as well */ + writel(GLOBAL_MST_EN | GLOBAL_SLV_EN, idev->base + GLOBAL_CONTROL); + writel(INT_MST | INT_SLV, idev->base + INTERRUPT_ENABLE); + + /* Set slave address */ + dec_ctl = SLV_ADDR_DEC_SA1E; + if (slave->flags & I2C_CLIENT_TEN) + dec_ctl |= SLV_ADDR_DEC_SA1M; + + writel(SLV_RX_ACSA1, idev->base + SLV_RX_CTL); + writel(dec_ctl, idev->base + SLV_ADDR_DEC_CTL); + writel(slave->addr, idev->base + SLV_ADDR_1); + + /* Enable interrupts */ + slv_int_mask |= SLV_STATUS_SRS1 | SLV_STATUS_SRRS1 | SLV_STATUS_SRND1; + slv_int_mask |= SLV_STATUS_SRC1; + writel(slv_int_mask, idev->base + SLV_INT_ENABLE); + + return 0; +} + +static int axxia_i2c_unreg_slave(struct i2c_client *slave) +{ + struct axxia_i2c_dev *idev = i2c_get_adapdata(slave->adapter); + + /* Disable slave mode */ + writel(GLOBAL_MST_EN, idev->base + GLOBAL_CONTROL); + writel(INT_MST, idev->base + INTERRUPT_ENABLE); + + synchronize_irq(idev->irq); + + idev->slave = NULL; + + return 0; +} + static const struct i2c_algorithm axxia_i2c_algo = { .master_xfer = axxia_i2c_xfer, .functionality = axxia_i2c_func, + .reg_slave = axxia_i2c_reg_slave, + .unreg_slave = axxia_i2c_unreg_slave, }; static const struct i2c_adapter_quirks axxia_i2c_quirks = { @@ -599,7 +736,6 @@ static int axxia_i2c_probe(struct platform_device *pdev) struct axxia_i2c_dev *idev = NULL; struct resource *res; void __iomem *base; - int irq; int ret = 0; idev = devm_kzalloc(&pdev->dev, sizeof(*idev), GFP_KERNEL); @@ -611,10 +747,10 @@ static int axxia_i2c_probe(struct platform_device *pdev) if (IS_ERR(base)) return PTR_ERR(base); - irq = platform_get_irq(pdev, 0); - if (irq < 0) { + idev->irq = platform_get_irq(pdev, 0); + if (idev->irq < 0) { dev_err(&pdev->dev, "missing interrupt resource\n"); - return irq; + return idev->irq; } idev->i2c_clk = devm_clk_get(&pdev->dev, "i2c"); @@ -643,10 +779,10 @@ static int axxia_i2c_probe(struct platform_device *pdev) goto error_disable_clk; } - ret = devm_request_irq(&pdev->dev, irq, axxia_i2c_isr, 0, + ret = devm_request_irq(&pdev->dev, idev->irq, axxia_i2c_isr, 0, pdev->name, idev); if (ret) { - dev_err(&pdev->dev, "failed to claim IRQ%d\n", irq); + dev_err(&pdev->dev, "failed to claim IRQ%d\n", idev->irq); goto error_disable_clk; } diff --git a/drivers/i2c/busses/i2c-bcm-iproc.c b/drivers/i2c/busses/i2c-bcm-iproc.c index 19ef2b0c682a..9ffdffaf6141 100644 --- a/drivers/i2c/busses/i2c-bcm-iproc.c +++ b/drivers/i2c/busses/i2c-bcm-iproc.c @@ -808,7 +808,7 @@ static struct i2c_algorithm bcm_iproc_algo = { .unreg_slave = bcm_iproc_i2c_unreg_slave, }; -static struct i2c_adapter_quirks bcm_iproc_i2c_quirks = { +static const struct i2c_adapter_quirks bcm_iproc_i2c_quirks = { .max_read_len = M_RX_MAX_READ_LEN, }; @@ -922,7 +922,9 @@ static int bcm_iproc_i2c_probe(struct platform_device *pdev) adap = &iproc_i2c->adapter; i2c_set_adapdata(adap, iproc_i2c); - strlcpy(adap->name, "Broadcom iProc I2C adapter", sizeof(adap->name)); + snprintf(adap->name, sizeof(adap->name), + "Broadcom iProc (%s)", + of_node_full_name(iproc_i2c->device->of_node)); adap->algo = &bcm_iproc_algo; adap->quirks = &bcm_iproc_i2c_quirks; adap->dev.parent = &pdev->dev; diff --git a/drivers/i2c/busses/i2c-bcm2835.c b/drivers/i2c/busses/i2c-bcm2835.c index 67752f7b0371..e01b2b57e724 100644 --- a/drivers/i2c/busses/i2c-bcm2835.c +++ b/drivers/i2c/busses/i2c-bcm2835.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -389,7 +390,7 @@ static const struct i2c_algorithm bcm2835_i2c_algo = { }; /* - * This HW was reported to have problems with clock stretching: + * The BCM2835 was reported to have problems with clock stretching: * http://www.advamation.com/knowhow/raspberrypi/rpi-i2c-bug.html * https://www.raspberrypi.org/forums/viewtopic.php?p=146272 */ @@ -471,11 +472,12 @@ static int bcm2835_i2c_probe(struct platform_device *pdev) i2c_set_adapdata(adap, i2c_dev); adap->owner = THIS_MODULE; adap->class = I2C_CLASS_DEPRECATED; - strlcpy(adap->name, "bcm2835 I2C adapter", sizeof(adap->name)); + snprintf(adap->name, sizeof(adap->name), "bcm2835 (%s)", + of_node_full_name(pdev->dev.of_node)); adap->algo = &bcm2835_i2c_algo; adap->dev.parent = &pdev->dev; adap->dev.of_node = pdev->dev.of_node; - adap->quirks = &bcm2835_i2c_quirks; + adap->quirks = of_device_get_match_data(&pdev->dev); bcm2835_i2c_writel(i2c_dev, BCM2835_I2C_C, 0); @@ -501,7 +503,8 @@ static int bcm2835_i2c_remove(struct platform_device *pdev) } static const struct of_device_id bcm2835_i2c_of_match[] = { - { .compatible = "brcm,bcm2835-i2c" }, + { .compatible = "brcm,bcm2711-i2c" }, + { .compatible = "brcm,bcm2835-i2c", .data = &bcm2835_i2c_quirks }, {}, }; MODULE_DEVICE_TABLE(of, bcm2835_i2c_of_match); diff --git a/drivers/i2c/busses/i2c-cht-wc.c b/drivers/i2c/busses/i2c-cht-wc.c index 66af44bfa67d..b8fde61bb5d8 100644 --- a/drivers/i2c/busses/i2c-cht-wc.c +++ b/drivers/i2c/busses/i2c-cht-wc.c @@ -178,6 +178,51 @@ static const struct i2c_algorithm cht_wc_i2c_adap_algo = { .smbus_xfer = cht_wc_i2c_adap_smbus_xfer, }; +/* + * We are an i2c-adapter which itself is part of an i2c-client. This means that + * transfers done through us take adapter->bus_lock twice, once for our parent + * i2c-adapter and once to take our own bus_lock. Lockdep does not like this + * nested locking, to make lockdep happy in the case of busses with muxes, the + * i2c-core's i2c_adapter_lock_bus function calls: + * rt_mutex_lock_nested(&adapter->bus_lock, i2c_adapter_depth(adapter)); + * + * But i2c_adapter_depth only works when the direct parent of the adapter is + * another adapter, as it is only meant for muxes. In our case there is an + * i2c-client and MFD instantiated platform_device in the parent->child chain + * between the 2 devices. + * + * So we override the default i2c_lock_operations and pass a hardcoded + * depth of 1 to rt_mutex_lock_nested, to make lockdep happy. + * + * Note that if there were to be a mux attached to our adapter, this would + * break things again since the i2c-mux code expects the root-adapter to have + * a locking depth of 0. But we always have only 1 client directly attached + * in the form of the Charger IC paired with the CHT Whiskey Cove PMIC. + */ +static void cht_wc_i2c_adap_lock_bus(struct i2c_adapter *adapter, + unsigned int flags) +{ + rt_mutex_lock_nested(&adapter->bus_lock, 1); +} + +static int cht_wc_i2c_adap_trylock_bus(struct i2c_adapter *adapter, + unsigned int flags) +{ + return rt_mutex_trylock(&adapter->bus_lock); +} + +static void cht_wc_i2c_adap_unlock_bus(struct i2c_adapter *adapter, + unsigned int flags) +{ + rt_mutex_unlock(&adapter->bus_lock); +} + +static const struct i2c_lock_operations cht_wc_i2c_adap_lock_ops = { + .lock_bus = cht_wc_i2c_adap_lock_bus, + .trylock_bus = cht_wc_i2c_adap_trylock_bus, + .unlock_bus = cht_wc_i2c_adap_unlock_bus, +}; + /**** irqchip for the client connected to the extchgr i2c adapter ****/ static void cht_wc_i2c_irq_lock(struct irq_data *data) { @@ -286,6 +331,7 @@ static int cht_wc_i2c_adap_i2c_probe(struct platform_device *pdev) adap->adapter.owner = THIS_MODULE; adap->adapter.class = I2C_CLASS_HWMON; adap->adapter.algo = &cht_wc_i2c_adap_algo; + adap->adapter.lock_ops = &cht_wc_i2c_adap_lock_ops; strlcpy(adap->adapter.name, "PMIC I2C Adapter", sizeof(adap->adapter.name)); adap->adapter.dev.parent = &pdev->dev; @@ -363,8 +409,7 @@ static int cht_wc_i2c_adap_i2c_remove(struct platform_device *pdev) { struct cht_wc_i2c_adap *adap = platform_get_drvdata(pdev); - if (adap->client) - i2c_unregister_device(adap->client); + i2c_unregister_device(adap->client); i2c_del_adapter(&adap->adapter); irq_domain_remove(adap->irq_domain); diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index d464799e40a3..e8b328242256 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -655,15 +655,11 @@ static int i2c_dw_init_recovery_info(struct dw_i2c_dev *dev) struct i2c_bus_recovery_info *rinfo = &dev->rinfo; struct i2c_adapter *adap = &dev->adapter; struct gpio_desc *gpio; - int r; - gpio = devm_gpiod_get(dev->dev, "scl", GPIOD_OUT_HIGH); - if (IS_ERR(gpio)) { - r = PTR_ERR(gpio); - if (r == -ENOENT || r == -ENOSYS) - return 0; - return r; - } + gpio = devm_gpiod_get_optional(dev->dev, "scl", GPIOD_OUT_HIGH); + if (IS_ERR_OR_NULL(gpio)) + return PTR_ERR_OR_ZERO(gpio); + rinfo->scl_gpiod = gpio; gpio = devm_gpiod_get_optional(dev->dev, "sda", GPIOD_IN); diff --git a/drivers/i2c/busses/i2c-designware-pcidrv.c b/drivers/i2c/busses/i2c-designware-pcidrv.c index 76810deb2de6..050adda7c1bd 100644 --- a/drivers/i2c/busses/i2c-designware-pcidrv.c +++ b/drivers/i2c/busses/i2c-designware-pcidrv.c @@ -33,6 +33,7 @@ enum dw_pci_ctl_id_t { baytrail, cherrytrail, haswell, + elkhartlake, }; struct dw_scl_sda_cfg { @@ -168,13 +169,20 @@ static struct dw_pci_controller dw_pci_controllers[] = { .flags = MODEL_CHERRYTRAIL, .scl_sda_cfg = &byt_config, }, + [elkhartlake] = { + .bus_num = -1, + .bus_cfg = INTEL_MID_STD_CFG | DW_IC_CON_SPEED_FAST, + .tx_fifo_depth = 32, + .rx_fifo_depth = 32, + .functionality = I2C_FUNC_10BIT_ADDR, + .clk_khz = 100000, + }, }; #ifdef CONFIG_PM static int i2c_dw_pci_suspend(struct device *dev) { - struct pci_dev *pdev = to_pci_dev(dev); - struct dw_i2c_dev *i_dev = pci_get_drvdata(pdev); + struct dw_i2c_dev *i_dev = dev_get_drvdata(dev); i_dev->suspended = true; i_dev->disable(i_dev); @@ -184,8 +192,7 @@ static int i2c_dw_pci_suspend(struct device *dev) static int i2c_dw_pci_resume(struct device *dev) { - struct pci_dev *pdev = to_pci_dev(dev); - struct dw_i2c_dev *i_dev = pci_get_drvdata(pdev); + struct dw_i2c_dev *i_dev = dev_get_drvdata(dev); int ret; ret = i_dev->init(i_dev); @@ -227,6 +234,8 @@ static int i2c_dw_pci_probe(struct pci_dev *pdev, return r; } + pci_set_master(pdev); + r = pcim_iomap_regions(pdev, 1 << 0, pci_name(pdev)); if (r) { dev_err(&pdev->dev, "I/O memory remapping failed\n"); @@ -237,18 +246,24 @@ static int i2c_dw_pci_probe(struct pci_dev *pdev, if (!dev) return -ENOMEM; + r = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES); + if (r < 0) + return r; + dev->clk = NULL; dev->controller = controller; dev->get_clk_rate_khz = i2c_dw_get_clk_rate_khz; dev->base = pcim_iomap_table(pdev)[0]; dev->dev = &pdev->dev; - dev->irq = pdev->irq; + dev->irq = pci_irq_vector(pdev, 0); dev->flags |= controller->flags; if (controller->setup) { r = controller->setup(pdev, controller); - if (r) + if (r) { + pci_free_irq_vectors(pdev); return r; + } } dev->functionality = controller->functionality | @@ -276,8 +291,10 @@ static int i2c_dw_pci_probe(struct pci_dev *pdev, adap->nr = controller->bus_num; r = i2c_dw_probe(dev); - if (r) + if (r) { + pci_free_irq_vectors(pdev); return r; + } pm_runtime_set_autosuspend_delay(&pdev->dev, 1000); pm_runtime_use_autosuspend(&pdev->dev); @@ -296,6 +313,7 @@ static void i2c_dw_pci_remove(struct pci_dev *pdev) pm_runtime_get_noresume(&pdev->dev); i2c_del_adapter(&dev->adapter); + pci_free_irq_vectors(pdev); } /* work with hotplug and coldplug */ @@ -331,6 +349,15 @@ static const struct pci_device_id i2_designware_pci_ids[] = { { PCI_VDEVICE(INTEL, 0x22C5), cherrytrail }, { PCI_VDEVICE(INTEL, 0x22C6), cherrytrail }, { PCI_VDEVICE(INTEL, 0x22C7), cherrytrail }, + /* Elkhart Lake (PSE I2C) */ + { PCI_VDEVICE(INTEL, 0x4bb9), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bba), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbb), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbc), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbd), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbe), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bbf), elkhartlake }, + { PCI_VDEVICE(INTEL, 0x4bc0), elkhartlake }, { 0,} }; MODULE_DEVICE_TABLE(pci, i2_designware_pci_ids); diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index ddfb81872906..16dd338877d0 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -279,12 +279,10 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) platform_set_drvdata(pdev, dev); dev->rst = devm_reset_control_get_optional_exclusive(&pdev->dev, NULL); - if (IS_ERR(dev->rst)) { - if (PTR_ERR(dev->rst) == -EPROBE_DEFER) - return -EPROBE_DEFER; - } else { - reset_control_deassert(dev->rst); - } + if (IS_ERR(dev->rst)) + return PTR_ERR(dev->rst); + + reset_control_deassert(dev->rst); t = &dev->timings; if (pdata) @@ -346,8 +344,10 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) /* Optional interface clock */ dev->pclk = devm_clk_get_optional(&pdev->dev, "pclk"); - if (IS_ERR(dev->pclk)) - return PTR_ERR(dev->pclk); + if (IS_ERR(dev->pclk)) { + ret = PTR_ERR(dev->pclk); + goto exit_reset; + } dev->clk = devm_clk_get(&pdev->dev, NULL); if (!i2c_dw_prepare_clk(dev, true)) { @@ -400,8 +400,7 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) exit_probe: dw_i2c_plat_pm_cleanup(dev); exit_reset: - if (!IS_ERR_OR_NULL(dev->rst)) - reset_control_assert(dev->rst); + reset_control_assert(dev->rst); return ret; } @@ -419,8 +418,7 @@ static int dw_i2c_plat_remove(struct platform_device *pdev) pm_runtime_put_sync(&pdev->dev); dw_i2c_plat_pm_cleanup(dev); - if (!IS_ERR_OR_NULL(dev->rst)) - reset_control_assert(dev->rst); + reset_control_assert(dev->rst); return 0; } diff --git a/drivers/i2c/busses/i2c-exynos5.c b/drivers/i2c/busses/i2c-exynos5.c index e4e7932f7800..e7514c16b756 100644 --- a/drivers/i2c/busses/i2c-exynos5.c +++ b/drivers/i2c/busses/i2c-exynos5.c @@ -791,9 +791,7 @@ static int exynos5_i2c_probe(struct platform_device *pdev) } ret = devm_request_irq(&pdev->dev, i2c->irq, exynos5_i2c_irq, - IRQF_NO_SUSPEND | IRQF_ONESHOT, - dev_name(&pdev->dev), i2c); - + IRQF_NO_SUSPEND, dev_name(&pdev->dev), i2c); if (ret != 0) { dev_err(&pdev->dev, "cannot request HS-I2C IRQ %d\n", i2c->irq); goto err_clk; diff --git a/drivers/i2c/busses/i2c-fsi.c b/drivers/i2c/busses/i2c-fsi.c index da5eb3960def..e0c256922d4f 100644 --- a/drivers/i2c/busses/i2c-fsi.c +++ b/drivers/i2c/busses/i2c-fsi.c @@ -707,8 +707,10 @@ static int fsi_i2c_probe(struct device *dev) continue; port = kzalloc(sizeof(*port), GFP_KERNEL); - if (!port) + if (!port) { + of_node_put(np); break; + } port->master = i2c; port->port = port_no; diff --git a/drivers/i2c/busses/i2c-hix5hd2.c b/drivers/i2c/busses/i2c-hix5hd2.c index 4df1434b3597..8497c7a95dd4 100644 --- a/drivers/i2c/busses/i2c-hix5hd2.c +++ b/drivers/i2c/busses/i2c-hix5hd2.c @@ -445,8 +445,7 @@ static int hix5hd2_i2c_probe(struct platform_device *pdev) hix5hd2_i2c_init(priv); ret = devm_request_irq(&pdev->dev, irq, hix5hd2_i2c_irq, - IRQF_NO_SUSPEND | IRQF_ONESHOT, - dev_name(&pdev->dev), priv); + IRQF_NO_SUSPEND, dev_name(&pdev->dev), priv); if (ret != 0) { dev_err(&pdev->dev, "cannot request HS-I2C IRQ %d\n", irq); goto err_clk; diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index 36e9559f880c..c09791fb4929 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -292,7 +292,8 @@ struct i801_priv { #define FEATURE_HOST_NOTIFY BIT(5) /* Not really a feature, but it's convenient to handle it as such */ #define FEATURE_IDF BIT(15) -#define FEATURE_TCO BIT(16) +#define FEATURE_TCO_SPT BIT(16) +#define FEATURE_TCO_CNL BIT(17) static const char *i801_feature_names[] = { "SMBus PEC", @@ -1500,57 +1501,23 @@ static inline unsigned int i801_get_adapter_class(struct i801_priv *priv) } #endif -static const struct itco_wdt_platform_data tco_platform_data = { +static const struct itco_wdt_platform_data spt_tco_platform_data = { .name = "Intel PCH", .version = 4, }; static DEFINE_SPINLOCK(p2sb_spinlock); -static void i801_add_tco(struct i801_priv *priv) +static struct platform_device * +i801_add_tco_spt(struct i801_priv *priv, struct pci_dev *pci_dev, + struct resource *tco_res) { - struct pci_dev *pci_dev = priv->pci_dev; - struct resource tco_res[3], *res; - struct platform_device *pdev; + struct resource *res; unsigned int devfn; - u32 tco_base, tco_ctl; - u32 base_addr, ctrl_val; u64 base64_addr; + u32 base_addr; u8 hidden; - if (!(priv->features & FEATURE_TCO)) - return; - - pci_read_config_dword(pci_dev, TCOBASE, &tco_base); - pci_read_config_dword(pci_dev, TCOCTL, &tco_ctl); - if (!(tco_ctl & TCOCTL_EN)) - return; - - memset(tco_res, 0, sizeof(tco_res)); - - res = &tco_res[ICH_RES_IO_TCO]; - res->start = tco_base & ~1; - res->end = res->start + 32 - 1; - res->flags = IORESOURCE_IO; - - /* - * Power Management registers. - */ - devfn = PCI_DEVFN(PCI_SLOT(pci_dev->devfn), 2); - pci_bus_read_config_dword(pci_dev->bus, devfn, ACPIBASE, &base_addr); - - res = &tco_res[ICH_RES_IO_SMI]; - res->start = (base_addr & ~1) + ACPIBASE_SMI_OFF; - res->end = res->start + 3; - res->flags = IORESOURCE_IO; - - /* - * Enable the ACPI I/O space. - */ - pci_bus_read_config_dword(pci_dev->bus, devfn, ACPICTRL, &ctrl_val); - ctrl_val |= ACPICTRL_EN; - pci_bus_write_config_dword(pci_dev->bus, devfn, ACPICTRL, ctrl_val); - /* * We must access the NO_REBOOT bit over the Primary to Sideband * bridge (P2SB). The BIOS prevents the P2SB device from being @@ -1586,15 +1553,76 @@ static void i801_add_tco(struct i801_priv *priv) res->end = res->start + 3; res->flags = IORESOURCE_MEM; - pdev = platform_device_register_resndata(&pci_dev->dev, "iTCO_wdt", -1, - tco_res, 3, &tco_platform_data, - sizeof(tco_platform_data)); - if (IS_ERR(pdev)) { - dev_warn(&pci_dev->dev, "failed to create iTCO device\n"); - return; - } + return platform_device_register_resndata(&pci_dev->dev, "iTCO_wdt", -1, + tco_res, 3, &spt_tco_platform_data, + sizeof(spt_tco_platform_data)); +} - priv->tco_pdev = pdev; +static const struct itco_wdt_platform_data cnl_tco_platform_data = { + .name = "Intel PCH", + .version = 6, +}; + +static struct platform_device * +i801_add_tco_cnl(struct i801_priv *priv, struct pci_dev *pci_dev, + struct resource *tco_res) +{ + return platform_device_register_resndata(&pci_dev->dev, "iTCO_wdt", -1, + tco_res, 2, &cnl_tco_platform_data, + sizeof(cnl_tco_platform_data)); +} + +static void i801_add_tco(struct i801_priv *priv) +{ + u32 base_addr, tco_base, tco_ctl, ctrl_val; + struct pci_dev *pci_dev = priv->pci_dev; + struct resource tco_res[3], *res; + unsigned int devfn; + + /* If we have ACPI based watchdog use that instead */ + if (acpi_has_watchdog()) + return; + + if (!(priv->features & (FEATURE_TCO_SPT | FEATURE_TCO_CNL))) + return; + + pci_read_config_dword(pci_dev, TCOBASE, &tco_base); + pci_read_config_dword(pci_dev, TCOCTL, &tco_ctl); + if (!(tco_ctl & TCOCTL_EN)) + return; + + memset(tco_res, 0, sizeof(tco_res)); + + res = &tco_res[ICH_RES_IO_TCO]; + res->start = tco_base & ~1; + res->end = res->start + 32 - 1; + res->flags = IORESOURCE_IO; + + /* + * Power Management registers. + */ + devfn = PCI_DEVFN(PCI_SLOT(pci_dev->devfn), 2); + pci_bus_read_config_dword(pci_dev->bus, devfn, ACPIBASE, &base_addr); + + res = &tco_res[ICH_RES_IO_SMI]; + res->start = (base_addr & ~1) + ACPIBASE_SMI_OFF; + res->end = res->start + 3; + res->flags = IORESOURCE_IO; + + /* + * Enable the ACPI I/O space. + */ + pci_bus_read_config_dword(pci_dev->bus, devfn, ACPICTRL, &ctrl_val); + ctrl_val |= ACPICTRL_EN; + pci_bus_write_config_dword(pci_dev->bus, devfn, ACPICTRL, ctrl_val); + + if (priv->features & FEATURE_TCO_CNL) + priv->tco_pdev = i801_add_tco_cnl(priv, pci_dev, tco_res); + else + priv->tco_pdev = i801_add_tco_spt(priv, pci_dev, tco_res); + + if (IS_ERR(priv->tco_pdev)) + dev_warn(&pci_dev->dev, "failed to create iTCO device\n"); } #ifdef CONFIG_ACPI @@ -1704,13 +1732,21 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) switch (dev->device) { case PCI_DEVICE_ID_INTEL_SUNRISEPOINT_H_SMBUS: case PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_SMBUS: - case PCI_DEVICE_ID_INTEL_CANNONLAKE_H_SMBUS: - case PCI_DEVICE_ID_INTEL_CANNONLAKE_LP_SMBUS: case PCI_DEVICE_ID_INTEL_LEWISBURG_SMBUS: case PCI_DEVICE_ID_INTEL_LEWISBURG_SSKU_SMBUS: - case PCI_DEVICE_ID_INTEL_CDF_SMBUS: case PCI_DEVICE_ID_INTEL_DNV_SMBUS: case PCI_DEVICE_ID_INTEL_KABYLAKE_PCH_H_SMBUS: + priv->features |= FEATURE_I2C_BLOCK_READ; + priv->features |= FEATURE_IRQ; + priv->features |= FEATURE_SMBUS_PEC; + priv->features |= FEATURE_BLOCK_BUFFER; + priv->features |= FEATURE_TCO_SPT; + priv->features |= FEATURE_HOST_NOTIFY; + break; + + case PCI_DEVICE_ID_INTEL_CANNONLAKE_H_SMBUS: + case PCI_DEVICE_ID_INTEL_CANNONLAKE_LP_SMBUS: + case PCI_DEVICE_ID_INTEL_CDF_SMBUS: case PCI_DEVICE_ID_INTEL_ICELAKE_LP_SMBUS: case PCI_DEVICE_ID_INTEL_COMETLAKE_SMBUS: case PCI_DEVICE_ID_INTEL_ELKHART_LAKE_SMBUS: @@ -1720,9 +1756,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) priv->features |= FEATURE_IRQ; priv->features |= FEATURE_SMBUS_PEC; priv->features |= FEATURE_BLOCK_BUFFER; - /* If we have ACPI based watchdog use that instead */ - if (!acpi_has_watchdog()) - priv->features |= FEATURE_TCO; + priv->features |= FEATURE_TCO_CNL; priv->features |= FEATURE_HOST_NOTIFY; break; @@ -1921,8 +1955,7 @@ static int i801_suspend(struct device *dev) static int i801_resume(struct device *dev) { - struct pci_dev *pci_dev = to_pci_dev(dev); - struct i801_priv *priv = pci_get_drvdata(pci_dev); + struct i801_priv *priv = dev_get_drvdata(dev); i801_enable_host_notify(&priv->adapter); diff --git a/drivers/i2c/busses/i2c-icy.c b/drivers/i2c/busses/i2c-icy.c new file mode 100644 index 000000000000..8382eb64b424 --- /dev/null +++ b/drivers/i2c/busses/i2c-icy.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * I2C driver for stand-alone PCF8584 style adapters on Zorro cards + * + * Original ICY documentation can be found on Aminet: + * https://aminet.net/package/docs/hard/icy + * + * There has been a modern community re-print of this design in 2019: + * https://www.a1k.org/forum/index.php?threads/70106/ + * + * The card is basically a Philips PCF8584 connected straight to the + * beginning of the AutoConfig'd address space (register S1 on base+2), + * with /INT on /INT2 on the Zorro bus. + * + * Copyright (c) 2019 Max Staudt + * + * This started as a fork of i2c-elektor.c and has evolved since. + * Thanks go to its authors for providing a base to grow on. + * + * + * IRQ support is currently not implemented. + * + * As it turns out, i2c-algo-pcf is really written with i2c-elektor's + * edge-triggered ISA interrupts in mind, while the Amiga's Zorro bus has + * level-triggered interrupts. This means that once an interrupt occurs, we + * have to tell the PCF8584 to shut up immediately, or it will keep the + * interrupt line busy and cause an IRQ storm. + + * However, because of the PCF8584's host-side protocol, there is no good + * way to just quieten it without side effects. Rather, we have to perform + * the next read/write operation straight away, which will reset the /INT + * pin. This entails re-designing the core of i2c-algo-pcf in the future. + * For now, we never request an IRQ from the PCF8584, and poll it instead. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "../algos/i2c-algo-pcf.h" + +struct icy_i2c { + struct i2c_adapter adapter; + + void __iomem *reg_s0; + void __iomem *reg_s1; + struct fwnode_handle *ltc2990_fwnode; + struct i2c_client *ltc2990_client; +}; + +/* + * Functions called by i2c-algo-pcf + */ +static void icy_pcf_setpcf(void *data, int ctl, int val) +{ + struct icy_i2c *i2c = (struct icy_i2c *)data; + + u8 __iomem *address = ctl ? i2c->reg_s1 : i2c->reg_s0; + + z_writeb(val, address); +} + +static int icy_pcf_getpcf(void *data, int ctl) +{ + struct icy_i2c *i2c = (struct icy_i2c *)data; + + u8 __iomem *address = ctl ? i2c->reg_s1 : i2c->reg_s0; + + return z_readb(address); +} + +static int icy_pcf_getown(void *data) +{ + return 0x55; +} + +static int icy_pcf_getclock(void *data) +{ + return 0x1c; +} + +static void icy_pcf_waitforpin(void *data) +{ + usleep_range(50, 150); +} + +/* + * Main i2c-icy part + */ +static unsigned short const icy_ltc2990_addresses[] = { + 0x4c, 0x4d, 0x4e, 0x4f, I2C_CLIENT_END +}; + +/* + * Additional sensors exposed once this property is applied: + * + * in1 will be the voltage of the 5V rail, divided by 2. + * in2 will be the voltage of the 12V rail, divided by 4. + * temp3 will be measured using a PCB loop next the chip. + */ +static const u32 icy_ltc2990_meas_mode[] = {0, 3}; + +static const struct property_entry icy_ltc2990_props[] = { + PROPERTY_ENTRY_U32_ARRAY("lltc,meas-mode", icy_ltc2990_meas_mode), + { } +}; + +static int icy_probe(struct zorro_dev *z, + const struct zorro_device_id *ent) +{ + struct icy_i2c *i2c; + struct i2c_algo_pcf_data *algo_data; + struct fwnode_handle *new_fwnode; + struct i2c_board_info ltc2990_info = { + .type = "ltc2990", + .addr = 0x4c, + }; + + i2c = devm_kzalloc(&z->dev, sizeof(*i2c), GFP_KERNEL); + if (!i2c) + return -ENOMEM; + + algo_data = devm_kzalloc(&z->dev, sizeof(*algo_data), GFP_KERNEL); + if (!algo_data) + return -ENOMEM; + + dev_set_drvdata(&z->dev, i2c); + i2c->adapter.dev.parent = &z->dev; + i2c->adapter.owner = THIS_MODULE; + /* i2c->adapter.algo assigned by i2c_pcf_add_bus() */ + i2c->adapter.algo_data = algo_data; + strlcpy(i2c->adapter.name, "ICY I2C Zorro adapter", + sizeof(i2c->adapter.name)); + + if (!devm_request_mem_region(&z->dev, + z->resource.start, + 4, i2c->adapter.name)) + return -ENXIO; + + /* Driver private data */ + i2c->reg_s0 = ZTWO_VADDR(z->resource.start); + i2c->reg_s1 = ZTWO_VADDR(z->resource.start + 2); + + algo_data->data = i2c; + algo_data->setpcf = icy_pcf_setpcf; + algo_data->getpcf = icy_pcf_getpcf; + algo_data->getown = icy_pcf_getown; + algo_data->getclock = icy_pcf_getclock; + algo_data->waitforpin = icy_pcf_waitforpin; + + if (i2c_pcf_add_bus(&i2c->adapter)) { + dev_err(&z->dev, "i2c_pcf_add_bus() failed\n"); + return -ENXIO; + } + + dev_info(&z->dev, "ICY I2C controller at %pa, IRQ not implemented\n", + &z->resource.start); + + /* + * The 2019 a1k.org PCBs have an LTC2990 at 0x4c, so start + * it automatically once ltc2990 is modprobed. + * + * in0 is the voltage of the internal 5V power supply. + * temp1 is the temperature inside the chip. + * + * See property_entry above for in1, in2, temp3. + */ + new_fwnode = fwnode_create_software_node(icy_ltc2990_props, NULL); + if (IS_ERR(new_fwnode)) { + dev_info(&z->dev, "Failed to create fwnode for LTC2990, error: %ld\n", + PTR_ERR(new_fwnode)); + } else { + /* + * Store the fwnode so we can destroy it on .remove(). + * Only store it on success, as fwnode_remove_software_node() + * is NULL safe, but not PTR_ERR safe. + */ + i2c->ltc2990_fwnode = new_fwnode; + ltc2990_info.fwnode = new_fwnode; + + i2c->ltc2990_client = + i2c_new_probed_device(&i2c->adapter, + <c2990_info, + icy_ltc2990_addresses, + NULL); + } + + return 0; +} + +static void icy_remove(struct zorro_dev *z) +{ + struct icy_i2c *i2c = dev_get_drvdata(&z->dev); + + i2c_unregister_device(i2c->ltc2990_client); + fwnode_remove_software_node(i2c->ltc2990_fwnode); + + i2c_del_adapter(&i2c->adapter); +} + +static const struct zorro_device_id icy_zorro_tbl[] = { + { ZORRO_ID(VMC, 15, 0), }, + { 0 } +}; + +MODULE_DEVICE_TABLE(zorro, icy_zorro_tbl); + +static struct zorro_driver icy_driver = { + .name = "i2c-icy", + .id_table = icy_zorro_tbl, + .probe = icy_probe, + .remove = icy_remove, +}; + +module_driver(icy_driver, + zorro_register_driver, + zorro_unregister_driver); + +MODULE_AUTHOR("Max Staudt "); +MODULE_DESCRIPTION("I2C bus via PCF8584 on ICY Zorro card"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/i2c/busses/i2c-imx-lpi2c.c b/drivers/i2c/busses/i2c-imx-lpi2c.c index dc00fabc919a..c92b56485fa6 100644 --- a/drivers/i2c/busses/i2c-imx-lpi2c.c +++ b/drivers/i2c/busses/i2c-imx-lpi2c.c @@ -545,7 +545,6 @@ MODULE_DEVICE_TABLE(of, lpi2c_imx_of_match); static int lpi2c_imx_probe(struct platform_device *pdev) { struct lpi2c_imx_struct *lpi2c_imx; - struct resource *res; unsigned int temp; int irq, ret; @@ -553,8 +552,7 @@ static int lpi2c_imx_probe(struct platform_device *pdev) if (!lpi2c_imx) return -ENOMEM; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - lpi2c_imx->base = devm_ioremap_resource(&pdev->dev, res); + lpi2c_imx->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(lpi2c_imx->base)) return PTR_ERR(lpi2c_imx->base); diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index 15f6cde6452f..a3b61336fe55 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -20,6 +20,7 @@ * */ +#include #include #include #include @@ -255,6 +256,12 @@ static const struct of_device_id i2c_imx_dt_ids[] = { }; MODULE_DEVICE_TABLE(of, i2c_imx_dt_ids); +static const struct acpi_device_id i2c_imx_acpi_ids[] = { + {"NXP0001", .driver_data = (kernel_ulong_t)&vf610_i2c_hwdata}, + { } +}; +MODULE_DEVICE_TABLE(acpi, i2c_imx_acpi_ids); + static inline int is_imx1_i2c(struct imx_i2c_struct *i2c_imx) { return i2c_imx->hwdata->devtype == IMX1_I2C; @@ -1048,14 +1055,13 @@ static const struct i2c_algorithm i2c_imx_algo = { static int i2c_imx_probe(struct platform_device *pdev) { - const struct of_device_id *of_id = of_match_device(i2c_imx_dt_ids, - &pdev->dev); struct imx_i2c_struct *i2c_imx; struct resource *res; struct imxi2c_platform_data *pdata = dev_get_platdata(&pdev->dev); void __iomem *base; int irq, ret; dma_addr_t phy_addr; + const struct imx_i2c_hwdata *match; dev_dbg(&pdev->dev, "<%s>\n", __func__); @@ -1075,8 +1081,9 @@ static int i2c_imx_probe(struct platform_device *pdev) if (!i2c_imx) return -ENOMEM; - if (of_id) - i2c_imx->hwdata = of_id->data; + match = device_get_match_data(&pdev->dev); + if (match) + i2c_imx->hwdata = match; else i2c_imx->hwdata = (struct imx_i2c_hwdata *) platform_get_device_id(pdev)->driver_data; @@ -1089,6 +1096,7 @@ static int i2c_imx_probe(struct platform_device *pdev) i2c_imx->adapter.nr = pdev->id; i2c_imx->adapter.dev.of_node = pdev->dev.of_node; i2c_imx->base = base; + ACPI_COMPANION_SET(&i2c_imx->adapter.dev, ACPI_COMPANION(&pdev->dev)); /* Get I2C clock */ i2c_imx->clk = devm_clk_get(&pdev->dev, NULL); @@ -1247,6 +1255,7 @@ static struct platform_driver i2c_imx_driver = { .name = DRIVER_NAME, .pm = &i2c_imx_pm_ops, .of_match_table = i2c_imx_dt_ids, + .acpi_match_table = i2c_imx_acpi_ids, }, .id_table = imx_i2c_devtype, }; diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c index 02d23edb2fb1..2f95e25a10f7 100644 --- a/drivers/i2c/busses/i2c-ismt.c +++ b/drivers/i2c/busses/i2c-ismt.c @@ -781,8 +781,6 @@ static int ismt_dev_init(struct ismt_priv *priv) if (!priv->hw) return -ENOMEM; - memset(priv->hw, 0, (ISMT_DESC_ENTRIES * sizeof(struct ismt_desc))); - priv->head = 0; init_completion(&priv->cmp); diff --git a/drivers/i2c/busses/i2c-mxs.c b/drivers/i2c/busses/i2c-mxs.c index 7d79317a1046..89224913f578 100644 --- a/drivers/i2c/busses/i2c-mxs.c +++ b/drivers/i2c/busses/i2c-mxs.c @@ -802,7 +802,6 @@ static int mxs_i2c_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct mxs_i2c_dev *i2c; struct i2c_adapter *adap; - struct resource *res; int err, irq; i2c = devm_kzalloc(dev, sizeof(*i2c), GFP_KERNEL); @@ -814,8 +813,7 @@ static int mxs_i2c_probe(struct platform_device *pdev) i2c->dev_type = device_id->driver_data; } - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - i2c->regs = devm_ioremap_resource(&pdev->dev, res); + i2c->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(i2c->regs)) return PTR_ERR(i2c->regs); diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c index 4117f1abc7c6..ca8b3ecfa93d 100644 --- a/drivers/i2c/busses/i2c-ocores.c +++ b/drivers/i2c/busses/i2c-ocores.c @@ -703,8 +703,9 @@ static int ocores_i2c_probe(struct platform_device *pdev) } if (ocores_algorithm.master_xfer != ocores_xfer_polling) { - ret = devm_request_irq(&pdev->dev, irq, ocores_isr, 0, - pdev->name, i2c); + ret = devm_request_any_context_irq(&pdev->dev, irq, + ocores_isr, 0, + pdev->name, i2c); if (ret) { dev_err(&pdev->dev, "Cannot claim IRQ\n"); goto err_clk; diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c index cba325eb852f..30ded6422e7b 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -72,7 +72,8 @@ #define PIIX4_BLOCK_DATA 0x14 /* Multi-port constants */ -#define PIIX4_MAX_ADAPTERS 4 +#define PIIX4_MAX_ADAPTERS 4 +#define HUDSON2_MAIN_PORTS 2 /* HUDSON2, KERNCZ reserves ports 3, 4 */ /* SB800 constants */ #define SB800_PIIX4_SMB_IDX 0xcd6 @@ -806,10 +807,12 @@ MODULE_DEVICE_TABLE (pci, piix4_ids); static struct i2c_adapter *piix4_main_adapters[PIIX4_MAX_ADAPTERS]; static struct i2c_adapter *piix4_aux_adapter; +static int piix4_adapter_count; static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba, bool sb800_main, u8 port, bool notify_imc, - const char *name, struct i2c_adapter **padap) + u8 hw_port_nr, const char *name, + struct i2c_adapter **padap) { struct i2c_adapter *adap; struct i2c_piix4_adapdata *adapdata; @@ -841,6 +844,12 @@ static int piix4_add_adapter(struct pci_dev *dev, unsigned short smba, /* set up the sysfs linkage to our parent device */ adap->dev.parent = &dev->dev; + if (has_acpi_companion(&dev->dev)) { + acpi_preset_companion(&adap->dev, + ACPI_COMPANION(&dev->dev), + hw_port_nr); + } + snprintf(adap->name, sizeof(adap->name), "SMBus PIIX4 adapter%s at %04x", name, smba); @@ -865,8 +874,19 @@ static int piix4_add_adapters_sb800(struct pci_dev *dev, unsigned short smba, int port; int retval; - for (port = 0; port < PIIX4_MAX_ADAPTERS; port++) { + if (dev->device == PCI_DEVICE_ID_AMD_KERNCZ_SMBUS || + (dev->device == PCI_DEVICE_ID_AMD_HUDSON2_SMBUS && + dev->revision >= 0x1F)) { + piix4_adapter_count = HUDSON2_MAIN_PORTS; + } else { + piix4_adapter_count = PIIX4_MAX_ADAPTERS; + } + + for (port = 0; port < piix4_adapter_count; port++) { + u8 hw_port_nr = port == 0 ? 0 : port + 1; + retval = piix4_add_adapter(dev, smba, true, port, notify_imc, + hw_port_nr, piix4_main_port_names_sb800[port], &piix4_main_adapters[port]); if (retval < 0) @@ -937,8 +957,8 @@ static int piix4_probe(struct pci_dev *dev, const struct pci_device_id *id) return retval; /* Try to register main SMBus adapter, give up if we can't */ - retval = piix4_add_adapter(dev, retval, false, 0, false, "", - &piix4_main_adapters[0]); + retval = piix4_add_adapter(dev, retval, false, 0, false, 0, + "", &piix4_main_adapters[0]); if (retval < 0) return retval; } @@ -964,7 +984,7 @@ static int piix4_probe(struct pci_dev *dev, const struct pci_device_id *id) if (retval > 0) { /* Try to add the aux adapter if it exists, * piix4_add_adapter will clean up if this fails */ - piix4_add_adapter(dev, retval, false, 0, false, + piix4_add_adapter(dev, retval, false, 0, false, 1, is_sb800 ? piix4_aux_port_name_sb800 : "", &piix4_aux_adapter); } @@ -987,7 +1007,7 @@ static void piix4_adap_remove(struct i2c_adapter *adap) static void piix4_remove(struct pci_dev *dev) { - int port = PIIX4_MAX_ADAPTERS; + int port = piix4_adapter_count; while (--port >= 0) { if (piix4_main_adapters[port]) { diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c index 961123529678..b432e7580458 100644 --- a/drivers/i2c/busses/i2c-sprd.c +++ b/drivers/i2c/busses/i2c-sprd.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -465,9 +466,9 @@ static int sprd_i2c_clk_init(struct sprd_i2c *i2c_dev) i2c_dev->clk = devm_clk_get(i2c_dev->dev, "enable"); if (IS_ERR(i2c_dev->clk)) { - dev_warn(i2c_dev->dev, "i2c%d can't get the enable clock\n", - i2c_dev->adap.nr); - i2c_dev->clk = NULL; + dev_err(i2c_dev->dev, "i2c%d can't get the enable clock\n", + i2c_dev->adap.nr); + return PTR_ERR(i2c_dev->clk); } return 0; @@ -477,7 +478,6 @@ static int sprd_i2c_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct sprd_i2c *i2c_dev; - struct resource *res; u32 prop; int ret; @@ -487,8 +487,7 @@ static int sprd_i2c_probe(struct platform_device *pdev) if (!i2c_dev) return -ENOMEM; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - i2c_dev->base = devm_ioremap_resource(dev, res); + i2c_dev->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(i2c_dev->base)) return PTR_ERR(i2c_dev->base); @@ -520,7 +519,10 @@ static int sprd_i2c_probe(struct platform_device *pdev) if (i2c_dev->bus_freq != 100000 && i2c_dev->bus_freq != 400000) return -EINVAL; - sprd_i2c_clk_init(i2c_dev); + ret = sprd_i2c_clk_init(i2c_dev); + if (ret) + return ret; + platform_set_drvdata(pdev, i2c_dev); ret = clk_prepare_enable(i2c_dev->clk); @@ -644,8 +646,7 @@ static struct platform_driver sprd_i2c_driver = { }, }; -static int sprd_i2c_init(void) -{ - return platform_driver_register(&sprd_i2c_driver); -} -arch_initcall_sync(sprd_i2c_init); +module_platform_driver(sprd_i2c_driver); + +MODULE_DESCRIPTION("Spreadtrum I2C master controller driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index 266d1c269b83..d36cf08461f7 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -1809,7 +1809,7 @@ static u32 stm32f7_i2c_func(struct i2c_adapter *adap) I2C_FUNC_SMBUS_I2C_BLOCK; } -static struct i2c_algorithm stm32f7_i2c_algo = { +static const struct i2c_algorithm stm32f7_i2c_algo = { .master_xfer = stm32f7_i2c_xfer, .smbus_xfer = stm32f7_i2c_smbus_xfer, .functionality = stm32f7_i2c_func, diff --git a/drivers/i2c/busses/i2c-synquacer.c b/drivers/i2c/busses/i2c-synquacer.c index f724c8e6b360..39762f0611b1 100644 --- a/drivers/i2c/busses/i2c-synquacer.c +++ b/drivers/i2c/busses/i2c-synquacer.c @@ -526,7 +526,7 @@ static const struct i2c_algorithm synquacer_i2c_algo = { .functionality = synquacer_i2c_functionality, }; -static struct i2c_adapter synquacer_i2c_ops = { +static const struct i2c_adapter synquacer_i2c_ops = { .owner = THIS_MODULE, .name = "synquacer_i2c-adapter", .algo = &synquacer_i2c_algo, diff --git a/drivers/i2c/busses/i2c-taos-evm.c b/drivers/i2c/busses/i2c-taos-evm.c index 37347c93e8e0..0bff3f3a8779 100644 --- a/drivers/i2c/busses/i2c-taos-evm.c +++ b/drivers/i2c/busses/i2c-taos-evm.c @@ -39,7 +39,7 @@ struct taos_data { }; /* TAOS TSL2550 EVM */ -static struct i2c_board_info tsl2550_info = { +static const struct i2c_board_info tsl2550_info = { I2C_BOARD_INFO("tsl2550", 0x39), }; diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 9fcb13beeb8f..c1683f9338b4 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -636,7 +636,7 @@ static void tegra_dvc_init(struct tegra_i2c_dev *i2c_dev) dvc_writel(i2c_dev, val, DVC_CTRL_REG1); } -static int tegra_i2c_runtime_resume(struct device *dev) +static int __maybe_unused tegra_i2c_runtime_resume(struct device *dev) { struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); int ret; @@ -665,7 +665,7 @@ static int tegra_i2c_runtime_resume(struct device *dev) return 0; } -static int tegra_i2c_runtime_suspend(struct device *dev) +static int __maybe_unused tegra_i2c_runtime_suspend(struct device *dev) { struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); @@ -713,12 +713,6 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev, bool clk_reinit) u32 tsu_thd; u8 tlow, thigh; - err = pm_runtime_get_sync(i2c_dev->dev); - if (err < 0) { - dev_err(i2c_dev->dev, "runtime resume failed %d\n", err); - return err; - } - reset_control_assert(i2c_dev->rst); udelay(2); reset_control_deassert(i2c_dev->rst); @@ -772,7 +766,7 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev, bool clk_reinit) if (err) { dev_err(i2c_dev->dev, "failed changing clock rate: %d\n", err); - goto err; + return err; } } @@ -787,23 +781,21 @@ static int tegra_i2c_init(struct tegra_i2c_dev *i2c_dev, bool clk_reinit) err = tegra_i2c_flush_fifos(i2c_dev); if (err) - goto err; + return err; if (i2c_dev->is_multimaster_mode && i2c_dev->hw->has_slcg_override_reg) i2c_writel(i2c_dev, I2C_MST_CORE_CLKEN_OVR, I2C_CLKEN_OVERRIDE); err = tegra_i2c_wait_for_config_load(i2c_dev); if (err) - goto err; + return err; if (i2c_dev->irq_disabled) { i2c_dev->irq_disabled = false; enable_irq(i2c_dev->irq); } -err: - pm_runtime_put(i2c_dev->dev); - return err; + return 0; } static int tegra_i2c_disable_packet_mode(struct tegra_i2c_dev *i2c_dev) @@ -1616,12 +1608,14 @@ static int tegra_i2c_probe(struct platform_device *pdev) } pm_runtime_enable(&pdev->dev); - if (!pm_runtime_enabled(&pdev->dev)) { + if (!pm_runtime_enabled(&pdev->dev)) ret = tegra_i2c_runtime_resume(&pdev->dev); - if (ret < 0) { - dev_err(&pdev->dev, "runtime resume failed\n"); - goto unprepare_div_clk; - } + else + ret = pm_runtime_get_sync(i2c_dev->dev); + + if (ret < 0) { + dev_err(&pdev->dev, "runtime resume failed\n"); + goto unprepare_div_clk; } if (i2c_dev->is_multimaster_mode) { @@ -1666,6 +1660,8 @@ static int tegra_i2c_probe(struct platform_device *pdev) if (ret) goto release_dma; + pm_runtime_put(&pdev->dev); + return 0; release_dma: @@ -1711,8 +1707,7 @@ static int tegra_i2c_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM_SLEEP -static int tegra_i2c_suspend(struct device *dev) +static int __maybe_unused tegra_i2c_suspend(struct device *dev) { struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); @@ -1721,38 +1716,41 @@ static int tegra_i2c_suspend(struct device *dev) return 0; } -static int tegra_i2c_resume(struct device *dev) +static int __maybe_unused tegra_i2c_resume(struct device *dev) { struct tegra_i2c_dev *i2c_dev = dev_get_drvdata(dev); int err; + err = tegra_i2c_runtime_resume(dev); + if (err) + return err; + err = tegra_i2c_init(i2c_dev, false); if (err) return err; + err = tegra_i2c_runtime_suspend(dev); + if (err) + return err; + i2c_mark_adapter_resumed(&i2c_dev->adapter); return 0; } static const struct dev_pm_ops tegra_i2c_pm = { - SET_SYSTEM_SLEEP_PM_OPS(tegra_i2c_suspend, tegra_i2c_resume) + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(tegra_i2c_suspend, tegra_i2c_resume) SET_RUNTIME_PM_OPS(tegra_i2c_runtime_suspend, tegra_i2c_runtime_resume, NULL) }; -#define TEGRA_I2C_PM (&tegra_i2c_pm) -#else -#define TEGRA_I2C_PM NULL -#endif - static struct platform_driver tegra_i2c_driver = { .probe = tegra_i2c_probe, .remove = tegra_i2c_remove, .driver = { .name = "tegra-i2c", .of_match_table = tegra_i2c_of_match, - .pm = TEGRA_I2C_PM, + .pm = &tegra_i2c_pm, }, }; diff --git a/drivers/i2c/busses/i2c-uniphier-f.c b/drivers/i2c/busses/i2c-uniphier-f.c index 7acca2599f04..4241aac79e7e 100644 --- a/drivers/i2c/busses/i2c-uniphier-f.c +++ b/drivers/i2c/busses/i2c-uniphier-f.c @@ -108,7 +108,6 @@ static void uniphier_fi2c_fill_txfifo(struct uniphier_fi2c_priv *priv, if (fifo_space-- <= 0) break; - dev_dbg(&priv->adap.dev, "write data: %02x\n", *priv->buf); writel(*priv->buf++, priv->membase + UNIPHIER_FI2C_DTTX); priv->len--; } @@ -124,7 +123,6 @@ static void uniphier_fi2c_drain_rxfifo(struct uniphier_fi2c_priv *priv) break; *priv->buf++ = readl(priv->membase + UNIPHIER_FI2C_DTRX); - dev_dbg(&priv->adap.dev, "read data: %02x\n", priv->buf[-1]); priv->len--; } } @@ -142,8 +140,6 @@ static void uniphier_fi2c_clear_irqs(struct uniphier_fi2c_priv *priv, static void uniphier_fi2c_stop(struct uniphier_fi2c_priv *priv) { - dev_dbg(&priv->adap.dev, "stop condition\n"); - priv->enabled_irqs |= UNIPHIER_FI2C_INT_STOP; uniphier_fi2c_set_irqs(priv); writel(UNIPHIER_FI2C_CR_MST | UNIPHIER_FI2C_CR_STO, @@ -160,21 +156,15 @@ static irqreturn_t uniphier_fi2c_interrupt(int irq, void *dev_id) irq_status = readl(priv->membase + UNIPHIER_FI2C_INT); irq_status &= priv->enabled_irqs; - dev_dbg(&priv->adap.dev, - "interrupt: enabled_irqs=%04x, irq_status=%04x\n", - priv->enabled_irqs, irq_status); - if (irq_status & UNIPHIER_FI2C_INT_STOP) goto complete; if (unlikely(irq_status & UNIPHIER_FI2C_INT_AL)) { - dev_dbg(&priv->adap.dev, "arbitration lost\n"); priv->error = -EAGAIN; goto complete; } if (unlikely(irq_status & UNIPHIER_FI2C_INT_NA)) { - dev_dbg(&priv->adap.dev, "could not get ACK\n"); priv->error = -ENXIO; if (priv->flags & UNIPHIER_FI2C_RD) { /* @@ -215,18 +205,14 @@ static irqreturn_t uniphier_fi2c_interrupt(int irq, void *dev_id) if (unlikely(priv->flags & UNIPHIER_FI2C_MANUAL_NACK)) { if (priv->len <= UNIPHIER_FI2C_FIFO_SIZE && !(priv->flags & UNIPHIER_FI2C_BYTE_WISE)) { - dev_dbg(&priv->adap.dev, - "enable read byte count IRQ\n"); priv->enabled_irqs |= UNIPHIER_FI2C_INT_RB; uniphier_fi2c_set_irqs(priv); priv->flags |= UNIPHIER_FI2C_BYTE_WISE; } - if (priv->len <= 1) { - dev_dbg(&priv->adap.dev, "set NACK\n"); + if (priv->len <= 1) writel(UNIPHIER_FI2C_CR_MST | UNIPHIER_FI2C_CR_NACK, priv->membase + UNIPHIER_FI2C_CR); - } } goto handled; @@ -334,10 +320,6 @@ static int uniphier_fi2c_master_xfer_one(struct i2c_adapter *adap, bool is_read = msg->flags & I2C_M_RD; unsigned long time_left, flags; - dev_dbg(&adap->dev, "%s: addr=0x%02x, len=%d, repeat=%d, stop=%d\n", - is_read ? "receive" : "transmit", msg->addr, msg->len, - repeat, stop); - priv->len = msg->len; priv->buf = msg->buf; priv->enabled_irqs = UNIPHIER_FI2C_INT_FAULTS; @@ -359,7 +341,6 @@ static int uniphier_fi2c_master_xfer_one(struct i2c_adapter *adap, else uniphier_fi2c_tx_init(priv, msg->addr, repeat); - dev_dbg(&adap->dev, "start condition\n"); /* * For a repeated START condition, writing a slave address to the FIFO * kicks the controller. So, the UNIPHIER_FI2C_CR register should be @@ -383,7 +364,6 @@ static int uniphier_fi2c_master_xfer_one(struct i2c_adapter *adap, uniphier_fi2c_recover(priv); return -ETIMEDOUT; } - dev_dbg(&adap->dev, "complete\n"); if (unlikely(priv->flags & UNIPHIER_FI2C_DEFER_STOP_COMP)) { u32 status; @@ -538,7 +518,6 @@ static int uniphier_fi2c_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct uniphier_fi2c_priv *priv; - struct resource *regs; u32 bus_speed; unsigned long clk_rate; int irq, ret; @@ -547,8 +526,7 @@ static int uniphier_fi2c_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); - priv->membase = devm_ioremap_resource(dev, regs); + priv->membase = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(priv->membase)) return PTR_ERR(priv->membase); diff --git a/drivers/i2c/busses/i2c-uniphier.c b/drivers/i2c/busses/i2c-uniphier.c index 0173840c32af..0270090c0360 100644 --- a/drivers/i2c/busses/i2c-uniphier.c +++ b/drivers/i2c/busses/i2c-uniphier.c @@ -71,7 +71,6 @@ static int uniphier_i2c_xfer_byte(struct i2c_adapter *adap, u32 txdata, reinit_completion(&priv->comp); txdata |= UNIPHIER_I2C_DTRM_IRQEN; - dev_dbg(&adap->dev, "write data: 0x%04x\n", txdata); writel(txdata, priv->membase + UNIPHIER_I2C_DTRM); time_left = wait_for_completion_timeout(&priv->comp, adap->timeout); @@ -81,8 +80,6 @@ static int uniphier_i2c_xfer_byte(struct i2c_adapter *adap, u32 txdata, } rxdata = readl(priv->membase + UNIPHIER_I2C_DREC); - dev_dbg(&adap->dev, "read data: 0x%04x\n", rxdata); - if (rxdatap) *rxdatap = rxdata; @@ -98,14 +95,11 @@ static int uniphier_i2c_send_byte(struct i2c_adapter *adap, u32 txdata) if (ret) return ret; - if (unlikely(rxdata & UNIPHIER_I2C_DREC_LAB)) { - dev_dbg(&adap->dev, "arbitration lost\n"); + if (unlikely(rxdata & UNIPHIER_I2C_DREC_LAB)) return -EAGAIN; - } - if (unlikely(rxdata & UNIPHIER_I2C_DREC_LRB)) { - dev_dbg(&adap->dev, "could not get ACK\n"); + + if (unlikely(rxdata & UNIPHIER_I2C_DREC_LRB)) return -ENXIO; - } return 0; } @@ -115,7 +109,6 @@ static int uniphier_i2c_tx(struct i2c_adapter *adap, u16 addr, u16 len, { int ret; - dev_dbg(&adap->dev, "start condition\n"); ret = uniphier_i2c_send_byte(adap, addr << 1 | UNIPHIER_I2C_DTRM_STA | UNIPHIER_I2C_DTRM_NACK); @@ -137,7 +130,6 @@ static int uniphier_i2c_rx(struct i2c_adapter *adap, u16 addr, u16 len, { int ret; - dev_dbg(&adap->dev, "start condition\n"); ret = uniphier_i2c_send_byte(adap, addr << 1 | UNIPHIER_I2C_DTRM_STA | UNIPHIER_I2C_DTRM_NACK | @@ -161,7 +153,6 @@ static int uniphier_i2c_rx(struct i2c_adapter *adap, u16 addr, u16 len, static int uniphier_i2c_stop(struct i2c_adapter *adap) { - dev_dbg(&adap->dev, "stop condition\n"); return uniphier_i2c_send_byte(adap, UNIPHIER_I2C_DTRM_STO | UNIPHIER_I2C_DTRM_NACK); } @@ -173,9 +164,6 @@ static int uniphier_i2c_master_xfer_one(struct i2c_adapter *adap, bool recovery = false; int ret; - dev_dbg(&adap->dev, "%s: addr=0x%02x, len=%d, stop=%d\n", - is_read ? "receive" : "transmit", msg->addr, msg->len, stop); - if (is_read) ret = uniphier_i2c_rx(adap, msg->addr, msg->len, msg->buf); else @@ -326,7 +314,6 @@ static int uniphier_i2c_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct uniphier_i2c_priv *priv; - struct resource *regs; u32 bus_speed; unsigned long clk_rate; int irq, ret; @@ -335,8 +322,7 @@ static int uniphier_i2c_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); - priv->membase = devm_ioremap_resource(dev, regs); + priv->membase = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(priv->membase)) return PTR_ERR(priv->membase); diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index 72b300174cb8..5f6a4985f2bc 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -350,13 +350,11 @@ static int i2c_device_probe(struct device *dev) return -ENODEV; if (client->flags & I2C_CLIENT_WAKE) { - int wakeirq = -ENOENT; + int wakeirq; - if (dev->of_node) { - wakeirq = of_irq_get_byname(dev->of_node, "wakeup"); - if (wakeirq == -EPROBE_DEFER) - return wakeirq; - } + wakeirq = of_irq_get_byname(dev->of_node, "wakeup"); + if (wakeirq == -EPROBE_DEFER) + return wakeirq; device_init_wakeup(&client->dev, true); @@ -966,7 +964,7 @@ struct i2c_client *devm_i2c_new_dummy_device(struct device *dev, EXPORT_SYMBOL_GPL(devm_i2c_new_dummy_device); /** - * i2c_new_secondary_device - Helper to get the instantiated secondary address + * i2c_new_ancillary_device - Helper to get the instantiated secondary address * and create the associated device * @client: Handle to the primary client * @name: Handle to specify which secondary address to get @@ -985,9 +983,9 @@ EXPORT_SYMBOL_GPL(devm_i2c_new_dummy_device); * cell whose "reg-names" value matches the slave name. * * This returns the new i2c client, which should be saved for later use with - * i2c_unregister_device(); or NULL to indicate an error. + * i2c_unregister_device(); or an ERR_PTR to describe the error. */ -struct i2c_client *i2c_new_secondary_device(struct i2c_client *client, +struct i2c_client *i2c_new_ancillary_device(struct i2c_client *client, const char *name, u16 default_addr) { @@ -1002,9 +1000,9 @@ struct i2c_client *i2c_new_secondary_device(struct i2c_client *client, } dev_dbg(&client->adapter->dev, "Address for %s : 0x%x\n", name, addr); - return i2c_new_dummy(client->adapter, addr); + return i2c_new_dummy_device(client->adapter, addr); } -EXPORT_SYMBOL_GPL(i2c_new_secondary_device); +EXPORT_SYMBOL_GPL(i2c_new_ancillary_device); /* ------------------------------------------------------------------------- */ diff --git a/drivers/i2c/i2c-slave-eeprom.c b/drivers/i2c/i2c-slave-eeprom.c index be65d3842878..92ff9991bae8 100644 --- a/drivers/i2c/i2c-slave-eeprom.c +++ b/drivers/i2c/i2c-slave-eeprom.c @@ -11,6 +11,13 @@ * pointer, yet implementation is deferred until the need actually arises. */ +/* + * FIXME: What to do if only 8 bits of a 16 bit address are sent? + * The ST-M24C64 sends only 0xff then. Needs verification with other + * EEPROMs, though. We currently use the 8 bit as a valid address. + */ + +#include #include #include #include @@ -21,12 +28,18 @@ struct eeprom_data { struct bin_attribute bin; - bool first_write; spinlock_t buffer_lock; - u8 buffer_idx; + u16 buffer_idx; + u16 address_mask; + u8 num_address_bytes; + u8 idx_write_cnt; u8 buffer[]; }; +#define I2C_SLAVE_BYTELEN GENMASK(15, 0) +#define I2C_SLAVE_FLAG_ADDR16 BIT(16) +#define I2C_SLAVE_DEVICE_MAGIC(_len, _flags) ((_flags) | (_len)) + static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, enum i2c_slave_event event, u8 *val) { @@ -34,12 +47,14 @@ static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, switch (event) { case I2C_SLAVE_WRITE_RECEIVED: - if (eeprom->first_write) { - eeprom->buffer_idx = *val; - eeprom->first_write = false; + if (eeprom->idx_write_cnt < eeprom->num_address_bytes) { + if (eeprom->idx_write_cnt == 0) + eeprom->buffer_idx = 0; + eeprom->buffer_idx = *val | (eeprom->buffer_idx << 8); + eeprom->idx_write_cnt++; } else { spin_lock(&eeprom->buffer_lock); - eeprom->buffer[eeprom->buffer_idx++] = *val; + eeprom->buffer[eeprom->buffer_idx++ & eeprom->address_mask] = *val; spin_unlock(&eeprom->buffer_lock); } break; @@ -50,7 +65,7 @@ static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, /* fallthrough */ case I2C_SLAVE_READ_REQUESTED: spin_lock(&eeprom->buffer_lock); - *val = eeprom->buffer[eeprom->buffer_idx]; + *val = eeprom->buffer[eeprom->buffer_idx & eeprom->address_mask]; spin_unlock(&eeprom->buffer_lock); /* * Do not increment buffer_idx here, because we don't know if @@ -61,7 +76,7 @@ static int i2c_slave_eeprom_slave_cb(struct i2c_client *client, case I2C_SLAVE_STOP: case I2C_SLAVE_WRITE_REQUESTED: - eeprom->first_write = true; + eeprom->idx_write_cnt = 0; break; default: @@ -105,13 +120,16 @@ static int i2c_slave_eeprom_probe(struct i2c_client *client, const struct i2c_de { struct eeprom_data *eeprom; int ret; - unsigned size = id->driver_data; + unsigned int size = FIELD_GET(I2C_SLAVE_BYTELEN, id->driver_data); + unsigned int flag_addr16 = FIELD_GET(I2C_SLAVE_FLAG_ADDR16, id->driver_data); eeprom = devm_kzalloc(&client->dev, sizeof(struct eeprom_data) + size, GFP_KERNEL); if (!eeprom) return -ENOMEM; - eeprom->first_write = true; + eeprom->idx_write_cnt = 0; + eeprom->num_address_bytes = flag_addr16 ? 2 : 1; + eeprom->address_mask = size - 1; spin_lock_init(&eeprom->buffer_lock); i2c_set_clientdata(client, eeprom); @@ -146,7 +164,9 @@ static int i2c_slave_eeprom_remove(struct i2c_client *client) } static const struct i2c_device_id i2c_slave_eeprom_id[] = { - { "slave-24c02", 2048 / 8 }, + { "slave-24c02", I2C_SLAVE_DEVICE_MAGIC(2048 / 8, 0) }, + { "slave-24c32", I2C_SLAVE_DEVICE_MAGIC(32768 / 8, I2C_SLAVE_FLAG_ADDR16) }, + { "slave-24c64", I2C_SLAVE_DEVICE_MAGIC(65536 / 8, I2C_SLAVE_FLAG_ADDR16) }, { } }; MODULE_DEVICE_TABLE(i2c, i2c_slave_eeprom_id); diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index dce06108c8c3..5337393d4dfe 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -583,8 +583,10 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, break; } - /* P2PDMA contexts do not need to be unmapped */ - if (!is_pci_p2pdma_page(sg_page(sg))) + if (is_pci_p2pdma_page(sg_page(sg))) + pci_p2pdma_unmap_sg(qp->pd->device->dma_device, sg, + sg_cnt, dir); + else ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); } EXPORT_SYMBOL(rdma_rw_ctx_destroy); diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 41f9e268e3fb..24244a2f68cc 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -54,10 +54,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { page = sg_page_iter_page(&sg_iter); - if (umem->writable && dirty) - put_user_pages_dirty_lock(&page, 1); - else - put_user_page(page); + put_user_pages_dirty_lock(&page, 1, umem->writable && dirty); } sg_free_table(&umem->sg_head); diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index b89a9b9aef7a..469acb961fbd 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -118,10 +118,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, size_t npages, bool dirty) { - if (dirty) - put_user_pages_dirty_lock(p, npages); - else - put_user_pages(p, npages); + put_user_pages_dirty_lock(p, npages, dirty); if (mm) { /* during close after signal, mm can be NULL */ atomic64_sub(npages, &mm->pinned_vm); diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 9f53f63b1453..7bff0a1e713d 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1041,7 +1041,7 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (cb) iowait_pio_inc(&priv->s_iowait); pbuf = sc_buffer_alloc(sc, plen, cb, qp); - if (unlikely(IS_ERR_OR_NULL(pbuf))) { + if (IS_ERR_OR_NULL(pbuf)) { if (cb) verbs_pio_complete(qp, 0); if (IS_ERR(pbuf)) { diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index bfbfbb7e0ff4..6bf764e41891 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -40,10 +40,7 @@ static void __qib_release_user_pages(struct page **p, size_t num_pages, int dirty) { - if (dirty) - put_user_pages_dirty_lock(p, num_pages); - else - put_user_pages(p, num_pages); + put_user_pages_dirty_lock(p, num_pages, dirty); } /** diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 0b0237d41613..62e6ffa9ad78 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -75,10 +75,7 @@ static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) for_each_sg(chunk->page_list, sg, chunk->nents, i) { page = sg_page(sg); pa = sg_phys(sg); - if (dirty) - put_user_pages_dirty_lock(&page, 1); - else - put_user_page(page); + put_user_pages_dirty_lock(&page, 1, dirty); usnic_dbg("pa: %pa\n", &pa); } kfree(chunk); diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index 87a56039f0ef..e99983f07663 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -63,15 +63,7 @@ struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, bool dirty) { - struct page **p = chunk->plist; - - while (num_pages--) { - if (!PageDirty(*p) && dirty) - put_user_pages_dirty_lock(p, 1); - else - put_user_page(*p); - p++; - } + put_user_pages_dirty_lock(chunk->plist, num_pages, dirty); } void siw_umem_release(struct siw_umem *umem, bool dirty) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 9118ab85cb3a..dab4446fe7d8 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -345,6 +345,14 @@ static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...) #define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) #endif +static void dm_integrity_prepare(struct request *rq) +{ +} + +static void dm_integrity_complete(struct request *rq, unsigned int nr_bytes) +{ +} + /* * DM Integrity profile, protection is performed layer above (dm-crypt) */ @@ -352,6 +360,8 @@ static const struct blk_integrity_profile dm_integrity_profile = { .name = "DM-DIF-EXT-TAG", .generate_fn = NULL, .verify_fn = NULL, + .prepare_fn = dm_integrity_prepare, + .complete_fn = dm_integrity_complete, }; static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); diff --git a/drivers/media/i2c/adv748x/adv748x-core.c b/drivers/media/i2c/adv748x/adv748x-core.c index 0a47d474e97a..23e02ff27b17 100644 --- a/drivers/media/i2c/adv748x/adv748x-core.c +++ b/drivers/media/i2c/adv748x/adv748x-core.c @@ -183,14 +183,14 @@ static int adv748x_initialise_clients(struct adv748x_state *state) int ret; for (i = ADV748X_PAGE_DPLL; i < ADV748X_PAGE_MAX; ++i) { - state->i2c_clients[i] = i2c_new_secondary_device( + state->i2c_clients[i] = i2c_new_ancillary_device( state->client, adv748x_default_addresses[i].name, adv748x_default_addresses[i].default_addr); - if (state->i2c_clients[i] == NULL) { + if (IS_ERR(state->i2c_clients[i])) { adv_err(state, "failed to create i2c client %u\n", i); - return -ENOMEM; + return PTR_ERR(state->i2c_clients[i]); } ret = adv748x_configure_regmap(state, i); diff --git a/drivers/media/i2c/adv7604.c b/drivers/media/i2c/adv7604.c index 28a84bf9f8a9..2dedd6ebb236 100644 --- a/drivers/media/i2c/adv7604.c +++ b/drivers/media/i2c/adv7604.c @@ -2862,10 +2862,8 @@ static void adv76xx_unregister_clients(struct adv76xx_state *state) { unsigned int i; - for (i = 1; i < ARRAY_SIZE(state->i2c_clients); ++i) { - if (state->i2c_clients[i]) - i2c_unregister_device(state->i2c_clients[i]); - } + for (i = 1; i < ARRAY_SIZE(state->i2c_clients); ++i) + i2c_unregister_device(state->i2c_clients[i]); } static struct i2c_client *adv76xx_dummy_client(struct v4l2_subdev *sd, @@ -2878,14 +2876,14 @@ static struct i2c_client *adv76xx_dummy_client(struct v4l2_subdev *sd, struct i2c_client *new_client; if (pdata && pdata->i2c_addresses[page]) - new_client = i2c_new_dummy(client->adapter, + new_client = i2c_new_dummy_device(client->adapter, pdata->i2c_addresses[page]); else - new_client = i2c_new_secondary_device(client, + new_client = i2c_new_ancillary_device(client, adv76xx_default_addresses[page].name, adv76xx_default_addresses[page].default_addr); - if (new_client) + if (!IS_ERR(new_client)) io_write(sd, io_reg, new_client->addr << 1); return new_client; @@ -3516,15 +3514,19 @@ static int adv76xx_probe(struct i2c_client *client, } for (i = 1; i < ADV76XX_PAGE_MAX; ++i) { + struct i2c_client *dummy_client; + if (!(BIT(i) & state->info->page_mask)) continue; - state->i2c_clients[i] = adv76xx_dummy_client(sd, i); - if (!state->i2c_clients[i]) { - err = -EINVAL; + dummy_client = adv76xx_dummy_client(sd, i); + if (IS_ERR(dummy_client)) { + err = PTR_ERR(dummy_client); v4l2_err(sd, "failed to create i2c client %u\n", i); goto err_i2c; } + + state->i2c_clients[i] = dummy_client; } INIT_DELAYED_WORK(&state->delayed_work_enable_hotplug, diff --git a/drivers/media/v4l2-core/videobuf-dma-contig.c b/drivers/media/v4l2-core/videobuf-dma-contig.c index 76b4ac7b1678..aeb2f497c683 100644 --- a/drivers/media/v4l2-core/videobuf-dma-contig.c +++ b/drivers/media/v4l2-core/videobuf-dma-contig.c @@ -157,6 +157,7 @@ static void videobuf_dma_contig_user_put(struct videobuf_dma_contig_memory *mem) static int videobuf_dma_contig_user_get(struct videobuf_dma_contig_memory *mem, struct videobuf_buffer *vb) { + unsigned long untagged_baddr = untagged_addr(vb->baddr); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long prev_pfn, this_pfn; @@ -164,22 +165,22 @@ static int videobuf_dma_contig_user_get(struct videobuf_dma_contig_memory *mem, unsigned int offset; int ret; - offset = vb->baddr & ~PAGE_MASK; + offset = untagged_baddr & ~PAGE_MASK; mem->size = PAGE_ALIGN(vb->size + offset); ret = -EINVAL; down_read(&mm->mmap_sem); - vma = find_vma(mm, vb->baddr); + vma = find_vma(mm, untagged_baddr); if (!vma) goto out_up; - if ((vb->baddr + mem->size) > vma->vm_end) + if ((untagged_baddr + mem->size) > vma->vm_end) goto out_up; pages_done = 0; prev_pfn = 0; /* kill warning */ - user_address = vb->baddr; + user_address = untagged_baddr; while (pages_done < (mem->size >> PAGE_SHIFT)) { ret = follow_pfn(vma, user_address, &this_pfn); diff --git a/drivers/mfd/88pm800.c b/drivers/mfd/88pm800.c index f2d9fb4c4e8e..4e8d0d6b9b5c 100644 --- a/drivers/mfd/88pm800.c +++ b/drivers/mfd/88pm800.c @@ -425,10 +425,10 @@ static int pm800_pages_init(struct pm80x_chip *chip) return -ENODEV; /* PM800 block power page */ - subchip->power_page = i2c_new_dummy(client->adapter, + subchip->power_page = i2c_new_dummy_device(client->adapter, subchip->power_page_addr); - if (subchip->power_page == NULL) { - ret = -ENODEV; + if (IS_ERR(subchip->power_page)) { + ret = PTR_ERR(subchip->power_page); goto out; } @@ -444,10 +444,10 @@ static int pm800_pages_init(struct pm80x_chip *chip) i2c_set_clientdata(subchip->power_page, chip); /* PM800 block GPADC */ - subchip->gpadc_page = i2c_new_dummy(client->adapter, + subchip->gpadc_page = i2c_new_dummy_device(client->adapter, subchip->gpadc_page_addr); - if (subchip->gpadc_page == NULL) { - ret = -ENODEV; + if (IS_ERR(subchip->gpadc_page)) { + ret = PTR_ERR(subchip->gpadc_page); goto out; } diff --git a/drivers/mfd/88pm860x-core.c b/drivers/mfd/88pm860x-core.c index 9e0bd135730f..c9bae71f643a 100644 --- a/drivers/mfd/88pm860x-core.c +++ b/drivers/mfd/88pm860x-core.c @@ -1178,12 +1178,12 @@ static int pm860x_probe(struct i2c_client *client) */ if (pdata->companion_addr && (pdata->companion_addr != client->addr)) { chip->companion_addr = pdata->companion_addr; - chip->companion = i2c_new_dummy(chip->client->adapter, + chip->companion = i2c_new_dummy_device(chip->client->adapter, chip->companion_addr); - if (!chip->companion) { + if (IS_ERR(chip->companion)) { dev_err(&client->dev, "Failed to allocate I2C companion device\n"); - return -ENODEV; + return PTR_ERR(chip->companion); } chip->regmap_companion = regmap_init_i2c(chip->companion, &pm860x_regmap_config); diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index c9c49da42446..ae24d3ea68ea 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -589,6 +589,17 @@ config INTEL_SOC_PMIC_CHTDC_TI Select this option for supporting Dollar Cove (TI version) PMIC device that is found on some Intel Cherry Trail systems. +config INTEL_SOC_PMIC_MRFLD + tristate "Support for Intel Merrifield Basin Cove PMIC" + depends on GPIOLIB + depends on ACPI + depends on INTEL_SCU_IPC + select MFD_CORE + select REGMAP_IRQ + help + Select this option for supporting Basin Cove PMIC device + that is found on Intel Merrifield systems. + config MFD_INTEL_LPSS tristate select COMMON_CLK @@ -641,15 +652,6 @@ config MFD_JANZ_CMODIO host many different types of MODULbus daughterboards, including CAN and GPIO controllers. -config MFD_JZ4740_ADC - bool "Janz JZ4740 ADC core" - select MFD_CORE - select GENERIC_IRQ_CHIP - depends on MACH_JZ4740 - help - Say yes here if you want support for the ADC unit in the JZ4740 SoC. - This driver is necessary for jz4740-battery and jz4740-hwmon driver. - config MFD_KEMPLD tristate "Kontron module PLD device" select MFD_CORE diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 0c0a848e62df..c1067ea46204 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -189,7 +189,6 @@ obj-$(CONFIG_LPC_SCH) += lpc_sch.o obj-$(CONFIG_LPC_ICH) += lpc_ich.o obj-$(CONFIG_MFD_RDC321X) += rdc321x-southbridge.o obj-$(CONFIG_MFD_JANZ_CMODIO) += janz-cmodio.o -obj-$(CONFIG_MFD_JZ4740_ADC) += jz4740-adc.o obj-$(CONFIG_MFD_TPS6586X) += tps6586x.o obj-$(CONFIG_MFD_VX855) += vx855.o obj-$(CONFIG_MFD_WL1273_CORE) += wl1273-core.o @@ -239,7 +238,9 @@ obj-$(CONFIG_INTEL_SOC_PMIC) += intel-soc-pmic.o obj-$(CONFIG_INTEL_SOC_PMIC_BXTWC) += intel_soc_pmic_bxtwc.o obj-$(CONFIG_INTEL_SOC_PMIC_CHTWC) += intel_soc_pmic_chtwc.o obj-$(CONFIG_INTEL_SOC_PMIC_CHTDC_TI) += intel_soc_pmic_chtdc_ti.o -obj-$(CONFIG_MFD_MT6397) += mt6397-core.o +mt6397-objs := mt6397-core.o mt6397-irq.o +obj-$(CONFIG_MFD_MT6397) += mt6397.o +obj-$(CONFIG_INTEL_SOC_PMIC_MRFLD) += intel_soc_pmic_mrfld.o obj-$(CONFIG_MFD_ALTERA_A10SR) += altera-a10sr.o obj-$(CONFIG_MFD_ALTERA_SYSMGR) += altera-sysmgr.o diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c index 9f3dbc31d3e9..57723f116bb5 100644 --- a/drivers/mfd/ab3100-core.c +++ b/drivers/mfd/ab3100-core.c @@ -865,10 +865,10 @@ static int ab3100_probe(struct i2c_client *client, &ab3100->chip_name[0]); /* Attach a second dummy i2c_client to the test register address */ - ab3100->testreg_client = i2c_new_dummy(client->adapter, + ab3100->testreg_client = i2c_new_dummy_device(client->adapter, client->addr + 1); - if (!ab3100->testreg_client) { - err = -ENOMEM; + if (IS_ERR(ab3100->testreg_client)) { + err = PTR_ERR(ab3100->testreg_client); goto exit_no_testreg_client; } diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c index 567a34b073dd..f4e26b6e5362 100644 --- a/drivers/mfd/ab8500-debugfs.c +++ b/drivers/mfd/ab8500-debugfs.c @@ -2680,16 +2680,12 @@ static int ab8500_debug_probe(struct platform_device *plf) irq_ab8500 = res->start; irq_first = platform_get_irq_byname(plf, "IRQ_FIRST"); - if (irq_first < 0) { - dev_err(&plf->dev, "First irq not found, err %d\n", irq_first); + if (irq_first < 0) return irq_first; - } irq_last = platform_get_irq_byname(plf, "IRQ_LAST"); - if (irq_last < 0) { - dev_err(&plf->dev, "Last irq not found, err %d\n", irq_last); + if (irq_last < 0) return irq_last; - } ab8500_dir = debugfs_create_dir(AB8500_NAME_STRING, NULL); diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c index 83b18c998d6f..a6bd2134cea2 100644 --- a/drivers/mfd/asic3.c +++ b/drivers/mfd/asic3.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/mfd/bcm590xx.c b/drivers/mfd/bcm590xx.c index 1aeb5e498d91..bfac5dc091ca 100644 --- a/drivers/mfd/bcm590xx.c +++ b/drivers/mfd/bcm590xx.c @@ -61,11 +61,11 @@ static int bcm590xx_i2c_probe(struct i2c_client *i2c_pri, } /* Secondary I2C slave address is the base address with A(2) asserted */ - bcm590xx->i2c_sec = i2c_new_dummy(i2c_pri->adapter, + bcm590xx->i2c_sec = i2c_new_dummy_device(i2c_pri->adapter, i2c_pri->addr | BIT(2)); - if (!bcm590xx->i2c_sec) { + if (IS_ERR(bcm590xx->i2c_sec)) { dev_err(&i2c_pri->dev, "failed to add secondary I2C device\n"); - return -ENODEV; + return PTR_ERR(bcm590xx->i2c_sec); } i2c_set_clientdata(bcm590xx->i2c_sec, bcm590xx); diff --git a/drivers/mfd/da9150-core.c b/drivers/mfd/da9150-core.c index 13033068721a..7f0aa1e8db96 100644 --- a/drivers/mfd/da9150-core.c +++ b/drivers/mfd/da9150-core.c @@ -420,10 +420,10 @@ static int da9150_probe(struct i2c_client *client, qif_addr = da9150_reg_read(da9150, DA9150_CORE2WIRE_CTRL_A); qif_addr = (qif_addr & DA9150_CORE_BASE_ADDR_MASK) >> 1; qif_addr |= DA9150_QIF_I2C_ADDR_LSB; - da9150->core_qif = i2c_new_dummy(client->adapter, qif_addr); - if (!da9150->core_qif) { + da9150->core_qif = i2c_new_dummy_device(client->adapter, qif_addr); + if (IS_ERR(da9150->core_qif)) { dev_err(da9150->dev, "Failed to attach QIF client\n"); - return -ENODEV; + return PTR_ERR(da9150->core_qif); } i2c_set_clientdata(da9150->core_qif, da9150); diff --git a/drivers/mfd/davinci_voicecodec.c b/drivers/mfd/davinci_voicecodec.c index 13ca7203e193..e5c8bc998eb4 100644 --- a/drivers/mfd/davinci_voicecodec.c +++ b/drivers/mfd/davinci_voicecodec.c @@ -19,7 +19,6 @@ #include #include -#include static const struct regmap_config davinci_vc_regmap = { .reg_bits = 32, @@ -31,6 +30,7 @@ static int __init davinci_vc_probe(struct platform_device *pdev) struct davinci_vc *davinci_vc; struct resource *res; struct mfd_cell *cell = NULL; + dma_addr_t fifo_base; int ret; davinci_vc = devm_kzalloc(&pdev->dev, @@ -48,6 +48,7 @@ static int __init davinci_vc_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + fifo_base = (dma_addr_t)res->start; davinci_vc->base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(davinci_vc->base)) { ret = PTR_ERR(davinci_vc->base); @@ -70,8 +71,7 @@ static int __init davinci_vc_probe(struct platform_device *pdev) } davinci_vc->davinci_vcif.dma_tx_channel = res->start; - davinci_vc->davinci_vcif.dma_tx_addr = - (dma_addr_t)(io_v2p(davinci_vc->base) + DAVINCI_VC_WFIFO); + davinci_vc->davinci_vcif.dma_tx_addr = fifo_base + DAVINCI_VC_WFIFO; res = platform_get_resource(pdev, IORESOURCE_DMA, 1); if (!res) { @@ -81,8 +81,7 @@ static int __init davinci_vc_probe(struct platform_device *pdev) } davinci_vc->davinci_vcif.dma_rx_channel = res->start; - davinci_vc->davinci_vcif.dma_rx_addr = - (dma_addr_t)(io_v2p(davinci_vc->base) + DAVINCI_VC_RFIFO); + davinci_vc->davinci_vcif.dma_rx_addr = fifo_base + DAVINCI_VC_RFIFO; davinci_vc->dev = &pdev->dev; davinci_vc->pdev = pdev; diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c index 90e0f21bc49c..0e019cc5da42 100644 --- a/drivers/mfd/db8500-prcmu.c +++ b/drivers/mfd/db8500-prcmu.c @@ -1695,21 +1695,41 @@ static long round_clock_rate(u8 clock, unsigned long rate) return rounded_rate; } -static const unsigned long armss_freqs[] = { +static const unsigned long db8500_armss_freqs[] = { 200000000, 400000000, 800000000, 998400000 }; +/* The DB8520 has slightly higher ARMSS max frequency */ +static const unsigned long db8520_armss_freqs[] = { + 200000000, + 400000000, + 800000000, + 1152000000 +}; + + + static long round_armss_rate(unsigned long rate) { unsigned long freq = 0; + const unsigned long *freqs; + int nfreqs; int i; + if (fw_info.version.project == PRCMU_FW_PROJECT_U8520) { + freqs = db8520_armss_freqs; + nfreqs = ARRAY_SIZE(db8520_armss_freqs); + } else { + freqs = db8500_armss_freqs; + nfreqs = ARRAY_SIZE(db8500_armss_freqs); + } + /* Find the corresponding arm opp from the cpufreq table. */ - for (i = 0; i < ARRAY_SIZE(armss_freqs); i++) { - freq = armss_freqs[i]; + for (i = 0; i < nfreqs; i++) { + freq = freqs[i]; if (rate <= freq) break; } @@ -1854,11 +1874,21 @@ static int set_armss_rate(unsigned long rate) { unsigned long freq; u8 opps[] = { ARM_EXTCLK, ARM_50_OPP, ARM_100_OPP, ARM_MAX_OPP }; + const unsigned long *freqs; + int nfreqs; int i; + if (fw_info.version.project == PRCMU_FW_PROJECT_U8520) { + freqs = db8520_armss_freqs; + nfreqs = ARRAY_SIZE(db8520_armss_freqs); + } else { + freqs = db8500_armss_freqs; + nfreqs = ARRAY_SIZE(db8500_armss_freqs); + } + /* Find the corresponding arm opp from the cpufreq table. */ - for (i = 0; i < ARRAY_SIZE(armss_freqs); i++) { - freq = armss_freqs[i]; + for (i = 0; i < nfreqs; i++) { + freq = freqs[i]; if (rate == freq) break; } @@ -3130,10 +3160,8 @@ static int db8500_prcmu_probe(struct platform_device *pdev) writel(ALL_MBOX_BITS, PRCM_ARM_IT1_CLR); irq = platform_get_irq(pdev, 0); - if (irq <= 0) { - dev_err(&pdev->dev, "no prcmu irq provided\n"); + if (irq <= 0) return irq; - } err = request_threaded_irq(irq, prcmu_irq_handler, prcmu_irq_thread_fn, IRQF_NO_SUSPEND, "prcmu", NULL); diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c index f505e3e1274b..70fa18b04ad2 100644 --- a/drivers/mfd/ezx-pcap.c +++ b/drivers/mfd/ezx-pcap.c @@ -35,7 +35,7 @@ struct pcap_chip { /* IO */ u32 buf; - struct mutex io_mutex; + spinlock_t io_lock; /* IRQ */ unsigned int irq_base; @@ -48,7 +48,7 @@ struct pcap_chip { struct pcap_adc_request *adc_queue[PCAP_ADC_MAXQ]; u8 adc_head; u8 adc_tail; - struct mutex adc_mutex; + spinlock_t adc_lock; }; /* IO */ @@ -76,14 +76,15 @@ static int ezx_pcap_putget(struct pcap_chip *pcap, u32 *data) int ezx_pcap_write(struct pcap_chip *pcap, u8 reg_num, u32 value) { + unsigned long flags; int ret; - mutex_lock(&pcap->io_mutex); + spin_lock_irqsave(&pcap->io_lock, flags); value &= PCAP_REGISTER_VALUE_MASK; value |= PCAP_REGISTER_WRITE_OP_BIT | (reg_num << PCAP_REGISTER_ADDRESS_SHIFT); ret = ezx_pcap_putget(pcap, &value); - mutex_unlock(&pcap->io_mutex); + spin_unlock_irqrestore(&pcap->io_lock, flags); return ret; } @@ -91,14 +92,15 @@ EXPORT_SYMBOL_GPL(ezx_pcap_write); int ezx_pcap_read(struct pcap_chip *pcap, u8 reg_num, u32 *value) { + unsigned long flags; int ret; - mutex_lock(&pcap->io_mutex); + spin_lock_irqsave(&pcap->io_lock, flags); *value = PCAP_REGISTER_READ_OP_BIT | (reg_num << PCAP_REGISTER_ADDRESS_SHIFT); ret = ezx_pcap_putget(pcap, value); - mutex_unlock(&pcap->io_mutex); + spin_unlock_irqrestore(&pcap->io_lock, flags); return ret; } @@ -106,11 +108,12 @@ EXPORT_SYMBOL_GPL(ezx_pcap_read); int ezx_pcap_set_bits(struct pcap_chip *pcap, u8 reg_num, u32 mask, u32 val) { + unsigned long flags; int ret; u32 tmp = PCAP_REGISTER_READ_OP_BIT | (reg_num << PCAP_REGISTER_ADDRESS_SHIFT); - mutex_lock(&pcap->io_mutex); + spin_lock_irqsave(&pcap->io_lock, flags); ret = ezx_pcap_putget(pcap, &tmp); if (ret) goto out_unlock; @@ -121,7 +124,7 @@ int ezx_pcap_set_bits(struct pcap_chip *pcap, u8 reg_num, u32 mask, u32 val) ret = ezx_pcap_putget(pcap, &tmp); out_unlock: - mutex_unlock(&pcap->io_mutex); + spin_unlock_irqrestore(&pcap->io_lock, flags); return ret; } @@ -212,14 +215,15 @@ static void pcap_irq_handler(struct irq_desc *desc) /* ADC */ void pcap_set_ts_bits(struct pcap_chip *pcap, u32 bits) { + unsigned long flags; u32 tmp; - mutex_lock(&pcap->adc_mutex); + spin_lock_irqsave(&pcap->adc_lock, flags); ezx_pcap_read(pcap, PCAP_REG_ADC, &tmp); tmp &= ~(PCAP_ADC_TS_M_MASK | PCAP_ADC_TS_REF_LOWPWR); tmp |= bits & (PCAP_ADC_TS_M_MASK | PCAP_ADC_TS_REF_LOWPWR); ezx_pcap_write(pcap, PCAP_REG_ADC, tmp); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, flags); } EXPORT_SYMBOL_GPL(pcap_set_ts_bits); @@ -234,15 +238,16 @@ static void pcap_disable_adc(struct pcap_chip *pcap) static void pcap_adc_trigger(struct pcap_chip *pcap) { + unsigned long flags; u32 tmp; u8 head; - mutex_lock(&pcap->adc_mutex); + spin_lock_irqsave(&pcap->adc_lock, flags); head = pcap->adc_head; if (!pcap->adc_queue[head]) { /* queue is empty, save power */ pcap_disable_adc(pcap); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, flags); return; } /* start conversion on requested bank, save TS_M bits */ @@ -254,7 +259,7 @@ static void pcap_adc_trigger(struct pcap_chip *pcap) tmp |= PCAP_ADC_AD_SEL1; ezx_pcap_write(pcap, PCAP_REG_ADC, tmp); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, flags); ezx_pcap_write(pcap, PCAP_REG_ADR, PCAP_ADR_ASC); } @@ -265,11 +270,11 @@ static irqreturn_t pcap_adc_irq(int irq, void *_pcap) u16 res[2]; u32 tmp; - mutex_lock(&pcap->adc_mutex); + spin_lock(&pcap->adc_lock); req = pcap->adc_queue[pcap->adc_head]; if (WARN(!req, "adc irq without pending request\n")) { - mutex_unlock(&pcap->adc_mutex); + spin_unlock(&pcap->adc_lock); return IRQ_HANDLED; } @@ -285,7 +290,7 @@ static irqreturn_t pcap_adc_irq(int irq, void *_pcap) pcap->adc_queue[pcap->adc_head] = NULL; pcap->adc_head = (pcap->adc_head + 1) & (PCAP_ADC_MAXQ - 1); - mutex_unlock(&pcap->adc_mutex); + spin_unlock(&pcap->adc_lock); /* pass the results and release memory */ req->callback(req->data, res); @@ -301,6 +306,7 @@ int pcap_adc_async(struct pcap_chip *pcap, u8 bank, u32 flags, u8 ch[], void *callback, void *data) { struct pcap_adc_request *req; + unsigned long irq_flags; /* This will be freed after we have a result */ req = kmalloc(sizeof(struct pcap_adc_request), GFP_KERNEL); @@ -314,15 +320,15 @@ int pcap_adc_async(struct pcap_chip *pcap, u8 bank, u32 flags, u8 ch[], req->callback = callback; req->data = data; - mutex_lock(&pcap->adc_mutex); + spin_lock_irqsave(&pcap->adc_lock, irq_flags); if (pcap->adc_queue[pcap->adc_tail]) { - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, irq_flags); kfree(req); return -EBUSY; } pcap->adc_queue[pcap->adc_tail] = req; pcap->adc_tail = (pcap->adc_tail + 1) & (PCAP_ADC_MAXQ - 1); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, irq_flags); /* start conversion */ pcap_adc_trigger(pcap); @@ -389,16 +395,17 @@ static int pcap_add_subdev(struct pcap_chip *pcap, static int ezx_pcap_remove(struct spi_device *spi) { struct pcap_chip *pcap = spi_get_drvdata(spi); + unsigned long flags; int i; /* remove all registered subdevs */ device_for_each_child(&spi->dev, NULL, pcap_remove_subdev); /* cleanup ADC */ - mutex_lock(&pcap->adc_mutex); + spin_lock_irqsave(&pcap->adc_lock, flags); for (i = 0; i < PCAP_ADC_MAXQ; i++) kfree(pcap->adc_queue[i]); - mutex_unlock(&pcap->adc_mutex); + spin_unlock_irqrestore(&pcap->adc_lock, flags); /* cleanup irqchip */ for (i = pcap->irq_base; i < (pcap->irq_base + PCAP_NIRQS); i++) @@ -426,8 +433,8 @@ static int ezx_pcap_probe(struct spi_device *spi) goto ret; } - mutex_init(&pcap->io_mutex); - mutex_init(&pcap->adc_mutex); + spin_lock_init(&pcap->io_lock); + spin_lock_init(&pcap->adc_lock); INIT_WORK(&pcap->isr_work, pcap_isr_work); INIT_WORK(&pcap->msr_work, pcap_msr_work); spi_set_drvdata(spi, pcap); diff --git a/drivers/mfd/fsl-imx25-tsadc.c b/drivers/mfd/fsl-imx25-tsadc.c index 20791cab7263..a016b39fe9b0 100644 --- a/drivers/mfd/fsl-imx25-tsadc.c +++ b/drivers/mfd/fsl-imx25-tsadc.c @@ -69,10 +69,8 @@ static int mx25_tsadc_setup_irq(struct platform_device *pdev, int irq; irq = platform_get_irq(pdev, 0); - if (irq <= 0) { - dev_err(dev, "Failed to get irq\n"); + if (irq <= 0) return irq; - } tsadc->domain = irq_domain_add_simple(np, 2, 0, &mx25_tsadc_domain_ops, tsadc); diff --git a/drivers/mfd/htc-i2cpld.c b/drivers/mfd/htc-i2cpld.c index 370519af5d0b..8ad6768bd7a2 100644 --- a/drivers/mfd/htc-i2cpld.c +++ b/drivers/mfd/htc-i2cpld.c @@ -385,8 +385,7 @@ static void htcpld_unregister_chip_i2c( htcpld = platform_get_drvdata(pdev); chip = &htcpld->chip[chip_index]; - if (chip->client) - i2c_unregister_device(chip->client); + i2c_unregister_device(chip->client); } static int htcpld_register_chip_gpio( diff --git a/drivers/mfd/intel-lpss-acpi.c b/drivers/mfd/intel-lpss-acpi.c index 61ffb8b393e4..c8fe334b5fe8 100644 --- a/drivers/mfd/intel-lpss-acpi.c +++ b/drivers/mfd/intel-lpss-acpi.c @@ -18,6 +18,10 @@ #include "intel-lpss.h" +static const struct intel_lpss_platform_info spt_info = { + .clk_rate = 120000000, +}; + static struct property_entry spt_i2c_properties[] = { PROPERTY_ENTRY_U32("i2c-sda-hold-time-ns", 230), { }, @@ -28,6 +32,19 @@ static const struct intel_lpss_platform_info spt_i2c_info = { .properties = spt_i2c_properties, }; +static struct property_entry uart_properties[] = { + PROPERTY_ENTRY_U32("reg-io-width", 4), + PROPERTY_ENTRY_U32("reg-shift", 2), + PROPERTY_ENTRY_BOOL("snps,uart-16550-compatible"), + { }, +}; + +static const struct intel_lpss_platform_info spt_uart_info = { + .clk_rate = 120000000, + .clk_con_id = "baudclk", + .properties = uart_properties, +}; + static const struct intel_lpss_platform_info bxt_info = { .clk_rate = 100000000, }; @@ -58,8 +75,17 @@ static const struct intel_lpss_platform_info apl_i2c_info = { static const struct acpi_device_id intel_lpss_acpi_ids[] = { /* SPT */ + { "INT3440", (kernel_ulong_t)&spt_info }, + { "INT3441", (kernel_ulong_t)&spt_info }, + { "INT3442", (kernel_ulong_t)&spt_i2c_info }, + { "INT3443", (kernel_ulong_t)&spt_i2c_info }, + { "INT3444", (kernel_ulong_t)&spt_i2c_info }, + { "INT3445", (kernel_ulong_t)&spt_i2c_info }, { "INT3446", (kernel_ulong_t)&spt_i2c_info }, { "INT3447", (kernel_ulong_t)&spt_i2c_info }, + { "INT3448", (kernel_ulong_t)&spt_uart_info }, + { "INT3449", (kernel_ulong_t)&spt_uart_info }, + { "INT344A", (kernel_ulong_t)&spt_uart_info }, /* BXT */ { "80860AAC", (kernel_ulong_t)&bxt_i2c_info }, { "80860ABC", (kernel_ulong_t)&bxt_info }, diff --git a/drivers/mfd/intel-lpss-pci.c b/drivers/mfd/intel-lpss-pci.c index ade6e1ce5a98..9355db29d2f9 100644 --- a/drivers/mfd/intel-lpss-pci.c +++ b/drivers/mfd/intel-lpss-pci.c @@ -35,6 +35,8 @@ static int intel_lpss_pci_probe(struct pci_dev *pdev, info->mem = &pdev->resource[0]; info->irq = pdev->irq; + pdev->d3cold_delay = 0; + /* Probably it is enough to set this for iDMA capable devices only */ pci_set_master(pdev); pci_try_set_mwi(pdev); @@ -256,6 +258,29 @@ static const struct pci_device_id intel_lpss_pci_ids[] = { { PCI_VDEVICE(INTEL, 0x9dea), (kernel_ulong_t)&cnl_i2c_info }, { PCI_VDEVICE(INTEL, 0x9deb), (kernel_ulong_t)&cnl_i2c_info }, { PCI_VDEVICE(INTEL, 0x9dfb), (kernel_ulong_t)&spt_info }, + /* TGL-LP */ + { PCI_VDEVICE(INTEL, 0xa0a8), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0a9), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0aa), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0ab), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0c5), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0c6), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0c7), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0d8), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0d9), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0da), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0db), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0dc), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0dd), (kernel_ulong_t)&bxt_uart_info }, + { PCI_VDEVICE(INTEL, 0xa0de), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0df), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0e8), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0e9), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0ea), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0eb), (kernel_ulong_t)&spt_i2c_info }, + { PCI_VDEVICE(INTEL, 0xa0fb), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0fd), (kernel_ulong_t)&spt_info }, + { PCI_VDEVICE(INTEL, 0xa0fe), (kernel_ulong_t)&spt_info }, /* SPT-H */ { PCI_VDEVICE(INTEL, 0xa127), (kernel_ulong_t)&spt_uart_info }, { PCI_VDEVICE(INTEL, 0xa128), (kernel_ulong_t)&spt_uart_info }, diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c index 277f48f1cc1c..bfe4ff337581 100644 --- a/drivers/mfd/intel-lpss.c +++ b/drivers/mfd/intel-lpss.c @@ -47,10 +47,10 @@ #define LPSS_PRIV_IDLELTR 0x14 #define LPSS_PRIV_LTR_REQ BIT(15) -#define LPSS_PRIV_LTR_SCALE_MASK 0xc00 -#define LPSS_PRIV_LTR_SCALE_1US 0x800 -#define LPSS_PRIV_LTR_SCALE_32US 0xc00 -#define LPSS_PRIV_LTR_VALUE_MASK 0x3ff +#define LPSS_PRIV_LTR_SCALE_MASK GENMASK(11, 10) +#define LPSS_PRIV_LTR_SCALE_1US (2 << 10) +#define LPSS_PRIV_LTR_SCALE_32US (3 << 10) +#define LPSS_PRIV_LTR_VALUE_MASK GENMASK(9, 0) #define LPSS_PRIV_SSP_REG 0x20 #define LPSS_PRIV_SSP_REG_DIS_DMA_FIN BIT(0) @@ -59,8 +59,8 @@ #define LPSS_PRIV_CAPS 0xfc #define LPSS_PRIV_CAPS_NO_IDMA BIT(8) +#define LPSS_PRIV_CAPS_TYPE_MASK GENMASK(7, 4) #define LPSS_PRIV_CAPS_TYPE_SHIFT 4 -#define LPSS_PRIV_CAPS_TYPE_MASK (0xf << LPSS_PRIV_CAPS_TYPE_SHIFT) /* This matches the type field in CAPS register */ enum intel_lpss_dev_type { @@ -128,17 +128,6 @@ static const struct mfd_cell intel_lpss_spi_cell = { static DEFINE_IDA(intel_lpss_devid_ida); static struct dentry *intel_lpss_debugfs; -static int intel_lpss_request_dma_module(const char *name) -{ - static bool intel_lpss_dma_requested; - - if (intel_lpss_dma_requested) - return 0; - - intel_lpss_dma_requested = true; - return request_module("%s", name); -} - static void intel_lpss_cache_ltr(struct intel_lpss *lpss) { lpss->active_ltr = readl(lpss->priv + LPSS_PRIV_ACTIVELTR); @@ -429,16 +418,6 @@ int intel_lpss_probe(struct device *dev, dev_warn(dev, "Failed to create debugfs entries\n"); if (intel_lpss_has_idma(lpss)) { - /* - * Ensure the DMA driver is loaded before the host - * controller device appears, so that the host controller - * driver can request its DMA channels as early as - * possible. - * - * If the DMA module is not there that's OK as well. - */ - intel_lpss_request_dma_module(LPSS_IDMA64_DRIVER_NAME); - ret = mfd_add_devices(dev, lpss->devid, &intel_lpss_idma64_cell, 1, info->mem, info->irq, NULL); if (ret) @@ -554,3 +533,11 @@ MODULE_AUTHOR("Heikki Krogerus "); MODULE_AUTHOR("Jarkko Nikula "); MODULE_DESCRIPTION("Intel LPSS core driver"); MODULE_LICENSE("GPL v2"); +/* + * Ensure the DMA driver is loaded before the host controller device appears, + * so that the host controller driver can request its DMA channels as early + * as possible. + * + * If the DMA module is not there that's OK as well. + */ +MODULE_SOFTDEP("pre: platform:" LPSS_IDMA64_DRIVER_NAME); diff --git a/drivers/mfd/intel_soc_pmic_bxtwc.c b/drivers/mfd/intel_soc_pmic_bxtwc.c index 6310c3bdb991..739cfb5b69fe 100644 --- a/drivers/mfd/intel_soc_pmic_bxtwc.c +++ b/drivers/mfd/intel_soc_pmic_bxtwc.c @@ -450,10 +450,8 @@ static int bxtwc_probe(struct platform_device *pdev) return -ENOMEM; ret = platform_get_irq(pdev, 0); - if (ret < 0) { - dev_err(&pdev->dev, "Invalid IRQ\n"); + if (ret < 0) return ret; - } pmic->irq = ret; dev_set_drvdata(&pdev->dev, pmic); diff --git a/drivers/mfd/intel_soc_pmic_mrfld.c b/drivers/mfd/intel_soc_pmic_mrfld.c new file mode 100644 index 000000000000..26a1551c5faf --- /dev/null +++ b/drivers/mfd/intel_soc_pmic_mrfld.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device access for Basin Cove PMIC + * + * Copyright (c) 2019, Intel Corporation. + * Author: Andy Shevchenko + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Level 2 IRQs + * + * Firmware on the systems with Basin Cove PMIC services Level 1 IRQs + * without an assistance. Thus, each of the Level 1 IRQ is represented + * as a separate RTE in IOAPIC. + */ +static struct resource irq_level2_resources[] = { + DEFINE_RES_IRQ(0), /* power button */ + DEFINE_RES_IRQ(0), /* TMU */ + DEFINE_RES_IRQ(0), /* thermal */ + DEFINE_RES_IRQ(0), /* BCU */ + DEFINE_RES_IRQ(0), /* ADC */ + DEFINE_RES_IRQ(0), /* charger */ + DEFINE_RES_IRQ(0), /* GPIO */ +}; + +static const struct mfd_cell bcove_dev[] = { + { + .name = "mrfld_bcove_pwrbtn", + .num_resources = 1, + .resources = &irq_level2_resources[0], + }, { + .name = "mrfld_bcove_tmu", + .num_resources = 1, + .resources = &irq_level2_resources[1], + }, { + .name = "mrfld_bcove_thermal", + .num_resources = 1, + .resources = &irq_level2_resources[2], + }, { + .name = "mrfld_bcove_bcu", + .num_resources = 1, + .resources = &irq_level2_resources[3], + }, { + .name = "mrfld_bcove_adc", + .num_resources = 1, + .resources = &irq_level2_resources[4], + }, { + .name = "mrfld_bcove_charger", + .num_resources = 1, + .resources = &irq_level2_resources[5], + }, { + .name = "mrfld_bcove_pwrsrc", + .num_resources = 1, + .resources = &irq_level2_resources[5], + }, { + .name = "mrfld_bcove_gpio", + .num_resources = 1, + .resources = &irq_level2_resources[6], + }, + { .name = "mrfld_bcove_region", }, +}; + +static int bcove_ipc_byte_reg_read(void *context, unsigned int reg, + unsigned int *val) +{ + u8 ipc_out; + int ret; + + ret = intel_scu_ipc_ioread8(reg, &ipc_out); + if (ret) + return ret; + + *val = ipc_out; + return 0; +} + +static int bcove_ipc_byte_reg_write(void *context, unsigned int reg, + unsigned int val) +{ + u8 ipc_in = val; + int ret; + + ret = intel_scu_ipc_iowrite8(reg, ipc_in); + if (ret) + return ret; + + return 0; +} + +static const struct regmap_config bcove_regmap_config = { + .reg_bits = 16, + .val_bits = 8, + .max_register = 0xff, + .reg_write = bcove_ipc_byte_reg_write, + .reg_read = bcove_ipc_byte_reg_read, +}; + +static int bcove_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct intel_soc_pmic *pmic; + unsigned int i; + int ret; + + pmic = devm_kzalloc(dev, sizeof(*pmic), GFP_KERNEL); + if (!pmic) + return -ENOMEM; + + platform_set_drvdata(pdev, pmic); + pmic->dev = &pdev->dev; + + pmic->regmap = devm_regmap_init(dev, NULL, pmic, &bcove_regmap_config); + if (IS_ERR(pmic->regmap)) + return PTR_ERR(pmic->regmap); + + for (i = 0; i < ARRAY_SIZE(irq_level2_resources); i++) { + ret = platform_get_irq(pdev, i); + if (ret < 0) + return ret; + + irq_level2_resources[i].start = ret; + irq_level2_resources[i].end = ret; + } + + return devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, + bcove_dev, ARRAY_SIZE(bcove_dev), + NULL, 0, NULL); +} + +static const struct acpi_device_id bcove_acpi_ids[] = { + { "INTC100E" }, + {} +}; +MODULE_DEVICE_TABLE(acpi, bcove_acpi_ids); + +static struct platform_driver bcove_driver = { + .driver = { + .name = "intel_soc_pmic_mrfld", + .acpi_match_table = bcove_acpi_ids, + }, + .probe = bcove_probe, +}; +module_platform_driver(bcove_driver); + +MODULE_DESCRIPTION("IPC driver for Intel SoC Basin Cove PMIC"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/mfd/jz4740-adc.c b/drivers/mfd/jz4740-adc.c deleted file mode 100644 index 082f16917519..000000000000 --- a/drivers/mfd/jz4740-adc.c +++ /dev/null @@ -1,324 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) 2009-2010, Lars-Peter Clausen - * JZ4740 SoC ADC driver - * - * This driver synchronizes access to the JZ4740 ADC core between the - * JZ4740 battery and hwmon drivers. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - - -#define JZ_REG_ADC_ENABLE 0x00 -#define JZ_REG_ADC_CFG 0x04 -#define JZ_REG_ADC_CTRL 0x08 -#define JZ_REG_ADC_STATUS 0x0c - -#define JZ_REG_ADC_TOUCHSCREEN_BASE 0x10 -#define JZ_REG_ADC_BATTERY_BASE 0x1c -#define JZ_REG_ADC_HWMON_BASE 0x20 - -#define JZ_ADC_ENABLE_TOUCH BIT(2) -#define JZ_ADC_ENABLE_BATTERY BIT(1) -#define JZ_ADC_ENABLE_ADCIN BIT(0) - -enum { - JZ_ADC_IRQ_ADCIN = 0, - JZ_ADC_IRQ_BATTERY, - JZ_ADC_IRQ_TOUCH, - JZ_ADC_IRQ_PENUP, - JZ_ADC_IRQ_PENDOWN, -}; - -struct jz4740_adc { - struct resource *mem; - void __iomem *base; - - int irq; - struct irq_chip_generic *gc; - - struct clk *clk; - atomic_t clk_ref; - - spinlock_t lock; -}; - -static void jz4740_adc_irq_demux(struct irq_desc *desc) -{ - struct irq_chip_generic *gc = irq_desc_get_handler_data(desc); - uint8_t status; - unsigned int i; - - status = readb(gc->reg_base + JZ_REG_ADC_STATUS); - - for (i = 0; i < 5; ++i) { - if (status & BIT(i)) - generic_handle_irq(gc->irq_base + i); - } -} - - -/* Refcounting for the ADC clock is done in here instead of in the clock - * framework, because it is the only clock which is shared between multiple - * devices and thus is the only clock which needs refcounting */ -static inline void jz4740_adc_clk_enable(struct jz4740_adc *adc) -{ - if (atomic_inc_return(&adc->clk_ref) == 1) - clk_prepare_enable(adc->clk); -} - -static inline void jz4740_adc_clk_disable(struct jz4740_adc *adc) -{ - if (atomic_dec_return(&adc->clk_ref) == 0) - clk_disable_unprepare(adc->clk); -} - -static inline void jz4740_adc_set_enabled(struct jz4740_adc *adc, int engine, - bool enabled) -{ - unsigned long flags; - uint8_t val; - - spin_lock_irqsave(&adc->lock, flags); - - val = readb(adc->base + JZ_REG_ADC_ENABLE); - if (enabled) - val |= BIT(engine); - else - val &= ~BIT(engine); - writeb(val, adc->base + JZ_REG_ADC_ENABLE); - - spin_unlock_irqrestore(&adc->lock, flags); -} - -static int jz4740_adc_cell_enable(struct platform_device *pdev) -{ - struct jz4740_adc *adc = dev_get_drvdata(pdev->dev.parent); - - jz4740_adc_clk_enable(adc); - jz4740_adc_set_enabled(adc, pdev->id, true); - - return 0; -} - -static int jz4740_adc_cell_disable(struct platform_device *pdev) -{ - struct jz4740_adc *adc = dev_get_drvdata(pdev->dev.parent); - - jz4740_adc_set_enabled(adc, pdev->id, false); - jz4740_adc_clk_disable(adc); - - return 0; -} - -int jz4740_adc_set_config(struct device *dev, uint32_t mask, uint32_t val) -{ - struct jz4740_adc *adc = dev_get_drvdata(dev); - unsigned long flags; - uint32_t cfg; - - if (!adc) - return -ENODEV; - - spin_lock_irqsave(&adc->lock, flags); - - cfg = readl(adc->base + JZ_REG_ADC_CFG); - - cfg &= ~mask; - cfg |= val; - - writel(cfg, adc->base + JZ_REG_ADC_CFG); - - spin_unlock_irqrestore(&adc->lock, flags); - - return 0; -} -EXPORT_SYMBOL_GPL(jz4740_adc_set_config); - -static struct resource jz4740_hwmon_resources[] = { - { - .start = JZ_ADC_IRQ_ADCIN, - .flags = IORESOURCE_IRQ, - }, - { - .start = JZ_REG_ADC_HWMON_BASE, - .end = JZ_REG_ADC_HWMON_BASE + 3, - .flags = IORESOURCE_MEM, - }, -}; - -static struct resource jz4740_battery_resources[] = { - { - .start = JZ_ADC_IRQ_BATTERY, - .flags = IORESOURCE_IRQ, - }, - { - .start = JZ_REG_ADC_BATTERY_BASE, - .end = JZ_REG_ADC_BATTERY_BASE + 3, - .flags = IORESOURCE_MEM, - }, -}; - -static const struct mfd_cell jz4740_adc_cells[] = { - { - .id = 0, - .name = "jz4740-hwmon", - .num_resources = ARRAY_SIZE(jz4740_hwmon_resources), - .resources = jz4740_hwmon_resources, - - .enable = jz4740_adc_cell_enable, - .disable = jz4740_adc_cell_disable, - }, - { - .id = 1, - .name = "jz4740-battery", - .num_resources = ARRAY_SIZE(jz4740_battery_resources), - .resources = jz4740_battery_resources, - - .enable = jz4740_adc_cell_enable, - .disable = jz4740_adc_cell_disable, - }, -}; - -static int jz4740_adc_probe(struct platform_device *pdev) -{ - struct irq_chip_generic *gc; - struct irq_chip_type *ct; - struct jz4740_adc *adc; - struct resource *mem_base; - int ret; - int irq_base; - - adc = devm_kzalloc(&pdev->dev, sizeof(*adc), GFP_KERNEL); - if (!adc) - return -ENOMEM; - - adc->irq = platform_get_irq(pdev, 0); - if (adc->irq < 0) { - ret = adc->irq; - dev_err(&pdev->dev, "Failed to get platform irq: %d\n", ret); - return ret; - } - - irq_base = platform_get_irq(pdev, 1); - if (irq_base < 0) { - dev_err(&pdev->dev, "Failed to get irq base: %d\n", irq_base); - return irq_base; - } - - mem_base = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!mem_base) { - dev_err(&pdev->dev, "Failed to get platform mmio resource\n"); - return -ENOENT; - } - - /* Only request the shared registers for the MFD driver */ - adc->mem = request_mem_region(mem_base->start, JZ_REG_ADC_STATUS, - pdev->name); - if (!adc->mem) { - dev_err(&pdev->dev, "Failed to request mmio memory region\n"); - return -EBUSY; - } - - adc->base = ioremap_nocache(adc->mem->start, resource_size(adc->mem)); - if (!adc->base) { - ret = -EBUSY; - dev_err(&pdev->dev, "Failed to ioremap mmio memory\n"); - goto err_release_mem_region; - } - - adc->clk = clk_get(&pdev->dev, "adc"); - if (IS_ERR(adc->clk)) { - ret = PTR_ERR(adc->clk); - dev_err(&pdev->dev, "Failed to get clock: %d\n", ret); - goto err_iounmap; - } - - spin_lock_init(&adc->lock); - atomic_set(&adc->clk_ref, 0); - - platform_set_drvdata(pdev, adc); - - gc = irq_alloc_generic_chip("INTC", 1, irq_base, adc->base, - handle_level_irq); - - ct = gc->chip_types; - ct->regs.mask = JZ_REG_ADC_CTRL; - ct->regs.ack = JZ_REG_ADC_STATUS; - ct->chip.irq_mask = irq_gc_mask_set_bit; - ct->chip.irq_unmask = irq_gc_mask_clr_bit; - ct->chip.irq_ack = irq_gc_ack_set_bit; - - irq_setup_generic_chip(gc, IRQ_MSK(5), IRQ_GC_INIT_MASK_CACHE, 0, - IRQ_NOPROBE | IRQ_LEVEL); - - adc->gc = gc; - - irq_set_chained_handler_and_data(adc->irq, jz4740_adc_irq_demux, gc); - - writeb(0x00, adc->base + JZ_REG_ADC_ENABLE); - writeb(0xff, adc->base + JZ_REG_ADC_CTRL); - - ret = mfd_add_devices(&pdev->dev, 0, jz4740_adc_cells, - ARRAY_SIZE(jz4740_adc_cells), mem_base, - irq_base, NULL); - if (ret < 0) - goto err_clk_put; - - return 0; - -err_clk_put: - clk_put(adc->clk); -err_iounmap: - iounmap(adc->base); -err_release_mem_region: - release_mem_region(adc->mem->start, resource_size(adc->mem)); - return ret; -} - -static int jz4740_adc_remove(struct platform_device *pdev) -{ - struct jz4740_adc *adc = platform_get_drvdata(pdev); - - mfd_remove_devices(&pdev->dev); - - irq_remove_generic_chip(adc->gc, IRQ_MSK(5), IRQ_NOPROBE | IRQ_LEVEL, 0); - kfree(adc->gc); - irq_set_chained_handler_and_data(adc->irq, NULL, NULL); - - iounmap(adc->base); - release_mem_region(adc->mem->start, resource_size(adc->mem)); - - clk_put(adc->clk); - - return 0; -} - -static struct platform_driver jz4740_adc_driver = { - .probe = jz4740_adc_probe, - .remove = jz4740_adc_remove, - .driver = { - .name = "jz4740-adc", - }, -}; - -module_platform_driver(jz4740_adc_driver); - -MODULE_DESCRIPTION("JZ4740 SoC ADC driver"); -MODULE_AUTHOR("Lars-Peter Clausen "); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:jz4740-adc"); diff --git a/drivers/mfd/max14577.c b/drivers/mfd/max14577.c index ebb13d5de530..fd8864cafd25 100644 --- a/drivers/mfd/max14577.c +++ b/drivers/mfd/max14577.c @@ -297,11 +297,11 @@ static int max77836_init(struct max14577 *max14577) int ret; u8 intsrc_mask; - max14577->i2c_pmic = i2c_new_dummy(max14577->i2c->adapter, + max14577->i2c_pmic = i2c_new_dummy_device(max14577->i2c->adapter, I2C_ADDR_PMIC); - if (!max14577->i2c_pmic) { + if (IS_ERR(max14577->i2c_pmic)) { dev_err(max14577->dev, "Failed to register PMIC I2C device\n"); - return -ENODEV; + return PTR_ERR(max14577->i2c_pmic); } i2c_set_clientdata(max14577->i2c_pmic, max14577); diff --git a/drivers/mfd/max77620.c b/drivers/mfd/max77620.c index 0c28965fcc6a..a851ff473a44 100644 --- a/drivers/mfd/max77620.c +++ b/drivers/mfd/max77620.c @@ -416,8 +416,10 @@ static int max77620_initialise_fps(struct max77620_chip *chip) for_each_child_of_node(fps_np, fps_child) { ret = max77620_config_fps(chip, fps_child); - if (ret < 0) + if (ret < 0) { + of_node_put(fps_child); return ret; + } } config = chip->enable_global_lpm ? MAX77620_ONOFFCNFG2_SLP_LPM_MSK : 0; diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c index 901d99d65924..596ed85cab3b 100644 --- a/drivers/mfd/max77693.c +++ b/drivers/mfd/max77693.c @@ -183,17 +183,17 @@ static int max77693_i2c_probe(struct i2c_client *i2c, } else dev_info(max77693->dev, "device ID: 0x%x\n", reg_data); - max77693->i2c_muic = i2c_new_dummy(i2c->adapter, I2C_ADDR_MUIC); - if (!max77693->i2c_muic) { + max77693->i2c_muic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_MUIC); + if (IS_ERR(max77693->i2c_muic)) { dev_err(max77693->dev, "Failed to allocate I2C device for MUIC\n"); - return -ENODEV; + return PTR_ERR(max77693->i2c_muic); } i2c_set_clientdata(max77693->i2c_muic, max77693); - max77693->i2c_haptic = i2c_new_dummy(i2c->adapter, I2C_ADDR_HAPTIC); - if (!max77693->i2c_haptic) { + max77693->i2c_haptic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_HAPTIC); + if (IS_ERR(max77693->i2c_haptic)) { dev_err(max77693->dev, "Failed to allocate I2C device for Haptic\n"); - ret = -ENODEV; + ret = PTR_ERR(max77693->i2c_haptic); goto err_i2c_haptic; } i2c_set_clientdata(max77693->i2c_haptic, max77693); diff --git a/drivers/mfd/max77843.c b/drivers/mfd/max77843.c index 25cbb2242b26..209ee24d9ce1 100644 --- a/drivers/mfd/max77843.c +++ b/drivers/mfd/max77843.c @@ -70,11 +70,11 @@ static int max77843_chg_init(struct max77693_dev *max77843) { int ret; - max77843->i2c_chg = i2c_new_dummy(max77843->i2c->adapter, I2C_ADDR_CHG); - if (!max77843->i2c_chg) { + max77843->i2c_chg = i2c_new_dummy_device(max77843->i2c->adapter, I2C_ADDR_CHG); + if (IS_ERR(max77843->i2c_chg)) { dev_err(&max77843->i2c->dev, "Cannot allocate I2C device for Charger\n"); - return -ENODEV; + return PTR_ERR(max77843->i2c_chg); } i2c_set_clientdata(max77843->i2c_chg, max77843); diff --git a/drivers/mfd/max8907.c b/drivers/mfd/max8907.c index cc01f706cb32..d44baafd9d14 100644 --- a/drivers/mfd/max8907.c +++ b/drivers/mfd/max8907.c @@ -214,9 +214,9 @@ static int max8907_i2c_probe(struct i2c_client *i2c, goto err_regmap_gen; } - max8907->i2c_rtc = i2c_new_dummy(i2c->adapter, MAX8907_RTC_I2C_ADDR); - if (!max8907->i2c_rtc) { - ret = -ENOMEM; + max8907->i2c_rtc = i2c_new_dummy_device(i2c->adapter, MAX8907_RTC_I2C_ADDR); + if (IS_ERR(max8907->i2c_rtc)) { + ret = PTR_ERR(max8907->i2c_rtc); goto err_dummy_rtc; } i2c_set_clientdata(max8907->i2c_rtc, max8907); diff --git a/drivers/mfd/max8925-i2c.c b/drivers/mfd/max8925-i2c.c index 20bb19b71109..114e905bef25 100644 --- a/drivers/mfd/max8925-i2c.c +++ b/drivers/mfd/max8925-i2c.c @@ -176,18 +176,18 @@ static int max8925_probe(struct i2c_client *client, dev_set_drvdata(chip->dev, chip); mutex_init(&chip->io_lock); - chip->rtc = i2c_new_dummy(chip->i2c->adapter, RTC_I2C_ADDR); - if (!chip->rtc) { + chip->rtc = i2c_new_dummy_device(chip->i2c->adapter, RTC_I2C_ADDR); + if (IS_ERR(chip->rtc)) { dev_err(chip->dev, "Failed to allocate I2C device for RTC\n"); - return -ENODEV; + return PTR_ERR(chip->rtc); } i2c_set_clientdata(chip->rtc, chip); - chip->adc = i2c_new_dummy(chip->i2c->adapter, ADC_I2C_ADDR); - if (!chip->adc) { + chip->adc = i2c_new_dummy_device(chip->i2c->adapter, ADC_I2C_ADDR); + if (IS_ERR(chip->adc)) { dev_err(chip->dev, "Failed to allocate I2C device for ADC\n"); i2c_unregister_device(chip->rtc); - return -ENODEV; + return PTR_ERR(chip->adc); } i2c_set_clientdata(chip->adc, chip); diff --git a/drivers/mfd/max8997.c b/drivers/mfd/max8997.c index 8c06c09e36d1..68d8f2b95287 100644 --- a/drivers/mfd/max8997.c +++ b/drivers/mfd/max8997.c @@ -185,25 +185,25 @@ static int max8997_i2c_probe(struct i2c_client *i2c, mutex_init(&max8997->iolock); - max8997->rtc = i2c_new_dummy(i2c->adapter, I2C_ADDR_RTC); - if (!max8997->rtc) { + max8997->rtc = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_RTC); + if (IS_ERR(max8997->rtc)) { dev_err(max8997->dev, "Failed to allocate I2C device for RTC\n"); - return -ENODEV; + return PTR_ERR(max8997->rtc); } i2c_set_clientdata(max8997->rtc, max8997); - max8997->haptic = i2c_new_dummy(i2c->adapter, I2C_ADDR_HAPTIC); - if (!max8997->haptic) { + max8997->haptic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_HAPTIC); + if (IS_ERR(max8997->haptic)) { dev_err(max8997->dev, "Failed to allocate I2C device for Haptic\n"); - ret = -ENODEV; + ret = PTR_ERR(max8997->haptic); goto err_i2c_haptic; } i2c_set_clientdata(max8997->haptic, max8997); - max8997->muic = i2c_new_dummy(i2c->adapter, I2C_ADDR_MUIC); - if (!max8997->muic) { + max8997->muic = i2c_new_dummy_device(i2c->adapter, I2C_ADDR_MUIC); + if (IS_ERR(max8997->muic)) { dev_err(max8997->dev, "Failed to allocate I2C device for MUIC\n"); - ret = -ENODEV; + ret = PTR_ERR(max8997->muic); goto err_i2c_muic; } i2c_set_clientdata(max8997->muic, max8997); diff --git a/drivers/mfd/max8998.c b/drivers/mfd/max8998.c index 56409df120f8..785f8e9841b7 100644 --- a/drivers/mfd/max8998.c +++ b/drivers/mfd/max8998.c @@ -195,10 +195,10 @@ static int max8998_i2c_probe(struct i2c_client *i2c, } mutex_init(&max8998->iolock); - max8998->rtc = i2c_new_dummy(i2c->adapter, RTC_I2C_ADDR); - if (!max8998->rtc) { + max8998->rtc = i2c_new_dummy_device(i2c->adapter, RTC_I2C_ADDR); + if (IS_ERR(max8998->rtc)) { dev_err(&i2c->dev, "Failed to allocate I2C device for RTC\n"); - return -ENODEV; + return PTR_ERR(max8998->rtc); } i2c_set_clientdata(max8998->rtc, max8998); diff --git a/drivers/mfd/mt6397-core.c b/drivers/mfd/mt6397-core.c index 337bcccdb914..310dae26ddff 100644 --- a/drivers/mfd/mt6397-core.c +++ b/drivers/mfd/mt6397-core.c @@ -5,34 +5,34 @@ */ #include +#include #include #include #include #include #include -#include #include -#include +#include #include +#include + +#define MT6323_RTC_BASE 0x8000 +#define MT6323_RTC_SIZE 0x40 #define MT6397_RTC_BASE 0xe000 #define MT6397_RTC_SIZE 0x3e -#define MT6323_CID_CODE 0x23 -#define MT6391_CID_CODE 0x91 -#define MT6397_CID_CODE 0x97 +#define MT6323_PWRC_BASE 0x8000 +#define MT6323_PWRC_SIZE 0x40 + +static const struct resource mt6323_rtc_resources[] = { + DEFINE_RES_MEM(MT6323_RTC_BASE, MT6323_RTC_SIZE), + DEFINE_RES_IRQ(MT6323_IRQ_STATUS_RTC), +}; static const struct resource mt6397_rtc_resources[] = { - { - .start = MT6397_RTC_BASE, - .end = MT6397_RTC_BASE + MT6397_RTC_SIZE, - .flags = IORESOURCE_MEM, - }, - { - .start = MT6397_IRQ_RTC, - .end = MT6397_IRQ_RTC, - .flags = IORESOURCE_IRQ, - }, + DEFINE_RES_MEM(MT6397_RTC_BASE, MT6397_RTC_SIZE), + DEFINE_RES_IRQ(MT6397_IRQ_RTC), }; static const struct resource mt6323_keys_resources[] = { @@ -45,8 +45,17 @@ static const struct resource mt6397_keys_resources[] = { DEFINE_RES_IRQ(MT6397_IRQ_HOMEKEY), }; +static const struct resource mt6323_pwrc_resources[] = { + DEFINE_RES_MEM(MT6323_PWRC_BASE, MT6323_PWRC_SIZE), +}; + static const struct mfd_cell mt6323_devs[] = { { + .name = "mt6323-rtc", + .num_resources = ARRAY_SIZE(mt6323_rtc_resources), + .resources = mt6323_rtc_resources, + .of_compatible = "mediatek,mt6323-rtc", + }, { .name = "mt6323-regulator", .of_compatible = "mediatek,mt6323-regulator" }, { @@ -57,6 +66,11 @@ static const struct mfd_cell mt6323_devs[] = { .num_resources = ARRAY_SIZE(mt6323_keys_resources), .resources = mt6323_keys_resources, .of_compatible = "mediatek,mt6323-keys" + }, { + .name = "mt6323-pwrc", + .num_resources = ARRAY_SIZE(mt6323_pwrc_resources), + .resources = mt6323_pwrc_resources, + .of_compatible = "mediatek,mt6323-pwrc" }, }; @@ -86,148 +100,6 @@ static const struct mfd_cell mt6397_devs[] = { } }; -static void mt6397_irq_lock(struct irq_data *data) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); - - mutex_lock(&mt6397->irqlock); -} - -static void mt6397_irq_sync_unlock(struct irq_data *data) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); - - regmap_write(mt6397->regmap, mt6397->int_con[0], - mt6397->irq_masks_cur[0]); - regmap_write(mt6397->regmap, mt6397->int_con[1], - mt6397->irq_masks_cur[1]); - - mutex_unlock(&mt6397->irqlock); -} - -static void mt6397_irq_disable(struct irq_data *data) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); - int shift = data->hwirq & 0xf; - int reg = data->hwirq >> 4; - - mt6397->irq_masks_cur[reg] &= ~BIT(shift); -} - -static void mt6397_irq_enable(struct irq_data *data) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); - int shift = data->hwirq & 0xf; - int reg = data->hwirq >> 4; - - mt6397->irq_masks_cur[reg] |= BIT(shift); -} - -#ifdef CONFIG_PM_SLEEP -static int mt6397_irq_set_wake(struct irq_data *irq_data, unsigned int on) -{ - struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(irq_data); - int shift = irq_data->hwirq & 0xf; - int reg = irq_data->hwirq >> 4; - - if (on) - mt6397->wake_mask[reg] |= BIT(shift); - else - mt6397->wake_mask[reg] &= ~BIT(shift); - - return 0; -} -#else -#define mt6397_irq_set_wake NULL -#endif - -static struct irq_chip mt6397_irq_chip = { - .name = "mt6397-irq", - .irq_bus_lock = mt6397_irq_lock, - .irq_bus_sync_unlock = mt6397_irq_sync_unlock, - .irq_enable = mt6397_irq_enable, - .irq_disable = mt6397_irq_disable, - .irq_set_wake = mt6397_irq_set_wake, -}; - -static void mt6397_irq_handle_reg(struct mt6397_chip *mt6397, int reg, - int irqbase) -{ - unsigned int status; - int i, irq, ret; - - ret = regmap_read(mt6397->regmap, reg, &status); - if (ret) { - dev_err(mt6397->dev, "Failed to read irq status: %d\n", ret); - return; - } - - for (i = 0; i < 16; i++) { - if (status & BIT(i)) { - irq = irq_find_mapping(mt6397->irq_domain, irqbase + i); - if (irq) - handle_nested_irq(irq); - } - } - - regmap_write(mt6397->regmap, reg, status); -} - -static irqreturn_t mt6397_irq_thread(int irq, void *data) -{ - struct mt6397_chip *mt6397 = data; - - mt6397_irq_handle_reg(mt6397, mt6397->int_status[0], 0); - mt6397_irq_handle_reg(mt6397, mt6397->int_status[1], 16); - - return IRQ_HANDLED; -} - -static int mt6397_irq_domain_map(struct irq_domain *d, unsigned int irq, - irq_hw_number_t hw) -{ - struct mt6397_chip *mt6397 = d->host_data; - - irq_set_chip_data(irq, mt6397); - irq_set_chip_and_handler(irq, &mt6397_irq_chip, handle_level_irq); - irq_set_nested_thread(irq, 1); - irq_set_noprobe(irq); - - return 0; -} - -static const struct irq_domain_ops mt6397_irq_domain_ops = { - .map = mt6397_irq_domain_map, -}; - -static int mt6397_irq_init(struct mt6397_chip *mt6397) -{ - int ret; - - mutex_init(&mt6397->irqlock); - - /* Mask all interrupt sources */ - regmap_write(mt6397->regmap, mt6397->int_con[0], 0x0); - regmap_write(mt6397->regmap, mt6397->int_con[1], 0x0); - - mt6397->irq_domain = irq_domain_add_linear(mt6397->dev->of_node, - MT6397_IRQ_NR, &mt6397_irq_domain_ops, mt6397); - if (!mt6397->irq_domain) { - dev_err(mt6397->dev, "could not create irq domain\n"); - return -ENOMEM; - } - - ret = devm_request_threaded_irq(mt6397->dev, mt6397->irq, NULL, - mt6397_irq_thread, IRQF_ONESHOT, "mt6397-pmic", mt6397); - if (ret) { - dev_err(mt6397->dev, "failed to register irq=%d; err: %d\n", - mt6397->irq, ret); - return ret; - } - - return 0; -} - #ifdef CONFIG_PM_SLEEP static int mt6397_irq_suspend(struct device *dev) { @@ -290,7 +162,7 @@ static int mt6397_probe(struct platform_device *pdev) return pmic->irq; switch (id & 0xff) { - case MT6323_CID_CODE: + case MT6323_CHIP_ID: pmic->int_con[0] = MT6323_INT_CON0; pmic->int_con[1] = MT6323_INT_CON1; pmic->int_status[0] = MT6323_INT_STATUS0; @@ -304,8 +176,8 @@ static int mt6397_probe(struct platform_device *pdev) 0, pmic->irq_domain); break; - case MT6397_CID_CODE: - case MT6391_CID_CODE: + case MT6391_CHIP_ID: + case MT6397_CHIP_ID: pmic->int_con[0] = MT6397_INT_CON0; pmic->int_con[1] = MT6397_INT_CON1; pmic->int_status[0] = MT6397_INT_STATUS0; diff --git a/drivers/mfd/mt6397-irq.c b/drivers/mfd/mt6397-irq.c new file mode 100644 index 000000000000..b2d3ce1f3115 --- /dev/null +++ b/drivers/mfd/mt6397-irq.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// Copyright (c) 2019 MediaTek Inc. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void mt6397_irq_lock(struct irq_data *data) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); + + mutex_lock(&mt6397->irqlock); +} + +static void mt6397_irq_sync_unlock(struct irq_data *data) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); + + regmap_write(mt6397->regmap, mt6397->int_con[0], + mt6397->irq_masks_cur[0]); + regmap_write(mt6397->regmap, mt6397->int_con[1], + mt6397->irq_masks_cur[1]); + + mutex_unlock(&mt6397->irqlock); +} + +static void mt6397_irq_disable(struct irq_data *data) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); + int shift = data->hwirq & 0xf; + int reg = data->hwirq >> 4; + + mt6397->irq_masks_cur[reg] &= ~BIT(shift); +} + +static void mt6397_irq_enable(struct irq_data *data) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(data); + int shift = data->hwirq & 0xf; + int reg = data->hwirq >> 4; + + mt6397->irq_masks_cur[reg] |= BIT(shift); +} + +#ifdef CONFIG_PM_SLEEP +static int mt6397_irq_set_wake(struct irq_data *irq_data, unsigned int on) +{ + struct mt6397_chip *mt6397 = irq_data_get_irq_chip_data(irq_data); + int shift = irq_data->hwirq & 0xf; + int reg = irq_data->hwirq >> 4; + + if (on) + mt6397->wake_mask[reg] |= BIT(shift); + else + mt6397->wake_mask[reg] &= ~BIT(shift); + + return 0; +} +#else +#define mt6397_irq_set_wake NULL +#endif + +static struct irq_chip mt6397_irq_chip = { + .name = "mt6397-irq", + .irq_bus_lock = mt6397_irq_lock, + .irq_bus_sync_unlock = mt6397_irq_sync_unlock, + .irq_enable = mt6397_irq_enable, + .irq_disable = mt6397_irq_disable, + .irq_set_wake = mt6397_irq_set_wake, +}; + +static void mt6397_irq_handle_reg(struct mt6397_chip *mt6397, int reg, + int irqbase) +{ + unsigned int status; + int i, irq, ret; + + ret = regmap_read(mt6397->regmap, reg, &status); + if (ret) { + dev_err(mt6397->dev, "Failed to read irq status: %d\n", ret); + return; + } + + for (i = 0; i < 16; i++) { + if (status & BIT(i)) { + irq = irq_find_mapping(mt6397->irq_domain, irqbase + i); + if (irq) + handle_nested_irq(irq); + } + } + + regmap_write(mt6397->regmap, reg, status); +} + +static irqreturn_t mt6397_irq_thread(int irq, void *data) +{ + struct mt6397_chip *mt6397 = data; + + mt6397_irq_handle_reg(mt6397, mt6397->int_status[0], 0); + mt6397_irq_handle_reg(mt6397, mt6397->int_status[1], 16); + + return IRQ_HANDLED; +} + +static int mt6397_irq_domain_map(struct irq_domain *d, unsigned int irq, + irq_hw_number_t hw) +{ + struct mt6397_chip *mt6397 = d->host_data; + + irq_set_chip_data(irq, mt6397); + irq_set_chip_and_handler(irq, &mt6397_irq_chip, handle_level_irq); + irq_set_nested_thread(irq, 1); + irq_set_noprobe(irq); + + return 0; +} + +static const struct irq_domain_ops mt6397_irq_domain_ops = { + .map = mt6397_irq_domain_map, +}; + +int mt6397_irq_init(struct mt6397_chip *chip) +{ + int ret; + + mutex_init(&chip->irqlock); + + switch (chip->chip_id) { + case MT6323_CHIP_ID: + chip->int_con[0] = MT6323_INT_CON0; + chip->int_con[1] = MT6323_INT_CON1; + chip->int_status[0] = MT6323_INT_STATUS0; + chip->int_status[1] = MT6323_INT_STATUS1; + break; + + case MT6391_CHIP_ID: + case MT6397_CHIP_ID: + chip->int_con[0] = MT6397_INT_CON0; + chip->int_con[1] = MT6397_INT_CON1; + chip->int_status[0] = MT6397_INT_STATUS0; + chip->int_status[1] = MT6397_INT_STATUS1; + break; + + default: + dev_err(chip->dev, "unsupported chip: 0x%x\n", chip->chip_id); + return -ENODEV; + } + + /* Mask all interrupt sources */ + regmap_write(chip->regmap, chip->int_con[0], 0x0); + regmap_write(chip->regmap, chip->int_con[1], 0x0); + + chip->irq_domain = irq_domain_add_linear(chip->dev->of_node, + MT6397_IRQ_NR, + &mt6397_irq_domain_ops, + chip); + if (!chip->irq_domain) { + dev_err(chip->dev, "could not create irq domain\n"); + return -ENOMEM; + } + + ret = devm_request_threaded_irq(chip->dev, chip->irq, NULL, + mt6397_irq_thread, IRQF_ONESHOT, + "mt6397-pmic", chip); + if (ret) { + dev_err(chip->dev, "failed to register irq=%d; err: %d\n", + chip->irq, ret); + return ret; + } + + return 0; +} diff --git a/drivers/mfd/palmas.c b/drivers/mfd/palmas.c index 6818ff34837c..f5b3fa973b13 100644 --- a/drivers/mfd/palmas.c +++ b/drivers/mfd/palmas.c @@ -549,12 +549,12 @@ static int palmas_i2c_probe(struct i2c_client *i2c, palmas->i2c_clients[i] = i2c; else { palmas->i2c_clients[i] = - i2c_new_dummy(i2c->adapter, + i2c_new_dummy_device(i2c->adapter, i2c->addr + i); - if (!palmas->i2c_clients[i]) { + if (IS_ERR(palmas->i2c_clients[i])) { dev_err(palmas->dev, "can't attach client %d\n", i); - ret = -ENOMEM; + ret = PTR_ERR(palmas->i2c_clients[i]); goto err_i2c; } palmas->i2c_clients[i]->dev.of_node = of_node_get(node); diff --git a/drivers/mfd/qcom_rpm.c b/drivers/mfd/qcom_rpm.c index 4d7e9008628c..71bc34b74bc9 100644 --- a/drivers/mfd/qcom_rpm.c +++ b/drivers/mfd/qcom_rpm.c @@ -561,22 +561,16 @@ static int qcom_rpm_probe(struct platform_device *pdev) clk_prepare_enable(rpm->ramclk); /* Accepts NULL */ irq_ack = platform_get_irq_byname(pdev, "ack"); - if (irq_ack < 0) { - dev_err(&pdev->dev, "required ack interrupt missing\n"); + if (irq_ack < 0) return irq_ack; - } irq_err = platform_get_irq_byname(pdev, "err"); - if (irq_err < 0) { - dev_err(&pdev->dev, "required err interrupt missing\n"); + if (irq_err < 0) return irq_err; - } irq_wakeup = platform_get_irq_byname(pdev, "wakeup"); - if (irq_wakeup < 0) { - dev_err(&pdev->dev, "required wakeup interrupt missing\n"); + if (irq_wakeup < 0) return irq_wakeup; - } match = of_match_device(qcom_rpm_of_match, &pdev->dev); if (!match) diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c index 9b9b06d36cb1..154270f8d8d7 100644 --- a/drivers/mfd/sm501.c +++ b/drivers/mfd/sm501.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -1394,10 +1395,8 @@ static int sm501_plat_probe(struct platform_device *dev) sm->platdata = dev_get_platdata(&dev->dev); ret = platform_get_irq(dev, 0); - if (ret < 0) { - dev_err(&dev->dev, "failed to get irq resource\n"); + if (ret < 0) goto err_res; - } sm->irq = ret; sm->io_res = platform_get_resource(dev, IORESOURCE_MEM, 1); diff --git a/drivers/mfd/timberdale.c b/drivers/mfd/timberdale.c index 60c122e9b39f..faecbca6dba3 100644 --- a/drivers/mfd/timberdale.c +++ b/drivers/mfd/timberdale.c @@ -626,8 +626,7 @@ static const struct mfd_cell timberdale_cells_bar2[] = { static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, char *buf) { - struct pci_dev *pdev = to_pci_dev(dev); - struct timberdale_device *priv = pci_get_drvdata(pdev); + struct timberdale_device *priv = dev_get_drvdata(dev); return sprintf(buf, "%d.%d.%d\n", priv->fw.major, priv->fw.minor, priv->fw.config); diff --git a/drivers/mfd/tps80031.c b/drivers/mfd/tps80031.c index 865257ade8ac..907452b86e32 100644 --- a/drivers/mfd/tps80031.c +++ b/drivers/mfd/tps80031.c @@ -437,12 +437,11 @@ static int tps80031_probe(struct i2c_client *client, if (tps80031_slave_address[i] == client->addr) tps80031->clients[i] = client; else - tps80031->clients[i] = i2c_new_dummy(client->adapter, - tps80031_slave_address[i]); - if (!tps80031->clients[i]) { + tps80031->clients[i] = devm_i2c_new_dummy_device(&client->dev, + client->adapter, tps80031_slave_address[i]); + if (IS_ERR(tps80031->clients[i])) { dev_err(&client->dev, "can't attach client %d\n", i); - ret = -ENOMEM; - goto fail_client_reg; + return PTR_ERR(tps80031->clients[i]); } i2c_set_clientdata(tps80031->clients[i], tps80031); @@ -452,7 +451,7 @@ static int tps80031_probe(struct i2c_client *client, ret = PTR_ERR(tps80031->regmap[i]); dev_err(&client->dev, "regmap %d init failed, err %d\n", i, ret); - goto fail_client_reg; + return ret; } } @@ -461,7 +460,7 @@ static int tps80031_probe(struct i2c_client *client, if (ret < 0) { dev_err(&client->dev, "Silicon version number read failed: %d\n", ret); - goto fail_client_reg; + return ret; } ret = tps80031_read(&client->dev, TPS80031_SLAVE_ID3, @@ -469,7 +468,7 @@ static int tps80031_probe(struct i2c_client *client, if (ret < 0) { dev_err(&client->dev, "Silicon eeprom version read failed: %d\n", ret); - goto fail_client_reg; + return ret; } dev_info(&client->dev, "ES version 0x%02x and EPROM version 0x%02x\n", @@ -482,7 +481,7 @@ static int tps80031_probe(struct i2c_client *client, ret = tps80031_irq_init(tps80031, client->irq, pdata->irq_base); if (ret) { dev_err(&client->dev, "IRQ init failed: %d\n", ret); - goto fail_client_reg; + return ret; } tps80031_pupd_init(tps80031, pdata); @@ -506,12 +505,6 @@ static int tps80031_probe(struct i2c_client *client, fail_mfd_add: regmap_del_irq_chip(client->irq, tps80031->irq_data); - -fail_client_reg: - for (i = 0; i < TPS80031_NUM_SLAVES; i++) { - if (tps80031->clients[i] && (tps80031->clients[i] != client)) - i2c_unregister_device(tps80031->clients[i]); - } return ret; } diff --git a/drivers/mfd/twl-core.c b/drivers/mfd/twl-core.c index 448d9397ff04..20cf8cfe4f3b 100644 --- a/drivers/mfd/twl-core.c +++ b/drivers/mfd/twl-core.c @@ -1141,12 +1141,12 @@ twl_probe(struct i2c_client *client, const struct i2c_device_id *id) if (i == 0) { twl->client = client; } else { - twl->client = i2c_new_dummy(client->adapter, + twl->client = i2c_new_dummy_device(client->adapter, client->addr + i); - if (!twl->client) { + if (IS_ERR(twl->client)) { dev_err(&client->dev, "can't attach client %d\n", i); - status = -ENOMEM; + status = PTR_ERR(twl->client); goto fail; } } diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c index 518945b2f737..2cccd82a3106 100644 --- a/drivers/misc/eeprom/at24.c +++ b/drivers/misc/eeprom/at24.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index 34cd67951aec..6c51b1bad8c4 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c index 6d52cf5ce20e..25aa400e2e3c 100644 --- a/drivers/net/ethernet/jme.c +++ b/drivers/net/ethernet/jme.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 0ef01db1f8b8..74f81fe03810 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/wimax/i2400m/tx.c b/drivers/net/wimax/i2400m/tx.c index ebd64e083726..1255302e251e 100644 --- a/drivers/net/wimax/i2400m/tx.c +++ b/drivers/net/wimax/i2400m/tx.c @@ -654,8 +654,7 @@ void i2400m_tx_close(struct i2400m *i2400m) padding = aligned_size - tx_msg_moved->size; if (padding > 0) { pad_buf = i2400m_tx_fifo_push(i2400m, padding, 0, 0); - if (unlikely(WARN_ON(pad_buf == NULL - || pad_buf == TAIL_FULL))) { + if (WARN_ON(pad_buf == NULL || pad_buf == TAIL_FULL)) { /* This should not happen -- append should verify * there is always space left at least to append * tx_block_size */ diff --git a/drivers/net/wireless/ath/ath5k/pci.c b/drivers/net/wireless/ath/ath5k/pci.c index c6156cc38940..d5ee32ce9eb3 100644 --- a/drivers/net/wireless/ath/ath5k/pci.c +++ b/drivers/net/wireless/ath/ath5k/pci.c @@ -18,7 +18,6 @@ #include #include -#include #include #include #include "../ath.h" diff --git a/drivers/net/wireless/intel/iwlegacy/3945-mac.c b/drivers/net/wireless/intel/iwlegacy/3945-mac.c index b82da75a9ae3..4fbcc7fba3cc 100644 --- a/drivers/net/wireless/intel/iwlegacy/3945-mac.c +++ b/drivers/net/wireless/intel/iwlegacy/3945-mac.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c index fa2c02881939..ffb705b18fb1 100644 --- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c +++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index 5ab87a8dc907..f8a1f985a1d8 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -62,7 +62,6 @@ * *****************************************************************************/ #include -#include #include #include #include diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1ede1763a5ee..108f60b46804 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -666,8 +666,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) return BLK_STS_NOTSUPP; control |= NVME_RW_PRINFO_PRACT; - } else if (req_op(req) == REQ_OP_WRITE) { - t10_pi_prepare(req, ns->pi_type); } switch (ns->pi_type) { @@ -690,13 +688,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, void nvme_cleanup_cmd(struct request *req) { - if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && - nvme_req(req)->status == 0) { - struct nvme_ns *ns = req->rq_disk->private_data; - - t10_pi_complete(req, ns->pi_type, - blk_rq_bytes(req) >> ns->lba_shift); - } if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { struct nvme_ns *ns = req->rq_disk->private_data; struct page *page = req->special_vec.bv_page; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6b4d7b064b38..c0808f9eb8ab 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -549,8 +549,10 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) WARN_ON_ONCE(!iod->nents); - /* P2PDMA requests do not need to be unmapped */ - if (!is_pci_p2pdma_page(sg_page(iod->sg))) + if (is_pci_p2pdma_page(sg_page(iod->sg))) + pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents, + rq_dma_dir(req)); + else dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req)); @@ -834,8 +836,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, goto out; if (is_pci_p2pdma_page(sg_page(iod->sg))) - nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents, - rq_dma_dir(req)); + nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg, + iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN); else nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN); diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index c313de96a357..a304f5ea11b9 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -52,7 +52,7 @@ config PCI_MSI If you don't know what to do here, say Y. config PCI_MSI_IRQ_DOMAIN - def_bool ARC || ARM || ARM64 || X86 + def_bool ARC || ARM || ARM64 || X86 || RISCV depends on PCI_MSI select GENERIC_MSI_IRQ_DOMAIN @@ -170,7 +170,7 @@ config PCI_P2PDMA Many PCIe root complexes do not support P2P transactions and it's hard to tell which support it at all, so at this time, - P2P DMA transations must be between devices behind the same root + P2P DMA transactions must be between devices behind the same root port. If unsure, say N. @@ -181,7 +181,7 @@ config PCI_LABEL config PCI_HYPERV tristate "Hyper-V PCI Frontend" - depends on X86 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && X86_64 + depends on X86_64 && HYPERV && PCI_MSI && PCI_MSI_IRQ_DOMAIN && SYSFS select PCI_HYPERV_INTERFACE help The PCI device frontend driver allows the kernel to import arbitrary diff --git a/drivers/pci/access.c b/drivers/pci/access.c index 544922f097c0..2fccb5762c76 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -336,15 +336,6 @@ static inline int pcie_cap_version(const struct pci_dev *dev) return pcie_caps_reg(dev) & PCI_EXP_FLAGS_VERS; } -static bool pcie_downstream_port(const struct pci_dev *dev) -{ - int type = pci_pcie_type(dev); - - return type == PCI_EXP_TYPE_ROOT_PORT || - type == PCI_EXP_TYPE_DOWNSTREAM || - type == PCI_EXP_TYPE_PCIE_BRIDGE; -} - bool pcie_cap_has_lnkctl(const struct pci_dev *dev) { int type = pci_pcie_type(dev); diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 495059d923f7..8e40b3e6da77 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -417,11 +417,9 @@ struct pci_bus *pci_bus_get(struct pci_bus *bus) get_device(&bus->dev); return bus; } -EXPORT_SYMBOL(pci_bus_get); void pci_bus_put(struct pci_bus *bus) { if (bus) put_device(&bus->dev); } -EXPORT_SYMBOL(pci_bus_put); diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 6ea778ae4877..0ba988b5b5bc 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -131,13 +131,29 @@ config PCI_KEYSTONE_EP DesignWare core functions to implement the driver. config PCI_LAYERSCAPE - bool "Freescale Layerscape PCIe controller" + bool "Freescale Layerscape PCIe controller - Host mode" depends on OF && (ARM || ARCH_LAYERSCAPE || COMPILE_TEST) depends on PCI_MSI_IRQ_DOMAIN select MFD_SYSCON select PCIE_DW_HOST help - Say Y here if you want PCIe controller support on Layerscape SoCs. + Say Y here if you want to enable PCIe controller support on Layerscape + SoCs to work in Host mode. + This controller can work either as EP or RC. The RCW[HOST_AGT_PEX] + determines which PCIe controller works in EP mode and which PCIe + controller works in RC mode. + +config PCI_LAYERSCAPE_EP + bool "Freescale Layerscape PCIe controller - Endpoint mode" + depends on OF && (ARM || ARCH_LAYERSCAPE || COMPILE_TEST) + depends on PCI_ENDPOINT + select PCIE_DW_EP + help + Say Y here if you want to enable PCIe controller support on Layerscape + SoCs to work in Endpoint mode. + This controller can work either as EP or RC. The RCW[HOST_AGT_PEX] + determines which PCIe controller works in EP mode and which PCIe + controller works in RC mode. config PCI_HISI depends on OF && (ARM64 || COMPILE_TEST) @@ -220,6 +236,16 @@ config PCI_MESON and therefore the driver re-uses the DesignWare core functions to implement the driver. +config PCIE_TEGRA194 + tristate "NVIDIA Tegra194 (and later) PCIe controller" + depends on ARCH_TEGRA_194_SOC || COMPILE_TEST + depends on PCI_MSI_IRQ_DOMAIN + select PCIE_DW_HOST + select PHY_TEGRA194_P2U + help + Say Y here if you want support for DesignWare core based PCIe host + controller found in NVIDIA Tegra194 SoC. + config PCIE_UNIPHIER bool "Socionext UniPhier PCIe controllers" depends on ARCH_UNIPHIER || COMPILE_TEST @@ -230,4 +256,16 @@ config PCIE_UNIPHIER Say Y here if you want PCIe controller support on UniPhier SoCs. This driver supports LD20 and PXs3 SoCs. +config PCIE_AL + bool "Amazon Annapurna Labs PCIe controller" + depends on OF && (ARM64 || COMPILE_TEST) + depends on PCI_MSI_IRQ_DOMAIN + select PCIE_DW_HOST + help + Say Y here to enable support of the Amazon's Annapurna Labs PCIe + controller IP on Amazon SoCs. The PCIe controller uses the DesignWare + core plus Annapurna Labs proprietary hardware wrappers. This is + required only for DT-based platforms. ACPI platforms with the + Annapurna Labs PCIe controller don't need to enable this. + endmenu diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile index b085dfd4fab7..69faff371f11 100644 --- a/drivers/pci/controller/dwc/Makefile +++ b/drivers/pci/controller/dwc/Makefile @@ -8,13 +8,15 @@ obj-$(CONFIG_PCI_EXYNOS) += pci-exynos.o obj-$(CONFIG_PCI_IMX6) += pci-imx6.o obj-$(CONFIG_PCIE_SPEAR13XX) += pcie-spear13xx.o obj-$(CONFIG_PCI_KEYSTONE) += pci-keystone.o -obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o pci-layerscape-ep.o +obj-$(CONFIG_PCI_LAYERSCAPE) += pci-layerscape.o +obj-$(CONFIG_PCI_LAYERSCAPE_EP) += pci-layerscape-ep.o obj-$(CONFIG_PCIE_QCOM) += pcie-qcom.o obj-$(CONFIG_PCIE_ARMADA_8K) += pcie-armada8k.o obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o obj-$(CONFIG_PCIE_KIRIN) += pcie-kirin.o obj-$(CONFIG_PCIE_HISI_STB) += pcie-histb.o obj-$(CONFIG_PCI_MESON) += pci-meson.o +obj-$(CONFIG_PCIE_TEGRA194) += pcie-tegra194.o obj-$(CONFIG_PCIE_UNIPHIER) += pcie-uniphier.o # The following drivers are for devices that use the generic ACPI diff --git a/drivers/pci/controller/dwc/pci-exynos.c b/drivers/pci/controller/dwc/pci-exynos.c index cee5f2f590e2..14a6ba4067fb 100644 --- a/drivers/pci/controller/dwc/pci-exynos.c +++ b/drivers/pci/controller/dwc/pci-exynos.c @@ -465,7 +465,7 @@ static int __init exynos_pcie_probe(struct platform_device *pdev) ep->phy = devm_of_phy_get(dev, np, NULL); if (IS_ERR(ep->phy)) { - if (PTR_ERR(ep->phy) == -EPROBE_DEFER) + if (PTR_ERR(ep->phy) != -ENODEV) return PTR_ERR(ep->phy); ep->phy = NULL; diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 9b5cb5b70389..acfbd34032a8 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -57,6 +57,7 @@ enum imx6_pcie_variants { struct imx6_pcie_drvdata { enum imx6_pcie_variants variant; u32 flags; + int dbi_length; }; struct imx6_pcie { @@ -1173,8 +1174,8 @@ static int imx6_pcie_probe(struct platform_device *pdev) imx6_pcie->vpcie = devm_regulator_get_optional(&pdev->dev, "vpcie"); if (IS_ERR(imx6_pcie->vpcie)) { - if (PTR_ERR(imx6_pcie->vpcie) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(imx6_pcie->vpcie) != -ENODEV) + return PTR_ERR(imx6_pcie->vpcie); imx6_pcie->vpcie = NULL; } @@ -1212,6 +1213,7 @@ static const struct imx6_pcie_drvdata drvdata[] = { .variant = IMX6Q, .flags = IMX6_PCIE_FLAG_IMX6_PHY | IMX6_PCIE_FLAG_IMX6_SPEED_CHANGE, + .dbi_length = 0x200, }, [IMX6SX] = { .variant = IMX6SX, @@ -1254,6 +1256,37 @@ static struct platform_driver imx6_pcie_driver = { .shutdown = imx6_pcie_shutdown, }; +static void imx6_pcie_quirk(struct pci_dev *dev) +{ + struct pci_bus *bus = dev->bus; + struct pcie_port *pp = bus->sysdata; + + /* Bus parent is the PCI bridge, its parent is this platform driver */ + if (!bus->dev.parent || !bus->dev.parent->parent) + return; + + /* Make sure we only quirk devices associated with this driver */ + if (bus->dev.parent->parent->driver != &imx6_pcie_driver.driver) + return; + + if (bus->number == pp->root_bus_nr) { + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct imx6_pcie *imx6_pcie = to_imx6_pcie(pci); + + /* + * Limit config length to avoid the kernel reading beyond + * the register set and causing an abort on i.MX 6Quad + */ + if (imx6_pcie->drvdata->dbi_length) { + dev->cfg_size = imx6_pcie->drvdata->dbi_length; + dev_info(&dev->dev, "Limiting cfg_size to %d\n", + dev->cfg_size); + } + } +} +DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_SYNOPSYS, 0xabcd, + PCI_CLASS_BRIDGE_PCI, 8, imx6_pcie_quirk); + static int __init imx6_pcie_init(void) { #ifdef CONFIG_ARM diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c index be61d96cc95e..ca9aa4501e7e 100644 --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c @@ -44,6 +44,7 @@ static const struct pci_epc_features ls_pcie_epc_features = { .linkup_notifier = false, .msi_capable = true, .msix_capable = false, + .bar_fixed_64bit = (1 << BAR_2) | (1 << BAR_4), }; static const struct pci_epc_features* diff --git a/drivers/pci/controller/dwc/pcie-al.c b/drivers/pci/controller/dwc/pcie-al.c index 3ab58f0584a8..1eeda2f6371f 100644 --- a/drivers/pci/controller/dwc/pcie-al.c +++ b/drivers/pci/controller/dwc/pcie-al.c @@ -91,3 +91,368 @@ struct pci_ecam_ops al_pcie_ops = { }; #endif /* defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) */ + +#ifdef CONFIG_PCIE_AL + +#include +#include "pcie-designware.h" + +#define AL_PCIE_REV_ID_2 2 +#define AL_PCIE_REV_ID_3 3 +#define AL_PCIE_REV_ID_4 4 + +#define AXI_BASE_OFFSET 0x0 + +#define DEVICE_ID_OFFSET 0x16c + +#define DEVICE_REV_ID 0x0 +#define DEVICE_REV_ID_DEV_ID_MASK GENMASK(31, 16) + +#define DEVICE_REV_ID_DEV_ID_X4 0 +#define DEVICE_REV_ID_DEV_ID_X8 2 +#define DEVICE_REV_ID_DEV_ID_X16 4 + +#define OB_CTRL_REV1_2_OFFSET 0x0040 +#define OB_CTRL_REV3_5_OFFSET 0x0030 + +#define CFG_TARGET_BUS 0x0 +#define CFG_TARGET_BUS_MASK_MASK GENMASK(7, 0) +#define CFG_TARGET_BUS_BUSNUM_MASK GENMASK(15, 8) + +#define CFG_CONTROL 0x4 +#define CFG_CONTROL_SUBBUS_MASK GENMASK(15, 8) +#define CFG_CONTROL_SEC_BUS_MASK GENMASK(23, 16) + +struct al_pcie_reg_offsets { + unsigned int ob_ctrl; +}; + +struct al_pcie_target_bus_cfg { + u8 reg_val; + u8 reg_mask; + u8 ecam_mask; +}; + +struct al_pcie { + struct dw_pcie *pci; + void __iomem *controller_base; /* base of PCIe unit (not DW core) */ + struct device *dev; + resource_size_t ecam_size; + unsigned int controller_rev_id; + struct al_pcie_reg_offsets reg_offsets; + struct al_pcie_target_bus_cfg target_bus_cfg; +}; + +#define PCIE_ECAM_DEVFN(x) (((x) & 0xff) << 12) + +#define to_al_pcie(x) dev_get_drvdata((x)->dev) + +static inline u32 al_pcie_controller_readl(struct al_pcie *pcie, u32 offset) +{ + return readl_relaxed(pcie->controller_base + offset); +} + +static inline void al_pcie_controller_writel(struct al_pcie *pcie, u32 offset, + u32 val) +{ + writel_relaxed(val, pcie->controller_base + offset); +} + +static int al_pcie_rev_id_get(struct al_pcie *pcie, unsigned int *rev_id) +{ + u32 dev_rev_id_val; + u32 dev_id_val; + + dev_rev_id_val = al_pcie_controller_readl(pcie, AXI_BASE_OFFSET + + DEVICE_ID_OFFSET + + DEVICE_REV_ID); + dev_id_val = FIELD_GET(DEVICE_REV_ID_DEV_ID_MASK, dev_rev_id_val); + + switch (dev_id_val) { + case DEVICE_REV_ID_DEV_ID_X4: + *rev_id = AL_PCIE_REV_ID_2; + break; + case DEVICE_REV_ID_DEV_ID_X8: + *rev_id = AL_PCIE_REV_ID_3; + break; + case DEVICE_REV_ID_DEV_ID_X16: + *rev_id = AL_PCIE_REV_ID_4; + break; + default: + dev_err(pcie->dev, "Unsupported dev_id_val (0x%x)\n", + dev_id_val); + return -EINVAL; + } + + dev_dbg(pcie->dev, "dev_id_val: 0x%x\n", dev_id_val); + + return 0; +} + +static int al_pcie_reg_offsets_set(struct al_pcie *pcie) +{ + switch (pcie->controller_rev_id) { + case AL_PCIE_REV_ID_2: + pcie->reg_offsets.ob_ctrl = OB_CTRL_REV1_2_OFFSET; + break; + case AL_PCIE_REV_ID_3: + case AL_PCIE_REV_ID_4: + pcie->reg_offsets.ob_ctrl = OB_CTRL_REV3_5_OFFSET; + break; + default: + dev_err(pcie->dev, "Unsupported controller rev_id: 0x%x\n", + pcie->controller_rev_id); + return -EINVAL; + } + + return 0; +} + +static inline void al_pcie_target_bus_set(struct al_pcie *pcie, + u8 target_bus, + u8 mask_target_bus) +{ + u32 reg; + + reg = FIELD_PREP(CFG_TARGET_BUS_MASK_MASK, mask_target_bus) | + FIELD_PREP(CFG_TARGET_BUS_BUSNUM_MASK, target_bus); + + al_pcie_controller_writel(pcie, AXI_BASE_OFFSET + + pcie->reg_offsets.ob_ctrl + CFG_TARGET_BUS, + reg); +} + +static void __iomem *al_pcie_conf_addr_map(struct al_pcie *pcie, + unsigned int busnr, + unsigned int devfn) +{ + struct al_pcie_target_bus_cfg *target_bus_cfg = &pcie->target_bus_cfg; + unsigned int busnr_ecam = busnr & target_bus_cfg->ecam_mask; + unsigned int busnr_reg = busnr & target_bus_cfg->reg_mask; + struct pcie_port *pp = &pcie->pci->pp; + void __iomem *pci_base_addr; + + pci_base_addr = (void __iomem *)((uintptr_t)pp->va_cfg0_base + + (busnr_ecam << 20) + + PCIE_ECAM_DEVFN(devfn)); + + if (busnr_reg != target_bus_cfg->reg_val) { + dev_dbg(pcie->pci->dev, "Changing target bus busnum val from 0x%x to 0x%x\n", + target_bus_cfg->reg_val, busnr_reg); + target_bus_cfg->reg_val = busnr_reg; + al_pcie_target_bus_set(pcie, + target_bus_cfg->reg_val, + target_bus_cfg->reg_mask); + } + + return pci_base_addr; +} + +static int al_pcie_rd_other_conf(struct pcie_port *pp, struct pci_bus *bus, + unsigned int devfn, int where, int size, + u32 *val) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct al_pcie *pcie = to_al_pcie(pci); + unsigned int busnr = bus->number; + void __iomem *pci_addr; + int rc; + + pci_addr = al_pcie_conf_addr_map(pcie, busnr, devfn); + + rc = dw_pcie_read(pci_addr + where, size, val); + + dev_dbg(pci->dev, "%d-byte config read from %04x:%02x:%02x.%d offset 0x%x (pci_addr: 0x%px) - val:0x%x\n", + size, pci_domain_nr(bus), bus->number, + PCI_SLOT(devfn), PCI_FUNC(devfn), where, + (pci_addr + where), *val); + + return rc; +} + +static int al_pcie_wr_other_conf(struct pcie_port *pp, struct pci_bus *bus, + unsigned int devfn, int where, int size, + u32 val) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct al_pcie *pcie = to_al_pcie(pci); + unsigned int busnr = bus->number; + void __iomem *pci_addr; + int rc; + + pci_addr = al_pcie_conf_addr_map(pcie, busnr, devfn); + + rc = dw_pcie_write(pci_addr + where, size, val); + + dev_dbg(pci->dev, "%d-byte config write to %04x:%02x:%02x.%d offset 0x%x (pci_addr: 0x%px) - val:0x%x\n", + size, pci_domain_nr(bus), bus->number, + PCI_SLOT(devfn), PCI_FUNC(devfn), where, + (pci_addr + where), val); + + return rc; +} + +static void al_pcie_config_prepare(struct al_pcie *pcie) +{ + struct al_pcie_target_bus_cfg *target_bus_cfg; + struct pcie_port *pp = &pcie->pci->pp; + unsigned int ecam_bus_mask; + u32 cfg_control_offset; + u8 subordinate_bus; + u8 secondary_bus; + u32 cfg_control; + u32 reg; + + target_bus_cfg = &pcie->target_bus_cfg; + + ecam_bus_mask = (pcie->ecam_size >> 20) - 1; + if (ecam_bus_mask > 255) { + dev_warn(pcie->dev, "ECAM window size is larger than 256MB. Cutting off at 256\n"); + ecam_bus_mask = 255; + } + + /* This portion is taken from the transaction address */ + target_bus_cfg->ecam_mask = ecam_bus_mask; + /* This portion is taken from the cfg_target_bus reg */ + target_bus_cfg->reg_mask = ~target_bus_cfg->ecam_mask; + target_bus_cfg->reg_val = pp->busn->start & target_bus_cfg->reg_mask; + + al_pcie_target_bus_set(pcie, target_bus_cfg->reg_val, + target_bus_cfg->reg_mask); + + secondary_bus = pp->busn->start + 1; + subordinate_bus = pp->busn->end; + + /* Set the valid values of secondary and subordinate buses */ + cfg_control_offset = AXI_BASE_OFFSET + pcie->reg_offsets.ob_ctrl + + CFG_CONTROL; + + cfg_control = al_pcie_controller_readl(pcie, cfg_control_offset); + + reg = cfg_control & + ~(CFG_CONTROL_SEC_BUS_MASK | CFG_CONTROL_SUBBUS_MASK); + + reg |= FIELD_PREP(CFG_CONTROL_SUBBUS_MASK, subordinate_bus) | + FIELD_PREP(CFG_CONTROL_SEC_BUS_MASK, secondary_bus); + + al_pcie_controller_writel(pcie, cfg_control_offset, reg); +} + +static int al_pcie_host_init(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct al_pcie *pcie = to_al_pcie(pci); + int rc; + + rc = al_pcie_rev_id_get(pcie, &pcie->controller_rev_id); + if (rc) + return rc; + + rc = al_pcie_reg_offsets_set(pcie); + if (rc) + return rc; + + al_pcie_config_prepare(pcie); + + return 0; +} + +static const struct dw_pcie_host_ops al_pcie_host_ops = { + .rd_other_conf = al_pcie_rd_other_conf, + .wr_other_conf = al_pcie_wr_other_conf, + .host_init = al_pcie_host_init, +}; + +static int al_add_pcie_port(struct pcie_port *pp, + struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + int ret; + + pp->ops = &al_pcie_host_ops; + + ret = dw_pcie_host_init(pp); + if (ret) { + dev_err(dev, "failed to initialize host\n"); + return ret; + } + + return 0; +} + +static const struct dw_pcie_ops dw_pcie_ops = { +}; + +static int al_pcie_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct resource *controller_res; + struct resource *ecam_res; + struct resource *dbi_res; + struct al_pcie *al_pcie; + struct dw_pcie *pci; + + al_pcie = devm_kzalloc(dev, sizeof(*al_pcie), GFP_KERNEL); + if (!al_pcie) + return -ENOMEM; + + pci = devm_kzalloc(dev, sizeof(*pci), GFP_KERNEL); + if (!pci) + return -ENOMEM; + + pci->dev = dev; + pci->ops = &dw_pcie_ops; + + al_pcie->pci = pci; + al_pcie->dev = dev; + + dbi_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); + pci->dbi_base = devm_pci_remap_cfg_resource(dev, dbi_res); + if (IS_ERR(pci->dbi_base)) { + dev_err(dev, "couldn't remap dbi base %pR\n", dbi_res); + return PTR_ERR(pci->dbi_base); + } + + ecam_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "config"); + if (!ecam_res) { + dev_err(dev, "couldn't find 'config' reg in DT\n"); + return -ENOENT; + } + al_pcie->ecam_size = resource_size(ecam_res); + + controller_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "controller"); + al_pcie->controller_base = devm_ioremap_resource(dev, controller_res); + if (IS_ERR(al_pcie->controller_base)) { + dev_err(dev, "couldn't remap controller base %pR\n", + controller_res); + return PTR_ERR(al_pcie->controller_base); + } + + dev_dbg(dev, "From DT: dbi_base: %pR, controller_base: %pR\n", + dbi_res, controller_res); + + platform_set_drvdata(pdev, al_pcie); + + return al_add_pcie_port(&pci->pp, pdev); +} + +static const struct of_device_id al_pcie_of_match[] = { + { .compatible = "amazon,al-alpine-v2-pcie", + }, + { .compatible = "amazon,al-alpine-v3-pcie", + }, + {}, +}; + +static struct platform_driver al_pcie_driver = { + .driver = { + .name = "al-pcie", + .of_match_table = al_pcie_of_match, + .suppress_bind_attrs = true, + }, + .probe = al_pcie_probe, +}; +builtin_platform_driver(al_pcie_driver); + +#endif /* CONFIG_PCIE_AL*/ diff --git a/drivers/pci/controller/dwc/pcie-armada8k.c b/drivers/pci/controller/dwc/pcie-armada8k.c index 3d55dc78d999..49596547e8c2 100644 --- a/drivers/pci/controller/dwc/pcie-armada8k.c +++ b/drivers/pci/controller/dwc/pcie-armada8k.c @@ -118,11 +118,10 @@ static int armada8k_pcie_setup_phys(struct armada8k_pcie *pcie) for (i = 0; i < ARMADA8K_PCIE_MAX_LANES; i++) { pcie->phy[i] = devm_of_phy_get_by_index(dev, node, i); - if (IS_ERR(pcie->phy[i]) && - (PTR_ERR(pcie->phy[i]) == -EPROBE_DEFER)) - return PTR_ERR(pcie->phy[i]); - if (IS_ERR(pcie->phy[i])) { + if (PTR_ERR(pcie->phy[i]) != -ENODEV) + return PTR_ERR(pcie->phy[i]); + pcie->phy[i] = NULL; continue; } diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 2bf5a35c0570..3dd2e2697294 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -40,39 +40,6 @@ void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar) __dw_pcie_ep_reset_bar(pci, bar, 0); } -static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie *pci, u8 cap_ptr, - u8 cap) -{ - u8 cap_id, next_cap_ptr; - u16 reg; - - if (!cap_ptr) - return 0; - - reg = dw_pcie_readw_dbi(pci, cap_ptr); - cap_id = (reg & 0x00ff); - - if (cap_id > PCI_CAP_ID_MAX) - return 0; - - if (cap_id == cap) - return cap_ptr; - - next_cap_ptr = (reg & 0xff00) >> 8; - return __dw_pcie_ep_find_next_cap(pci, next_cap_ptr, cap); -} - -static u8 dw_pcie_ep_find_capability(struct dw_pcie *pci, u8 cap) -{ - u8 next_cap_ptr; - u16 reg; - - reg = dw_pcie_readw_dbi(pci, PCI_CAPABILITY_LIST); - next_cap_ptr = (reg & 0x00ff); - - return __dw_pcie_ep_find_next_cap(pci, next_cap_ptr, cap); -} - static int dw_pcie_ep_write_header(struct pci_epc *epc, u8 func_no, struct pci_epf_header *hdr) { @@ -531,6 +498,7 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) int ret; u32 reg; void *addr; + u8 hdr_type; unsigned int nbars; unsigned int offset; struct pci_epc *epc; @@ -595,6 +563,13 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) if (ep->ops->ep_init) ep->ops->ep_init(ep); + hdr_type = dw_pcie_readb_dbi(pci, PCI_HEADER_TYPE); + if (hdr_type != PCI_HEADER_TYPE_NORMAL) { + dev_err(pci->dev, "PCIe controller is not set to EP mode (hdr_type:0x%x)!\n", + hdr_type); + return -EIO; + } + ret = of_property_read_u8(np, "max-functions", &epc->max_functions); if (ret < 0) epc->max_functions = 1; @@ -612,9 +587,9 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) dev_err(dev, "Failed to reserve memory for MSI/MSI-X\n"); return -ENOMEM; } - ep->msi_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSI); + ep->msi_cap = dw_pcie_find_capability(pci, PCI_CAP_ID_MSI); - ep->msix_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSIX); + ep->msix_cap = dw_pcie_find_capability(pci, PCI_CAP_ID_MSIX); offset = dw_pcie_ep_find_ext_capability(pci, PCI_EXT_CAP_ID_REBAR); if (offset) { diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index f93252d0da5b..0f36a926059a 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -323,6 +323,7 @@ int dw_pcie_host_init(struct pcie_port *pp) struct pci_bus *child; struct pci_host_bridge *bridge; struct resource *cfg_res; + u32 hdr_type; int ret; raw_spin_lock_init(&pci->pp.lock); @@ -464,6 +465,21 @@ int dw_pcie_host_init(struct pcie_port *pp) goto err_free_msi; } + ret = dw_pcie_rd_own_conf(pp, PCI_HEADER_TYPE, 1, &hdr_type); + if (ret != PCIBIOS_SUCCESSFUL) { + dev_err(pci->dev, "Failed reading PCI_HEADER_TYPE cfg space reg (ret: 0x%x)\n", + ret); + ret = pcibios_err_to_errno(ret); + goto err_free_msi; + } + if (hdr_type != PCI_HEADER_TYPE_BRIDGE) { + dev_err(pci->dev, + "PCIe controller is not set to bridge type (hdr_type: 0x%x)!\n", + hdr_type); + ret = -EIO; + goto err_free_msi; + } + pp->root_bus_nr = pp->busn->start; bridge->dev.parent = dev; @@ -628,6 +644,12 @@ void dw_pcie_setup_rc(struct pcie_port *pp) u32 val, ctrl, num_ctrls; struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + /* + * Enable DBI read-only registers for writing/updating configuration. + * Write permission gets disabled towards the end of this function. + */ + dw_pcie_dbi_ro_wr_en(pci); + dw_pcie_setup(pci); if (!pp->ops->msi_host_init) { @@ -650,12 +672,10 @@ void dw_pcie_setup_rc(struct pcie_port *pp) dw_pcie_writel_dbi(pci, PCI_BASE_ADDRESS_1, 0x00000000); /* Setup interrupt pins */ - dw_pcie_dbi_ro_wr_en(pci); val = dw_pcie_readl_dbi(pci, PCI_INTERRUPT_LINE); val &= 0xffff00ff; val |= 0x00000100; dw_pcie_writel_dbi(pci, PCI_INTERRUPT_LINE, val); - dw_pcie_dbi_ro_wr_dis(pci); /* Setup bus numbers */ val = dw_pcie_readl_dbi(pci, PCI_PRIMARY_BUS); @@ -687,15 +707,13 @@ void dw_pcie_setup_rc(struct pcie_port *pp) dw_pcie_wr_own_conf(pp, PCI_BASE_ADDRESS_0, 4, 0); - /* Enable write permission for the DBI read-only register */ - dw_pcie_dbi_ro_wr_en(pci); /* Program correct class for RC */ dw_pcie_wr_own_conf(pp, PCI_CLASS_DEVICE, 2, PCI_CLASS_BRIDGE_PCI); - /* Better disable write permission right after the update */ - dw_pcie_dbi_ro_wr_dis(pci); dw_pcie_rd_own_conf(pp, PCIE_LINK_WIDTH_SPEED_CONTROL, 4, &val); val |= PORT_LOGIC_SPEED_CHANGE; dw_pcie_wr_own_conf(pp, PCIE_LINK_WIDTH_SPEED_CONTROL, 4, val); + + dw_pcie_dbi_ro_wr_dis(pci); } EXPORT_SYMBOL_GPL(dw_pcie_setup_rc); diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 7d25102c304c..820488dfeaed 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -14,6 +14,86 @@ #include "pcie-designware.h" +/* + * These interfaces resemble the pci_find_*capability() interfaces, but these + * are for configuring host controllers, which are bridges *to* PCI devices but + * are not PCI devices themselves. + */ +static u8 __dw_pcie_find_next_cap(struct dw_pcie *pci, u8 cap_ptr, + u8 cap) +{ + u8 cap_id, next_cap_ptr; + u16 reg; + + if (!cap_ptr) + return 0; + + reg = dw_pcie_readw_dbi(pci, cap_ptr); + cap_id = (reg & 0x00ff); + + if (cap_id > PCI_CAP_ID_MAX) + return 0; + + if (cap_id == cap) + return cap_ptr; + + next_cap_ptr = (reg & 0xff00) >> 8; + return __dw_pcie_find_next_cap(pci, next_cap_ptr, cap); +} + +u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap) +{ + u8 next_cap_ptr; + u16 reg; + + reg = dw_pcie_readw_dbi(pci, PCI_CAPABILITY_LIST); + next_cap_ptr = (reg & 0x00ff); + + return __dw_pcie_find_next_cap(pci, next_cap_ptr, cap); +} +EXPORT_SYMBOL_GPL(dw_pcie_find_capability); + +static u16 dw_pcie_find_next_ext_capability(struct dw_pcie *pci, u16 start, + u8 cap) +{ + u32 header; + int ttl; + int pos = PCI_CFG_SPACE_SIZE; + + /* minimum 8 bytes per capability */ + ttl = (PCI_CFG_SPACE_EXP_SIZE - PCI_CFG_SPACE_SIZE) / 8; + + if (start) + pos = start; + + header = dw_pcie_readl_dbi(pci, pos); + /* + * If we have no capabilities, this is indicated by cap ID, + * cap version and next pointer all being 0. + */ + if (header == 0) + return 0; + + while (ttl-- > 0) { + if (PCI_EXT_CAP_ID(header) == cap && pos != start) + return pos; + + pos = PCI_EXT_CAP_NEXT(header); + if (pos < PCI_CFG_SPACE_SIZE) + break; + + header = dw_pcie_readl_dbi(pci, pos); + } + + return 0; +} + +u16 dw_pcie_find_ext_capability(struct dw_pcie *pci, u8 cap) +{ + return dw_pcie_find_next_ext_capability(pci, 0, cap); +} +EXPORT_SYMBOL_GPL(dw_pcie_find_ext_capability); + int dw_pcie_read(void __iomem *addr, int size, u32 *val) { if (!IS_ALIGNED((uintptr_t)addr, size)) { @@ -376,10 +456,11 @@ int dw_pcie_wait_for_link(struct dw_pcie *pci) usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX); } - dev_err(pci->dev, "Phy link never came up\n"); + dev_info(pci->dev, "Phy link never came up\n"); return -ETIMEDOUT; } +EXPORT_SYMBOL_GPL(dw_pcie_wait_for_link); int dw_pcie_link_up(struct dw_pcie *pci) { @@ -423,8 +504,10 @@ void dw_pcie_setup(struct dw_pcie *pci) ret = of_property_read_u32(np, "num-lanes", &lanes); - if (ret) - lanes = 0; + if (ret) { + dev_dbg(pci->dev, "property num-lanes isn't found\n"); + return; + } /* Set the number of lanes */ val = dw_pcie_readl_dbi(pci, PCIE_PORT_LINK_CONTROL); @@ -466,4 +549,11 @@ void dw_pcie_setup(struct dw_pcie *pci) break; } dw_pcie_writel_dbi(pci, PCIE_LINK_WIDTH_SPEED_CONTROL, val); + + if (of_property_read_bool(np, "snps,enable-cdm-check")) { + val = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS); + val |= PCIE_PL_CHK_REG_CHK_REG_CONTINUOUS | + PCIE_PL_CHK_REG_CHK_REG_START; + dw_pcie_writel_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS, val); + } } diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index ffed084a0b4f..5a18e94e52c8 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -86,6 +86,15 @@ #define PCIE_MISC_CONTROL_1_OFF 0x8BC #define PCIE_DBI_RO_WR_EN BIT(0) +#define PCIE_PL_CHK_REG_CONTROL_STATUS 0xB20 +#define PCIE_PL_CHK_REG_CHK_REG_START BIT(0) +#define PCIE_PL_CHK_REG_CHK_REG_CONTINUOUS BIT(1) +#define PCIE_PL_CHK_REG_CHK_REG_COMPARISON_ERROR BIT(16) +#define PCIE_PL_CHK_REG_CHK_REG_LOGIC_ERROR BIT(17) +#define PCIE_PL_CHK_REG_CHK_REG_COMPLETE BIT(18) + +#define PCIE_PL_CHK_REG_ERR_ADDR 0xB28 + /* * iATU Unroll-specific register definitions * From 4.80 core version the address translation will be made by unroll @@ -251,6 +260,9 @@ struct dw_pcie { #define to_dw_pcie_from_ep(endpoint) \ container_of((endpoint), struct dw_pcie, ep) +u8 dw_pcie_find_capability(struct dw_pcie *pci, u8 cap); +u16 dw_pcie_find_ext_capability(struct dw_pcie *pci, u8 cap); + int dw_pcie_read(void __iomem *addr, int size, u32 *val); int dw_pcie_write(void __iomem *addr, int size, u32 val); diff --git a/drivers/pci/controller/dwc/pcie-histb.c b/drivers/pci/controller/dwc/pcie-histb.c index 954bc2b74bbc..811b5c6d62ea 100644 --- a/drivers/pci/controller/dwc/pcie-histb.c +++ b/drivers/pci/controller/dwc/pcie-histb.c @@ -340,8 +340,8 @@ static int histb_pcie_probe(struct platform_device *pdev) hipcie->vpcie = devm_regulator_get_optional(dev, "vpcie"); if (IS_ERR(hipcie->vpcie)) { - if (PTR_ERR(hipcie->vpcie) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(hipcie->vpcie) != -ENODEV) + return PTR_ERR(hipcie->vpcie); hipcie->vpcie = NULL; } diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c index 8df1914226be..c19617a912bd 100644 --- a/drivers/pci/controller/dwc/pcie-kirin.c +++ b/drivers/pci/controller/dwc/pcie-kirin.c @@ -436,7 +436,7 @@ static int kirin_pcie_host_init(struct pcie_port *pp) return 0; } -static struct dw_pcie_ops kirin_dw_pcie_ops = { +static const struct dw_pcie_ops kirin_dw_pcie_ops = { .read_dbi = kirin_pcie_read_dbi, .write_dbi = kirin_pcie_write_dbi, .link_up = kirin_pcie_link_up, diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c new file mode 100644 index 000000000000..f89f5acee72d --- /dev/null +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -0,0 +1,1732 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * PCIe host controller driver for Tegra194 SoC + * + * Copyright (C) 2019 NVIDIA Corporation. + * + * Author: Vidya Sagar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pcie-designware.h" +#include +#include +#include "../../pci.h" + +#define APPL_PINMUX 0x0 +#define APPL_PINMUX_PEX_RST BIT(0) +#define APPL_PINMUX_CLKREQ_OVERRIDE_EN BIT(2) +#define APPL_PINMUX_CLKREQ_OVERRIDE BIT(3) +#define APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE_EN BIT(4) +#define APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE BIT(5) +#define APPL_PINMUX_CLKREQ_OUT_OVRD_EN BIT(9) +#define APPL_PINMUX_CLKREQ_OUT_OVRD BIT(10) + +#define APPL_CTRL 0x4 +#define APPL_CTRL_SYS_PRE_DET_STATE BIT(6) +#define APPL_CTRL_LTSSM_EN BIT(7) +#define APPL_CTRL_HW_HOT_RST_EN BIT(20) +#define APPL_CTRL_HW_HOT_RST_MODE_MASK GENMASK(1, 0) +#define APPL_CTRL_HW_HOT_RST_MODE_SHIFT 22 +#define APPL_CTRL_HW_HOT_RST_MODE_IMDT_RST 0x1 + +#define APPL_INTR_EN_L0_0 0x8 +#define APPL_INTR_EN_L0_0_LINK_STATE_INT_EN BIT(0) +#define APPL_INTR_EN_L0_0_MSI_RCV_INT_EN BIT(4) +#define APPL_INTR_EN_L0_0_INT_INT_EN BIT(8) +#define APPL_INTR_EN_L0_0_CDM_REG_CHK_INT_EN BIT(19) +#define APPL_INTR_EN_L0_0_SYS_INTR_EN BIT(30) +#define APPL_INTR_EN_L0_0_SYS_MSI_INTR_EN BIT(31) + +#define APPL_INTR_STATUS_L0 0xC +#define APPL_INTR_STATUS_L0_LINK_STATE_INT BIT(0) +#define APPL_INTR_STATUS_L0_INT_INT BIT(8) +#define APPL_INTR_STATUS_L0_CDM_REG_CHK_INT BIT(18) + +#define APPL_INTR_EN_L1_0_0 0x1C +#define APPL_INTR_EN_L1_0_0_LINK_REQ_RST_NOT_INT_EN BIT(1) + +#define APPL_INTR_STATUS_L1_0_0 0x20 +#define APPL_INTR_STATUS_L1_0_0_LINK_REQ_RST_NOT_CHGED BIT(1) + +#define APPL_INTR_STATUS_L1_1 0x2C +#define APPL_INTR_STATUS_L1_2 0x30 +#define APPL_INTR_STATUS_L1_3 0x34 +#define APPL_INTR_STATUS_L1_6 0x3C +#define APPL_INTR_STATUS_L1_7 0x40 + +#define APPL_INTR_EN_L1_8_0 0x44 +#define APPL_INTR_EN_L1_8_BW_MGT_INT_EN BIT(2) +#define APPL_INTR_EN_L1_8_AUTO_BW_INT_EN BIT(3) +#define APPL_INTR_EN_L1_8_INTX_EN BIT(11) +#define APPL_INTR_EN_L1_8_AER_INT_EN BIT(15) + +#define APPL_INTR_STATUS_L1_8_0 0x4C +#define APPL_INTR_STATUS_L1_8_0_EDMA_INT_MASK GENMASK(11, 6) +#define APPL_INTR_STATUS_L1_8_0_BW_MGT_INT_STS BIT(2) +#define APPL_INTR_STATUS_L1_8_0_AUTO_BW_INT_STS BIT(3) + +#define APPL_INTR_STATUS_L1_9 0x54 +#define APPL_INTR_STATUS_L1_10 0x58 +#define APPL_INTR_STATUS_L1_11 0x64 +#define APPL_INTR_STATUS_L1_13 0x74 +#define APPL_INTR_STATUS_L1_14 0x78 +#define APPL_INTR_STATUS_L1_15 0x7C +#define APPL_INTR_STATUS_L1_17 0x88 + +#define APPL_INTR_EN_L1_18 0x90 +#define APPL_INTR_EN_L1_18_CDM_REG_CHK_CMPLT BIT(2) +#define APPL_INTR_EN_L1_18_CDM_REG_CHK_CMP_ERR BIT(1) +#define APPL_INTR_EN_L1_18_CDM_REG_CHK_LOGIC_ERR BIT(0) + +#define APPL_INTR_STATUS_L1_18 0x94 +#define APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMPLT BIT(2) +#define APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMP_ERR BIT(1) +#define APPL_INTR_STATUS_L1_18_CDM_REG_CHK_LOGIC_ERR BIT(0) + +#define APPL_MSI_CTRL_2 0xB0 + +#define APPL_LTR_MSG_1 0xC4 +#define LTR_MSG_REQ BIT(15) +#define LTR_MST_NO_SNOOP_SHIFT 16 + +#define APPL_LTR_MSG_2 0xC8 +#define APPL_LTR_MSG_2_LTR_MSG_REQ_STATE BIT(3) + +#define APPL_LINK_STATUS 0xCC +#define APPL_LINK_STATUS_RDLH_LINK_UP BIT(0) + +#define APPL_DEBUG 0xD0 +#define APPL_DEBUG_PM_LINKST_IN_L2_LAT BIT(21) +#define APPL_DEBUG_PM_LINKST_IN_L0 0x11 +#define APPL_DEBUG_LTSSM_STATE_MASK GENMASK(8, 3) +#define APPL_DEBUG_LTSSM_STATE_SHIFT 3 +#define LTSSM_STATE_PRE_DETECT 5 + +#define APPL_RADM_STATUS 0xE4 +#define APPL_PM_XMT_TURNOFF_STATE BIT(0) + +#define APPL_DM_TYPE 0x100 +#define APPL_DM_TYPE_MASK GENMASK(3, 0) +#define APPL_DM_TYPE_RP 0x4 +#define APPL_DM_TYPE_EP 0x0 + +#define APPL_CFG_BASE_ADDR 0x104 +#define APPL_CFG_BASE_ADDR_MASK GENMASK(31, 12) + +#define APPL_CFG_IATU_DMA_BASE_ADDR 0x108 +#define APPL_CFG_IATU_DMA_BASE_ADDR_MASK GENMASK(31, 18) + +#define APPL_CFG_MISC 0x110 +#define APPL_CFG_MISC_SLV_EP_MODE BIT(14) +#define APPL_CFG_MISC_ARCACHE_MASK GENMASK(13, 10) +#define APPL_CFG_MISC_ARCACHE_SHIFT 10 +#define APPL_CFG_MISC_ARCACHE_VAL 3 + +#define APPL_CFG_SLCG_OVERRIDE 0x114 +#define APPL_CFG_SLCG_OVERRIDE_SLCG_EN_MASTER BIT(0) + +#define APPL_CAR_RESET_OVRD 0x12C +#define APPL_CAR_RESET_OVRD_CYA_OVERRIDE_CORE_RST_N BIT(0) + +#define IO_BASE_IO_DECODE BIT(0) +#define IO_BASE_IO_DECODE_BIT8 BIT(8) + +#define CFG_PREF_MEM_LIMIT_BASE_MEM_DECODE BIT(0) +#define CFG_PREF_MEM_LIMIT_BASE_MEM_LIMIT_DECODE BIT(16) + +#define CFG_TIMER_CTRL_MAX_FUNC_NUM_OFF 0x718 +#define CFG_TIMER_CTRL_ACK_NAK_SHIFT (19) + +#define EVENT_COUNTER_ALL_CLEAR 0x3 +#define EVENT_COUNTER_ENABLE_ALL 0x7 +#define EVENT_COUNTER_ENABLE_SHIFT 2 +#define EVENT_COUNTER_EVENT_SEL_MASK GENMASK(7, 0) +#define EVENT_COUNTER_EVENT_SEL_SHIFT 16 +#define EVENT_COUNTER_EVENT_Tx_L0S 0x2 +#define EVENT_COUNTER_EVENT_Rx_L0S 0x3 +#define EVENT_COUNTER_EVENT_L1 0x5 +#define EVENT_COUNTER_EVENT_L1_1 0x7 +#define EVENT_COUNTER_EVENT_L1_2 0x8 +#define EVENT_COUNTER_GROUP_SEL_SHIFT 24 +#define EVENT_COUNTER_GROUP_5 0x5 + +#define PORT_LOGIC_ACK_F_ASPM_CTRL 0x70C +#define ENTER_ASPM BIT(30) +#define L0S_ENTRANCE_LAT_SHIFT 24 +#define L0S_ENTRANCE_LAT_MASK GENMASK(26, 24) +#define L1_ENTRANCE_LAT_SHIFT 27 +#define L1_ENTRANCE_LAT_MASK GENMASK(29, 27) +#define N_FTS_SHIFT 8 +#define N_FTS_MASK GENMASK(7, 0) +#define N_FTS_VAL 52 + +#define PORT_LOGIC_GEN2_CTRL 0x80C +#define PORT_LOGIC_GEN2_CTRL_DIRECT_SPEED_CHANGE BIT(17) +#define FTS_MASK GENMASK(7, 0) +#define FTS_VAL 52 + +#define PORT_LOGIC_MSI_CTRL_INT_0_EN 0x828 + +#define GEN3_EQ_CONTROL_OFF 0x8a8 +#define GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_SHIFT 8 +#define GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_MASK GENMASK(23, 8) +#define GEN3_EQ_CONTROL_OFF_FB_MODE_MASK GENMASK(3, 0) + +#define GEN3_RELATED_OFF 0x890 +#define GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL BIT(0) +#define GEN3_RELATED_OFF_GEN3_EQ_DISABLE BIT(16) +#define GEN3_RELATED_OFF_RATE_SHADOW_SEL_SHIFT 24 +#define GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK GENMASK(25, 24) + +#define PORT_LOGIC_AMBA_ERROR_RESPONSE_DEFAULT 0x8D0 +#define AMBA_ERROR_RESPONSE_CRS_SHIFT 3 +#define AMBA_ERROR_RESPONSE_CRS_MASK GENMASK(1, 0) +#define AMBA_ERROR_RESPONSE_CRS_OKAY 0 +#define AMBA_ERROR_RESPONSE_CRS_OKAY_FFFFFFFF 1 +#define AMBA_ERROR_RESPONSE_CRS_OKAY_FFFF0001 2 + +#define PORT_LOGIC_MSIX_DOORBELL 0x948 + +#define CAP_SPCIE_CAP_OFF 0x154 +#define CAP_SPCIE_CAP_OFF_DSP_TX_PRESET0_MASK GENMASK(3, 0) +#define CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_MASK GENMASK(11, 8) +#define CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_SHIFT 8 + +#define PME_ACK_TIMEOUT 10000 + +#define LTSSM_TIMEOUT 50000 /* 50ms */ + +#define GEN3_GEN4_EQ_PRESET_INIT 5 + +#define GEN1_CORE_CLK_FREQ 62500000 +#define GEN2_CORE_CLK_FREQ 125000000 +#define GEN3_CORE_CLK_FREQ 250000000 +#define GEN4_CORE_CLK_FREQ 500000000 + +static const unsigned int pcie_gen_freq[] = { + GEN1_CORE_CLK_FREQ, + GEN2_CORE_CLK_FREQ, + GEN3_CORE_CLK_FREQ, + GEN4_CORE_CLK_FREQ +}; + +static const u32 event_cntr_ctrl_offset[] = { + 0x1d8, + 0x1a8, + 0x1a8, + 0x1a8, + 0x1c4, + 0x1d8 +}; + +static const u32 event_cntr_data_offset[] = { + 0x1dc, + 0x1ac, + 0x1ac, + 0x1ac, + 0x1c8, + 0x1dc +}; + +struct tegra_pcie_dw { + struct device *dev; + struct resource *appl_res; + struct resource *dbi_res; + struct resource *atu_dma_res; + void __iomem *appl_base; + struct clk *core_clk; + struct reset_control *core_apb_rst; + struct reset_control *core_rst; + struct dw_pcie pci; + struct tegra_bpmp *bpmp; + + bool supports_clkreq; + bool enable_cdm_check; + bool link_state; + bool update_fc_fixup; + u8 init_link_width; + u32 msi_ctrl_int; + u32 num_lanes; + u32 max_speed; + u32 cid; + u32 cfg_link_cap_l1sub; + u32 pcie_cap_base; + u32 aspm_cmrt; + u32 aspm_pwr_on_t; + u32 aspm_l0s_enter_lat; + + struct regulator *pex_ctl_supply; + struct regulator *slot_ctl_3v3; + struct regulator *slot_ctl_12v; + + unsigned int phy_count; + struct phy **phys; + + struct dentry *debugfs; +}; + +static inline struct tegra_pcie_dw *to_tegra_pcie(struct dw_pcie *pci) +{ + return container_of(pci, struct tegra_pcie_dw, pci); +} + +static inline void appl_writel(struct tegra_pcie_dw *pcie, const u32 value, + const u32 reg) +{ + writel_relaxed(value, pcie->appl_base + reg); +} + +static inline u32 appl_readl(struct tegra_pcie_dw *pcie, const u32 reg) +{ + return readl_relaxed(pcie->appl_base + reg); +} + +struct tegra_pcie_soc { + enum dw_pcie_device_mode mode; +}; + +static void apply_bad_link_workaround(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 current_link_width; + u16 val; + + /* + * NOTE:- Since this scenario is uncommon and link as such is not + * stable anyway, not waiting to confirm if link is really + * transitioning to Gen-2 speed + */ + val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKSTA); + if (val & PCI_EXP_LNKSTA_LBMS) { + current_link_width = (val & PCI_EXP_LNKSTA_NLW) >> + PCI_EXP_LNKSTA_NLW_SHIFT; + if (pcie->init_link_width > current_link_width) { + dev_warn(pci->dev, "PCIe link is bad, width reduced\n"); + val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL2); + val &= ~PCI_EXP_LNKCTL2_TLS; + val |= PCI_EXP_LNKCTL2_TLS_2_5GT; + dw_pcie_writew_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL2, val); + + val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL); + val |= PCI_EXP_LNKCTL_RL; + dw_pcie_writew_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL, val); + } + } +} + +static irqreturn_t tegra_pcie_rp_irq_handler(struct tegra_pcie_dw *pcie) +{ + struct dw_pcie *pci = &pcie->pci; + struct pcie_port *pp = &pci->pp; + u32 val, tmp; + u16 val_w; + + val = appl_readl(pcie, APPL_INTR_STATUS_L0); + if (val & APPL_INTR_STATUS_L0_LINK_STATE_INT) { + val = appl_readl(pcie, APPL_INTR_STATUS_L1_0_0); + if (val & APPL_INTR_STATUS_L1_0_0_LINK_REQ_RST_NOT_CHGED) { + appl_writel(pcie, val, APPL_INTR_STATUS_L1_0_0); + + /* SBR & Surprise Link Down WAR */ + val = appl_readl(pcie, APPL_CAR_RESET_OVRD); + val &= ~APPL_CAR_RESET_OVRD_CYA_OVERRIDE_CORE_RST_N; + appl_writel(pcie, val, APPL_CAR_RESET_OVRD); + udelay(1); + val = appl_readl(pcie, APPL_CAR_RESET_OVRD); + val |= APPL_CAR_RESET_OVRD_CYA_OVERRIDE_CORE_RST_N; + appl_writel(pcie, val, APPL_CAR_RESET_OVRD); + + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_GEN2_CTRL); + val |= PORT_LOGIC_GEN2_CTRL_DIRECT_SPEED_CHANGE; + dw_pcie_writel_dbi(pci, PORT_LOGIC_GEN2_CTRL, val); + } + } + + if (val & APPL_INTR_STATUS_L0_INT_INT) { + val = appl_readl(pcie, APPL_INTR_STATUS_L1_8_0); + if (val & APPL_INTR_STATUS_L1_8_0_AUTO_BW_INT_STS) { + appl_writel(pcie, + APPL_INTR_STATUS_L1_8_0_AUTO_BW_INT_STS, + APPL_INTR_STATUS_L1_8_0); + apply_bad_link_workaround(pp); + } + if (val & APPL_INTR_STATUS_L1_8_0_BW_MGT_INT_STS) { + appl_writel(pcie, + APPL_INTR_STATUS_L1_8_0_BW_MGT_INT_STS, + APPL_INTR_STATUS_L1_8_0); + + val_w = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKSTA); + dev_dbg(pci->dev, "Link Speed : Gen-%u\n", val_w & + PCI_EXP_LNKSTA_CLS); + } + } + + val = appl_readl(pcie, APPL_INTR_STATUS_L0); + if (val & APPL_INTR_STATUS_L0_CDM_REG_CHK_INT) { + val = appl_readl(pcie, APPL_INTR_STATUS_L1_18); + tmp = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS); + if (val & APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMPLT) { + dev_info(pci->dev, "CDM check complete\n"); + tmp |= PCIE_PL_CHK_REG_CHK_REG_COMPLETE; + } + if (val & APPL_INTR_STATUS_L1_18_CDM_REG_CHK_CMP_ERR) { + dev_err(pci->dev, "CDM comparison mismatch\n"); + tmp |= PCIE_PL_CHK_REG_CHK_REG_COMPARISON_ERROR; + } + if (val & APPL_INTR_STATUS_L1_18_CDM_REG_CHK_LOGIC_ERR) { + dev_err(pci->dev, "CDM Logic error\n"); + tmp |= PCIE_PL_CHK_REG_CHK_REG_LOGIC_ERROR; + } + dw_pcie_writel_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS, tmp); + tmp = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_ERR_ADDR); + dev_err(pci->dev, "CDM Error Address Offset = 0x%08X\n", tmp); + } + + return IRQ_HANDLED; +} + +static irqreturn_t tegra_pcie_irq_handler(int irq, void *arg) +{ + struct tegra_pcie_dw *pcie = arg; + + return tegra_pcie_rp_irq_handler(pcie); +} + +static int tegra_pcie_dw_rd_own_conf(struct pcie_port *pp, int where, int size, + u32 *val) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + + /* + * This is an endpoint mode specific register happen to appear even + * when controller is operating in root port mode and system hangs + * when it is accessed with link being in ASPM-L1 state. + * So skip accessing it altogether + */ + if (where == PORT_LOGIC_MSIX_DOORBELL) { + *val = 0x00000000; + return PCIBIOS_SUCCESSFUL; + } + + return dw_pcie_read(pci->dbi_base + where, size, val); +} + +static int tegra_pcie_dw_wr_own_conf(struct pcie_port *pp, int where, int size, + u32 val) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + + /* + * This is an endpoint mode specific register happen to appear even + * when controller is operating in root port mode and system hangs + * when it is accessed with link being in ASPM-L1 state. + * So skip accessing it altogether + */ + if (where == PORT_LOGIC_MSIX_DOORBELL) + return PCIBIOS_SUCCESSFUL; + + return dw_pcie_write(pci->dbi_base + where, size, val); +} + +#if defined(CONFIG_PCIEASPM) +static void disable_aspm_l11(struct tegra_pcie_dw *pcie) +{ + u32 val; + + val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub); + val &= ~PCI_L1SS_CAP_ASPM_L1_1; + dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val); +} + +static void disable_aspm_l12(struct tegra_pcie_dw *pcie) +{ + u32 val; + + val = dw_pcie_readl_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub); + val &= ~PCI_L1SS_CAP_ASPM_L1_2; + dw_pcie_writel_dbi(&pcie->pci, pcie->cfg_link_cap_l1sub, val); +} + +static inline u32 event_counter_prog(struct tegra_pcie_dw *pcie, u32 event) +{ + u32 val; + + val = dw_pcie_readl_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid]); + val &= ~(EVENT_COUNTER_EVENT_SEL_MASK << EVENT_COUNTER_EVENT_SEL_SHIFT); + val |= EVENT_COUNTER_GROUP_5 << EVENT_COUNTER_GROUP_SEL_SHIFT; + val |= event << EVENT_COUNTER_EVENT_SEL_SHIFT; + val |= EVENT_COUNTER_ENABLE_ALL << EVENT_COUNTER_ENABLE_SHIFT; + dw_pcie_writel_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid], val); + val = dw_pcie_readl_dbi(&pcie->pci, event_cntr_data_offset[pcie->cid]); + + return val; +} + +static int aspm_state_cnt(struct seq_file *s, void *data) +{ + struct tegra_pcie_dw *pcie = (struct tegra_pcie_dw *) + dev_get_drvdata(s->private); + u32 val; + + seq_printf(s, "Tx L0s entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_Tx_L0S)); + + seq_printf(s, "Rx L0s entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_Rx_L0S)); + + seq_printf(s, "Link L1 entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_L1)); + + seq_printf(s, "Link L1.1 entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_L1_1)); + + seq_printf(s, "Link L1.2 entry count : %u\n", + event_counter_prog(pcie, EVENT_COUNTER_EVENT_L1_2)); + + /* Clear all counters */ + dw_pcie_writel_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid], + EVENT_COUNTER_ALL_CLEAR); + + /* Re-enable counting */ + val = EVENT_COUNTER_ENABLE_ALL << EVENT_COUNTER_ENABLE_SHIFT; + val |= EVENT_COUNTER_GROUP_5 << EVENT_COUNTER_GROUP_SEL_SHIFT; + dw_pcie_writel_dbi(&pcie->pci, event_cntr_ctrl_offset[pcie->cid], val); + + return 0; +} + +static void init_host_aspm(struct tegra_pcie_dw *pcie) +{ + struct dw_pcie *pci = &pcie->pci; + u32 val; + + val = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_L1SS); + pcie->cfg_link_cap_l1sub = val + PCI_L1SS_CAP; + + /* Enable ASPM counters */ + val = EVENT_COUNTER_ENABLE_ALL << EVENT_COUNTER_ENABLE_SHIFT; + val |= EVENT_COUNTER_GROUP_5 << EVENT_COUNTER_GROUP_SEL_SHIFT; + dw_pcie_writel_dbi(pci, event_cntr_ctrl_offset[pcie->cid], val); + + /* Program T_cmrt and T_pwr_on values */ + val = dw_pcie_readl_dbi(pci, pcie->cfg_link_cap_l1sub); + val &= ~(PCI_L1SS_CAP_CM_RESTORE_TIME | PCI_L1SS_CAP_P_PWR_ON_VALUE); + val |= (pcie->aspm_cmrt << 8); + val |= (pcie->aspm_pwr_on_t << 19); + dw_pcie_writel_dbi(pci, pcie->cfg_link_cap_l1sub, val); + + /* Program L0s and L1 entrance latencies */ + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL); + val &= ~L0S_ENTRANCE_LAT_MASK; + val |= (pcie->aspm_l0s_enter_lat << L0S_ENTRANCE_LAT_SHIFT); + val |= ENTER_ASPM; + dw_pcie_writel_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL, val); +} + +static int init_debugfs(struct tegra_pcie_dw *pcie) +{ + struct dentry *d; + + d = debugfs_create_devm_seqfile(pcie->dev, "aspm_state_cnt", + pcie->debugfs, aspm_state_cnt); + if (IS_ERR_OR_NULL(d)) + dev_err(pcie->dev, + "Failed to create debugfs file \"aspm_state_cnt\"\n"); + + return 0; +} +#else +static inline void disable_aspm_l12(struct tegra_pcie_dw *pcie) { return; } +static inline void disable_aspm_l11(struct tegra_pcie_dw *pcie) { return; } +static inline void init_host_aspm(struct tegra_pcie_dw *pcie) { return; } +static inline int init_debugfs(struct tegra_pcie_dw *pcie) { return 0; } +#endif + +static void tegra_pcie_enable_system_interrupts(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val; + u16 val_w; + + val = appl_readl(pcie, APPL_INTR_EN_L0_0); + val |= APPL_INTR_EN_L0_0_LINK_STATE_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L0_0); + + val = appl_readl(pcie, APPL_INTR_EN_L1_0_0); + val |= APPL_INTR_EN_L1_0_0_LINK_REQ_RST_NOT_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L1_0_0); + + if (pcie->enable_cdm_check) { + val = appl_readl(pcie, APPL_INTR_EN_L0_0); + val |= APPL_INTR_EN_L0_0_CDM_REG_CHK_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L0_0); + + val = appl_readl(pcie, APPL_INTR_EN_L1_18); + val |= APPL_INTR_EN_L1_18_CDM_REG_CHK_CMP_ERR; + val |= APPL_INTR_EN_L1_18_CDM_REG_CHK_LOGIC_ERR; + appl_writel(pcie, val, APPL_INTR_EN_L1_18); + } + + val_w = dw_pcie_readw_dbi(&pcie->pci, pcie->pcie_cap_base + + PCI_EXP_LNKSTA); + pcie->init_link_width = (val_w & PCI_EXP_LNKSTA_NLW) >> + PCI_EXP_LNKSTA_NLW_SHIFT; + + val_w = dw_pcie_readw_dbi(&pcie->pci, pcie->pcie_cap_base + + PCI_EXP_LNKCTL); + val_w |= PCI_EXP_LNKCTL_LBMIE; + dw_pcie_writew_dbi(&pcie->pci, pcie->pcie_cap_base + PCI_EXP_LNKCTL, + val_w); +} + +static void tegra_pcie_enable_legacy_interrupts(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val; + + /* Enable legacy interrupt generation */ + val = appl_readl(pcie, APPL_INTR_EN_L0_0); + val |= APPL_INTR_EN_L0_0_SYS_INTR_EN; + val |= APPL_INTR_EN_L0_0_INT_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L0_0); + + val = appl_readl(pcie, APPL_INTR_EN_L1_8_0); + val |= APPL_INTR_EN_L1_8_INTX_EN; + val |= APPL_INTR_EN_L1_8_AUTO_BW_INT_EN; + val |= APPL_INTR_EN_L1_8_BW_MGT_INT_EN; + if (IS_ENABLED(CONFIG_PCIEAER)) + val |= APPL_INTR_EN_L1_8_AER_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L1_8_0); +} + +static void tegra_pcie_enable_msi_interrupts(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val; + + dw_pcie_msi_init(pp); + + /* Enable MSI interrupt generation */ + val = appl_readl(pcie, APPL_INTR_EN_L0_0); + val |= APPL_INTR_EN_L0_0_SYS_MSI_INTR_EN; + val |= APPL_INTR_EN_L0_0_MSI_RCV_INT_EN; + appl_writel(pcie, val, APPL_INTR_EN_L0_0); +} + +static void tegra_pcie_enable_interrupts(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + + /* Clear interrupt statuses before enabling interrupts */ + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L0); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_0_0); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_1); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_2); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_3); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_6); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_7); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_8_0); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_9); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_10); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_11); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_13); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_14); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_15); + appl_writel(pcie, 0xFFFFFFFF, APPL_INTR_STATUS_L1_17); + + tegra_pcie_enable_system_interrupts(pp); + tegra_pcie_enable_legacy_interrupts(pp); + if (IS_ENABLED(CONFIG_PCI_MSI)) + tegra_pcie_enable_msi_interrupts(pp); +} + +static void config_gen3_gen4_eq_presets(struct tegra_pcie_dw *pcie) +{ + struct dw_pcie *pci = &pcie->pci; + u32 val, offset, i; + + /* Program init preset */ + for (i = 0; i < pcie->num_lanes; i++) { + dw_pcie_read(pci->dbi_base + CAP_SPCIE_CAP_OFF + + (i * 2), 2, &val); + val &= ~CAP_SPCIE_CAP_OFF_DSP_TX_PRESET0_MASK; + val |= GEN3_GEN4_EQ_PRESET_INIT; + val &= ~CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_MASK; + val |= (GEN3_GEN4_EQ_PRESET_INIT << + CAP_SPCIE_CAP_OFF_USP_TX_PRESET0_SHIFT); + dw_pcie_write(pci->dbi_base + CAP_SPCIE_CAP_OFF + + (i * 2), 2, val); + + offset = dw_pcie_find_ext_capability(pci, + PCI_EXT_CAP_ID_PL_16GT) + + PCI_PL_16GT_LE_CTRL; + dw_pcie_read(pci->dbi_base + offset + i, 1, &val); + val &= ~PCI_PL_16GT_LE_CTRL_DSP_TX_PRESET_MASK; + val |= GEN3_GEN4_EQ_PRESET_INIT; + val &= ~PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_MASK; + val |= (GEN3_GEN4_EQ_PRESET_INIT << + PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_SHIFT); + dw_pcie_write(pci->dbi_base + offset + i, 1, val); + } + + val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); + val &= ~GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK; + dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); + + val = dw_pcie_readl_dbi(pci, GEN3_EQ_CONTROL_OFF); + val &= ~GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_MASK; + val |= (0x3ff << GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_SHIFT); + val &= ~GEN3_EQ_CONTROL_OFF_FB_MODE_MASK; + dw_pcie_writel_dbi(pci, GEN3_EQ_CONTROL_OFF, val); + + val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); + val &= ~GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK; + val |= (0x1 << GEN3_RELATED_OFF_RATE_SHADOW_SEL_SHIFT); + dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); + + val = dw_pcie_readl_dbi(pci, GEN3_EQ_CONTROL_OFF); + val &= ~GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_MASK; + val |= (0x360 << GEN3_EQ_CONTROL_OFF_PSET_REQ_VEC_SHIFT); + val &= ~GEN3_EQ_CONTROL_OFF_FB_MODE_MASK; + dw_pcie_writel_dbi(pci, GEN3_EQ_CONTROL_OFF, val); + + val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); + val &= ~GEN3_RELATED_OFF_RATE_SHADOW_SEL_MASK; + dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); +} + +static void tegra_pcie_prepare_host(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val; + + val = dw_pcie_readl_dbi(pci, PCI_IO_BASE); + val &= ~(IO_BASE_IO_DECODE | IO_BASE_IO_DECODE_BIT8); + dw_pcie_writel_dbi(pci, PCI_IO_BASE, val); + + val = dw_pcie_readl_dbi(pci, PCI_PREF_MEMORY_BASE); + val |= CFG_PREF_MEM_LIMIT_BASE_MEM_DECODE; + val |= CFG_PREF_MEM_LIMIT_BASE_MEM_LIMIT_DECODE; + dw_pcie_writel_dbi(pci, PCI_PREF_MEMORY_BASE, val); + + dw_pcie_writel_dbi(pci, PCI_BASE_ADDRESS_0, 0); + + /* Configure FTS */ + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL); + val &= ~(N_FTS_MASK << N_FTS_SHIFT); + val |= N_FTS_VAL << N_FTS_SHIFT; + dw_pcie_writel_dbi(pci, PORT_LOGIC_ACK_F_ASPM_CTRL, val); + + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_GEN2_CTRL); + val &= ~FTS_MASK; + val |= FTS_VAL; + dw_pcie_writel_dbi(pci, PORT_LOGIC_GEN2_CTRL, val); + + /* Enable as 0xFFFF0001 response for CRS */ + val = dw_pcie_readl_dbi(pci, PORT_LOGIC_AMBA_ERROR_RESPONSE_DEFAULT); + val &= ~(AMBA_ERROR_RESPONSE_CRS_MASK << AMBA_ERROR_RESPONSE_CRS_SHIFT); + val |= (AMBA_ERROR_RESPONSE_CRS_OKAY_FFFF0001 << + AMBA_ERROR_RESPONSE_CRS_SHIFT); + dw_pcie_writel_dbi(pci, PORT_LOGIC_AMBA_ERROR_RESPONSE_DEFAULT, val); + + /* Configure Max Speed from DT */ + if (pcie->max_speed && pcie->max_speed != -EINVAL) { + val = dw_pcie_readl_dbi(pci, pcie->pcie_cap_base + + PCI_EXP_LNKCAP); + val &= ~PCI_EXP_LNKCAP_SLS; + val |= pcie->max_speed; + dw_pcie_writel_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKCAP, + val); + } + + /* Configure Max lane width from DT */ + val = dw_pcie_readl_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKCAP); + val &= ~PCI_EXP_LNKCAP_MLW; + val |= (pcie->num_lanes << PCI_EXP_LNKSTA_NLW_SHIFT); + dw_pcie_writel_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKCAP, val); + + config_gen3_gen4_eq_presets(pcie); + + init_host_aspm(pcie); + + val = dw_pcie_readl_dbi(pci, GEN3_RELATED_OFF); + val &= ~GEN3_RELATED_OFF_GEN3_ZRXDC_NONCOMPL; + dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); + + if (pcie->update_fc_fixup) { + val = dw_pcie_readl_dbi(pci, CFG_TIMER_CTRL_MAX_FUNC_NUM_OFF); + val |= 0x1 << CFG_TIMER_CTRL_ACK_NAK_SHIFT; + dw_pcie_writel_dbi(pci, CFG_TIMER_CTRL_MAX_FUNC_NUM_OFF, val); + } + + dw_pcie_setup_rc(pp); + + clk_set_rate(pcie->core_clk, GEN4_CORE_CLK_FREQ); + + /* Assert RST */ + val = appl_readl(pcie, APPL_PINMUX); + val &= ~APPL_PINMUX_PEX_RST; + appl_writel(pcie, val, APPL_PINMUX); + + usleep_range(100, 200); + + /* Enable LTSSM */ + val = appl_readl(pcie, APPL_CTRL); + val |= APPL_CTRL_LTSSM_EN; + appl_writel(pcie, val, APPL_CTRL); + + /* De-assert RST */ + val = appl_readl(pcie, APPL_PINMUX); + val |= APPL_PINMUX_PEX_RST; + appl_writel(pcie, val, APPL_PINMUX); + + msleep(100); +} + +static int tegra_pcie_dw_host_init(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val, tmp, offset, speed; + + tegra_pcie_prepare_host(pp); + + if (dw_pcie_wait_for_link(pci)) { + /* + * There are some endpoints which can't get the link up if + * root port has Data Link Feature (DLF) enabled. + * Refer Spec rev 4.0 ver 1.0 sec 3.4.2 & 7.7.4 for more info + * on Scaled Flow Control and DLF. + * So, need to confirm that is indeed the case here and attempt + * link up once again with DLF disabled. + */ + val = appl_readl(pcie, APPL_DEBUG); + val &= APPL_DEBUG_LTSSM_STATE_MASK; + val >>= APPL_DEBUG_LTSSM_STATE_SHIFT; + tmp = appl_readl(pcie, APPL_LINK_STATUS); + tmp &= APPL_LINK_STATUS_RDLH_LINK_UP; + if (!(val == 0x11 && !tmp)) { + /* Link is down for all good reasons */ + return 0; + } + + dev_info(pci->dev, "Link is down in DLL"); + dev_info(pci->dev, "Trying again with DLFE disabled\n"); + /* Disable LTSSM */ + val = appl_readl(pcie, APPL_CTRL); + val &= ~APPL_CTRL_LTSSM_EN; + appl_writel(pcie, val, APPL_CTRL); + + reset_control_assert(pcie->core_rst); + reset_control_deassert(pcie->core_rst); + + offset = dw_pcie_find_ext_capability(pci, PCI_EXT_CAP_ID_DLF); + val = dw_pcie_readl_dbi(pci, offset + PCI_DLF_CAP); + val &= ~PCI_DLF_EXCHANGE_ENABLE; + dw_pcie_writel_dbi(pci, offset, val); + + tegra_pcie_prepare_host(pp); + + if (dw_pcie_wait_for_link(pci)) + return 0; + } + + speed = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKSTA) & + PCI_EXP_LNKSTA_CLS; + clk_set_rate(pcie->core_clk, pcie_gen_freq[speed - 1]); + + tegra_pcie_enable_interrupts(pp); + + return 0; +} + +static int tegra_pcie_dw_link_up(struct dw_pcie *pci) +{ + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + u32 val = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKSTA); + + return !!(val & PCI_EXP_LNKSTA_DLLLA); +} + +static void tegra_pcie_set_msi_vec_num(struct pcie_port *pp) +{ + pp->num_vectors = MAX_MSI_IRQS; +} + +static const struct dw_pcie_ops tegra_dw_pcie_ops = { + .link_up = tegra_pcie_dw_link_up, +}; + +static struct dw_pcie_host_ops tegra_pcie_dw_host_ops = { + .rd_own_conf = tegra_pcie_dw_rd_own_conf, + .wr_own_conf = tegra_pcie_dw_wr_own_conf, + .host_init = tegra_pcie_dw_host_init, + .set_num_vectors = tegra_pcie_set_msi_vec_num, +}; + +static void tegra_pcie_disable_phy(struct tegra_pcie_dw *pcie) +{ + unsigned int phy_count = pcie->phy_count; + + while (phy_count--) { + phy_power_off(pcie->phys[phy_count]); + phy_exit(pcie->phys[phy_count]); + } +} + +static int tegra_pcie_enable_phy(struct tegra_pcie_dw *pcie) +{ + unsigned int i; + int ret; + + for (i = 0; i < pcie->phy_count; i++) { + ret = phy_init(pcie->phys[i]); + if (ret < 0) + goto phy_power_off; + + ret = phy_power_on(pcie->phys[i]); + if (ret < 0) + goto phy_exit; + } + + return 0; + +phy_power_off: + while (i--) { + phy_power_off(pcie->phys[i]); +phy_exit: + phy_exit(pcie->phys[i]); + } + + return ret; +} + +static int tegra_pcie_dw_parse_dt(struct tegra_pcie_dw *pcie) +{ + struct device_node *np = pcie->dev->of_node; + int ret; + + ret = of_property_read_u32(np, "nvidia,aspm-cmrt-us", &pcie->aspm_cmrt); + if (ret < 0) { + dev_info(pcie->dev, "Failed to read ASPM T_cmrt: %d\n", ret); + return ret; + } + + ret = of_property_read_u32(np, "nvidia,aspm-pwr-on-t-us", + &pcie->aspm_pwr_on_t); + if (ret < 0) + dev_info(pcie->dev, "Failed to read ASPM Power On time: %d\n", + ret); + + ret = of_property_read_u32(np, "nvidia,aspm-l0s-entrance-latency-us", + &pcie->aspm_l0s_enter_lat); + if (ret < 0) + dev_info(pcie->dev, + "Failed to read ASPM L0s Entrance latency: %d\n", ret); + + ret = of_property_read_u32(np, "num-lanes", &pcie->num_lanes); + if (ret < 0) { + dev_err(pcie->dev, "Failed to read num-lanes: %d\n", ret); + return ret; + } + + pcie->max_speed = of_pci_get_max_link_speed(np); + + ret = of_property_read_u32_index(np, "nvidia,bpmp", 1, &pcie->cid); + if (ret) { + dev_err(pcie->dev, "Failed to read Controller-ID: %d\n", ret); + return ret; + } + + ret = of_property_count_strings(np, "phy-names"); + if (ret < 0) { + dev_err(pcie->dev, "Failed to find PHY entries: %d\n", + ret); + return ret; + } + pcie->phy_count = ret; + + if (of_property_read_bool(np, "nvidia,update-fc-fixup")) + pcie->update_fc_fixup = true; + + pcie->supports_clkreq = + of_property_read_bool(pcie->dev->of_node, "supports-clkreq"); + + pcie->enable_cdm_check = + of_property_read_bool(np, "snps,enable-cdm-check"); + + return 0; +} + +static int tegra_pcie_bpmp_set_ctrl_state(struct tegra_pcie_dw *pcie, + bool enable) +{ + struct mrq_uphy_response resp; + struct tegra_bpmp_message msg; + struct mrq_uphy_request req; + + /* Controller-5 doesn't need to have its state set by BPMP-FW */ + if (pcie->cid == 5) + return 0; + + memset(&req, 0, sizeof(req)); + memset(&resp, 0, sizeof(resp)); + + req.cmd = CMD_UPHY_PCIE_CONTROLLER_STATE; + req.controller_state.pcie_controller = pcie->cid; + req.controller_state.enable = enable; + + memset(&msg, 0, sizeof(msg)); + msg.mrq = MRQ_UPHY; + msg.tx.data = &req; + msg.tx.size = sizeof(req); + msg.rx.data = &resp; + msg.rx.size = sizeof(resp); + + return tegra_bpmp_transfer(pcie->bpmp, &msg); +} + +static void tegra_pcie_downstream_dev_to_D0(struct tegra_pcie_dw *pcie) +{ + struct pcie_port *pp = &pcie->pci.pp; + struct pci_bus *child, *root_bus = NULL; + struct pci_dev *pdev; + + /* + * link doesn't go into L2 state with some of the endpoints with Tegra + * if they are not in D0 state. So, need to make sure that immediate + * downstream devices are in D0 state before sending PME_TurnOff to put + * link into L2 state. + * This is as per PCI Express Base r4.0 v1.0 September 27-2017, + * 5.2 Link State Power Management (Page #428). + */ + + list_for_each_entry(child, &pp->root_bus->children, node) { + /* Bring downstream devices to D0 if they are not already in */ + if (child->parent == pp->root_bus) { + root_bus = child; + break; + } + } + + if (!root_bus) { + dev_err(pcie->dev, "Failed to find downstream devices\n"); + return; + } + + list_for_each_entry(pdev, &root_bus->devices, bus_list) { + if (PCI_SLOT(pdev->devfn) == 0) { + if (pci_set_power_state(pdev, PCI_D0)) + dev_err(pcie->dev, + "Failed to transition %s to D0 state\n", + dev_name(&pdev->dev)); + } + } +} + +static int tegra_pcie_get_slot_regulators(struct tegra_pcie_dw *pcie) +{ + pcie->slot_ctl_3v3 = devm_regulator_get_optional(pcie->dev, "vpcie3v3"); + if (IS_ERR(pcie->slot_ctl_3v3)) { + if (PTR_ERR(pcie->slot_ctl_3v3) != -ENODEV) + return PTR_ERR(pcie->slot_ctl_3v3); + + pcie->slot_ctl_3v3 = NULL; + } + + pcie->slot_ctl_12v = devm_regulator_get_optional(pcie->dev, "vpcie12v"); + if (IS_ERR(pcie->slot_ctl_12v)) { + if (PTR_ERR(pcie->slot_ctl_12v) != -ENODEV) + return PTR_ERR(pcie->slot_ctl_12v); + + pcie->slot_ctl_12v = NULL; + } + + return 0; +} + +static int tegra_pcie_enable_slot_regulators(struct tegra_pcie_dw *pcie) +{ + int ret; + + if (pcie->slot_ctl_3v3) { + ret = regulator_enable(pcie->slot_ctl_3v3); + if (ret < 0) { + dev_err(pcie->dev, + "Failed to enable 3.3V slot supply: %d\n", ret); + return ret; + } + } + + if (pcie->slot_ctl_12v) { + ret = regulator_enable(pcie->slot_ctl_12v); + if (ret < 0) { + dev_err(pcie->dev, + "Failed to enable 12V slot supply: %d\n", ret); + goto fail_12v_enable; + } + } + + /* + * According to PCI Express Card Electromechanical Specification + * Revision 1.1, Table-2.4, T_PVPERL (Power stable to PERST# inactive) + * should be a minimum of 100ms. + */ + if (pcie->slot_ctl_3v3 || pcie->slot_ctl_12v) + msleep(100); + + return 0; + +fail_12v_enable: + if (pcie->slot_ctl_3v3) + regulator_disable(pcie->slot_ctl_3v3); + return ret; +} + +static void tegra_pcie_disable_slot_regulators(struct tegra_pcie_dw *pcie) +{ + if (pcie->slot_ctl_12v) + regulator_disable(pcie->slot_ctl_12v); + if (pcie->slot_ctl_3v3) + regulator_disable(pcie->slot_ctl_3v3); +} + +static int tegra_pcie_config_controller(struct tegra_pcie_dw *pcie, + bool en_hw_hot_rst) +{ + int ret; + u32 val; + + ret = tegra_pcie_bpmp_set_ctrl_state(pcie, true); + if (ret) { + dev_err(pcie->dev, + "Failed to enable controller %u: %d\n", pcie->cid, ret); + return ret; + } + + ret = tegra_pcie_enable_slot_regulators(pcie); + if (ret < 0) + goto fail_slot_reg_en; + + ret = regulator_enable(pcie->pex_ctl_supply); + if (ret < 0) { + dev_err(pcie->dev, "Failed to enable regulator: %d\n", ret); + goto fail_reg_en; + } + + ret = clk_prepare_enable(pcie->core_clk); + if (ret) { + dev_err(pcie->dev, "Failed to enable core clock: %d\n", ret); + goto fail_core_clk; + } + + ret = reset_control_deassert(pcie->core_apb_rst); + if (ret) { + dev_err(pcie->dev, "Failed to deassert core APB reset: %d\n", + ret); + goto fail_core_apb_rst; + } + + if (en_hw_hot_rst) { + /* Enable HW_HOT_RST mode */ + val = appl_readl(pcie, APPL_CTRL); + val &= ~(APPL_CTRL_HW_HOT_RST_MODE_MASK << + APPL_CTRL_HW_HOT_RST_MODE_SHIFT); + val |= APPL_CTRL_HW_HOT_RST_EN; + appl_writel(pcie, val, APPL_CTRL); + } + + ret = tegra_pcie_enable_phy(pcie); + if (ret) { + dev_err(pcie->dev, "Failed to enable PHY: %d\n", ret); + goto fail_phy; + } + + /* Update CFG base address */ + appl_writel(pcie, pcie->dbi_res->start & APPL_CFG_BASE_ADDR_MASK, + APPL_CFG_BASE_ADDR); + + /* Configure this core for RP mode operation */ + appl_writel(pcie, APPL_DM_TYPE_RP, APPL_DM_TYPE); + + appl_writel(pcie, 0x0, APPL_CFG_SLCG_OVERRIDE); + + val = appl_readl(pcie, APPL_CTRL); + appl_writel(pcie, val | APPL_CTRL_SYS_PRE_DET_STATE, APPL_CTRL); + + val = appl_readl(pcie, APPL_CFG_MISC); + val |= (APPL_CFG_MISC_ARCACHE_VAL << APPL_CFG_MISC_ARCACHE_SHIFT); + appl_writel(pcie, val, APPL_CFG_MISC); + + if (!pcie->supports_clkreq) { + val = appl_readl(pcie, APPL_PINMUX); + val |= APPL_PINMUX_CLKREQ_OUT_OVRD_EN; + val |= APPL_PINMUX_CLKREQ_OUT_OVRD; + appl_writel(pcie, val, APPL_PINMUX); + } + + /* Update iATU_DMA base address */ + appl_writel(pcie, + pcie->atu_dma_res->start & APPL_CFG_IATU_DMA_BASE_ADDR_MASK, + APPL_CFG_IATU_DMA_BASE_ADDR); + + reset_control_deassert(pcie->core_rst); + + pcie->pcie_cap_base = dw_pcie_find_capability(&pcie->pci, + PCI_CAP_ID_EXP); + + /* Disable ASPM-L1SS advertisement as there is no CLKREQ routing */ + if (!pcie->supports_clkreq) { + disable_aspm_l11(pcie); + disable_aspm_l12(pcie); + } + + return ret; + +fail_phy: + reset_control_assert(pcie->core_apb_rst); +fail_core_apb_rst: + clk_disable_unprepare(pcie->core_clk); +fail_core_clk: + regulator_disable(pcie->pex_ctl_supply); +fail_reg_en: + tegra_pcie_disable_slot_regulators(pcie); +fail_slot_reg_en: + tegra_pcie_bpmp_set_ctrl_state(pcie, false); + + return ret; +} + +static int __deinit_controller(struct tegra_pcie_dw *pcie) +{ + int ret; + + ret = reset_control_assert(pcie->core_rst); + if (ret) { + dev_err(pcie->dev, "Failed to assert \"core\" reset: %d\n", + ret); + return ret; + } + + tegra_pcie_disable_phy(pcie); + + ret = reset_control_assert(pcie->core_apb_rst); + if (ret) { + dev_err(pcie->dev, "Failed to assert APB reset: %d\n", ret); + return ret; + } + + clk_disable_unprepare(pcie->core_clk); + + ret = regulator_disable(pcie->pex_ctl_supply); + if (ret) { + dev_err(pcie->dev, "Failed to disable regulator: %d\n", ret); + return ret; + } + + tegra_pcie_disable_slot_regulators(pcie); + + ret = tegra_pcie_bpmp_set_ctrl_state(pcie, false); + if (ret) { + dev_err(pcie->dev, "Failed to disable controller %d: %d\n", + pcie->cid, ret); + return ret; + } + + return ret; +} + +static int tegra_pcie_init_controller(struct tegra_pcie_dw *pcie) +{ + struct dw_pcie *pci = &pcie->pci; + struct pcie_port *pp = &pci->pp; + int ret; + + ret = tegra_pcie_config_controller(pcie, false); + if (ret < 0) + return ret; + + pp->ops = &tegra_pcie_dw_host_ops; + + ret = dw_pcie_host_init(pp); + if (ret < 0) { + dev_err(pcie->dev, "Failed to add PCIe port: %d\n", ret); + goto fail_host_init; + } + + return 0; + +fail_host_init: + return __deinit_controller(pcie); +} + +static int tegra_pcie_try_link_l2(struct tegra_pcie_dw *pcie) +{ + u32 val; + + if (!tegra_pcie_dw_link_up(&pcie->pci)) + return 0; + + val = appl_readl(pcie, APPL_RADM_STATUS); + val |= APPL_PM_XMT_TURNOFF_STATE; + appl_writel(pcie, val, APPL_RADM_STATUS); + + return readl_poll_timeout_atomic(pcie->appl_base + APPL_DEBUG, val, + val & APPL_DEBUG_PM_LINKST_IN_L2_LAT, + 1, PME_ACK_TIMEOUT); +} + +static void tegra_pcie_dw_pme_turnoff(struct tegra_pcie_dw *pcie) +{ + u32 data; + int err; + + if (!tegra_pcie_dw_link_up(&pcie->pci)) { + dev_dbg(pcie->dev, "PCIe link is not up...!\n"); + return; + } + + if (tegra_pcie_try_link_l2(pcie)) { + dev_info(pcie->dev, "Link didn't transition to L2 state\n"); + /* + * TX lane clock freq will reset to Gen1 only if link is in L2 + * or detect state. + * So apply pex_rst to end point to force RP to go into detect + * state + */ + data = appl_readl(pcie, APPL_PINMUX); + data &= ~APPL_PINMUX_PEX_RST; + appl_writel(pcie, data, APPL_PINMUX); + + err = readl_poll_timeout_atomic(pcie->appl_base + APPL_DEBUG, + data, + ((data & + APPL_DEBUG_LTSSM_STATE_MASK) >> + APPL_DEBUG_LTSSM_STATE_SHIFT) == + LTSSM_STATE_PRE_DETECT, + 1, LTSSM_TIMEOUT); + if (err) { + dev_info(pcie->dev, "Link didn't go to detect state\n"); + } else { + /* Disable LTSSM after link is in detect state */ + data = appl_readl(pcie, APPL_CTRL); + data &= ~APPL_CTRL_LTSSM_EN; + appl_writel(pcie, data, APPL_CTRL); + } + } + /* + * DBI registers may not be accessible after this as PLL-E would be + * down depending on how CLKREQ is pulled by end point + */ + data = appl_readl(pcie, APPL_PINMUX); + data |= (APPL_PINMUX_CLKREQ_OVERRIDE_EN | APPL_PINMUX_CLKREQ_OVERRIDE); + /* Cut REFCLK to slot */ + data |= APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE_EN; + data &= ~APPL_PINMUX_CLK_OUTPUT_IN_OVERRIDE; + appl_writel(pcie, data, APPL_PINMUX); +} + +static int tegra_pcie_deinit_controller(struct tegra_pcie_dw *pcie) +{ + tegra_pcie_downstream_dev_to_D0(pcie); + dw_pcie_host_deinit(&pcie->pci.pp); + tegra_pcie_dw_pme_turnoff(pcie); + + return __deinit_controller(pcie); +} + +static int tegra_pcie_config_rp(struct tegra_pcie_dw *pcie) +{ + struct pcie_port *pp = &pcie->pci.pp; + struct device *dev = pcie->dev; + char *name; + int ret; + + if (IS_ENABLED(CONFIG_PCI_MSI)) { + pp->msi_irq = of_irq_get_byname(dev->of_node, "msi"); + if (!pp->msi_irq) { + dev_err(dev, "Failed to get MSI interrupt\n"); + return -ENODEV; + } + } + + pm_runtime_enable(dev); + + ret = pm_runtime_get_sync(dev); + if (ret < 0) { + dev_err(dev, "Failed to get runtime sync for PCIe dev: %d\n", + ret); + goto fail_pm_get_sync; + } + + ret = pinctrl_pm_select_default_state(dev); + if (ret < 0) { + dev_err(dev, "Failed to configure sideband pins: %d\n", ret); + goto fail_pinctrl; + } + + tegra_pcie_init_controller(pcie); + + pcie->link_state = tegra_pcie_dw_link_up(&pcie->pci); + if (!pcie->link_state) { + ret = -ENOMEDIUM; + goto fail_host_init; + } + + name = devm_kasprintf(dev, GFP_KERNEL, "%pOFP", dev->of_node); + if (!name) { + ret = -ENOMEM; + goto fail_host_init; + } + + pcie->debugfs = debugfs_create_dir(name, NULL); + if (!pcie->debugfs) + dev_err(dev, "Failed to create debugfs\n"); + else + init_debugfs(pcie); + + return ret; + +fail_host_init: + tegra_pcie_deinit_controller(pcie); +fail_pinctrl: + pm_runtime_put_sync(dev); +fail_pm_get_sync: + pm_runtime_disable(dev); + return ret; +} + +static int tegra_pcie_dw_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct resource *atu_dma_res; + struct tegra_pcie_dw *pcie; + struct resource *dbi_res; + struct pcie_port *pp; + struct dw_pcie *pci; + struct phy **phys; + char *name; + int ret; + u32 i; + + pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); + if (!pcie) + return -ENOMEM; + + pci = &pcie->pci; + pci->dev = &pdev->dev; + pci->ops = &tegra_dw_pcie_ops; + pp = &pci->pp; + pcie->dev = &pdev->dev; + + ret = tegra_pcie_dw_parse_dt(pcie); + if (ret < 0) { + dev_err(dev, "Failed to parse device tree: %d\n", ret); + return ret; + } + + ret = tegra_pcie_get_slot_regulators(pcie); + if (ret < 0) { + dev_err(dev, "Failed to get slot regulators: %d\n", ret); + return ret; + } + + pcie->pex_ctl_supply = devm_regulator_get(dev, "vddio-pex-ctl"); + if (IS_ERR(pcie->pex_ctl_supply)) { + ret = PTR_ERR(pcie->pex_ctl_supply); + if (ret != -EPROBE_DEFER) + dev_err(dev, "Failed to get regulator: %ld\n", + PTR_ERR(pcie->pex_ctl_supply)); + return ret; + } + + pcie->core_clk = devm_clk_get(dev, "core"); + if (IS_ERR(pcie->core_clk)) { + dev_err(dev, "Failed to get core clock: %ld\n", + PTR_ERR(pcie->core_clk)); + return PTR_ERR(pcie->core_clk); + } + + pcie->appl_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "appl"); + if (!pcie->appl_res) { + dev_err(dev, "Failed to find \"appl\" region\n"); + return -ENODEV; + } + + pcie->appl_base = devm_ioremap_resource(dev, pcie->appl_res); + if (IS_ERR(pcie->appl_base)) + return PTR_ERR(pcie->appl_base); + + pcie->core_apb_rst = devm_reset_control_get(dev, "apb"); + if (IS_ERR(pcie->core_apb_rst)) { + dev_err(dev, "Failed to get APB reset: %ld\n", + PTR_ERR(pcie->core_apb_rst)); + return PTR_ERR(pcie->core_apb_rst); + } + + phys = devm_kcalloc(dev, pcie->phy_count, sizeof(*phys), GFP_KERNEL); + if (!phys) + return -ENOMEM; + + for (i = 0; i < pcie->phy_count; i++) { + name = kasprintf(GFP_KERNEL, "p2u-%u", i); + if (!name) { + dev_err(dev, "Failed to create P2U string\n"); + return -ENOMEM; + } + phys[i] = devm_phy_get(dev, name); + kfree(name); + if (IS_ERR(phys[i])) { + ret = PTR_ERR(phys[i]); + if (ret != -EPROBE_DEFER) + dev_err(dev, "Failed to get PHY: %d\n", ret); + return ret; + } + } + + pcie->phys = phys; + + dbi_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi"); + if (!dbi_res) { + dev_err(dev, "Failed to find \"dbi\" region\n"); + return -ENODEV; + } + pcie->dbi_res = dbi_res; + + pci->dbi_base = devm_ioremap_resource(dev, dbi_res); + if (IS_ERR(pci->dbi_base)) + return PTR_ERR(pci->dbi_base); + + /* Tegra HW locates DBI2 at a fixed offset from DBI */ + pci->dbi_base2 = pci->dbi_base + 0x1000; + + atu_dma_res = platform_get_resource_byname(pdev, IORESOURCE_MEM, + "atu_dma"); + if (!atu_dma_res) { + dev_err(dev, "Failed to find \"atu_dma\" region\n"); + return -ENODEV; + } + pcie->atu_dma_res = atu_dma_res; + + pci->atu_base = devm_ioremap_resource(dev, atu_dma_res); + if (IS_ERR(pci->atu_base)) + return PTR_ERR(pci->atu_base); + + pcie->core_rst = devm_reset_control_get(dev, "core"); + if (IS_ERR(pcie->core_rst)) { + dev_err(dev, "Failed to get core reset: %ld\n", + PTR_ERR(pcie->core_rst)); + return PTR_ERR(pcie->core_rst); + } + + pp->irq = platform_get_irq_byname(pdev, "intr"); + if (!pp->irq) { + dev_err(dev, "Failed to get \"intr\" interrupt\n"); + return -ENODEV; + } + + ret = devm_request_irq(dev, pp->irq, tegra_pcie_irq_handler, + IRQF_SHARED, "tegra-pcie-intr", pcie); + if (ret) { + dev_err(dev, "Failed to request IRQ %d: %d\n", pp->irq, ret); + return ret; + } + + pcie->bpmp = tegra_bpmp_get(dev); + if (IS_ERR(pcie->bpmp)) + return PTR_ERR(pcie->bpmp); + + platform_set_drvdata(pdev, pcie); + + ret = tegra_pcie_config_rp(pcie); + if (ret && ret != -ENOMEDIUM) + goto fail; + else + return 0; + +fail: + tegra_bpmp_put(pcie->bpmp); + return ret; +} + +static int tegra_pcie_dw_remove(struct platform_device *pdev) +{ + struct tegra_pcie_dw *pcie = platform_get_drvdata(pdev); + + if (!pcie->link_state) + return 0; + + debugfs_remove_recursive(pcie->debugfs); + tegra_pcie_deinit_controller(pcie); + pm_runtime_put_sync(pcie->dev); + pm_runtime_disable(pcie->dev); + tegra_bpmp_put(pcie->bpmp); + + return 0; +} + +static int tegra_pcie_dw_suspend_late(struct device *dev) +{ + struct tegra_pcie_dw *pcie = dev_get_drvdata(dev); + u32 val; + + if (!pcie->link_state) + return 0; + + /* Enable HW_HOT_RST mode */ + val = appl_readl(pcie, APPL_CTRL); + val &= ~(APPL_CTRL_HW_HOT_RST_MODE_MASK << + APPL_CTRL_HW_HOT_RST_MODE_SHIFT); + val |= APPL_CTRL_HW_HOT_RST_EN; + appl_writel(pcie, val, APPL_CTRL); + + return 0; +} + +static int tegra_pcie_dw_suspend_noirq(struct device *dev) +{ + struct tegra_pcie_dw *pcie = dev_get_drvdata(dev); + + if (!pcie->link_state) + return 0; + + /* Save MSI interrupt vector */ + pcie->msi_ctrl_int = dw_pcie_readl_dbi(&pcie->pci, + PORT_LOGIC_MSI_CTRL_INT_0_EN); + tegra_pcie_downstream_dev_to_D0(pcie); + tegra_pcie_dw_pme_turnoff(pcie); + + return __deinit_controller(pcie); +} + +static int tegra_pcie_dw_resume_noirq(struct device *dev) +{ + struct tegra_pcie_dw *pcie = dev_get_drvdata(dev); + int ret; + + if (!pcie->link_state) + return 0; + + ret = tegra_pcie_config_controller(pcie, true); + if (ret < 0) + return ret; + + ret = tegra_pcie_dw_host_init(&pcie->pci.pp); + if (ret < 0) { + dev_err(dev, "Failed to init host: %d\n", ret); + goto fail_host_init; + } + + /* Restore MSI interrupt vector */ + dw_pcie_writel_dbi(&pcie->pci, PORT_LOGIC_MSI_CTRL_INT_0_EN, + pcie->msi_ctrl_int); + + return 0; + +fail_host_init: + return __deinit_controller(pcie); +} + +static int tegra_pcie_dw_resume_early(struct device *dev) +{ + struct tegra_pcie_dw *pcie = dev_get_drvdata(dev); + u32 val; + + if (!pcie->link_state) + return 0; + + /* Disable HW_HOT_RST mode */ + val = appl_readl(pcie, APPL_CTRL); + val &= ~(APPL_CTRL_HW_HOT_RST_MODE_MASK << + APPL_CTRL_HW_HOT_RST_MODE_SHIFT); + val |= APPL_CTRL_HW_HOT_RST_MODE_IMDT_RST << + APPL_CTRL_HW_HOT_RST_MODE_SHIFT; + val &= ~APPL_CTRL_HW_HOT_RST_EN; + appl_writel(pcie, val, APPL_CTRL); + + return 0; +} + +static void tegra_pcie_dw_shutdown(struct platform_device *pdev) +{ + struct tegra_pcie_dw *pcie = platform_get_drvdata(pdev); + + if (!pcie->link_state) + return; + + debugfs_remove_recursive(pcie->debugfs); + tegra_pcie_downstream_dev_to_D0(pcie); + + disable_irq(pcie->pci.pp.irq); + if (IS_ENABLED(CONFIG_PCI_MSI)) + disable_irq(pcie->pci.pp.msi_irq); + + tegra_pcie_dw_pme_turnoff(pcie); + __deinit_controller(pcie); +} + +static const struct of_device_id tegra_pcie_dw_of_match[] = { + { + .compatible = "nvidia,tegra194-pcie", + }, + {}, +}; + +static const struct dev_pm_ops tegra_pcie_dw_pm_ops = { + .suspend_late = tegra_pcie_dw_suspend_late, + .suspend_noirq = tegra_pcie_dw_suspend_noirq, + .resume_noirq = tegra_pcie_dw_resume_noirq, + .resume_early = tegra_pcie_dw_resume_early, +}; + +static struct platform_driver tegra_pcie_dw_driver = { + .probe = tegra_pcie_dw_probe, + .remove = tegra_pcie_dw_remove, + .shutdown = tegra_pcie_dw_shutdown, + .driver = { + .name = "tegra194-pcie", + .pm = &tegra_pcie_dw_pm_ops, + .of_match_table = tegra_pcie_dw_of_match, + }, +}; +module_platform_driver(tegra_pcie_dw_driver); + +MODULE_DEVICE_TABLE(of, tegra_pcie_dw_of_match); + +MODULE_AUTHOR("Vidya Sagar "); +MODULE_DESCRIPTION("NVIDIA PCIe host controller driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c index c742881b5061..c8cb9c5188a4 100644 --- a/drivers/pci/controller/pci-host-common.c +++ b/drivers/pci/controller/pci-host-common.c @@ -43,9 +43,8 @@ static struct pci_config_window *gen_pci_init(struct device *dev, goto err_out; } - err = devm_add_action(dev, gen_pci_unmap_cfg, cfg); + err = devm_add_action_or_reset(dev, gen_pci_unmap_cfg, cfg); if (err) { - gen_pci_unmap_cfg(cfg); goto err_out; } return cfg; diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 0ca73c851e0f..f1f300218fab 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2809,6 +2809,48 @@ static void put_hvpcibus(struct hv_pcibus_device *hbus) complete(&hbus->remove_event); } +#define HVPCI_DOM_MAP_SIZE (64 * 1024) +static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE); + +/* + * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0 + * as invalid for passthrough PCI devices of this driver. + */ +#define HVPCI_DOM_INVALID 0 + +/** + * hv_get_dom_num() - Get a valid PCI domain number + * Check if the PCI domain number is in use, and return another number if + * it is in use. + * + * @dom: Requested domain number + * + * return: domain number on success, HVPCI_DOM_INVALID on failure + */ +static u16 hv_get_dom_num(u16 dom) +{ + unsigned int i; + + if (test_and_set_bit(dom, hvpci_dom_map) == 0) + return dom; + + for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) { + if (test_and_set_bit(i, hvpci_dom_map) == 0) + return i; + } + + return HVPCI_DOM_INVALID; +} + +/** + * hv_put_dom_num() - Mark the PCI domain number as free + * @dom: Domain number to be freed + */ +static void hv_put_dom_num(u16 dom) +{ + clear_bit(dom, hvpci_dom_map); +} + /** * hv_pci_probe() - New VMBus channel probe, for a root PCI bus * @hdev: VMBus's tracking struct for this root PCI bus @@ -2820,6 +2862,7 @@ static int hv_pci_probe(struct hv_device *hdev, const struct hv_vmbus_device_id *dev_id) { struct hv_pcibus_device *hbus; + u16 dom_req, dom; char *name; int ret; @@ -2835,19 +2878,34 @@ static int hv_pci_probe(struct hv_device *hdev, hbus->state = hv_pcibus_init; /* - * The PCI bus "domain" is what is called "segment" in ACPI and - * other specs. Pull it from the instance ID, to get something - * unique. Bytes 8 and 9 are what is used in Windows guests, so - * do the same thing for consistency. Note that, since this code - * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee - * that (1) the only domain in use for something that looks like - * a physical PCI bus (which is actually emulated by the - * hypervisor) is domain 0 and (2) there will be no overlap - * between domains derived from these instance IDs in the same - * VM. + * The PCI bus "domain" is what is called "segment" in ACPI and other + * specs. Pull it from the instance ID, to get something usually + * unique. In rare cases of collision, we will find out another number + * not in use. + * + * Note that, since this code only runs in a Hyper-V VM, Hyper-V + * together with this guest driver can guarantee that (1) The only + * domain used by Gen1 VMs for something that looks like a physical + * PCI bus (which is actually emulated by the hypervisor) is domain 0. + * (2) There will be no overlap between domains (after fixing possible + * collisions) in the same VM. */ - hbus->sysdata.domain = hdev->dev_instance.b[9] | - hdev->dev_instance.b[8] << 8; + dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4]; + dom = hv_get_dom_num(dom_req); + + if (dom == HVPCI_DOM_INVALID) { + dev_err(&hdev->device, + "Unable to use dom# 0x%hx or other numbers", dom_req); + ret = -EINVAL; + goto free_bus; + } + + if (dom != dom_req) + dev_info(&hdev->device, + "PCI dom# 0x%hx has collision, using 0x%hx", + dom_req, dom); + + hbus->sysdata.domain = dom; hbus->hdev = hdev; refcount_set(&hbus->remove_lock, 1); @@ -2862,7 +2920,7 @@ static int hv_pci_probe(struct hv_device *hdev, hbus->sysdata.domain); if (!hbus->wq) { ret = -ENOMEM; - goto free_bus; + goto free_dom; } ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, @@ -2946,6 +3004,8 @@ static int hv_pci_probe(struct hv_device *hdev, vmbus_close(hdev->channel); destroy_wq: destroy_workqueue(hbus->wq); +free_dom: + hv_put_dom_num(hbus->sysdata.domain); free_bus: free_page((unsigned long)hbus); return ret; @@ -3008,8 +3068,8 @@ static int hv_pci_remove(struct hv_device *hdev) /* Remove the bus from PCI's point of view. */ pci_lock_rescan_remove(); pci_stop_root_bus(hbus->pci_bus); - pci_remove_root_bus(hbus->pci_bus); hv_pci_remove_slots(hbus); + pci_remove_root_bus(hbus->pci_bus); pci_unlock_rescan_remove(); hbus->state = hv_pcibus_removed; } @@ -3027,6 +3087,9 @@ static int hv_pci_remove(struct hv_device *hdev) put_hvpcibus(hbus); wait_for_completion(&hbus->remove_event); destroy_workqueue(hbus->wq); + + hv_put_dom_num(hbus->sysdata.domain); + free_page((unsigned long)hbus); return 0; } @@ -3058,6 +3121,9 @@ static void __exit exit_hv_pci_drv(void) static int __init init_hv_pci_drv(void) { + /* Set the invalid domain number's bit, so it will not be used */ + set_bit(HVPCI_DOM_INVALID, hvpci_dom_map); + /* Initialize PCI block r/w interface */ hvpci_block_ops.read_block = hv_read_config_block; hvpci_block_ops.write_block = hv_write_config_block; diff --git a/drivers/pci/controller/pci-tegra.c b/drivers/pci/controller/pci-tegra.c index 9a917b2456f6..673a1725ef38 100644 --- a/drivers/pci/controller/pci-tegra.c +++ b/drivers/pci/controller/pci-tegra.c @@ -2237,14 +2237,15 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) err = of_pci_get_devfn(port); if (err < 0) { dev_err(dev, "failed to parse address: %d\n", err); - return err; + goto err_node_put; } index = PCI_SLOT(err); if (index < 1 || index > soc->num_ports) { dev_err(dev, "invalid port number: %d\n", index); - return -EINVAL; + err = -EINVAL; + goto err_node_put; } index--; @@ -2253,12 +2254,13 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) if (err < 0) { dev_err(dev, "failed to parse # of lanes: %d\n", err); - return err; + goto err_node_put; } if (value > 16) { dev_err(dev, "invalid # of lanes: %u\n", value); - return -EINVAL; + err = -EINVAL; + goto err_node_put; } lanes |= value << (index << 3); @@ -2272,13 +2274,15 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) lane += value; rp = devm_kzalloc(dev, sizeof(*rp), GFP_KERNEL); - if (!rp) - return -ENOMEM; + if (!rp) { + err = -ENOMEM; + goto err_node_put; + } err = of_address_to_resource(port, 0, &rp->regs); if (err < 0) { dev_err(dev, "failed to parse address: %d\n", err); - return err; + goto err_node_put; } INIT_LIST_HEAD(&rp->list); @@ -2330,6 +2334,10 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie) return err; return 0; + +err_node_put: + of_node_put(port); + return err; } /* diff --git a/drivers/pci/controller/pcie-iproc-platform.c b/drivers/pci/controller/pcie-iproc-platform.c index 5a3550b6bb29..9ee6200a66f4 100644 --- a/drivers/pci/controller/pcie-iproc-platform.c +++ b/drivers/pci/controller/pcie-iproc-platform.c @@ -93,12 +93,9 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev) pcie->need_ib_cfg = of_property_read_bool(np, "dma-ranges"); /* PHY use is optional */ - pcie->phy = devm_phy_get(dev, "pcie-phy"); - if (IS_ERR(pcie->phy)) { - if (PTR_ERR(pcie->phy) == -EPROBE_DEFER) - return -EPROBE_DEFER; - pcie->phy = NULL; - } + pcie->phy = devm_phy_optional_get(dev, "pcie-phy"); + if (IS_ERR(pcie->phy)) + return PTR_ERR(pcie->phy); ret = devm_of_pci_get_host_bridge_resources(dev, 0, 0xff, &resources, &iobase); diff --git a/drivers/pci/controller/pcie-mediatek.c b/drivers/pci/controller/pcie-mediatek.c index 80601e1b939e..626a7c352dfd 100644 --- a/drivers/pci/controller/pcie-mediatek.c +++ b/drivers/pci/controller/pcie-mediatek.c @@ -73,6 +73,7 @@ #define PCIE_MSI_VECTOR 0x0c0 #define PCIE_CONF_VEND_ID 0x100 +#define PCIE_CONF_DEVICE_ID 0x102 #define PCIE_CONF_CLASS_ID 0x106 #define PCIE_INT_MASK 0x420 @@ -141,12 +142,16 @@ struct mtk_pcie_port; /** * struct mtk_pcie_soc - differentiate between host generations * @need_fix_class_id: whether this host's class ID needed to be fixed or not + * @need_fix_device_id: whether this host's device ID needed to be fixed or not + * @device_id: device ID which this host need to be fixed * @ops: pointer to configuration access functions * @startup: pointer to controller setting functions * @setup_irq: pointer to initialize IRQ functions */ struct mtk_pcie_soc { bool need_fix_class_id; + bool need_fix_device_id; + unsigned int device_id; struct pci_ops *ops; int (*startup)(struct mtk_pcie_port *port); int (*setup_irq)(struct mtk_pcie_port *port, struct device_node *node); @@ -630,8 +635,6 @@ static void mtk_pcie_intr_handler(struct irq_desc *desc) } chained_irq_exit(irqchip, desc); - - return; } static int mtk_pcie_setup_irq(struct mtk_pcie_port *port, @@ -696,6 +699,9 @@ static int mtk_pcie_startup_port_v2(struct mtk_pcie_port *port) writew(val, port->base + PCIE_CONF_CLASS_ID); } + if (soc->need_fix_device_id) + writew(soc->device_id, port->base + PCIE_CONF_DEVICE_ID); + /* 100ms timeout value should be enough for Gen1/2 training */ err = readl_poll_timeout(port->base + PCIE_LINK_STATUS_V2, val, !!(val & PCIE_PORT_LINKUP_V2), 20, @@ -1216,11 +1222,21 @@ static const struct mtk_pcie_soc mtk_pcie_soc_mt7622 = { .setup_irq = mtk_pcie_setup_irq, }; +static const struct mtk_pcie_soc mtk_pcie_soc_mt7629 = { + .need_fix_class_id = true, + .need_fix_device_id = true, + .device_id = PCI_DEVICE_ID_MEDIATEK_7629, + .ops = &mtk_pcie_ops_v2, + .startup = mtk_pcie_startup_port_v2, + .setup_irq = mtk_pcie_setup_irq, +}; + static const struct of_device_id mtk_pcie_ids[] = { { .compatible = "mediatek,mt2701-pcie", .data = &mtk_pcie_soc_v1 }, { .compatible = "mediatek,mt7623-pcie", .data = &mtk_pcie_soc_v1 }, { .compatible = "mediatek,mt2712-pcie", .data = &mtk_pcie_soc_mt2712 }, { .compatible = "mediatek,mt7622-pcie", .data = &mtk_pcie_soc_mt7622 }, + { .compatible = "mediatek,mt7629-pcie", .data = &mtk_pcie_soc_mt7629 }, {}, }; diff --git a/drivers/pci/controller/pcie-mobiveil.c b/drivers/pci/controller/pcie-mobiveil.c index 672e633601c7..a45a6447b01d 100644 --- a/drivers/pci/controller/pcie-mobiveil.c +++ b/drivers/pci/controller/pcie-mobiveil.c @@ -88,6 +88,7 @@ #define AMAP_CTRL_TYPE_MASK 3 #define PAB_EXT_PEX_AMAP_SIZEN(win) PAB_EXT_REG_ADDR(0xbef0, win) +#define PAB_EXT_PEX_AMAP_AXI_WIN(win) PAB_EXT_REG_ADDR(0xb4a0, win) #define PAB_PEX_AMAP_AXI_WIN(win) PAB_REG_ADDR(0x4ba4, win) #define PAB_PEX_AMAP_PEX_WIN_L(win) PAB_REG_ADDR(0x4ba8, win) #define PAB_PEX_AMAP_PEX_WIN_H(win) PAB_REG_ADDR(0x4bac, win) @@ -462,7 +463,7 @@ static int mobiveil_pcie_parse_dt(struct mobiveil_pcie *pcie) } static void program_ib_windows(struct mobiveil_pcie *pcie, int win_num, - u64 pci_addr, u32 type, u64 size) + u64 cpu_addr, u64 pci_addr, u32 type, u64 size) { u32 value; u64 size64 = ~(size - 1); @@ -482,7 +483,10 @@ static void program_ib_windows(struct mobiveil_pcie *pcie, int win_num, csr_writel(pcie, upper_32_bits(size64), PAB_EXT_PEX_AMAP_SIZEN(win_num)); - csr_writel(pcie, pci_addr, PAB_PEX_AMAP_AXI_WIN(win_num)); + csr_writel(pcie, lower_32_bits(cpu_addr), + PAB_PEX_AMAP_AXI_WIN(win_num)); + csr_writel(pcie, upper_32_bits(cpu_addr), + PAB_EXT_PEX_AMAP_AXI_WIN(win_num)); csr_writel(pcie, lower_32_bits(pci_addr), PAB_PEX_AMAP_PEX_WIN_L(win_num)); @@ -624,7 +628,7 @@ static int mobiveil_host_init(struct mobiveil_pcie *pcie) CFG_WINDOW_TYPE, resource_size(pcie->ob_io_res)); /* memory inbound translation window */ - program_ib_windows(pcie, WIN_NUM_0, 0, MEM_WINDOW_TYPE, IB_WIN_SIZE); + program_ib_windows(pcie, WIN_NUM_0, 0, 0, MEM_WINDOW_TYPE, IB_WIN_SIZE); /* Get the I/O and memory ranges from DT */ resource_list_for_each_entry(win, &pcie->resources) { diff --git a/drivers/pci/controller/pcie-rockchip-host.c b/drivers/pci/controller/pcie-rockchip-host.c index 8d20f1793a61..ef8e677ce9d1 100644 --- a/drivers/pci/controller/pcie-rockchip-host.c +++ b/drivers/pci/controller/pcie-rockchip-host.c @@ -608,29 +608,29 @@ static int rockchip_pcie_parse_host_dt(struct rockchip_pcie *rockchip) rockchip->vpcie12v = devm_regulator_get_optional(dev, "vpcie12v"); if (IS_ERR(rockchip->vpcie12v)) { - if (PTR_ERR(rockchip->vpcie12v) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(rockchip->vpcie12v) != -ENODEV) + return PTR_ERR(rockchip->vpcie12v); dev_info(dev, "no vpcie12v regulator found\n"); } rockchip->vpcie3v3 = devm_regulator_get_optional(dev, "vpcie3v3"); if (IS_ERR(rockchip->vpcie3v3)) { - if (PTR_ERR(rockchip->vpcie3v3) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(rockchip->vpcie3v3) != -ENODEV) + return PTR_ERR(rockchip->vpcie3v3); dev_info(dev, "no vpcie3v3 regulator found\n"); } rockchip->vpcie1v8 = devm_regulator_get_optional(dev, "vpcie1v8"); if (IS_ERR(rockchip->vpcie1v8)) { - if (PTR_ERR(rockchip->vpcie1v8) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(rockchip->vpcie1v8) != -ENODEV) + return PTR_ERR(rockchip->vpcie1v8); dev_info(dev, "no vpcie1v8 regulator found\n"); } rockchip->vpcie0v9 = devm_regulator_get_optional(dev, "vpcie0v9"); if (IS_ERR(rockchip->vpcie0v9)) { - if (PTR_ERR(rockchip->vpcie0v9) == -EPROBE_DEFER) - return -EPROBE_DEFER; + if (PTR_ERR(rockchip->vpcie0v9) != -ENODEV) + return PTR_ERR(rockchip->vpcie0v9); dev_info(dev, "no vpcie0v9 regulator found\n"); } diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 4575e0c6dc4b..a35d3f3996d7 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -31,6 +31,9 @@ #define PCI_REG_VMLOCK 0x70 #define MB2_SHADOW_EN(vmlock) (vmlock & 0x2) +#define MB2_SHADOW_OFFSET 0x2000 +#define MB2_SHADOW_SIZE 16 + enum vmd_features { /* * Device may contain registers which hint the physical location of the @@ -94,6 +97,7 @@ struct vmd_dev { struct resource resources[3]; struct irq_domain *irq_domain; struct pci_bus *bus; + u8 busn_start; struct dma_map_ops dma_ops; struct dma_domain dma_domain; @@ -440,7 +444,8 @@ static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus, unsigned int devfn, int reg, int len) { char __iomem *addr = vmd->cfgbar + - (bus->number << 20) + (devfn << 12) + reg; + ((bus->number - vmd->busn_start) << 20) + + (devfn << 12) + reg; if ((addr - vmd->cfgbar) + len >= resource_size(&vmd->dev->resource[VMD_CFGBAR])) @@ -563,7 +568,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) unsigned long flags; LIST_HEAD(resources); resource_size_t offset[2] = {0}; - resource_size_t membar2_offset = 0x2000, busn_start = 0; + resource_size_t membar2_offset = 0x2000; struct pci_bus *child; /* @@ -576,7 +581,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) u32 vmlock; int ret; - membar2_offset = 0x2018; + membar2_offset = MB2_SHADOW_OFFSET + MB2_SHADOW_SIZE; ret = pci_read_config_dword(vmd->dev, PCI_REG_VMLOCK, &vmlock); if (ret || vmlock == ~0) return -ENODEV; @@ -588,9 +593,9 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) if (!membar2) return -ENOMEM; offset[0] = vmd->dev->resource[VMD_MEMBAR1].start - - readq(membar2 + 0x2008); + readq(membar2 + MB2_SHADOW_OFFSET); offset[1] = vmd->dev->resource[VMD_MEMBAR2].start - - readq(membar2 + 0x2010); + readq(membar2 + MB2_SHADOW_OFFSET + 8); pci_iounmap(vmd->dev, membar2); } } @@ -606,14 +611,14 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) pci_read_config_dword(vmd->dev, PCI_REG_VMCONFIG, &vmconfig); if (BUS_RESTRICT_CAP(vmcap) && (BUS_RESTRICT_CFG(vmconfig) == 0x1)) - busn_start = 128; + vmd->busn_start = 128; } res = &vmd->dev->resource[VMD_CFGBAR]; vmd->resources[0] = (struct resource) { .name = "VMD CFGBAR", - .start = busn_start, - .end = busn_start + (resource_size(res) >> 20) - 1, + .start = vmd->busn_start, + .end = vmd->busn_start + (resource_size(res) >> 20) - 1, .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED, }; @@ -681,8 +686,8 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]); pci_add_resource_offset(&resources, &vmd->resources[2], offset[1]); - vmd->bus = pci_create_root_bus(&vmd->dev->dev, busn_start, &vmd_ops, - sd, &resources); + vmd->bus = pci_create_root_bus(&vmd->dev->dev, vmd->busn_start, + &vmd_ops, sd, &resources); if (!vmd->bus) { pci_free_resource_list(&resources); irq_domain_remove(vmd->irq_domain); diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c index 603eadf3d965..d0559d2faf50 100644 --- a/drivers/pci/hotplug/cpci_hotplug_core.c +++ b/drivers/pci/hotplug/cpci_hotplug_core.c @@ -563,7 +563,6 @@ cleanup_slots(void) } cleanup_null: up_write(&list_rwsem); - return; } int diff --git a/drivers/pci/hotplug/cpqphp_core.c b/drivers/pci/hotplug/cpqphp_core.c index 16bbb183695a..b8aacb41a83c 100644 --- a/drivers/pci/hotplug/cpqphp_core.c +++ b/drivers/pci/hotplug/cpqphp_core.c @@ -173,7 +173,6 @@ static void pci_print_IRQ_route(void) dbg("%d %d %d %d\n", tbus, tdevice >> 3, tdevice & 0x7, tslot); } - return; } diff --git a/drivers/pci/hotplug/cpqphp_ctrl.c b/drivers/pci/hotplug/cpqphp_ctrl.c index b7f4e1f099d9..68de958a9be8 100644 --- a/drivers/pci/hotplug/cpqphp_ctrl.c +++ b/drivers/pci/hotplug/cpqphp_ctrl.c @@ -1872,8 +1872,6 @@ static void interrupt_event_handler(struct controller *ctrl) } } /* End of FOR loop */ } - - return; } @@ -1943,8 +1941,6 @@ void cpqhp_pushbutton_thread(struct timer_list *t) p_slot->state = STATIC_STATE; } - - return; } diff --git a/drivers/pci/hotplug/cpqphp_nvram.h b/drivers/pci/hotplug/cpqphp_nvram.h index 918ff8dbfe62..70e879b6a23f 100644 --- a/drivers/pci/hotplug/cpqphp_nvram.h +++ b/drivers/pci/hotplug/cpqphp_nvram.h @@ -16,10 +16,7 @@ #ifndef CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM -static inline void compaq_nvram_init(void __iomem *rom_start) -{ - return; -} +static inline void compaq_nvram_init(void __iomem *rom_start) { } static inline int compaq_nvram_load(void __iomem *rom_start, struct controller *ctrl) { diff --git a/drivers/pci/hotplug/ibmphp_res.c b/drivers/pci/hotplug/ibmphp_res.c index 5e8caf7a4452..5c93aa14f0de 100644 --- a/drivers/pci/hotplug/ibmphp_res.c +++ b/drivers/pci/hotplug/ibmphp_res.c @@ -1941,6 +1941,7 @@ static int __init update_bridge_ranges(struct bus_node **bus) break; case PCI_HEADER_TYPE_BRIDGE: function = 0x8; + /* fall through */ case PCI_HEADER_TYPE_MULTIBRIDGE: /* We assume here that only 1 bus behind the bridge TO DO: add functionality for several: diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 8c51a04b8083..654c972b8ea0 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -110,9 +110,9 @@ struct controller { * * @OFF_STATE: slot is powered off, no subordinate devices are enumerated * @BLINKINGON_STATE: slot will be powered on after the 5 second delay, - * green led is blinking + * Power Indicator is blinking * @BLINKINGOFF_STATE: slot will be powered off after the 5 second delay, - * green led is blinking + * Power Indicator is blinking * @POWERON_STATE: slot is currently powering on * @POWEROFF_STATE: slot is currently powering off * @ON_STATE: slot is powered on, subordinate devices have been enumerated @@ -167,12 +167,11 @@ int pciehp_power_on_slot(struct controller *ctrl); void pciehp_power_off_slot(struct controller *ctrl); void pciehp_get_power_status(struct controller *ctrl, u8 *status); -void pciehp_set_attention_status(struct controller *ctrl, u8 status); +#define INDICATOR_NOOP -1 /* Leave indicator unchanged */ +void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn); + void pciehp_get_latch_status(struct controller *ctrl, u8 *status); int pciehp_query_power_fault(struct controller *ctrl); -void pciehp_green_led_on(struct controller *ctrl); -void pciehp_green_led_off(struct controller *ctrl); -void pciehp_green_led_blink(struct controller *ctrl); bool pciehp_card_present(struct controller *ctrl); bool pciehp_card_present_or_link_active(struct controller *ctrl); int pciehp_check_link_status(struct controller *ctrl); diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index 6ad0d86762cb..b3122c151b80 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -95,15 +95,20 @@ static void cleanup_slot(struct controller *ctrl) } /* - * set_attention_status - Turns the Amber LED for a slot on, off or blink + * set_attention_status - Turns the Attention Indicator on, off or blinking */ static int set_attention_status(struct hotplug_slot *hotplug_slot, u8 status) { struct controller *ctrl = to_ctrl(hotplug_slot); struct pci_dev *pdev = ctrl->pcie->port; + if (status) + status <<= PCI_EXP_SLTCTL_ATTN_IND_SHIFT; + else + status = PCI_EXP_SLTCTL_ATTN_IND_OFF; + pci_config_pm_runtime_get(pdev); - pciehp_set_attention_status(ctrl, status); + pciehp_set_indicators(ctrl, INDICATOR_NOOP, status); pci_config_pm_runtime_put(pdev); return 0; } diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c index 631ced0ab28a..21af7b16d7a4 100644 --- a/drivers/pci/hotplug/pciehp_ctrl.c +++ b/drivers/pci/hotplug/pciehp_ctrl.c @@ -30,7 +30,10 @@ static void set_slot_off(struct controller *ctrl) { - /* turn off slot, turn on Amber LED, turn off Green LED if supported*/ + /* + * Turn off slot, turn on attention indicator, turn off power + * indicator + */ if (POWER_CTRL(ctrl)) { pciehp_power_off_slot(ctrl); @@ -42,8 +45,8 @@ static void set_slot_off(struct controller *ctrl) msleep(1000); } - pciehp_green_led_off(ctrl); - pciehp_set_attention_status(ctrl, 1); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + PCI_EXP_SLTCTL_ATTN_IND_ON); } /** @@ -65,7 +68,8 @@ static int board_added(struct controller *ctrl) return retval; } - pciehp_green_led_blink(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK, + INDICATOR_NOOP); /* Check link training status */ retval = pciehp_check_link_status(ctrl); @@ -90,8 +94,8 @@ static int board_added(struct controller *ctrl) } } - pciehp_green_led_on(ctrl); - pciehp_set_attention_status(ctrl, 0); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_ON, + PCI_EXP_SLTCTL_ATTN_IND_OFF); return 0; err_exit: @@ -100,7 +104,7 @@ static int board_added(struct controller *ctrl) } /** - * remove_board - Turns off slot and LEDs + * remove_board - Turn off slot and Power Indicator * @ctrl: PCIe hotplug controller where board is being removed * @safe_removal: whether the board is safely removed (versus surprise removed) */ @@ -123,8 +127,8 @@ static void remove_board(struct controller *ctrl, bool safe_removal) &ctrl->pending_events); } - /* turn off Green LED */ - pciehp_green_led_off(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + INDICATOR_NOOP); } static int pciehp_enable_slot(struct controller *ctrl); @@ -171,9 +175,9 @@ void pciehp_handle_button_press(struct controller *ctrl) ctrl_info(ctrl, "Slot(%s) Powering on due to button press\n", slot_name(ctrl)); } - /* blink green LED and turn off amber */ - pciehp_green_led_blink(ctrl); - pciehp_set_attention_status(ctrl, 0); + /* blink power indicator and turn off attention */ + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK, + PCI_EXP_SLTCTL_ATTN_IND_OFF); schedule_delayed_work(&ctrl->button_work, 5 * HZ); break; case BLINKINGOFF_STATE: @@ -187,12 +191,13 @@ void pciehp_handle_button_press(struct controller *ctrl) cancel_delayed_work(&ctrl->button_work); if (ctrl->state == BLINKINGOFF_STATE) { ctrl->state = ON_STATE; - pciehp_green_led_on(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_ON, + PCI_EXP_SLTCTL_ATTN_IND_OFF); } else { ctrl->state = OFF_STATE; - pciehp_green_led_off(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + PCI_EXP_SLTCTL_ATTN_IND_OFF); } - pciehp_set_attention_status(ctrl, 0); ctrl_info(ctrl, "Slot(%s): Action canceled due to button press\n", slot_name(ctrl)); break; @@ -310,7 +315,9 @@ static int pciehp_enable_slot(struct controller *ctrl) pm_runtime_get_sync(&ctrl->pcie->port->dev); ret = __pciehp_enable_slot(ctrl); if (ret && ATTN_BUTTN(ctrl)) - pciehp_green_led_off(ctrl); /* may be blinking */ + /* may be blinking */ + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + INDICATOR_NOOP); pm_runtime_put(&ctrl->pcie->port->dev); mutex_lock(&ctrl->state_lock); diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index bd990e3371e3..1a522c1c4177 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -418,65 +418,40 @@ int pciehp_set_raw_indicator_status(struct hotplug_slot *hotplug_slot, return 0; } -void pciehp_set_attention_status(struct controller *ctrl, u8 value) +/** + * pciehp_set_indicators() - set attention indicator, power indicator, or both + * @ctrl: PCIe hotplug controller + * @pwr: one of: + * PCI_EXP_SLTCTL_PWR_IND_ON + * PCI_EXP_SLTCTL_PWR_IND_BLINK + * PCI_EXP_SLTCTL_PWR_IND_OFF + * @attn: one of: + * PCI_EXP_SLTCTL_ATTN_IND_ON + * PCI_EXP_SLTCTL_ATTN_IND_BLINK + * PCI_EXP_SLTCTL_ATTN_IND_OFF + * + * Either @pwr or @attn can also be INDICATOR_NOOP to leave that indicator + * unchanged. + */ +void pciehp_set_indicators(struct controller *ctrl, int pwr, int attn) { - u16 slot_cmd; + u16 cmd = 0, mask = 0; - if (!ATTN_LED(ctrl)) - return; - - switch (value) { - case 0: /* turn off */ - slot_cmd = PCI_EXP_SLTCTL_ATTN_IND_OFF; - break; - case 1: /* turn on */ - slot_cmd = PCI_EXP_SLTCTL_ATTN_IND_ON; - break; - case 2: /* turn blink */ - slot_cmd = PCI_EXP_SLTCTL_ATTN_IND_BLINK; - break; - default: - return; + if (PWR_LED(ctrl) && pwr != INDICATOR_NOOP) { + cmd |= (pwr & PCI_EXP_SLTCTL_PIC); + mask |= PCI_EXP_SLTCTL_PIC; } - pcie_write_cmd_nowait(ctrl, slot_cmd, PCI_EXP_SLTCTL_AIC); - ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, - pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, slot_cmd); -} -void pciehp_green_led_on(struct controller *ctrl) -{ - if (!PWR_LED(ctrl)) - return; + if (ATTN_LED(ctrl) && attn != INDICATOR_NOOP) { + cmd |= (attn & PCI_EXP_SLTCTL_AIC); + mask |= PCI_EXP_SLTCTL_AIC; + } - pcie_write_cmd_nowait(ctrl, PCI_EXP_SLTCTL_PWR_IND_ON, - PCI_EXP_SLTCTL_PIC); - ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, - pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, - PCI_EXP_SLTCTL_PWR_IND_ON); -} - -void pciehp_green_led_off(struct controller *ctrl) -{ - if (!PWR_LED(ctrl)) - return; - - pcie_write_cmd_nowait(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, - PCI_EXP_SLTCTL_PIC); - ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, - pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, - PCI_EXP_SLTCTL_PWR_IND_OFF); -} - -void pciehp_green_led_blink(struct controller *ctrl) -{ - if (!PWR_LED(ctrl)) - return; - - pcie_write_cmd_nowait(ctrl, PCI_EXP_SLTCTL_PWR_IND_BLINK, - PCI_EXP_SLTCTL_PIC); - ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, - pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, - PCI_EXP_SLTCTL_PWR_IND_BLINK); + if (cmd) { + pcie_write_cmd_nowait(ctrl, cmd, mask); + ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__, + pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, cmd); + } } int pciehp_power_on_slot(struct controller *ctrl) @@ -638,8 +613,8 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id) if ((events & PCI_EXP_SLTSTA_PFD) && !ctrl->power_fault_detected) { ctrl->power_fault_detected = 1; ctrl_err(ctrl, "Slot(%s): Power fault\n", slot_name(ctrl)); - pciehp_set_attention_status(ctrl, 1); - pciehp_green_led_off(ctrl); + pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF, + PCI_EXP_SLTCTL_ATTN_IND_ON); } /* diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c index 182f9e3443ee..977946e4e613 100644 --- a/drivers/pci/hotplug/rpadlpar_core.c +++ b/drivers/pci/hotplug/rpadlpar_core.c @@ -473,7 +473,6 @@ int __init rpadlpar_io_init(void) void rpadlpar_io_exit(void) { dlpar_sysfs_exit(); - return; } module_init(rpadlpar_io_init); diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c index c3899ee1db99..18627bb21e9e 100644 --- a/drivers/pci/hotplug/rpaphp_core.c +++ b/drivers/pci/hotplug/rpaphp_core.c @@ -408,7 +408,6 @@ static void __exit cleanup_slots(void) pci_hp_deregister(&slot->hotplug_slot); dealloc_slot_struct(slot); } - return; } static int __init rpaphp_init(void) diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 525fd3f272b3..b3f972e8cfed 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -240,6 +240,173 @@ void pci_iov_remove_virtfn(struct pci_dev *dev, int id) pci_dev_put(dev); } +static ssize_t sriov_totalvfs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pci_sriov_get_totalvfs(pdev)); +} + +static ssize_t sriov_numvfs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pdev->sriov->num_VFs); +} + +/* + * num_vfs > 0; number of VFs to enable + * num_vfs = 0; disable all VFs + * + * Note: SRIOV spec does not allow partial VF + * disable, so it's all or none. + */ +static ssize_t sriov_numvfs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = to_pci_dev(dev); + int ret; + u16 num_vfs; + + ret = kstrtou16(buf, 0, &num_vfs); + if (ret < 0) + return ret; + + if (num_vfs > pci_sriov_get_totalvfs(pdev)) + return -ERANGE; + + device_lock(&pdev->dev); + + if (num_vfs == pdev->sriov->num_VFs) + goto exit; + + /* is PF driver loaded w/callback */ + if (!pdev->driver || !pdev->driver->sriov_configure) { + pci_info(pdev, "Driver does not support SRIOV configuration via sysfs\n"); + ret = -ENOENT; + goto exit; + } + + if (num_vfs == 0) { + /* disable VFs */ + ret = pdev->driver->sriov_configure(pdev, 0); + goto exit; + } + + /* enable VFs */ + if (pdev->sriov->num_VFs) { + pci_warn(pdev, "%d VFs already enabled. Disable before enabling %d VFs\n", + pdev->sriov->num_VFs, num_vfs); + ret = -EBUSY; + goto exit; + } + + ret = pdev->driver->sriov_configure(pdev, num_vfs); + if (ret < 0) + goto exit; + + if (ret != num_vfs) + pci_warn(pdev, "%d VFs requested; only %d enabled\n", + num_vfs, ret); + +exit: + device_unlock(&pdev->dev); + + if (ret < 0) + return ret; + + return count; +} + +static ssize_t sriov_offset_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pdev->sriov->offset); +} + +static ssize_t sriov_stride_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pdev->sriov->stride); +} + +static ssize_t sriov_vf_device_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%x\n", pdev->sriov->vf_device); +} + +static ssize_t sriov_drivers_autoprobe_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return sprintf(buf, "%u\n", pdev->sriov->drivers_autoprobe); +} + +static ssize_t sriov_drivers_autoprobe_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = to_pci_dev(dev); + bool drivers_autoprobe; + + if (kstrtobool(buf, &drivers_autoprobe) < 0) + return -EINVAL; + + pdev->sriov->drivers_autoprobe = drivers_autoprobe; + + return count; +} + +static DEVICE_ATTR_RO(sriov_totalvfs); +static DEVICE_ATTR_RW(sriov_numvfs); +static DEVICE_ATTR_RO(sriov_offset); +static DEVICE_ATTR_RO(sriov_stride); +static DEVICE_ATTR_RO(sriov_vf_device); +static DEVICE_ATTR_RW(sriov_drivers_autoprobe); + +static struct attribute *sriov_dev_attrs[] = { + &dev_attr_sriov_totalvfs.attr, + &dev_attr_sriov_numvfs.attr, + &dev_attr_sriov_offset.attr, + &dev_attr_sriov_stride.attr, + &dev_attr_sriov_vf_device.attr, + &dev_attr_sriov_drivers_autoprobe.attr, + NULL, +}; + +static umode_t sriov_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + + if (!dev_is_pf(dev)) + return 0; + + return a->mode; +} + +const struct attribute_group sriov_dev_attr_group = { + .attrs = sriov_dev_attrs, + .is_visible = sriov_attrs_are_visible, +}; + int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) { return 0; @@ -557,8 +724,8 @@ static void sriov_restore_state(struct pci_dev *dev) ctrl |= iov->ctrl & PCI_SRIOV_CTRL_ARI; pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, ctrl); - for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++) - pci_update_resource(dev, i); + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) + pci_update_resource(dev, i + PCI_IOV_RESOURCES); pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz); pci_iov_set_numvfs(dev, iov->num_VFs); diff --git a/drivers/pci/of.c b/drivers/pci/of.c index bc7b27a28795..36891e7deee3 100644 --- a/drivers/pci/of.c +++ b/drivers/pci/of.c @@ -353,7 +353,7 @@ EXPORT_SYMBOL_GPL(devm_of_pci_get_host_bridge_resources); /** * of_irq_parse_pci - Resolve the interrupt for a PCI device * @pdev: the device whose interrupt is to be resolved - * @out_irq: structure of_irq filled by this function + * @out_irq: structure of_phandle_args filled by this function * * This function resolves the PCI interrupt for a given PCI device. If a * device-node exists for a given pci_dev, it will use normal OF tree diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 234476226529..0608aae72ccc 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -18,13 +18,32 @@ #include #include #include -#include +#include + +enum pci_p2pdma_map_type { + PCI_P2PDMA_MAP_UNKNOWN = 0, + PCI_P2PDMA_MAP_NOT_SUPPORTED, + PCI_P2PDMA_MAP_BUS_ADDR, + PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, +}; struct pci_p2pdma { struct gen_pool *pool; bool p2pmem_published; + struct xarray map_types; }; +struct pci_p2pdma_pagemap { + struct dev_pagemap pgmap; + struct pci_dev *provider; + u64 bus_offset; +}; + +static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) +{ + return container_of(pgmap, struct pci_p2pdma_pagemap, pgmap); +} + static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -87,6 +106,7 @@ static void pci_p2pdma_release(void *data) gen_pool_destroy(p2pdma->pool); sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group); + xa_destroy(&p2pdma->map_types); } static int pci_p2pdma_setup(struct pci_dev *pdev) @@ -98,6 +118,8 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) if (!p2p) return -ENOMEM; + xa_init(&p2p->map_types); + p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); if (!p2p->pool) goto out; @@ -135,6 +157,7 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) { + struct pci_p2pdma_pagemap *p2p_pgmap; struct dev_pagemap *pgmap; void *addr; int error; @@ -157,14 +180,18 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, return error; } - pgmap = devm_kzalloc(&pdev->dev, sizeof(*pgmap), GFP_KERNEL); - if (!pgmap) + p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL); + if (!p2p_pgmap) return -ENOMEM; + + pgmap = &p2p_pgmap->pgmap; pgmap->res.start = pci_resource_start(pdev, bar) + offset; pgmap->res.end = pgmap->res.start + size - 1; pgmap->res.flags = pci_resource_flags(pdev, bar); pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; - pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) - + + p2p_pgmap->provider = pdev; + p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar); addr = devm_memremap_pages(&pdev->dev, pgmap); @@ -246,18 +273,31 @@ static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev) seq_buf_printf(buf, "%s;", pci_name(pdev)); } -/* - * If we can't find a common upstream bridge take a look at the root - * complex and compare it to a whitelist of known good hardware. - */ -static bool root_complex_whitelist(struct pci_dev *dev) -{ - struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); - struct pci_dev *root = pci_get_slot(host->bus, PCI_DEVFN(0, 0)); - unsigned short vendor, device; +static const struct pci_p2pdma_whitelist_entry { + unsigned short vendor; + unsigned short device; + enum { + REQ_SAME_HOST_BRIDGE = 1 << 0, + } flags; +} pci_p2pdma_whitelist[] = { + /* AMD ZEN */ + {PCI_VENDOR_ID_AMD, 0x1450, 0}, - if (iommu_present(dev->dev.bus)) - return false; + /* Intel Xeon E5/Core i7 */ + {PCI_VENDOR_ID_INTEL, 0x3c00, REQ_SAME_HOST_BRIDGE}, + {PCI_VENDOR_ID_INTEL, 0x3c01, REQ_SAME_HOST_BRIDGE}, + /* Intel Xeon E7 v3/Xeon E5 v3/Core i7 */ + {PCI_VENDOR_ID_INTEL, 0x2f00, REQ_SAME_HOST_BRIDGE}, + {PCI_VENDOR_ID_INTEL, 0x2f01, REQ_SAME_HOST_BRIDGE}, + {} +}; + +static bool __host_bridge_whitelist(struct pci_host_bridge *host, + bool same_host_bridge) +{ + struct pci_dev *root = pci_get_slot(host->bus, PCI_DEVFN(0, 0)); + const struct pci_p2pdma_whitelist_entry *entry; + unsigned short vendor, device; if (!root) return false; @@ -266,65 +306,49 @@ static bool root_complex_whitelist(struct pci_dev *dev) device = root->device; pci_dev_put(root); - /* AMD ZEN host bridges can do peer to peer */ - if (vendor == PCI_VENDOR_ID_AMD && device == 0x1450) + for (entry = pci_p2pdma_whitelist; entry->vendor; entry++) { + if (vendor != entry->vendor || device != entry->device) + continue; + if (entry->flags & REQ_SAME_HOST_BRIDGE && !same_host_bridge) + return false; + return true; + } return false; } /* - * Find the distance through the nearest common upstream bridge between - * two PCI devices. - * - * If the two devices are the same device then 0 will be returned. - * - * If there are two virtual functions of the same device behind the same - * bridge port then 2 will be returned (one step down to the PCIe switch, - * then one step back to the same device). - * - * In the case where two devices are connected to the same PCIe switch, the - * value 4 will be returned. This corresponds to the following PCI tree: - * - * -+ Root Port - * \+ Switch Upstream Port - * +-+ Switch Downstream Port - * + \- Device A - * \-+ Switch Downstream Port - * \- Device B - * - * The distance is 4 because we traverse from Device A through the downstream - * port of the switch, to the common upstream port, back up to the second - * downstream port and then to Device B. - * - * Any two devices that don't have a common upstream bridge will return -1. - * In this way devices on separate PCIe root ports will be rejected, which - * is what we want for peer-to-peer seeing each PCIe root port defines a - * separate hierarchy domain and there's no way to determine whether the root - * complex supports forwarding between them. - * - * In the case where two devices are connected to different PCIe switches, - * this function will still return a positive distance as long as both - * switches eventually have a common upstream bridge. Note this covers - * the case of using multiple PCIe switches to achieve a desired level of - * fan-out from a root port. The exact distance will be a function of the - * number of switches between Device A and Device B. - * - * If a bridge which has any ACS redirection bits set is in the path - * then this functions will return -2. This is so we reject any - * cases where the TLPs are forwarded up into the root complex. - * In this case, a list of all infringing bridge addresses will be - * populated in acs_list (assuming it's non-null) for printk purposes. + * If we can't find a common upstream bridge take a look at the root + * complex and compare it to a whitelist of known good hardware. */ -static int upstream_bridge_distance(struct pci_dev *provider, - struct pci_dev *client, - struct seq_buf *acs_list) +static bool host_bridge_whitelist(struct pci_dev *a, struct pci_dev *b) +{ + struct pci_host_bridge *host_a = pci_find_host_bridge(a->bus); + struct pci_host_bridge *host_b = pci_find_host_bridge(b->bus); + + if (host_a == host_b) + return __host_bridge_whitelist(host_a, true); + + if (__host_bridge_whitelist(host_a, false) && + __host_bridge_whitelist(host_b, false)) + return true; + + return false; +} + +static enum pci_p2pdma_map_type +__upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, + int *dist, bool *acs_redirects, struct seq_buf *acs_list) { struct pci_dev *a = provider, *b = client, *bb; int dist_a = 0; int dist_b = 0; int acs_cnt = 0; + if (acs_redirects) + *acs_redirects = false; + /* * Note, we don't need to take references to devices returned by * pci_upstream_bridge() seeing we hold a reference to a child @@ -353,15 +377,10 @@ static int upstream_bridge_distance(struct pci_dev *provider, dist_a++; } - /* - * Allow the connection if both devices are on a whitelisted root - * complex, but add an arbitrary large value to the distance. - */ - if (root_complex_whitelist(provider) && - root_complex_whitelist(client)) - return 0x1000 + dist_a + dist_b; + if (dist) + *dist = dist_a + dist_b; - return -1; + return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; check_b_path_acs: bb = b; @@ -378,33 +397,110 @@ static int upstream_bridge_distance(struct pci_dev *provider, bb = pci_upstream_bridge(bb); } - if (acs_cnt) - return -2; + if (dist) + *dist = dist_a + dist_b; - return dist_a + dist_b; + if (acs_cnt) { + if (acs_redirects) + *acs_redirects = true; + + return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; + } + + return PCI_P2PDMA_MAP_BUS_ADDR; } -static int upstream_bridge_distance_warn(struct pci_dev *provider, - struct pci_dev *client) +static unsigned long map_types_idx(struct pci_dev *client) +{ + return (pci_domain_nr(client->bus) << 16) | + (client->bus->number << 8) | client->devfn; +} + +/* + * Find the distance through the nearest common upstream bridge between + * two PCI devices. + * + * If the two devices are the same device then 0 will be returned. + * + * If there are two virtual functions of the same device behind the same + * bridge port then 2 will be returned (one step down to the PCIe switch, + * then one step back to the same device). + * + * In the case where two devices are connected to the same PCIe switch, the + * value 4 will be returned. This corresponds to the following PCI tree: + * + * -+ Root Port + * \+ Switch Upstream Port + * +-+ Switch Downstream Port + * + \- Device A + * \-+ Switch Downstream Port + * \- Device B + * + * The distance is 4 because we traverse from Device A through the downstream + * port of the switch, to the common upstream port, back up to the second + * downstream port and then to Device B. + * + * Any two devices that cannot communicate using p2pdma will return + * PCI_P2PDMA_MAP_NOT_SUPPORTED. + * + * Any two devices that have a data path that goes through the host bridge + * will consult a whitelist. If the host bridges are on the whitelist, + * this function will return PCI_P2PDMA_MAP_THRU_HOST_BRIDGE. + * + * If either bridge is not on the whitelist this function returns + * PCI_P2PDMA_MAP_NOT_SUPPORTED. + * + * If a bridge which has any ACS redirection bits set is in the path, + * acs_redirects will be set to true. In this case, a list of all infringing + * bridge addresses will be populated in acs_list (assuming it's non-null) + * for printk purposes. + */ +static enum pci_p2pdma_map_type +upstream_bridge_distance(struct pci_dev *provider, struct pci_dev *client, + int *dist, bool *acs_redirects, struct seq_buf *acs_list) +{ + enum pci_p2pdma_map_type map_type; + + map_type = __upstream_bridge_distance(provider, client, dist, + acs_redirects, acs_list); + + if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) { + if (!host_bridge_whitelist(provider, client)) + map_type = PCI_P2PDMA_MAP_NOT_SUPPORTED; + } + + if (provider->p2pdma) + xa_store(&provider->p2pdma->map_types, map_types_idx(client), + xa_mk_value(map_type), GFP_KERNEL); + + return map_type; +} + +static enum pci_p2pdma_map_type +upstream_bridge_distance_warn(struct pci_dev *provider, struct pci_dev *client, + int *dist) { struct seq_buf acs_list; + bool acs_redirects; int ret; seq_buf_init(&acs_list, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); if (!acs_list.buffer) return -ENOMEM; - ret = upstream_bridge_distance(provider, client, &acs_list); - if (ret == -2) { - pci_warn(client, "cannot be used for peer-to-peer DMA as ACS redirect is set between the client and provider (%s)\n", + ret = upstream_bridge_distance(provider, client, dist, &acs_redirects, + &acs_list); + if (acs_redirects) { + pci_warn(client, "ACS redirect is set between the client and provider (%s)\n", pci_name(provider)); /* Drop final semicolon */ acs_list.buffer[acs_list.len-1] = 0; pci_warn(client, "to disable ACS redirect for this path, add the kernel parameter: pci=disable_acs_redir=%s\n", acs_list.buffer); + } - } else if (ret < 0) { - pci_warn(client, "cannot be used for peer-to-peer DMA as the client and provider (%s) do not share an upstream bridge\n", + if (ret == PCI_P2PDMA_MAP_NOT_SUPPORTED) { + pci_warn(client, "cannot be used for peer-to-peer DMA as the client and provider (%s) do not share an upstream bridge or whitelisted host bridge\n", pci_name(provider)); } @@ -421,22 +517,22 @@ static int upstream_bridge_distance_warn(struct pci_dev *provider, * @num_clients: number of clients in the array * @verbose: if true, print warnings for devices when we return -1 * - * Returns -1 if any of the clients are not compatible (behind the same - * root port as the provider), otherwise returns a positive number where - * a lower number is the preferable choice. (If there's one client - * that's the same as the provider it will return 0, which is best choice). + * Returns -1 if any of the clients are not compatible, otherwise returns a + * positive number where a lower number is the preferable choice. (If there's + * one client that's the same as the provider it will return 0, which is best + * choice). * - * For now, "compatible" means the provider and the clients are all behind - * the same PCI root port. This cuts out cases that may work but is safest - * for the user. Future work can expand this to white-list root complexes that - * can safely forward between each ports. + * "compatible" means the provider and the clients are either all behind + * the same PCI root port or the host bridges connected to each of the devices + * are listed in the 'pci_p2pdma_whitelist'. */ int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, int num_clients, bool verbose) { bool not_supported = false; struct pci_dev *pci_client; - int distance = 0; + int total_dist = 0; + int distance; int i, ret; if (num_clients == 0) @@ -461,26 +557,26 @@ int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, if (verbose) ret = upstream_bridge_distance_warn(provider, - pci_client); + pci_client, &distance); else ret = upstream_bridge_distance(provider, pci_client, - NULL); + &distance, NULL, NULL); pci_dev_put(pci_client); - if (ret < 0) + if (ret == PCI_P2PDMA_MAP_NOT_SUPPORTED) not_supported = true; if (not_supported && !verbose) break; - distance += ret; + total_dist += distance; } if (not_supported) return -1; - return distance; + return total_dist; } EXPORT_SYMBOL_GPL(pci_p2pdma_distance_many); @@ -706,21 +802,19 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) } EXPORT_SYMBOL_GPL(pci_p2pmem_publish); -/** - * pci_p2pdma_map_sg - map a PCI peer-to-peer scatterlist for DMA - * @dev: device doing the DMA request - * @sg: scatter list to map - * @nents: elements in the scatterlist - * @dir: DMA direction - * - * Scatterlists mapped with this function should not be unmapped in any way. - * - * Returns the number of SG entries mapped or 0 on error. - */ -int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir) +static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct pci_dev *provider, + struct pci_dev *client) +{ + if (!provider->p2pdma) + return PCI_P2PDMA_MAP_NOT_SUPPORTED; + + return xa_to_value(xa_load(&provider->p2pdma->map_types, + map_types_idx(client))); +} + +static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap, + struct device *dev, struct scatterlist *sg, int nents) { - struct dev_pagemap *pgmap; struct scatterlist *s; phys_addr_t paddr; int i; @@ -736,16 +830,80 @@ int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, return 0; for_each_sg(sg, s, nents, i) { - pgmap = sg_page(s)->pgmap; paddr = sg_phys(s); - s->dma_address = paddr - pgmap->pci_p2pdma_bus_offset; + s->dma_address = paddr - p2p_pgmap->bus_offset; sg_dma_len(s) = s->length; } return nents; } -EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg); + +/** + * pci_p2pdma_map_sg - map a PCI peer-to-peer scatterlist for DMA + * @dev: device doing the DMA request + * @sg: scatter list to map + * @nents: elements in the scatterlist + * @dir: DMA direction + * @attrs: DMA attributes passed to dma_map_sg() (if called) + * + * Scatterlists mapped with this function should be unmapped using + * pci_p2pdma_unmap_sg_attrs(). + * + * Returns the number of SG entries mapped or 0 on error. + */ +int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + struct pci_p2pdma_pagemap *p2p_pgmap = + to_p2p_pgmap(sg_page(sg)->pgmap); + struct pci_dev *client; + + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return 0; + + client = to_pci_dev(dev); + + switch (pci_p2pdma_map_type(p2p_pgmap->provider, client)) { + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + return dma_map_sg_attrs(dev, sg, nents, dir, attrs); + case PCI_P2PDMA_MAP_BUS_ADDR: + return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents); + default: + WARN_ON_ONCE(1); + return 0; + } +} +EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs); + +/** + * pci_p2pdma_unmap_sg - unmap a PCI peer-to-peer scatterlist that was + * mapped with pci_p2pdma_map_sg() + * @dev: device doing the DMA request + * @sg: scatter list to map + * @nents: number of elements returned by pci_p2pdma_map_sg() + * @dir: DMA direction + * @attrs: DMA attributes passed to dma_unmap_sg() (if called) + */ +void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + struct pci_p2pdma_pagemap *p2p_pgmap = + to_p2p_pgmap(sg_page(sg)->pgmap); + enum pci_p2pdma_map_type map_type; + struct pci_dev *client; + + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return; + + client = to_pci_dev(dev); + + map_type = pci_p2pdma_map_type(p2p_pgmap->provider, client); + + if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + dma_unmap_sg_attrs(dev, sg, nents, dir, attrs); +} +EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs); /** * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 45049f558860..0c02d500158f 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -118,8 +117,58 @@ phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle) return (phys_addr_t)mcfg_addr; } +/* _HPX PCI Setting Record (Type 0); same as _HPP */ +struct hpx_type0 { + u32 revision; /* Not present in _HPP */ + u8 cache_line_size; /* Not applicable to PCIe */ + u8 latency_timer; /* Not applicable to PCIe */ + u8 enable_serr; + u8 enable_perr; +}; + +static struct hpx_type0 pci_default_type0 = { + .revision = 1, + .cache_line_size = 8, + .latency_timer = 0x40, + .enable_serr = 0, + .enable_perr = 0, +}; + +static void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx) +{ + u16 pci_cmd, pci_bctl; + + if (!hpx) + hpx = &pci_default_type0; + + if (hpx->revision > 1) { + pci_warn(dev, "PCI settings rev %d not supported; using defaults\n", + hpx->revision); + hpx = &pci_default_type0; + } + + pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpx->cache_line_size); + pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpx->latency_timer); + pci_read_config_word(dev, PCI_COMMAND, &pci_cmd); + if (hpx->enable_serr) + pci_cmd |= PCI_COMMAND_SERR; + if (hpx->enable_perr) + pci_cmd |= PCI_COMMAND_PARITY; + pci_write_config_word(dev, PCI_COMMAND, pci_cmd); + + /* Program bridge control value */ + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { + pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER, + hpx->latency_timer); + pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl); + if (hpx->enable_perr) + pci_bctl |= PCI_BRIDGE_CTL_PARITY; + pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl); + } +} + static acpi_status decode_type0_hpx_record(union acpi_object *record, - struct hpp_type0 *hpx0) + struct hpx_type0 *hpx0) { int i; union acpi_object *fields = record->package.elements; @@ -146,8 +195,30 @@ static acpi_status decode_type0_hpx_record(union acpi_object *record, return AE_OK; } +/* _HPX PCI-X Setting Record (Type 1) */ +struct hpx_type1 { + u32 revision; + u8 max_mem_read; + u8 avg_max_split; + u16 tot_max_split; +}; + +static void program_hpx_type1(struct pci_dev *dev, struct hpx_type1 *hpx) +{ + int pos; + + if (!hpx) + return; + + pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); + if (!pos) + return; + + pci_warn(dev, "PCI-X settings not supported\n"); +} + static acpi_status decode_type1_hpx_record(union acpi_object *record, - struct hpp_type1 *hpx1) + struct hpx_type1 *hpx1) { int i; union acpi_object *fields = record->package.elements; @@ -173,8 +244,130 @@ static acpi_status decode_type1_hpx_record(union acpi_object *record, return AE_OK; } +static bool pcie_root_rcb_set(struct pci_dev *dev) +{ + struct pci_dev *rp = pcie_find_root_port(dev); + u16 lnkctl; + + if (!rp) + return false; + + pcie_capability_read_word(rp, PCI_EXP_LNKCTL, &lnkctl); + if (lnkctl & PCI_EXP_LNKCTL_RCB) + return true; + + return false; +} + +/* _HPX PCI Express Setting Record (Type 2) */ +struct hpx_type2 { + u32 revision; + u32 unc_err_mask_and; + u32 unc_err_mask_or; + u32 unc_err_sever_and; + u32 unc_err_sever_or; + u32 cor_err_mask_and; + u32 cor_err_mask_or; + u32 adv_err_cap_and; + u32 adv_err_cap_or; + u16 pci_exp_devctl_and; + u16 pci_exp_devctl_or; + u16 pci_exp_lnkctl_and; + u16 pci_exp_lnkctl_or; + u32 sec_unc_err_sever_and; + u32 sec_unc_err_sever_or; + u32 sec_unc_err_mask_and; + u32 sec_unc_err_mask_or; +}; + +static void program_hpx_type2(struct pci_dev *dev, struct hpx_type2 *hpx) +{ + int pos; + u32 reg32; + + if (!hpx) + return; + + if (!pci_is_pcie(dev)) + return; + + if (hpx->revision > 1) { + pci_warn(dev, "PCIe settings rev %d not supported\n", + hpx->revision); + return; + } + + /* + * Don't allow _HPX to change MPS or MRRS settings. We manage + * those to make sure they're consistent with the rest of the + * platform. + */ + hpx->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD | + PCI_EXP_DEVCTL_READRQ; + hpx->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD | + PCI_EXP_DEVCTL_READRQ); + + /* Initialize Device Control Register */ + pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, + ~hpx->pci_exp_devctl_and, hpx->pci_exp_devctl_or); + + /* Initialize Link Control Register */ + if (pcie_cap_has_lnkctl(dev)) { + + /* + * If the Root Port supports Read Completion Boundary of + * 128, set RCB to 128. Otherwise, clear it. + */ + hpx->pci_exp_lnkctl_and |= PCI_EXP_LNKCTL_RCB; + hpx->pci_exp_lnkctl_or &= ~PCI_EXP_LNKCTL_RCB; + if (pcie_root_rcb_set(dev)) + hpx->pci_exp_lnkctl_or |= PCI_EXP_LNKCTL_RCB; + + pcie_capability_clear_and_set_word(dev, PCI_EXP_LNKCTL, + ~hpx->pci_exp_lnkctl_and, hpx->pci_exp_lnkctl_or); + } + + /* Find Advanced Error Reporting Enhanced Capability */ + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); + if (!pos) + return; + + /* Initialize Uncorrectable Error Mask Register */ + pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, ®32); + reg32 = (reg32 & hpx->unc_err_mask_and) | hpx->unc_err_mask_or; + pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, reg32); + + /* Initialize Uncorrectable Error Severity Register */ + pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, ®32); + reg32 = (reg32 & hpx->unc_err_sever_and) | hpx->unc_err_sever_or; + pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, reg32); + + /* Initialize Correctable Error Mask Register */ + pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, ®32); + reg32 = (reg32 & hpx->cor_err_mask_and) | hpx->cor_err_mask_or; + pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg32); + + /* Initialize Advanced Error Capabilities and Control Register */ + pci_read_config_dword(dev, pos + PCI_ERR_CAP, ®32); + reg32 = (reg32 & hpx->adv_err_cap_and) | hpx->adv_err_cap_or; + + /* Don't enable ECRC generation or checking if unsupported */ + if (!(reg32 & PCI_ERR_CAP_ECRC_GENC)) + reg32 &= ~PCI_ERR_CAP_ECRC_GENE; + if (!(reg32 & PCI_ERR_CAP_ECRC_CHKC)) + reg32 &= ~PCI_ERR_CAP_ECRC_CHKE; + pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32); + + /* + * FIXME: The following two registers are not supported yet. + * + * o Secondary Uncorrectable Error Severity Register + * o Secondary Uncorrectable Error Mask Register + */ +} + static acpi_status decode_type2_hpx_record(union acpi_object *record, - struct hpp_type2 *hpx2) + struct hpx_type2 *hpx2) { int i; union acpi_object *fields = record->package.elements; @@ -213,6 +406,164 @@ static acpi_status decode_type2_hpx_record(union acpi_object *record, return AE_OK; } +/* _HPX PCI Express Setting Record (Type 3) */ +struct hpx_type3 { + u16 device_type; + u16 function_type; + u16 config_space_location; + u16 pci_exp_cap_id; + u16 pci_exp_cap_ver; + u16 pci_exp_vendor_id; + u16 dvsec_id; + u16 dvsec_rev; + u16 match_offset; + u32 match_mask_and; + u32 match_value; + u16 reg_offset; + u32 reg_mask_and; + u32 reg_mask_or; +}; + +enum hpx_type3_dev_type { + HPX_TYPE_ENDPOINT = BIT(0), + HPX_TYPE_LEG_END = BIT(1), + HPX_TYPE_RC_END = BIT(2), + HPX_TYPE_RC_EC = BIT(3), + HPX_TYPE_ROOT_PORT = BIT(4), + HPX_TYPE_UPSTREAM = BIT(5), + HPX_TYPE_DOWNSTREAM = BIT(6), + HPX_TYPE_PCI_BRIDGE = BIT(7), + HPX_TYPE_PCIE_BRIDGE = BIT(8), +}; + +static u16 hpx3_device_type(struct pci_dev *dev) +{ + u16 pcie_type = pci_pcie_type(dev); + const int pcie_to_hpx3_type[] = { + [PCI_EXP_TYPE_ENDPOINT] = HPX_TYPE_ENDPOINT, + [PCI_EXP_TYPE_LEG_END] = HPX_TYPE_LEG_END, + [PCI_EXP_TYPE_RC_END] = HPX_TYPE_RC_END, + [PCI_EXP_TYPE_RC_EC] = HPX_TYPE_RC_EC, + [PCI_EXP_TYPE_ROOT_PORT] = HPX_TYPE_ROOT_PORT, + [PCI_EXP_TYPE_UPSTREAM] = HPX_TYPE_UPSTREAM, + [PCI_EXP_TYPE_DOWNSTREAM] = HPX_TYPE_DOWNSTREAM, + [PCI_EXP_TYPE_PCI_BRIDGE] = HPX_TYPE_PCI_BRIDGE, + [PCI_EXP_TYPE_PCIE_BRIDGE] = HPX_TYPE_PCIE_BRIDGE, + }; + + if (pcie_type >= ARRAY_SIZE(pcie_to_hpx3_type)) + return 0; + + return pcie_to_hpx3_type[pcie_type]; +} + +enum hpx_type3_fn_type { + HPX_FN_NORMAL = BIT(0), + HPX_FN_SRIOV_PHYS = BIT(1), + HPX_FN_SRIOV_VIRT = BIT(2), +}; + +static u8 hpx3_function_type(struct pci_dev *dev) +{ + if (dev->is_virtfn) + return HPX_FN_SRIOV_VIRT; + else if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV) > 0) + return HPX_FN_SRIOV_PHYS; + else + return HPX_FN_NORMAL; +} + +static bool hpx3_cap_ver_matches(u8 pcie_cap_id, u8 hpx3_cap_id) +{ + u8 cap_ver = hpx3_cap_id & 0xf; + + if ((hpx3_cap_id & BIT(4)) && cap_ver >= pcie_cap_id) + return true; + else if (cap_ver == pcie_cap_id) + return true; + + return false; +} + +enum hpx_type3_cfg_loc { + HPX_CFG_PCICFG = 0, + HPX_CFG_PCIE_CAP = 1, + HPX_CFG_PCIE_CAP_EXT = 2, + HPX_CFG_VEND_CAP = 3, + HPX_CFG_DVSEC = 4, + HPX_CFG_MAX, +}; + +static void program_hpx_type3_register(struct pci_dev *dev, + const struct hpx_type3 *reg) +{ + u32 match_reg, write_reg, header, orig_value; + u16 pos; + + if (!(hpx3_device_type(dev) & reg->device_type)) + return; + + if (!(hpx3_function_type(dev) & reg->function_type)) + return; + + switch (reg->config_space_location) { + case HPX_CFG_PCICFG: + pos = 0; + break; + case HPX_CFG_PCIE_CAP: + pos = pci_find_capability(dev, reg->pci_exp_cap_id); + if (pos == 0) + return; + + break; + case HPX_CFG_PCIE_CAP_EXT: + pos = pci_find_ext_capability(dev, reg->pci_exp_cap_id); + if (pos == 0) + return; + + pci_read_config_dword(dev, pos, &header); + if (!hpx3_cap_ver_matches(PCI_EXT_CAP_VER(header), + reg->pci_exp_cap_ver)) + return; + + break; + case HPX_CFG_VEND_CAP: /* Fall through */ + case HPX_CFG_DVSEC: /* Fall through */ + default: + pci_warn(dev, "Encountered _HPX type 3 with unsupported config space location"); + return; + } + + pci_read_config_dword(dev, pos + reg->match_offset, &match_reg); + + if ((match_reg & reg->match_mask_and) != reg->match_value) + return; + + pci_read_config_dword(dev, pos + reg->reg_offset, &write_reg); + orig_value = write_reg; + write_reg &= reg->reg_mask_and; + write_reg |= reg->reg_mask_or; + + if (orig_value == write_reg) + return; + + pci_write_config_dword(dev, pos + reg->reg_offset, write_reg); + + pci_dbg(dev, "Applied _HPX3 at [0x%x]: 0x%08x -> 0x%08x", + pos, orig_value, write_reg); +} + +static void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx) +{ + if (!hpx) + return; + + if (!pci_is_pcie(dev)) + return; + + program_hpx_type3_register(dev, hpx); +} + static void parse_hpx3_register(struct hpx_type3 *hpx3_reg, union acpi_object *reg_fields) { @@ -233,8 +584,7 @@ static void parse_hpx3_register(struct hpx_type3 *hpx3_reg, } static acpi_status program_type3_hpx_record(struct pci_dev *dev, - union acpi_object *record, - const struct hotplug_program_ops *hp_ops) + union acpi_object *record) { union acpi_object *fields = record->package.elements; u32 desc_count, expected_length, revision; @@ -258,7 +608,7 @@ static acpi_status program_type3_hpx_record(struct pci_dev *dev, for (i = 0; i < desc_count; i++) { reg_fields = fields + 3 + i * 14; parse_hpx3_register(&hpx3, reg_fields); - hp_ops->program_type3(dev, &hpx3); + program_hpx_type3(dev, &hpx3); } break; @@ -271,15 +621,14 @@ static acpi_status program_type3_hpx_record(struct pci_dev *dev, return AE_OK; } -static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle, - const struct hotplug_program_ops *hp_ops) +static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle) { acpi_status status; struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; union acpi_object *package, *record, *fields; - struct hpp_type0 hpx0; - struct hpp_type1 hpx1; - struct hpp_type2 hpx2; + struct hpx_type0 hpx0; + struct hpx_type1 hpx1; + struct hpx_type2 hpx2; u32 type; int i; @@ -314,24 +663,24 @@ static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle, status = decode_type0_hpx_record(record, &hpx0); if (ACPI_FAILURE(status)) goto exit; - hp_ops->program_type0(dev, &hpx0); + program_hpx_type0(dev, &hpx0); break; case 1: memset(&hpx1, 0, sizeof(hpx1)); status = decode_type1_hpx_record(record, &hpx1); if (ACPI_FAILURE(status)) goto exit; - hp_ops->program_type1(dev, &hpx1); + program_hpx_type1(dev, &hpx1); break; case 2: memset(&hpx2, 0, sizeof(hpx2)); status = decode_type2_hpx_record(record, &hpx2); if (ACPI_FAILURE(status)) goto exit; - hp_ops->program_type2(dev, &hpx2); + program_hpx_type2(dev, &hpx2); break; case 3: - status = program_type3_hpx_record(dev, record, hp_ops); + status = program_type3_hpx_record(dev, record); if (ACPI_FAILURE(status)) goto exit; break; @@ -347,16 +696,15 @@ static acpi_status acpi_run_hpx(struct pci_dev *dev, acpi_handle handle, return status; } -static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle, - const struct hotplug_program_ops *hp_ops) +static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle) { acpi_status status; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *package, *fields; - struct hpp_type0 hpp0; + struct hpx_type0 hpx0; int i; - memset(&hpp0, 0, sizeof(hpp0)); + memset(&hpx0, 0, sizeof(hpx0)); status = acpi_evaluate_object(handle, "_HPP", NULL, &buffer); if (ACPI_FAILURE(status)) @@ -377,26 +725,24 @@ static acpi_status acpi_run_hpp(struct pci_dev *dev, acpi_handle handle, } } - hpp0.revision = 1; - hpp0.cache_line_size = fields[0].integer.value; - hpp0.latency_timer = fields[1].integer.value; - hpp0.enable_serr = fields[2].integer.value; - hpp0.enable_perr = fields[3].integer.value; + hpx0.revision = 1; + hpx0.cache_line_size = fields[0].integer.value; + hpx0.latency_timer = fields[1].integer.value; + hpx0.enable_serr = fields[2].integer.value; + hpx0.enable_perr = fields[3].integer.value; - hp_ops->program_type0(dev, &hpp0); + program_hpx_type0(dev, &hpx0); exit: kfree(buffer.pointer); return status; } -/* pci_get_hp_params +/* pci_acpi_program_hp_params * * @dev - the pci_dev for which we want parameters - * @hpp - allocated by the caller */ -int pci_acpi_program_hp_params(struct pci_dev *dev, - const struct hotplug_program_ops *hp_ops) +int pci_acpi_program_hp_params(struct pci_dev *dev) { acpi_status status; acpi_handle handle, phandle; @@ -419,10 +765,10 @@ int pci_acpi_program_hp_params(struct pci_dev *dev, * this pci dev. */ while (handle) { - status = acpi_run_hpx(dev, handle, hp_ops); + status = acpi_run_hpx(dev, handle); if (ACPI_SUCCESS(status)) return 0; - status = acpi_run_hpp(dev, handle, hp_ops); + status = acpi_run_hpp(dev, handle); if (ACPI_SUCCESS(status)) return 0; if (acpi_is_root_bridge(handle)) diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c index 06083b86d4f4..5fd90105510d 100644 --- a/drivers/pci/pci-bridge-emul.c +++ b/drivers/pci/pci-bridge-emul.c @@ -38,7 +38,7 @@ struct pci_bridge_reg_behavior { u32 rsvd; }; -const static struct pci_bridge_reg_behavior pci_regs_behavior[] = { +static const struct pci_bridge_reg_behavior pci_regs_behavior[] = { [PCI_VENDOR_ID / 4] = { .ro = ~0 }, [PCI_COMMAND / 4] = { .rw = (PCI_COMMAND_IO | PCI_COMMAND_MEMORY | @@ -173,7 +173,7 @@ const static struct pci_bridge_reg_behavior pci_regs_behavior[] = { }, }; -const static struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { +static const struct pci_bridge_reg_behavior pcie_cap_regs_behavior[] = { [PCI_CAP_LIST_ID / 4] = { /* * Capability ID, Next Capability Pointer and diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 965c72104150..868e35109284 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -464,9 +464,7 @@ static ssize_t dev_rescan_store(struct device *dev, } return count; } -static struct device_attribute dev_rescan_attr = __ATTR(rescan, - (S_IWUSR|S_IWGRP), - NULL, dev_rescan_store); +static DEVICE_ATTR_WO(dev_rescan); static ssize_t remove_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -480,13 +478,12 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr, pci_stop_and_remove_bus_device_locked(to_pci_dev(dev)); return count; } -static struct device_attribute dev_remove_attr = __ATTR_IGNORE_LOCKDEP(remove, - (S_IWUSR|S_IWGRP), - NULL, remove_store); +static DEVICE_ATTR_IGNORE_LOCKDEP(remove, 0220, NULL, + remove_store); -static ssize_t dev_bus_rescan_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t bus_rescan_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { unsigned long val; struct pci_bus *bus = to_pci_bus(dev); @@ -504,7 +501,7 @@ static ssize_t dev_bus_rescan_store(struct device *dev, } return count; } -static DEVICE_ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, dev_bus_rescan_store); +static DEVICE_ATTR_WO(bus_rescan); #if defined(CONFIG_PM) && defined(CONFIG_ACPI) static ssize_t d3cold_allowed_store(struct device *dev, @@ -551,154 +548,6 @@ static ssize_t devspec_show(struct device *dev, static DEVICE_ATTR_RO(devspec); #endif -#ifdef CONFIG_PCI_IOV -static ssize_t sriov_totalvfs_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pci_sriov_get_totalvfs(pdev)); -} - - -static ssize_t sriov_numvfs_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pdev->sriov->num_VFs); -} - -/* - * num_vfs > 0; number of VFs to enable - * num_vfs = 0; disable all VFs - * - * Note: SRIOV spec doesn't allow partial VF - * disable, so it's all or none. - */ -static ssize_t sriov_numvfs_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct pci_dev *pdev = to_pci_dev(dev); - int ret; - u16 num_vfs; - - ret = kstrtou16(buf, 0, &num_vfs); - if (ret < 0) - return ret; - - if (num_vfs > pci_sriov_get_totalvfs(pdev)) - return -ERANGE; - - device_lock(&pdev->dev); - - if (num_vfs == pdev->sriov->num_VFs) - goto exit; - - /* is PF driver loaded w/callback */ - if (!pdev->driver || !pdev->driver->sriov_configure) { - pci_info(pdev, "Driver doesn't support SRIOV configuration via sysfs\n"); - ret = -ENOENT; - goto exit; - } - - if (num_vfs == 0) { - /* disable VFs */ - ret = pdev->driver->sriov_configure(pdev, 0); - goto exit; - } - - /* enable VFs */ - if (pdev->sriov->num_VFs) { - pci_warn(pdev, "%d VFs already enabled. Disable before enabling %d VFs\n", - pdev->sriov->num_VFs, num_vfs); - ret = -EBUSY; - goto exit; - } - - ret = pdev->driver->sriov_configure(pdev, num_vfs); - if (ret < 0) - goto exit; - - if (ret != num_vfs) - pci_warn(pdev, "%d VFs requested; only %d enabled\n", - num_vfs, ret); - -exit: - device_unlock(&pdev->dev); - - if (ret < 0) - return ret; - - return count; -} - -static ssize_t sriov_offset_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pdev->sriov->offset); -} - -static ssize_t sriov_stride_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pdev->sriov->stride); -} - -static ssize_t sriov_vf_device_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%x\n", pdev->sriov->vf_device); -} - -static ssize_t sriov_drivers_autoprobe_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - return sprintf(buf, "%u\n", pdev->sriov->drivers_autoprobe); -} - -static ssize_t sriov_drivers_autoprobe_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct pci_dev *pdev = to_pci_dev(dev); - bool drivers_autoprobe; - - if (kstrtobool(buf, &drivers_autoprobe) < 0) - return -EINVAL; - - pdev->sriov->drivers_autoprobe = drivers_autoprobe; - - return count; -} - -static struct device_attribute sriov_totalvfs_attr = __ATTR_RO(sriov_totalvfs); -static struct device_attribute sriov_numvfs_attr = - __ATTR(sriov_numvfs, (S_IRUGO|S_IWUSR|S_IWGRP), - sriov_numvfs_show, sriov_numvfs_store); -static struct device_attribute sriov_offset_attr = __ATTR_RO(sriov_offset); -static struct device_attribute sriov_stride_attr = __ATTR_RO(sriov_stride); -static struct device_attribute sriov_vf_device_attr = __ATTR_RO(sriov_vf_device); -static struct device_attribute sriov_drivers_autoprobe_attr = - __ATTR(sriov_drivers_autoprobe, (S_IRUGO|S_IWUSR|S_IWGRP), - sriov_drivers_autoprobe_show, sriov_drivers_autoprobe_store); -#endif /* CONFIG_PCI_IOV */ - static ssize_t driver_override_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -792,7 +641,7 @@ static struct attribute *pcie_dev_attrs[] = { }; static struct attribute *pcibus_attrs[] = { - &dev_attr_rescan.attr, + &dev_attr_bus_rescan.attr, &dev_attr_cpuaffinity.attr, &dev_attr_cpulistaffinity.attr, NULL, @@ -820,7 +669,7 @@ static ssize_t boot_vga_show(struct device *dev, struct device_attribute *attr, !!(pdev->resource[PCI_ROM_RESOURCE].flags & IORESOURCE_ROM_SHADOW)); } -static struct device_attribute vga_attr = __ATTR_RO(boot_vga); +static DEVICE_ATTR_RO(boot_vga); static ssize_t pci_read_config(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, @@ -1085,7 +934,7 @@ void pci_create_legacy_files(struct pci_bus *b) sysfs_bin_attr_init(b->legacy_io); b->legacy_io->attr.name = "legacy_io"; b->legacy_io->size = 0xffff; - b->legacy_io->attr.mode = S_IRUSR | S_IWUSR; + b->legacy_io->attr.mode = 0600; b->legacy_io->read = pci_read_legacy_io; b->legacy_io->write = pci_write_legacy_io; b->legacy_io->mmap = pci_mmap_legacy_io; @@ -1099,7 +948,7 @@ void pci_create_legacy_files(struct pci_bus *b) sysfs_bin_attr_init(b->legacy_mem); b->legacy_mem->attr.name = "legacy_mem"; b->legacy_mem->size = 1024*1024; - b->legacy_mem->attr.mode = S_IRUSR | S_IWUSR; + b->legacy_mem->attr.mode = 0600; b->legacy_mem->mmap = pci_mmap_legacy_mem; pci_adjust_legacy_attr(b, pci_mmap_mem); error = device_create_bin_file(&b->dev, b->legacy_mem); @@ -1306,7 +1155,7 @@ static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine) } } res_attr->attr.name = res_attr_name; - res_attr->attr.mode = S_IRUSR | S_IWUSR; + res_attr->attr.mode = 0600; res_attr->size = pci_resource_len(pdev, num); res_attr->private = (void *)(unsigned long)num; retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr); @@ -1419,7 +1268,7 @@ static ssize_t pci_read_rom(struct file *filp, struct kobject *kobj, static const struct bin_attribute pci_config_attr = { .attr = { .name = "config", - .mode = S_IRUGO | S_IWUSR, + .mode = 0644, }, .size = PCI_CFG_SPACE_SIZE, .read = pci_read_config, @@ -1429,7 +1278,7 @@ static const struct bin_attribute pci_config_attr = { static const struct bin_attribute pcie_config_attr = { .attr = { .name = "config", - .mode = S_IRUGO | S_IWUSR, + .mode = 0644, }, .size = PCI_CFG_SPACE_EXP_SIZE, .read = pci_read_config, @@ -1458,7 +1307,7 @@ static ssize_t reset_store(struct device *dev, struct device_attribute *attr, return count; } -static struct device_attribute reset_attr = __ATTR(reset, 0200, NULL, reset_store); +static DEVICE_ATTR(reset, 0200, NULL, reset_store); static int pci_create_capabilities_sysfs(struct pci_dev *dev) { @@ -1468,7 +1317,7 @@ static int pci_create_capabilities_sysfs(struct pci_dev *dev) pcie_aspm_create_sysfs_dev_files(dev); if (dev->reset_fn) { - retval = device_create_file(&dev->dev, &reset_attr); + retval = device_create_file(&dev->dev, &dev_attr_reset); if (retval) goto error; } @@ -1511,7 +1360,7 @@ int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev) sysfs_bin_attr_init(attr); attr->size = rom_size; attr->attr.name = "rom"; - attr->attr.mode = S_IRUSR | S_IWUSR; + attr->attr.mode = 0600; attr->read = pci_read_rom; attr->write = pci_write_rom; retval = sysfs_create_bin_file(&pdev->dev.kobj, attr); @@ -1553,7 +1402,7 @@ static void pci_remove_capabilities_sysfs(struct pci_dev *dev) pcie_vpd_remove_sysfs_dev_files(dev); pcie_aspm_remove_sysfs_dev_files(dev); if (dev->reset_fn) { - device_remove_file(&dev->dev, &reset_attr); + device_remove_file(&dev->dev, &dev_attr_reset); dev->reset_fn = 0; } } @@ -1606,7 +1455,7 @@ static int __init pci_sysfs_init(void) late_initcall(pci_sysfs_init); static struct attribute *pci_dev_dev_attrs[] = { - &vga_attr.attr, + &dev_attr_boot_vga.attr, NULL, }; @@ -1616,7 +1465,7 @@ static umode_t pci_dev_attrs_are_visible(struct kobject *kobj, struct device *dev = kobj_to_dev(kobj); struct pci_dev *pdev = to_pci_dev(dev); - if (a == &vga_attr.attr) + if (a == &dev_attr_boot_vga.attr) if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA) return 0; @@ -1624,8 +1473,8 @@ static umode_t pci_dev_attrs_are_visible(struct kobject *kobj, } static struct attribute *pci_dev_hp_attrs[] = { - &dev_remove_attr.attr, - &dev_rescan_attr.attr, + &dev_attr_remove.attr, + &dev_attr_dev_rescan.attr, NULL, }; @@ -1697,34 +1546,6 @@ static const struct attribute_group pci_dev_hp_attr_group = { .is_visible = pci_dev_hp_attrs_are_visible, }; -#ifdef CONFIG_PCI_IOV -static struct attribute *sriov_dev_attrs[] = { - &sriov_totalvfs_attr.attr, - &sriov_numvfs_attr.attr, - &sriov_offset_attr.attr, - &sriov_stride_attr.attr, - &sriov_vf_device_attr.attr, - &sriov_drivers_autoprobe_attr.attr, - NULL, -}; - -static umode_t sriov_attrs_are_visible(struct kobject *kobj, - struct attribute *a, int n) -{ - struct device *dev = kobj_to_dev(kobj); - - if (!dev_is_pf(dev)) - return 0; - - return a->mode; -} - -static const struct attribute_group sriov_dev_attr_group = { - .attrs = sriov_dev_attrs, - .is_visible = sriov_attrs_are_visible, -}; -#endif /* CONFIG_PCI_IOV */ - static const struct attribute_group pci_dev_attr_group = { .attrs = pci_dev_dev_attrs, .is_visible = pci_dev_attrs_are_visible, diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 1b27b5af3d55..e7982af9a5d8 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -890,8 +890,8 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state) pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK); - if (dev->current_state != state && printk_ratelimit()) - pci_info(dev, "Refused to change power state, currently in D%d\n", + if (dev->current_state != state) + pci_info_ratelimited(dev, "Refused to change power state, currently in D%d\n", dev->current_state); /* @@ -1443,7 +1443,7 @@ static void pci_restore_rebar_state(struct pci_dev *pdev) pci_read_config_dword(pdev, pos + PCI_REBAR_CTRL, &ctrl); bar_idx = ctrl & PCI_REBAR_CTRL_BAR_IDX; res = pdev->resource + bar_idx; - size = order_base_2((resource_size(res) >> 20) | 1) - 1; + size = ilog2(resource_size(res)) - 20; ctrl &= ~PCI_REBAR_CTRL_BAR_SIZE; ctrl |= size << PCI_REBAR_CTRL_BAR_SHIFT; pci_write_config_dword(pdev, pos + PCI_REBAR_CTRL, ctrl); @@ -3581,7 +3581,7 @@ int pci_enable_atomic_ops_to_root(struct pci_dev *dev, u32 cap_mask) } /* Ensure upstream ports don't block AtomicOps on egress */ - if (!bridge->has_secondary_link) { + if (pci_pcie_type(bridge) == PCI_EXP_TYPE_UPSTREAM) { pcie_capability_read_dword(bridge, PCI_EXP_DEVCTL2, &ctl2); if (ctl2 & PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK) @@ -5923,8 +5923,19 @@ resource_size_t __weak pcibios_default_alignment(void) return 0; } -#define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE -static char resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0}; +/* + * Arches that don't want to expose struct resource to userland as-is in + * sysfs and /proc can implement their own pci_resource_to_user(). + */ +void __weak pci_resource_to_user(const struct pci_dev *dev, int bar, + const struct resource *rsrc, + resource_size_t *start, resource_size_t *end) +{ + *start = rsrc->start; + *end = rsrc->end; +} + +static char *resource_alignment_param; static DEFINE_SPINLOCK(resource_alignment_lock); /** @@ -5945,7 +5956,7 @@ static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev, spin_lock(&resource_alignment_lock); p = resource_alignment_param; - if (!*p && !align) + if (!p || !*p) goto out; if (pci_has_flag(PCI_PROBE_ONLY)) { align = 0; @@ -6109,35 +6120,41 @@ void pci_reassigndev_resource_alignment(struct pci_dev *dev) } } -static ssize_t pci_set_resource_alignment_param(const char *buf, size_t count) -{ - if (count > RESOURCE_ALIGNMENT_PARAM_SIZE - 1) - count = RESOURCE_ALIGNMENT_PARAM_SIZE - 1; - spin_lock(&resource_alignment_lock); - strncpy(resource_alignment_param, buf, count); - resource_alignment_param[count] = '\0'; - spin_unlock(&resource_alignment_lock); - return count; -} - -static ssize_t pci_get_resource_alignment_param(char *buf, size_t size) -{ - size_t count; - spin_lock(&resource_alignment_lock); - count = snprintf(buf, size, "%s", resource_alignment_param); - spin_unlock(&resource_alignment_lock); - return count; -} - static ssize_t resource_alignment_show(struct bus_type *bus, char *buf) { - return pci_get_resource_alignment_param(buf, PAGE_SIZE); + size_t count = 0; + + spin_lock(&resource_alignment_lock); + if (resource_alignment_param) + count = snprintf(buf, PAGE_SIZE, "%s", resource_alignment_param); + spin_unlock(&resource_alignment_lock); + + /* + * When set by the command line, resource_alignment_param will not + * have a trailing line feed, which is ugly. So conditionally add + * it here. + */ + if (count >= 2 && buf[count - 2] != '\n' && count < PAGE_SIZE - 1) { + buf[count - 1] = '\n'; + buf[count++] = 0; + } + + return count; } static ssize_t resource_alignment_store(struct bus_type *bus, const char *buf, size_t count) { - return pci_set_resource_alignment_param(buf, count); + char *param = kstrndup(buf, count, GFP_KERNEL); + + if (!param) + return -ENOMEM; + + spin_lock(&resource_alignment_lock); + kfree(resource_alignment_param); + resource_alignment_param = param; + spin_unlock(&resource_alignment_lock); + return count; } static BUS_ATTR_RW(resource_alignment); @@ -6266,8 +6283,7 @@ static int __init pci_setup(char *str) } else if (!strncmp(str, "cbmemsize=", 10)) { pci_cardbus_mem_size = memparse(str + 10, &str); } else if (!strncmp(str, "resource_alignment=", 19)) { - pci_set_resource_alignment_param(str + 19, - strlen(str + 19)); + resource_alignment_param = str + 19; } else if (!strncmp(str, "ecrc=", 5)) { pcie_ecrc_get_policy(str + 5); } else if (!strncmp(str, "hpiosize=", 9)) { @@ -6302,15 +6318,18 @@ static int __init pci_setup(char *str) early_param("pci", pci_setup); /* - * 'disable_acs_redir_param' is initialized in pci_setup(), above, to point - * to data in the __initdata section which will be freed after the init - * sequence is complete. We can't allocate memory in pci_setup() because some - * architectures do not have any memory allocation service available during - * an early_param() call. So we allocate memory and copy the variable here - * before the init section is freed. + * 'resource_alignment_param' and 'disable_acs_redir_param' are initialized + * in pci_setup(), above, to point to data in the __initdata section which + * will be freed after the init sequence is complete. We can't allocate memory + * in pci_setup() because some architectures do not have any memory allocation + * service available during an early_param() call. So we allocate memory and + * copy the variable here before the init section is freed. + * */ static int __init pci_realloc_setup_params(void) { + resource_alignment_param = kstrdup(resource_alignment_param, + GFP_KERNEL); disable_acs_redir_param = kstrdup(disable_acs_redir_param, GFP_KERNEL); return 0; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index d22d1b807701..3f6947ee3324 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -39,6 +39,11 @@ int pci_probe_reset_function(struct pci_dev *dev); int pci_bridge_secondary_bus_reset(struct pci_dev *dev); int pci_bus_error_reset(struct pci_dev *dev); +#define PCI_PM_D2_DELAY 200 +#define PCI_PM_D3_WAIT 10 +#define PCI_PM_D3COLD_WAIT 100 +#define PCI_PM_BUS_WAIT 50 + /** * struct pci_platform_pm_ops - Firmware PM callbacks * @@ -84,6 +89,8 @@ void pci_power_up(struct pci_dev *dev); void pci_disable_enabled_device(struct pci_dev *dev); int pci_finish_runtime_suspend(struct pci_dev *dev); void pcie_clear_root_pme_status(struct pci_dev *dev); +bool pci_check_pme_status(struct pci_dev *dev); +void pci_pme_wakeup_bus(struct pci_bus *bus); int __pci_pme_wakeup(struct pci_dev *dev, void *ign); void pci_pme_restore(struct pci_dev *dev); bool pci_dev_need_resume(struct pci_dev *dev); @@ -118,11 +125,25 @@ static inline bool pci_power_manageable(struct pci_dev *pci_dev) return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3; } +static inline bool pcie_downstream_port(const struct pci_dev *dev) +{ + int type = pci_pcie_type(dev); + + return type == PCI_EXP_TYPE_ROOT_PORT || + type == PCI_EXP_TYPE_DOWNSTREAM || + type == PCI_EXP_TYPE_PCIE_BRIDGE; +} + int pci_vpd_init(struct pci_dev *dev); void pci_vpd_release(struct pci_dev *dev); void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev); void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev); +/* PCI Virtual Channel */ +int pci_save_vc_state(struct pci_dev *dev); +void pci_restore_vc_state(struct pci_dev *dev); +void pci_allocate_vc_save_buffers(struct pci_dev *dev); + /* PCI /proc functions */ #ifdef CONFIG_PROC_FS int pci_proc_attach_device(struct pci_dev *dev); @@ -196,6 +217,9 @@ extern const struct attribute_group *pcibus_groups[]; extern const struct device_type pci_dev_type; extern const struct attribute_group *pci_bus_groups[]; +extern unsigned long pci_hotplug_io_size; +extern unsigned long pci_hotplug_mem_size; +extern unsigned long pci_hotplug_bus_size; /** * pci_match_one_device - Tell if a PCI device structure has a matching @@ -236,6 +260,9 @@ enum pci_bar_type { pci_bar_mem64, /* A 64-bit memory BAR */ }; +struct device *pci_get_host_bridge_device(struct pci_dev *dev); +void pci_put_host_bridge_device(struct device *dev); + int pci_configure_extended_tags(struct pci_dev *dev, void *ign); bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int crs_timeout); @@ -256,6 +283,8 @@ bool pci_bus_clip_resource(struct pci_dev *dev, int idx); void pci_reassigndev_resource_alignment(struct pci_dev *dev); void pci_disable_bridge_window(struct pci_dev *dev); +struct pci_bus *pci_bus_get(struct pci_bus *bus); +void pci_bus_put(struct pci_bus *bus); /* PCIe link information */ #define PCIE_SPEED2STR(speed) \ @@ -279,6 +308,7 @@ u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed, enum pcie_link_width *width); void __pcie_print_link_status(struct pci_dev *dev, bool verbose); void pcie_report_downtraining(struct pci_dev *dev); +void pcie_update_link_speed(struct pci_bus *bus, u16 link_status); /* Single Root I/O Virtualization */ struct pci_sriov { @@ -418,11 +448,12 @@ static inline void pci_restore_dpc_state(struct pci_dev *dev) {} #endif #ifdef CONFIG_PCI_ATS +/* Address Translation Service */ +void pci_ats_init(struct pci_dev *dev); void pci_restore_ats_state(struct pci_dev *dev); #else -static inline void pci_restore_ats_state(struct pci_dev *dev) -{ -} +static inline void pci_ats_init(struct pci_dev *d) { } +static inline void pci_restore_ats_state(struct pci_dev *dev) { } #endif /* CONFIG_PCI_ATS */ #ifdef CONFIG_PCI_IOV @@ -433,7 +464,7 @@ void pci_iov_update_resource(struct pci_dev *dev, int resno); resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno); void pci_restore_iov_state(struct pci_dev *dev); int pci_iov_bus_range(struct pci_bus *bus); - +extern const struct attribute_group sriov_dev_attr_group; #else static inline int pci_iov_init(struct pci_dev *dev) { @@ -518,10 +549,21 @@ static inline void pcie_aspm_create_sysfs_dev_files(struct pci_dev *pdev) { } static inline void pcie_aspm_remove_sysfs_dev_files(struct pci_dev *pdev) { } #endif +#ifdef CONFIG_PCIE_ECRC +void pcie_set_ecrc_checking(struct pci_dev *dev); +void pcie_ecrc_get_policy(char *str); +#else +static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { } +static inline void pcie_ecrc_get_policy(char *str) { } +#endif + #ifdef CONFIG_PCIE_PTM void pci_ptm_init(struct pci_dev *dev); +int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); #else static inline void pci_ptm_init(struct pci_dev *dev) { } +static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) +{ return -EINVAL; } #endif struct pci_dev_reset_methods { @@ -558,6 +600,10 @@ struct device_node; int of_pci_parse_bus_range(struct device_node *node, struct resource *res); int of_get_pci_domain_nr(struct device_node *node); int of_pci_get_max_link_speed(struct device_node *node); +void pci_set_of_node(struct pci_dev *dev); +void pci_release_of_node(struct pci_dev *dev); +void pci_set_bus_of_node(struct pci_bus *bus); +void pci_release_bus_of_node(struct pci_bus *bus); #else static inline int @@ -577,6 +623,11 @@ of_pci_get_max_link_speed(struct device_node *node) { return -EINVAL; } + +static inline void pci_set_of_node(struct pci_dev *dev) { } +static inline void pci_release_of_node(struct pci_dev *dev) { } +static inline void pci_set_bus_of_node(struct pci_bus *bus) { } +static inline void pci_release_bus_of_node(struct pci_bus *bus) { } #endif /* CONFIG_OF */ #if defined(CONFIG_OF_ADDRESS) @@ -607,4 +658,13 @@ static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } static inline void pci_aer_clear_device_status(struct pci_dev *dev) { } #endif +#ifdef CONFIG_ACPI +int pci_acpi_program_hp_params(struct pci_dev *dev); +#else +static inline int pci_acpi_program_hp_params(struct pci_dev *dev) +{ + return -ENODEV; +} +#endif + #endif /* DRIVERS_PCI_H */ diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 464f8f92653f..652ef23bba35 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "../pci.h" #ifdef MODULE_PARAM_PREFIX @@ -913,10 +912,10 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev) /* * We allocate pcie_link_state for the component on the upstream - * end of a Link, so there's nothing to do unless this device has a - * Link on its secondary side. + * end of a Link, so there's nothing to do unless this device is + * downstream port. */ - if (!pdev->has_secondary_link) + if (!pcie_downstream_port(pdev)) return; /* VIA has a strange chipset, root port is under a bridge */ @@ -1070,7 +1069,7 @@ static int __pci_disable_link_state(struct pci_dev *pdev, int state, bool sem) if (!pci_is_pcie(pdev)) return 0; - if (pdev->has_secondary_link) + if (pcie_downstream_port(pdev)) parent = pdev; if (!parent || !parent->link_state) return -EINVAL; diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index 773197a12568..b0e6048a9208 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -166,7 +166,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service) driver = pcie_port_find_service(dev, service); if (driver && driver->reset_link) { status = driver->reset_link(dev); - } else if (dev->has_secondary_link) { + } else if (pcie_downstream_port(dev)) { status = default_reset_link(dev); } else { pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n", diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index dbeeb385fb9f..3d5271a7a849 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1426,26 +1426,38 @@ void set_pcie_port_type(struct pci_dev *pdev) pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, ®16); pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD; + parent = pci_upstream_bridge(pdev); + if (!parent) + return; + /* - * A Root Port or a PCI-to-PCIe bridge is always the upstream end - * of a Link. No PCIe component has two Links. Two Links are - * connected by a Switch that has a Port on each Link and internal - * logic to connect the two Ports. + * Some systems do not identify their upstream/downstream ports + * correctly so detect impossible configurations here and correct + * the port type accordingly. */ type = pci_pcie_type(pdev); - if (type == PCI_EXP_TYPE_ROOT_PORT || - type == PCI_EXP_TYPE_PCIE_BRIDGE) - pdev->has_secondary_link = 1; - else if (type == PCI_EXP_TYPE_UPSTREAM || - type == PCI_EXP_TYPE_DOWNSTREAM) { - parent = pci_upstream_bridge(pdev); - + if (type == PCI_EXP_TYPE_DOWNSTREAM) { /* - * Usually there's an upstream device (Root Port or Switch - * Downstream Port), but we can't assume one exists. + * If pdev claims to be downstream port but the parent + * device is also downstream port assume pdev is actually + * upstream port. */ - if (parent && !parent->has_secondary_link) - pdev->has_secondary_link = 1; + if (pcie_downstream_port(parent)) { + pci_info(pdev, "claims to be downstream port but is acting as upstream port, correcting type\n"); + pdev->pcie_flags_reg &= ~PCI_EXP_FLAGS_TYPE; + pdev->pcie_flags_reg |= PCI_EXP_TYPE_UPSTREAM; + } + } else if (type == PCI_EXP_TYPE_UPSTREAM) { + /* + * If pdev claims to be upstream port but the parent + * device is also upstream port assume pdev is actually + * downstream port. + */ + if (pci_pcie_type(parent) == PCI_EXP_TYPE_UPSTREAM) { + pci_info(pdev, "claims to be upstream port but is acting as downstream port, correcting type\n"); + pdev->pcie_flags_reg &= ~PCI_EXP_FLAGS_TYPE; + pdev->pcie_flags_reg |= PCI_EXP_TYPE_DOWNSTREAM; + } } } @@ -1915,275 +1927,6 @@ static void pci_configure_mps(struct pci_dev *dev) p_mps, mps, mpss); } -static struct hpp_type0 pci_default_type0 = { - .revision = 1, - .cache_line_size = 8, - .latency_timer = 0x40, - .enable_serr = 0, - .enable_perr = 0, -}; - -static void program_hpp_type0(struct pci_dev *dev, struct hpp_type0 *hpp) -{ - u16 pci_cmd, pci_bctl; - - if (!hpp) - hpp = &pci_default_type0; - - if (hpp->revision > 1) { - pci_warn(dev, "PCI settings rev %d not supported; using defaults\n", - hpp->revision); - hpp = &pci_default_type0; - } - - pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpp->cache_line_size); - pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpp->latency_timer); - pci_read_config_word(dev, PCI_COMMAND, &pci_cmd); - if (hpp->enable_serr) - pci_cmd |= PCI_COMMAND_SERR; - if (hpp->enable_perr) - pci_cmd |= PCI_COMMAND_PARITY; - pci_write_config_word(dev, PCI_COMMAND, pci_cmd); - - /* Program bridge control value */ - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { - pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER, - hpp->latency_timer); - pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl); - if (hpp->enable_perr) - pci_bctl |= PCI_BRIDGE_CTL_PARITY; - pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl); - } -} - -static void program_hpp_type1(struct pci_dev *dev, struct hpp_type1 *hpp) -{ - int pos; - - if (!hpp) - return; - - pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); - if (!pos) - return; - - pci_warn(dev, "PCI-X settings not supported\n"); -} - -static bool pcie_root_rcb_set(struct pci_dev *dev) -{ - struct pci_dev *rp = pcie_find_root_port(dev); - u16 lnkctl; - - if (!rp) - return false; - - pcie_capability_read_word(rp, PCI_EXP_LNKCTL, &lnkctl); - if (lnkctl & PCI_EXP_LNKCTL_RCB) - return true; - - return false; -} - -static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) -{ - int pos; - u32 reg32; - - if (!hpp) - return; - - if (!pci_is_pcie(dev)) - return; - - if (hpp->revision > 1) { - pci_warn(dev, "PCIe settings rev %d not supported\n", - hpp->revision); - return; - } - - /* - * Don't allow _HPX to change MPS or MRRS settings. We manage - * those to make sure they're consistent with the rest of the - * platform. - */ - hpp->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD | - PCI_EXP_DEVCTL_READRQ; - hpp->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD | - PCI_EXP_DEVCTL_READRQ); - - /* Initialize Device Control Register */ - pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, - ~hpp->pci_exp_devctl_and, hpp->pci_exp_devctl_or); - - /* Initialize Link Control Register */ - if (pcie_cap_has_lnkctl(dev)) { - - /* - * If the Root Port supports Read Completion Boundary of - * 128, set RCB to 128. Otherwise, clear it. - */ - hpp->pci_exp_lnkctl_and |= PCI_EXP_LNKCTL_RCB; - hpp->pci_exp_lnkctl_or &= ~PCI_EXP_LNKCTL_RCB; - if (pcie_root_rcb_set(dev)) - hpp->pci_exp_lnkctl_or |= PCI_EXP_LNKCTL_RCB; - - pcie_capability_clear_and_set_word(dev, PCI_EXP_LNKCTL, - ~hpp->pci_exp_lnkctl_and, hpp->pci_exp_lnkctl_or); - } - - /* Find Advanced Error Reporting Enhanced Capability */ - pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); - if (!pos) - return; - - /* Initialize Uncorrectable Error Mask Register */ - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, ®32); - reg32 = (reg32 & hpp->unc_err_mask_and) | hpp->unc_err_mask_or; - pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, reg32); - - /* Initialize Uncorrectable Error Severity Register */ - pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, ®32); - reg32 = (reg32 & hpp->unc_err_sever_and) | hpp->unc_err_sever_or; - pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, reg32); - - /* Initialize Correctable Error Mask Register */ - pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, ®32); - reg32 = (reg32 & hpp->cor_err_mask_and) | hpp->cor_err_mask_or; - pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg32); - - /* Initialize Advanced Error Capabilities and Control Register */ - pci_read_config_dword(dev, pos + PCI_ERR_CAP, ®32); - reg32 = (reg32 & hpp->adv_err_cap_and) | hpp->adv_err_cap_or; - - /* Don't enable ECRC generation or checking if unsupported */ - if (!(reg32 & PCI_ERR_CAP_ECRC_GENC)) - reg32 &= ~PCI_ERR_CAP_ECRC_GENE; - if (!(reg32 & PCI_ERR_CAP_ECRC_CHKC)) - reg32 &= ~PCI_ERR_CAP_ECRC_CHKE; - pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32); - - /* - * FIXME: The following two registers are not supported yet. - * - * o Secondary Uncorrectable Error Severity Register - * o Secondary Uncorrectable Error Mask Register - */ -} - -static u16 hpx3_device_type(struct pci_dev *dev) -{ - u16 pcie_type = pci_pcie_type(dev); - const int pcie_to_hpx3_type[] = { - [PCI_EXP_TYPE_ENDPOINT] = HPX_TYPE_ENDPOINT, - [PCI_EXP_TYPE_LEG_END] = HPX_TYPE_LEG_END, - [PCI_EXP_TYPE_RC_END] = HPX_TYPE_RC_END, - [PCI_EXP_TYPE_RC_EC] = HPX_TYPE_RC_EC, - [PCI_EXP_TYPE_ROOT_PORT] = HPX_TYPE_ROOT_PORT, - [PCI_EXP_TYPE_UPSTREAM] = HPX_TYPE_UPSTREAM, - [PCI_EXP_TYPE_DOWNSTREAM] = HPX_TYPE_DOWNSTREAM, - [PCI_EXP_TYPE_PCI_BRIDGE] = HPX_TYPE_PCI_BRIDGE, - [PCI_EXP_TYPE_PCIE_BRIDGE] = HPX_TYPE_PCIE_BRIDGE, - }; - - if (pcie_type >= ARRAY_SIZE(pcie_to_hpx3_type)) - return 0; - - return pcie_to_hpx3_type[pcie_type]; -} - -static u8 hpx3_function_type(struct pci_dev *dev) -{ - if (dev->is_virtfn) - return HPX_FN_SRIOV_VIRT; - else if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_SRIOV) > 0) - return HPX_FN_SRIOV_PHYS; - else - return HPX_FN_NORMAL; -} - -static bool hpx3_cap_ver_matches(u8 pcie_cap_id, u8 hpx3_cap_id) -{ - u8 cap_ver = hpx3_cap_id & 0xf; - - if ((hpx3_cap_id & BIT(4)) && cap_ver >= pcie_cap_id) - return true; - else if (cap_ver == pcie_cap_id) - return true; - - return false; -} - -static void program_hpx_type3_register(struct pci_dev *dev, - const struct hpx_type3 *reg) -{ - u32 match_reg, write_reg, header, orig_value; - u16 pos; - - if (!(hpx3_device_type(dev) & reg->device_type)) - return; - - if (!(hpx3_function_type(dev) & reg->function_type)) - return; - - switch (reg->config_space_location) { - case HPX_CFG_PCICFG: - pos = 0; - break; - case HPX_CFG_PCIE_CAP: - pos = pci_find_capability(dev, reg->pci_exp_cap_id); - if (pos == 0) - return; - - break; - case HPX_CFG_PCIE_CAP_EXT: - pos = pci_find_ext_capability(dev, reg->pci_exp_cap_id); - if (pos == 0) - return; - - pci_read_config_dword(dev, pos, &header); - if (!hpx3_cap_ver_matches(PCI_EXT_CAP_VER(header), - reg->pci_exp_cap_ver)) - return; - - break; - case HPX_CFG_VEND_CAP: /* Fall through */ - case HPX_CFG_DVSEC: /* Fall through */ - default: - pci_warn(dev, "Encountered _HPX type 3 with unsupported config space location"); - return; - } - - pci_read_config_dword(dev, pos + reg->match_offset, &match_reg); - - if ((match_reg & reg->match_mask_and) != reg->match_value) - return; - - pci_read_config_dword(dev, pos + reg->reg_offset, &write_reg); - orig_value = write_reg; - write_reg &= reg->reg_mask_and; - write_reg |= reg->reg_mask_or; - - if (orig_value == write_reg) - return; - - pci_write_config_dword(dev, pos + reg->reg_offset, write_reg); - - pci_dbg(dev, "Applied _HPX3 at [0x%x]: 0x%08x -> 0x%08x", - pos, orig_value, write_reg); -} - -static void program_hpx_type3(struct pci_dev *dev, struct hpx_type3 *hpx3) -{ - if (!hpx3) - return; - - if (!pci_is_pcie(dev)) - return; - - program_hpx_type3_register(dev, hpx3); -} - int pci_configure_extended_tags(struct pci_dev *dev, void *ign) { struct pci_host_bridge *host; @@ -2364,13 +2107,6 @@ static void pci_configure_serr(struct pci_dev *dev) static void pci_configure_device(struct pci_dev *dev) { - static const struct hotplug_program_ops hp_ops = { - .program_type0 = program_hpp_type0, - .program_type1 = program_hpp_type1, - .program_type2 = program_hpp_type2, - .program_type3 = program_hpx_type3, - }; - pci_configure_mps(dev); pci_configure_extended_tags(dev, NULL); pci_configure_relaxed_ordering(dev); @@ -2378,7 +2114,7 @@ static void pci_configure_device(struct pci_dev *dev) pci_configure_eetlp_prefix(dev); pci_configure_serr(dev); - pci_acpi_program_hp_params(dev, &hp_ops); + pci_acpi_program_hp_params(dev); } static void pci_release_capabilities(struct pci_dev *dev) @@ -2759,12 +2495,8 @@ static int only_one_child(struct pci_bus *bus) * A PCIe Downstream Port normally leads to a Link with only Device * 0 on it (PCIe spec r3.1, sec 7.3.1). As an optimization, scan * only for Device 0 in that situation. - * - * Checking has_secondary_link is a hack to identify Downstream - * Ports because sometimes Switches are configured such that the - * PCIe Port Type labels are backwards. */ - if (bridge && pci_is_pcie(bridge) && bridge->has_secondary_link) + if (bridge && pci_is_pcie(bridge) && pcie_downstream_port(bridge)) return 1; return 0; diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 44c4ae1abd00..320255e5e8f8 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -2592,6 +2591,59 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NVENET_15, nvenet_msi_disable); +/* + * PCIe spec r4.0 sec 7.7.1.2 and sec 7.7.2.2 say that if MSI/MSI-X is enabled, + * then the device can't use INTx interrupts. Tegra's PCIe root ports don't + * generate MSI interrupts for PME and AER events instead only INTx interrupts + * are generated. Though Tegra's PCIe root ports can generate MSI interrupts + * for other events, since PCIe specificiation doesn't support using a mix of + * INTx and MSI/MSI-X, it is required to disable MSI interrupts to avoid port + * service drivers registering their respective ISRs for MSIs. + */ +static void pci_quirk_nvidia_tegra_disable_rp_msi(struct pci_dev *dev) +{ + dev->no_msi = 1; +} +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x1ad0, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x1ad1, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x1ad2, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0bf0, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0bf1, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e1c, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e1d, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e12, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0e13, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0fae, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0faf, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x10e5, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); +DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_NVIDIA, 0x10e6, + PCI_CLASS_BRIDGE_PCI, 8, + pci_quirk_nvidia_tegra_disable_rp_msi); + /* * Some versions of the MCP55 bridge from Nvidia have a legacy IRQ routing * config register. This register controls the routing of legacy @@ -2925,6 +2977,24 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, 0x10a1, quirk_msi_intx_disable_qca_bug); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, 0xe091, quirk_msi_intx_disable_qca_bug); + +/* + * Amazon's Annapurna Labs 1c36:0031 Root Ports don't support MSI-X, so it + * should be disabled on platforms where the device (mistakenly) advertises it. + * + * Notice that this quirk also disables MSI (which may work, but hasn't been + * tested), since currently there is no standard way to disable only MSI-X. + * + * The 0031 device id is reused for other non Root Port device types, + * therefore the quirk is registered for the PCI_CLASS_BRIDGE_PCI class. + */ +static void quirk_al_msi_disable(struct pci_dev *dev) +{ + dev->no_msi = 1; + pci_warn(dev, "Disabling MSI/MSI-X\n"); +} +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, + PCI_CLASS_BRIDGE_PCI, 8, quirk_al_msi_disable); #endif /* CONFIG_PCI_MSI */ /* @@ -4366,6 +4436,24 @@ static int pci_quirk_qcom_rp_acs(struct pci_dev *dev, u16 acs_flags) return ret; } +static int pci_quirk_al_acs(struct pci_dev *dev, u16 acs_flags) +{ + if (pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) + return -ENOTTY; + + /* + * Amazon's Annapurna Labs root ports don't include an ACS capability, + * but do include ACS-like functionality. The hardware doesn't support + * peer-to-peer transactions via the root port and each has a unique + * segment number. + * + * Additionally, the root ports cannot send traffic to each other. + */ + acs_flags &= ~(PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF); + + return acs_flags ? 0 : 1; +} + /* * Sunrise Point PCH root ports implement ACS, but unfortunately as shown in * the datasheet (Intel 100 Series Chipset Family PCH Datasheet, Vol. 2, @@ -4466,6 +4554,19 @@ static int pci_quirk_mf_endpoint_acs(struct pci_dev *dev, u16 acs_flags) return acs_flags ? 0 : 1; } +static int pci_quirk_brcm_acs(struct pci_dev *dev, u16 acs_flags) +{ + /* + * iProc PAXB Root Ports don't advertise an ACS capability, but + * they do not allow peer-to-peer transactions between Root Ports. + * Allow each Root Port to be in a separate IOMMU group by masking + * SV/RR/CR/UF bits. + */ + acs_flags &= ~(PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF); + + return acs_flags ? 0 : 1; +} + static const struct pci_dev_acs_enabled { u16 vendor; u16 device; @@ -4559,6 +4660,9 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_AMPERE, 0xE00A, pci_quirk_xgene_acs }, { PCI_VENDOR_ID_AMPERE, 0xE00B, pci_quirk_xgene_acs }, { PCI_VENDOR_ID_AMPERE, 0xE00C, pci_quirk_xgene_acs }, + { PCI_VENDOR_ID_BROADCOM, 0xD714, pci_quirk_brcm_acs }, + /* Amazon Annapurna Labs */ + { PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, pci_quirk_al_acs }, { 0 } }; diff --git a/drivers/pci/search.c b/drivers/pci/search.c index 7f4e65872b8d..bade14002fd8 100644 --- a/drivers/pci/search.c +++ b/drivers/pci/search.c @@ -15,7 +15,6 @@ #include "pci.h" DECLARE_RWSEM(pci_bus_sem); -EXPORT_SYMBOL_GPL(pci_bus_sem); /* * pci_for_each_dma_alias - Iterate over DMA aliases for a device diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 79b1fa6519be..e7dbe21705ba 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1662,8 +1662,8 @@ static int iov_resources_unassigned(struct pci_dev *dev, void *data) int i; bool *unassigned = data; - for (i = PCI_IOV_RESOURCES; i <= PCI_IOV_RESOURCE_END; i++) { - struct resource *r = &dev->resource[i]; + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + struct resource *r = &dev->resource[i + PCI_IOV_RESOURCES]; struct pci_bus_region region; /* Not assigned or rejected by kernel? */ diff --git a/drivers/pci/vc.c b/drivers/pci/vc.c index 5acd9c02683a..5486f8768c86 100644 --- a/drivers/pci/vc.c +++ b/drivers/pci/vc.c @@ -13,6 +13,8 @@ #include #include +#include "pci.h" + /** * pci_vc_save_restore_dwords - Save or restore a series of dwords * @dev: device @@ -105,7 +107,7 @@ static void pci_vc_enable(struct pci_dev *dev, int pos, int res) struct pci_dev *link = NULL; /* Enable VCs from the downstream device */ - if (!dev->has_secondary_link) + if (!pci_is_pcie(dev) || !pcie_downstream_port(dev)) return; ctrl_pos = pos + PCI_VC_RES_CTRL + (res * PCI_CAP_VC_PER_VC_SIZEOF); @@ -409,7 +411,6 @@ void pci_restore_vc_state(struct pci_dev *dev) * For each type of VC capability, VC/VC9/MFVC, find the capability, size * it, and allocate a buffer for save/restore. */ - void pci_allocate_vc_save_buffers(struct pci_dev *dev) { int i; diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 4963c2e2bd4c..7915d10f9aa1 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -571,6 +571,12 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LSI_LOGIC, 0x005f, quirk_blacklist_vpd); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, PCI_ANY_ID, quirk_blacklist_vpd); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_QLOGIC, 0x2261, quirk_blacklist_vpd); +/* + * The Amazon Annapurna Labs 0x0031 device id is reused for other non Root Port + * device types, so the quirk is registered for the PCI_CLASS_BRIDGE_PCI class. + */ +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, + PCI_CLASS_BRIDGE_PCI, 8, quirk_blacklist_vpd); /* * For Broadcom 5706, 5708, 5709 rev. A nics, any read beyond the diff --git a/drivers/phy/tegra/Kconfig b/drivers/phy/tegra/Kconfig index e516967d695b..f9817c3ae85f 100644 --- a/drivers/phy/tegra/Kconfig +++ b/drivers/phy/tegra/Kconfig @@ -7,3 +7,10 @@ config PHY_TEGRA_XUSB To compile this driver as a module, choose M here: the module will be called phy-tegra-xusb. + +config PHY_TEGRA194_P2U + tristate "NVIDIA Tegra194 PIPE2UPHY PHY driver" + depends on ARCH_TEGRA_194_SOC || COMPILE_TEST + select GENERIC_PHY + help + Enable this to support the P2U (PIPE to UPHY) that is part of Tegra 19x SOCs. diff --git a/drivers/phy/tegra/Makefile b/drivers/phy/tegra/Makefile index 64ccaeacb631..320dd389f34d 100644 --- a/drivers/phy/tegra/Makefile +++ b/drivers/phy/tegra/Makefile @@ -6,3 +6,4 @@ phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_124_SOC) += xusb-tegra124.o phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_132_SOC) += xusb-tegra124.o phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_210_SOC) += xusb-tegra210.o phy-tegra-xusb-$(CONFIG_ARCH_TEGRA_186_SOC) += xusb-tegra186.o +obj-$(CONFIG_PHY_TEGRA194_P2U) += phy-tegra194-p2u.o diff --git a/drivers/phy/tegra/phy-tegra194-p2u.c b/drivers/phy/tegra/phy-tegra194-p2u.c new file mode 100644 index 000000000000..7042bed9feaa --- /dev/null +++ b/drivers/phy/tegra/phy-tegra194-p2u.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * P2U (PIPE to UPHY) driver for Tegra T194 SoC + * + * Copyright (C) 2019 NVIDIA Corporation. + * + * Author: Vidya Sagar + */ + +#include +#include +#include +#include +#include +#include + +#define P2U_PERIODIC_EQ_CTRL_GEN3 0xc0 +#define P2U_PERIODIC_EQ_CTRL_GEN3_PERIODIC_EQ_EN BIT(0) +#define P2U_PERIODIC_EQ_CTRL_GEN3_INIT_PRESET_EQ_TRAIN_EN BIT(1) +#define P2U_PERIODIC_EQ_CTRL_GEN4 0xc4 +#define P2U_PERIODIC_EQ_CTRL_GEN4_INIT_PRESET_EQ_TRAIN_EN BIT(1) + +#define P2U_RX_DEBOUNCE_TIME 0xa4 +#define P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_MASK 0xffff +#define P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_VAL 160 + +struct tegra_p2u { + void __iomem *base; +}; + +static inline void p2u_writel(struct tegra_p2u *phy, const u32 value, + const u32 reg) +{ + writel_relaxed(value, phy->base + reg); +} + +static inline u32 p2u_readl(struct tegra_p2u *phy, const u32 reg) +{ + return readl_relaxed(phy->base + reg); +} + +static int tegra_p2u_power_on(struct phy *x) +{ + struct tegra_p2u *phy = phy_get_drvdata(x); + u32 val; + + val = p2u_readl(phy, P2U_PERIODIC_EQ_CTRL_GEN3); + val &= ~P2U_PERIODIC_EQ_CTRL_GEN3_PERIODIC_EQ_EN; + val |= P2U_PERIODIC_EQ_CTRL_GEN3_INIT_PRESET_EQ_TRAIN_EN; + p2u_writel(phy, val, P2U_PERIODIC_EQ_CTRL_GEN3); + + val = p2u_readl(phy, P2U_PERIODIC_EQ_CTRL_GEN4); + val |= P2U_PERIODIC_EQ_CTRL_GEN4_INIT_PRESET_EQ_TRAIN_EN; + p2u_writel(phy, val, P2U_PERIODIC_EQ_CTRL_GEN4); + + val = p2u_readl(phy, P2U_RX_DEBOUNCE_TIME); + val &= ~P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_MASK; + val |= P2U_RX_DEBOUNCE_TIME_DEBOUNCE_TIMER_VAL; + p2u_writel(phy, val, P2U_RX_DEBOUNCE_TIME); + + return 0; +} + +static const struct phy_ops ops = { + .power_on = tegra_p2u_power_on, + .owner = THIS_MODULE, +}; + +static int tegra_p2u_probe(struct platform_device *pdev) +{ + struct phy_provider *phy_provider; + struct device *dev = &pdev->dev; + struct phy *generic_phy; + struct tegra_p2u *phy; + struct resource *res; + + phy = devm_kzalloc(dev, sizeof(*phy), GFP_KERNEL); + if (!phy) + return -ENOMEM; + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ctl"); + phy->base = devm_ioremap_resource(dev, res); + if (IS_ERR(phy->base)) + return PTR_ERR(phy->base); + + platform_set_drvdata(pdev, phy); + + generic_phy = devm_phy_create(dev, NULL, &ops); + if (IS_ERR(generic_phy)) + return PTR_ERR(generic_phy); + + phy_set_drvdata(generic_phy, phy); + + phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate); + if (IS_ERR(phy_provider)) + return PTR_ERR(phy_provider); + + return 0; +} + +static const struct of_device_id tegra_p2u_id_table[] = { + { + .compatible = "nvidia,tegra194-p2u", + }, + {} +}; +MODULE_DEVICE_TABLE(of, tegra_p2u_id_table); + +static struct platform_driver tegra_p2u_driver = { + .probe = tegra_p2u_probe, + .driver = { + .name = "tegra194-p2u", + .of_match_table = tegra_p2u_id_table, + }, +}; +module_platform_driver(tegra_p2u_driver); + +MODULE_AUTHOR("Vidya Sagar "); +MODULE_DESCRIPTION("NVIDIA Tegra194 PIPE2UPHY PHY driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 1b67bb578f9f..ae21d08c65e8 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -674,6 +674,7 @@ config EEEPC_LAPTOP config ASUS_WMI tristate "ASUS WMI Driver" depends on ACPI_WMI + depends on ACPI_BATTERY depends on INPUT depends on HWMON depends on BACKLIGHT_CLASS_DEVICE diff --git a/drivers/platform/x86/i2c-multi-instantiate.c b/drivers/platform/x86/i2c-multi-instantiate.c index 61fe341a85aa..ea68f6ed66ae 100644 --- a/drivers/platform/x86/i2c-multi-instantiate.c +++ b/drivers/platform/x86/i2c-multi-instantiate.c @@ -90,7 +90,7 @@ static int i2c_multi_inst_probe(struct platform_device *pdev) for (i = 0; i < multi->num_clients && inst_data[i].type; i++) { memset(&board_info, 0, sizeof(board_info)); strlcpy(board_info.type, inst_data[i].type, I2C_NAME_SIZE); - snprintf(name, sizeof(name), "%s-%s.%d", match->id, + snprintf(name, sizeof(name), "%s-%s.%d", dev_name(dev), inst_data[i].type, i); board_info.dev_name = name; switch (inst_data[i].flags & IRQ_RESOURCE_TYPE) { diff --git a/drivers/platform/x86/pmc_atom.c b/drivers/platform/x86/pmc_atom.c index 9aca5e7ce6d0..07d1b911e72f 100644 --- a/drivers/platform/x86/pmc_atom.c +++ b/drivers/platform/x86/pmc_atom.c @@ -422,6 +422,13 @@ static const struct dmi_system_id critclk_systems[] = { DMI_MATCH(DMI_PRODUCT_VERSION, "6ES7647-8B"), }, }, + { + .ident = "SIMATIC IPC277E", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "SIEMENS AG"), + DMI_MATCH(DMI_PRODUCT_VERSION, "6AV7882-0"), + }, + }, { /*sentinel*/ } }; diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c index 0005ec9285aa..b42a93736668 100644 --- a/drivers/s390/cio/ccwgroup.c +++ b/drivers/s390/cio/ccwgroup.c @@ -372,7 +372,7 @@ int ccwgroup_create_dev(struct device *parent, struct ccwgroup_driver *gdrv, goto error; } /* Check for trailing stuff. */ - if (i == num_devices && strlen(buf) > 0) { + if (i == num_devices && buf && strlen(buf) > 0) { rc = -EINVAL; goto error; } diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 22c55816100b..1fbfb0a93f5f 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -1388,6 +1388,8 @@ device_initcall(cio_settle_init); int sch_is_pseudo_sch(struct subchannel *sch) { + if (!sch->dev.parent) + return 0; return sch == to_css(sch->dev.parent)->pseudo_subchannel; } diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c index d722458c5928..65841af15748 100644 --- a/drivers/s390/cio/device_ops.c +++ b/drivers/s390/cio/device_ops.c @@ -124,9 +124,7 @@ EXPORT_SYMBOL(ccw_device_is_multipath); /** * ccw_device_clear() - terminate I/O request processing * @cdev: target ccw device - * @intparm: interruption parameter; value is only used if no I/O is - * outstanding, otherwise the intparm associated with the I/O request - * is returned + * @intparm: interruption parameter to be returned upon conclusion of csch * * ccw_device_clear() calls csch on @cdev's subchannel. * Returns: @@ -179,6 +177,9 @@ int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm) * completed during the time specified by @expires. If a timeout occurs, the * channel program is terminated via xsch, hsch or csch, and the device's * interrupt handler will be called with an irb containing ERR_PTR(-%ETIMEDOUT). + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_halt() or ccw_device_clear(). * Returns: * %0, if the operation was successful; * -%EBUSY, if the device is busy, or status pending; @@ -256,6 +257,9 @@ int ccw_device_start_timeout_key(struct ccw_device *cdev, struct ccw1 *cpa, * Start a S/390 channel program. When the interrupt arrives, the * IRQ handler is called, either immediately, delayed (dev-end missing, * or sense required) or never (no IRQ handler registered). + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_halt() or ccw_device_clear(). * Returns: * %0, if the operation was successful; * -%EBUSY, if the device is busy, or status pending; @@ -287,6 +291,9 @@ int ccw_device_start_key(struct ccw_device *cdev, struct ccw1 *cpa, * Start a S/390 channel program. When the interrupt arrives, the * IRQ handler is called, either immediately, delayed (dev-end missing, * or sense required) or never (no IRQ handler registered). + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_halt() or ccw_device_clear(). * Returns: * %0, if the operation was successful; * -%EBUSY, if the device is busy, or status pending; @@ -322,6 +329,9 @@ int ccw_device_start(struct ccw_device *cdev, struct ccw1 *cpa, * completed during the time specified by @expires. If a timeout occurs, the * channel program is terminated via xsch, hsch or csch, and the device's * interrupt handler will be called with an irb containing ERR_PTR(-%ETIMEDOUT). + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_halt() or ccw_device_clear(). * Returns: * %0, if the operation was successful; * -%EBUSY, if the device is busy, or status pending; @@ -343,11 +353,12 @@ int ccw_device_start_timeout(struct ccw_device *cdev, struct ccw1 *cpa, /** * ccw_device_halt() - halt I/O request processing * @cdev: target ccw device - * @intparm: interruption parameter; value is only used if no I/O is - * outstanding, otherwise the intparm associated with the I/O request - * is returned + * @intparm: interruption parameter to be returned upon conclusion of hsch * * ccw_device_halt() calls hsch on @cdev's subchannel. + * The interruption handler will echo back the @intparm specified here, unless + * another interruption parameter is specified by a subsequent invocation of + * ccw_device_clear(). * Returns: * %0 on success, * -%ENODEV on device not operational, diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index a76b8a8bcbbb..a1915061932e 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -1322,24 +1322,24 @@ static int ap_get_compatible_type(ap_qid_t qid, int rawtype, unsigned int func) /* < CEX2A is not supported */ if (rawtype < AP_DEVICE_TYPE_CEX2A) return 0; - /* up to CEX6 known and fully supported */ - if (rawtype <= AP_DEVICE_TYPE_CEX6) + /* up to CEX7 known and fully supported */ + if (rawtype <= AP_DEVICE_TYPE_CEX7) return rawtype; /* - * unknown new type > CEX6, check for compatibility + * unknown new type > CEX7, check for compatibility * to the highest known and supported type which is - * currently CEX6 with the help of the QACT function. + * currently CEX7 with the help of the QACT function. */ if (ap_qact_available()) { struct ap_queue_status status; union ap_qact_ap_info apinfo = {0}; apinfo.mode = (func >> 26) & 0x07; - apinfo.cat = AP_DEVICE_TYPE_CEX6; + apinfo.cat = AP_DEVICE_TYPE_CEX7; status = ap_qact(qid, 0, &apinfo); if (status.response_code == AP_RESPONSE_NORMAL && apinfo.cat >= AP_DEVICE_TYPE_CEX2A - && apinfo.cat <= AP_DEVICE_TYPE_CEX6) + && apinfo.cat <= AP_DEVICE_TYPE_CEX7) comp_type = apinfo.cat; } if (!comp_type) diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 6f3cf37776ca..433b7b64368d 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * Copyright IBM Corp. 2006, 2012 + * Copyright IBM Corp. 2006, 2019 * Author(s): Cornelia Huck * Martin Schwidefsky * Ralph Wuerthner @@ -63,6 +63,7 @@ static inline int ap_test_bit(unsigned int *ptr, unsigned int nr) #define AP_DEVICE_TYPE_CEX4 10 #define AP_DEVICE_TYPE_CEX5 11 #define AP_DEVICE_TYPE_CEX6 12 +#define AP_DEVICE_TYPE_CEX7 13 /* * Known function facilities diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index f76a1d0f54c4..9de3d46b3253 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -1363,9 +1363,122 @@ static struct attribute_group ccadata_attr_group = { .bin_attrs = ccadata_attrs, }; +#define CCACIPHERTOKENSIZE (sizeof(struct cipherkeytoken) + 80) + +/* + * Sysfs attribute read function for all secure key ccacipher binary attributes. + * The implementation can not deal with partial reads, because a new random + * secure key blob is generated with each read. In case of partial reads + * (i.e. off != 0 or count < key blob size) -EINVAL is returned. + */ +static ssize_t pkey_ccacipher_aes_attr_read(enum pkey_key_size keybits, + bool is_xts, char *buf, loff_t off, + size_t count) +{ + size_t keysize; + int rc; + + if (off != 0 || count < CCACIPHERTOKENSIZE) + return -EINVAL; + if (is_xts) + if (count < 2 * CCACIPHERTOKENSIZE) + return -EINVAL; + + keysize = CCACIPHERTOKENSIZE; + rc = cca_gencipherkey(-1, -1, keybits, 0, buf, &keysize); + if (rc) + return rc; + memset(buf + keysize, 0, CCACIPHERTOKENSIZE - keysize); + + if (is_xts) { + keysize = CCACIPHERTOKENSIZE; + rc = cca_gencipherkey(-1, -1, keybits, 0, + buf + CCACIPHERTOKENSIZE, &keysize); + if (rc) + return rc; + memset(buf + CCACIPHERTOKENSIZE + keysize, 0, + CCACIPHERTOKENSIZE - keysize); + + return 2 * CCACIPHERTOKENSIZE; + } + + return CCACIPHERTOKENSIZE; +} + +static ssize_t ccacipher_aes_128_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_128, false, buf, + off, count); +} + +static ssize_t ccacipher_aes_192_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_192, false, buf, + off, count); +} + +static ssize_t ccacipher_aes_256_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_256, false, buf, + off, count); +} + +static ssize_t ccacipher_aes_128_xts_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_128, true, buf, + off, count); +} + +static ssize_t ccacipher_aes_256_xts_read(struct file *filp, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return pkey_ccacipher_aes_attr_read(PKEY_SIZE_AES_256, true, buf, + off, count); +} + +static BIN_ATTR_RO(ccacipher_aes_128, CCACIPHERTOKENSIZE); +static BIN_ATTR_RO(ccacipher_aes_192, CCACIPHERTOKENSIZE); +static BIN_ATTR_RO(ccacipher_aes_256, CCACIPHERTOKENSIZE); +static BIN_ATTR_RO(ccacipher_aes_128_xts, 2 * CCACIPHERTOKENSIZE); +static BIN_ATTR_RO(ccacipher_aes_256_xts, 2 * CCACIPHERTOKENSIZE); + +static struct bin_attribute *ccacipher_attrs[] = { + &bin_attr_ccacipher_aes_128, + &bin_attr_ccacipher_aes_192, + &bin_attr_ccacipher_aes_256, + &bin_attr_ccacipher_aes_128_xts, + &bin_attr_ccacipher_aes_256_xts, + NULL +}; + +static struct attribute_group ccacipher_attr_group = { + .name = "ccacipher", + .bin_attrs = ccacipher_attrs, +}; + static const struct attribute_group *pkey_attr_groups[] = { &protkey_attr_group, &ccadata_attr_group, + &ccacipher_attr_group, NULL, }; diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c index 003662aa8060..be2520cc010b 100644 --- a/drivers/s390/crypto/vfio_ap_drv.c +++ b/drivers/s390/crypto/vfio_ap_drv.c @@ -36,6 +36,8 @@ static struct ap_device_id ap_queue_ids[] = { .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { .dev_type = AP_DEVICE_TYPE_CEX6, .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, + { .dev_type = AP_DEVICE_TYPE_CEX7, + .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { /* end of sibling */ }, }; diff --git a/drivers/s390/crypto/zcrypt_api.h b/drivers/s390/crypto/zcrypt_api.h index 2d3f2732344f..d464618cd84f 100644 --- a/drivers/s390/crypto/zcrypt_api.h +++ b/drivers/s390/crypto/zcrypt_api.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * Copyright IBM Corp. 2001, 2018 + * Copyright IBM Corp. 2001, 2019 * Author(s): Robert Burroughs * Eric Rossman (edrossma@us.ibm.com) * Cornelia Huck @@ -29,6 +29,7 @@ #define ZCRYPT_CEX4 10 #define ZCRYPT_CEX5 11 #define ZCRYPT_CEX6 12 +#define ZCRYPT_CEX7 13 /** * Large random numbers are pulled in 4096 byte chunks from the crypto cards diff --git a/drivers/s390/crypto/zcrypt_cex4.c b/drivers/s390/crypto/zcrypt_cex4.c index f58d8dec19dc..442e3d6162f7 100644 --- a/drivers/s390/crypto/zcrypt_cex4.c +++ b/drivers/s390/crypto/zcrypt_cex4.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright IBM Corp. 2012 + * Copyright IBM Corp. 2012, 2019 * Author(s): Holger Dengler */ @@ -38,8 +38,8 @@ #define CEX4_CLEANUP_TIME (900*HZ) MODULE_AUTHOR("IBM Corporation"); -MODULE_DESCRIPTION("CEX4/CEX5/CEX6 Cryptographic Card device driver, " \ - "Copyright IBM Corp. 2018"); +MODULE_DESCRIPTION("CEX4/CEX5/CEX6/CEX7 Cryptographic Card device driver, " \ + "Copyright IBM Corp. 2019"); MODULE_LICENSE("GPL"); static struct ap_device_id zcrypt_cex4_card_ids[] = { @@ -49,6 +49,8 @@ static struct ap_device_id zcrypt_cex4_card_ids[] = { .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, { .dev_type = AP_DEVICE_TYPE_CEX6, .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, + { .dev_type = AP_DEVICE_TYPE_CEX7, + .match_flags = AP_DEVICE_ID_MATCH_CARD_TYPE }, { /* end of list */ }, }; @@ -61,6 +63,8 @@ static struct ap_device_id zcrypt_cex4_queue_ids[] = { .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { .dev_type = AP_DEVICE_TYPE_CEX6, .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, + { .dev_type = AP_DEVICE_TYPE_CEX7, + .match_flags = AP_DEVICE_ID_MATCH_QUEUE_TYPE }, { /* end of list */ }, }; @@ -146,7 +150,7 @@ static const struct attribute_group cca_queue_attr_group = { }; /** - * Probe function for CEX4/CEX5/CEX6 card device. It always + * Probe function for CEX4/CEX5/CEX6/CEX7 card device. It always * accepts the AP device since the bus_match already checked * the hardware type. * @ap_dev: pointer to the AP device. @@ -158,25 +162,31 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) * MEX_1k, MEX_2k, MEX_4k, CRT_1k, CRT_2k, CRT_4k, RNG, SECKEY */ static const int CEX4A_SPEED_IDX[] = { - 14, 19, 249, 42, 228, 1458, 0, 0}; + 14, 19, 249, 42, 228, 1458, 0, 0}; static const int CEX5A_SPEED_IDX[] = { - 8, 9, 20, 18, 66, 458, 0, 0}; + 8, 9, 20, 18, 66, 458, 0, 0}; static const int CEX6A_SPEED_IDX[] = { - 6, 9, 20, 17, 65, 438, 0, 0}; + 6, 9, 20, 17, 65, 438, 0, 0}; + static const int CEX7A_SPEED_IDX[] = { + 6, 8, 17, 15, 54, 362, 0, 0}; static const int CEX4C_SPEED_IDX[] = { 59, 69, 308, 83, 278, 2204, 209, 40}; static const int CEX5C_SPEED_IDX[] = { - 24, 31, 50, 37, 90, 479, 27, 10}; + 24, 31, 50, 37, 90, 479, 27, 10}; static const int CEX6C_SPEED_IDX[] = { - 16, 20, 32, 27, 77, 455, 23, 9}; + 16, 20, 32, 27, 77, 455, 24, 9}; + static const int CEX7C_SPEED_IDX[] = { + 14, 16, 26, 23, 64, 376, 23, 8}; static const int CEX4P_SPEED_IDX[] = { - 224, 313, 3560, 359, 605, 2827, 0, 50}; + 0, 0, 0, 0, 0, 0, 0, 50}; static const int CEX5P_SPEED_IDX[] = { - 63, 84, 156, 83, 142, 533, 0, 10}; + 0, 0, 0, 0, 0, 0, 0, 10}; static const int CEX6P_SPEED_IDX[] = { - 55, 70, 121, 73, 129, 522, 0, 9}; + 0, 0, 0, 0, 0, 0, 0, 9}; + static const int CEX7P_SPEED_IDX[] = { + 0, 0, 0, 0, 0, 0, 0, 8}; struct ap_card *ac = to_ap_card(&ap_dev->device); struct zcrypt_card *zc; @@ -198,11 +208,19 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX5; memcpy(zc->speed_rating, CEX5A_SPEED_IDX, sizeof(CEX5A_SPEED_IDX)); - } else { + } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX6) { zc->type_string = "CEX6A"; zc->user_space_type = ZCRYPT_CEX6; memcpy(zc->speed_rating, CEX6A_SPEED_IDX, sizeof(CEX6A_SPEED_IDX)); + } else { + zc->type_string = "CEX7A"; + /* wrong user space type, just for compatibility + * with the ZCRYPT_STATUS_MASK ioctl. + */ + zc->user_space_type = ZCRYPT_CEX6; + memcpy(zc->speed_rating, CEX7A_SPEED_IDX, + sizeof(CEX7A_SPEED_IDX)); } zc->min_mod_size = CEX4A_MIN_MOD_SIZE; if (ap_test_bit(&ac->functions, AP_FUNC_MEX4K) && @@ -232,7 +250,7 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX3C; memcpy(zc->speed_rating, CEX5C_SPEED_IDX, sizeof(CEX5C_SPEED_IDX)); - } else { + } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX6) { zc->type_string = "CEX6C"; /* wrong user space type, must be CEX6 * just keep it for cca compatibility @@ -240,6 +258,14 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX3C; memcpy(zc->speed_rating, CEX6C_SPEED_IDX, sizeof(CEX6C_SPEED_IDX)); + } else { + zc->type_string = "CEX7C"; + /* wrong user space type, must be CEX7 + * just keep it for cca compatibility + */ + zc->user_space_type = ZCRYPT_CEX3C; + memcpy(zc->speed_rating, CEX7C_SPEED_IDX, + sizeof(CEX7C_SPEED_IDX)); } zc->min_mod_size = CEX4C_MIN_MOD_SIZE; zc->max_mod_size = CEX4C_MAX_MOD_SIZE; @@ -255,11 +281,19 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) zc->user_space_type = ZCRYPT_CEX5; memcpy(zc->speed_rating, CEX5P_SPEED_IDX, sizeof(CEX5P_SPEED_IDX)); - } else { + } else if (ac->ap_dev.device_type == AP_DEVICE_TYPE_CEX6) { zc->type_string = "CEX6P"; zc->user_space_type = ZCRYPT_CEX6; memcpy(zc->speed_rating, CEX6P_SPEED_IDX, sizeof(CEX6P_SPEED_IDX)); + } else { + zc->type_string = "CEX7P"; + /* wrong user space type, just for compatibility + * with the ZCRYPT_STATUS_MASK ioctl. + */ + zc->user_space_type = ZCRYPT_CEX6; + memcpy(zc->speed_rating, CEX7P_SPEED_IDX, + sizeof(CEX7P_SPEED_IDX)); } zc->min_mod_size = CEX4C_MIN_MOD_SIZE; zc->max_mod_size = CEX4C_MAX_MOD_SIZE; @@ -289,8 +323,8 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev) } /** - * This is called to remove the CEX4/CEX5/CEX6 card driver information - * if an AP card device is removed. + * This is called to remove the CEX4/CEX5/CEX6/CEX7 card driver + * information if an AP card device is removed. */ static void zcrypt_cex4_card_remove(struct ap_device *ap_dev) { @@ -311,7 +345,7 @@ static struct ap_driver zcrypt_cex4_card_driver = { }; /** - * Probe function for CEX4/CEX5/CEX6 queue device. It always + * Probe function for CEX4/CEX5/CEX6/CEX7 queue device. It always * accepts the AP device since the bus_match already checked * the hardware type. * @ap_dev: pointer to the AP device. @@ -369,7 +403,7 @@ static int zcrypt_cex4_queue_probe(struct ap_device *ap_dev) } /** - * This is called to remove the CEX4/CEX5/CEX6 queue driver + * This is called to remove the CEX4/CEX5/CEX6/CEX7 queue driver * information if an AP queue device is removed. */ static void zcrypt_cex4_queue_remove(struct ap_device *ap_dev) diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 644f7f5c61a2..4a858789e6c5 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index 1bb6aada93fa..ac39ed79ccaa 100644 --- a/drivers/scsi/hpsa.c +++ b/drivers/scsi/hpsa.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index d0c2f8d6f2a2..c8e512ba6d39 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 7623196de9e3..50928bc266eb 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1211,9 +1211,6 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) dix = scsi_prot_sg_count(cmd); dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type); - if (write && dix) - t10_pi_prepare(cmd->request, sdkp->protection_type); - if (dif || dix) protect = sd_setup_protect_cmnd(cmd, dix, dif); else @@ -2055,11 +2052,6 @@ static int sd_done(struct scsi_cmnd *SCpnt) "sd_done: completed %d of %d bytes\n", good_bytes, scsi_bufflen(SCpnt))); - if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt) && - good_bytes) - t10_pi_complete(SCpnt->request, sdkp->protection_type, - good_bytes / scsi_prot_interval(SCpnt)); - return good_bytes; } diff --git a/drivers/staging/android/ion/heaps/ion_system_heap.c b/drivers/staging/android/ion/heaps/ion_system_heap.c index 4b2fe985706b..792cd130e493 100644 --- a/drivers/staging/android/ion/heaps/ion_system_heap.c +++ b/drivers/staging/android/ion/heaps/ion_system_heap.c @@ -122,7 +122,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap, if (!page) goto free_pages; list_add_tail(&page->lru, &pages); - size_remaining -= PAGE_SIZE << compound_order(page); + size_remaining -= page_size(page); max_order = compound_order(page); i++; } @@ -135,7 +135,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap, sg = table->sgl; list_for_each_entry_safe(page, tmp_page, &pages, lru) { - sg_set_page(sg, page, PAGE_SIZE << compound_order(page), 0); + sg_set_page(sg, page, page_size(page), 0); sg = sg_next(sg); list_del(&page->lru); } diff --git a/drivers/target/tcm_fc/tfc_io.c b/drivers/target/tcm_fc/tfc_io.c index a254792d882c..1354a157e9af 100644 --- a/drivers/target/tcm_fc/tfc_io.c +++ b/drivers/target/tcm_fc/tfc_io.c @@ -136,8 +136,7 @@ int ft_queue_data_in(struct se_cmd *se_cmd) page, off_in_page, tlen); fr_len(fp) += tlen; fp_skb(fp)->data_len += tlen; - fp_skb(fp)->truesize += - PAGE_SIZE << compound_order(page); + fp_skb(fp)->truesize += page_size(page); } else { BUG_ON(!page); from = kmap_atomic(page + (mem_off >> PAGE_SHIFT)); diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index 2da026fd12c9..09ddcd06c715 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -254,6 +254,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr, shm->teedev = teedev; shm->ctx = ctx; shm->id = -1; + addr = untagged_addr(addr); start = rounddown(addr, PAGE_SIZE); shm->offset = addr - start; shm->size = length; diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 213ff03c8a9f..59d9d512dcda 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1451,9 +1452,9 @@ struct ffs_sb_fill_data { struct ffs_data *ffs_data; }; -static int ffs_sb_fill(struct super_block *sb, void *_data, int silent) +static int ffs_sb_fill(struct super_block *sb, struct fs_context *fc) { - struct ffs_sb_fill_data *data = _data; + struct ffs_sb_fill_data *data = fc->fs_private; struct inode *inode; struct ffs_data *ffs = data->ffs_data; @@ -1486,147 +1487,152 @@ static int ffs_sb_fill(struct super_block *sb, void *_data, int silent) return 0; } -static int ffs_fs_parse_opts(struct ffs_sb_fill_data *data, char *opts) +enum { + Opt_no_disconnect, + Opt_rmode, + Opt_fmode, + Opt_mode, + Opt_uid, + Opt_gid, +}; + +static const struct fs_parameter_spec ffs_fs_param_specs[] = { + fsparam_bool ("no_disconnect", Opt_no_disconnect), + fsparam_u32 ("rmode", Opt_rmode), + fsparam_u32 ("fmode", Opt_fmode), + fsparam_u32 ("mode", Opt_mode), + fsparam_u32 ("uid", Opt_uid), + fsparam_u32 ("gid", Opt_gid), + {} +}; + +static const struct fs_parameter_description ffs_fs_fs_parameters = { + .name = "kAFS", + .specs = ffs_fs_param_specs, +}; + +static int ffs_fs_parse_param(struct fs_context *fc, struct fs_parameter *param) { + struct ffs_sb_fill_data *data = fc->fs_private; + struct fs_parse_result result; + int opt; + ENTER(); - if (!opts || !*opts) - return 0; + opt = fs_parse(fc, &ffs_fs_fs_parameters, param, &result); + if (opt < 0) + return opt; - for (;;) { - unsigned long value; - char *eq, *comma; + switch (opt) { + case Opt_no_disconnect: + data->no_disconnect = result.boolean; + break; + case Opt_rmode: + data->root_mode = (result.uint_32 & 0555) | S_IFDIR; + break; + case Opt_fmode: + data->perms.mode = (result.uint_32 & 0666) | S_IFREG; + break; + case Opt_mode: + data->root_mode = (result.uint_32 & 0555) | S_IFDIR; + data->perms.mode = (result.uint_32 & 0666) | S_IFREG; + break; - /* Option limit */ - comma = strchr(opts, ','); - if (comma) - *comma = 0; + case Opt_uid: + data->perms.uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(data->perms.uid)) + goto unmapped_value; + break; + case Opt_gid: + data->perms.gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(data->perms.gid)) + goto unmapped_value; + break; - /* Value limit */ - eq = strchr(opts, '='); - if (unlikely(!eq)) { - pr_err("'=' missing in %s\n", opts); - return -EINVAL; - } - *eq = 0; - - /* Parse value */ - if (kstrtoul(eq + 1, 0, &value)) { - pr_err("%s: invalid value: %s\n", opts, eq + 1); - return -EINVAL; - } - - /* Interpret option */ - switch (eq - opts) { - case 13: - if (!memcmp(opts, "no_disconnect", 13)) - data->no_disconnect = !!value; - else - goto invalid; - break; - case 5: - if (!memcmp(opts, "rmode", 5)) - data->root_mode = (value & 0555) | S_IFDIR; - else if (!memcmp(opts, "fmode", 5)) - data->perms.mode = (value & 0666) | S_IFREG; - else - goto invalid; - break; - - case 4: - if (!memcmp(opts, "mode", 4)) { - data->root_mode = (value & 0555) | S_IFDIR; - data->perms.mode = (value & 0666) | S_IFREG; - } else { - goto invalid; - } - break; - - case 3: - if (!memcmp(opts, "uid", 3)) { - data->perms.uid = make_kuid(current_user_ns(), value); - if (!uid_valid(data->perms.uid)) { - pr_err("%s: unmapped value: %lu\n", opts, value); - return -EINVAL; - } - } else if (!memcmp(opts, "gid", 3)) { - data->perms.gid = make_kgid(current_user_ns(), value); - if (!gid_valid(data->perms.gid)) { - pr_err("%s: unmapped value: %lu\n", opts, value); - return -EINVAL; - } - } else { - goto invalid; - } - break; - - default: -invalid: - pr_err("%s: invalid option\n", opts); - return -EINVAL; - } - - /* Next iteration */ - if (!comma) - break; - opts = comma + 1; + default: + return -ENOPARAM; } return 0; + +unmapped_value: + return invalf(fc, "%s: unmapped value: %u", param->key, result.uint_32); } -/* "mount -t functionfs dev_name /dev/function" ends up here */ - -static struct dentry * -ffs_fs_mount(struct file_system_type *t, int flags, - const char *dev_name, void *opts) +/* + * Set up the superblock for a mount. + */ +static int ffs_fs_get_tree(struct fs_context *fc) { - struct ffs_sb_fill_data data = { - .perms = { - .mode = S_IFREG | 0600, - .uid = GLOBAL_ROOT_UID, - .gid = GLOBAL_ROOT_GID, - }, - .root_mode = S_IFDIR | 0500, - .no_disconnect = false, - }; - struct dentry *rv; - int ret; + struct ffs_sb_fill_data *ctx = fc->fs_private; void *ffs_dev; struct ffs_data *ffs; ENTER(); - ret = ffs_fs_parse_opts(&data, opts); - if (unlikely(ret < 0)) - return ERR_PTR(ret); + if (!fc->source) + return invalf(fc, "No source specified"); - ffs = ffs_data_new(dev_name); + ffs = ffs_data_new(fc->source); if (unlikely(!ffs)) - return ERR_PTR(-ENOMEM); - ffs->file_perms = data.perms; - ffs->no_disconnect = data.no_disconnect; + return -ENOMEM; + ffs->file_perms = ctx->perms; + ffs->no_disconnect = ctx->no_disconnect; - ffs->dev_name = kstrdup(dev_name, GFP_KERNEL); + ffs->dev_name = kstrdup(fc->source, GFP_KERNEL); if (unlikely(!ffs->dev_name)) { ffs_data_put(ffs); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } - ffs_dev = ffs_acquire_dev(dev_name); + ffs_dev = ffs_acquire_dev(ffs->dev_name); if (IS_ERR(ffs_dev)) { ffs_data_put(ffs); - return ERR_CAST(ffs_dev); + return PTR_ERR(ffs_dev); } - ffs->private_data = ffs_dev; - data.ffs_data = ffs; - rv = mount_nodev(t, flags, &data, ffs_sb_fill); - if (IS_ERR(rv) && data.ffs_data) { - ffs_release_dev(data.ffs_data); - ffs_data_put(data.ffs_data); + ffs->private_data = ffs_dev; + ctx->ffs_data = ffs; + return get_tree_nodev(fc, ffs_sb_fill); +} + +static void ffs_fs_free_fc(struct fs_context *fc) +{ + struct ffs_sb_fill_data *ctx = fc->fs_private; + + if (ctx) { + if (ctx->ffs_data) { + ffs_release_dev(ctx->ffs_data); + ffs_data_put(ctx->ffs_data); + } + + kfree(ctx); } - return rv; +} + +static const struct fs_context_operations ffs_fs_context_ops = { + .free = ffs_fs_free_fc, + .parse_param = ffs_fs_parse_param, + .get_tree = ffs_fs_get_tree, +}; + +static int ffs_fs_init_fs_context(struct fs_context *fc) +{ + struct ffs_sb_fill_data *ctx; + + ctx = kzalloc(sizeof(struct ffs_sb_fill_data), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->perms.mode = S_IFREG | 0600; + ctx->perms.uid = GLOBAL_ROOT_UID; + ctx->perms.gid = GLOBAL_ROOT_GID; + ctx->root_mode = S_IFDIR | 0500; + ctx->no_disconnect = false; + + fc->fs_private = ctx; + fc->ops = &ffs_fs_context_ops; + return 0; } static void @@ -1644,7 +1650,8 @@ ffs_fs_kill_sb(struct super_block *sb) static struct file_system_type ffs_fs_type = { .owner = THIS_MODULE, .name = "functionfs", - .mount = ffs_fs_mount, + .init_fs_context = ffs_fs_init_fs_context, + .parameters = &ffs_fs_fs_parameters, .kill_sb = ffs_fs_kill_sb, }; MODULE_ALIAS_FS("functionfs"); diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 3b18fa4d090a..26cef65b41e7 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -176,13 +176,13 @@ static long tce_iommu_register_pages(struct tce_container *container, } static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, - unsigned int page_shift) + unsigned int it_page_shift) { struct page *page; unsigned long size = 0; - if (mm_iommu_is_devmem(mm, hpa, page_shift, &size)) - return size == (1UL << page_shift); + if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size)) + return size == (1UL << it_page_shift); page = pfn_to_page(hpa >> PAGE_SHIFT); /* @@ -190,7 +190,7 @@ static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, * a page we just found. Otherwise the hardware can get access to * a bigger memory chunk that it should. */ - return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; + return page_shift(compound_head(page)) >= it_page_shift; } static inline bool tce_groups_attached(struct tce_container *container) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 9a50b0558fa9..96fddc1dafc3 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -375,6 +375,8 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, down_read(&mm->mmap_sem); + vaddr = untagged_addr(vaddr); + vma = find_vma_intersection(mm, vaddr, vaddr + 1); if (vma && vma->vm_flags & VM_PFNMAP) { diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig index 8b081d61773e..40676be2e46a 100644 --- a/drivers/video/backlight/Kconfig +++ b/drivers/video/backlight/Kconfig @@ -10,7 +10,6 @@ menu "Backlight & LCD device support" # config LCD_CLASS_DEVICE tristate "Lowlevel LCD controls" - default m help This framework adds support for low-level control of LCD. Some framebuffer devices connect to platform-specific LCD modules @@ -143,7 +142,6 @@ endif # LCD_CLASS_DEVICE # config BACKLIGHT_CLASS_DEVICE tristate "Lowlevel Backlight controls" - default m help This framework adds support for low-level control of the LCD backlight. This includes support for brightness and power. diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 5dc07106a59e..cac3e35d7630 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -32,6 +32,12 @@ static const char *const backlight_types[] = { [BACKLIGHT_FIRMWARE] = "firmware", }; +static const char *const backlight_scale_types[] = { + [BACKLIGHT_SCALE_UNKNOWN] = "unknown", + [BACKLIGHT_SCALE_LINEAR] = "linear", + [BACKLIGHT_SCALE_NON_LINEAR] = "non-linear", +}; + #if defined(CONFIG_FB) || (defined(CONFIG_FB_MODULE) && \ defined(CONFIG_BACKLIGHT_CLASS_DEVICE_MODULE)) /* This callback gets called when something important happens inside a @@ -246,6 +252,18 @@ static ssize_t actual_brightness_show(struct device *dev, } static DEVICE_ATTR_RO(actual_brightness); +static ssize_t scale_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct backlight_device *bd = to_backlight_device(dev); + + if (WARN_ON(bd->props.scale > BACKLIGHT_SCALE_NON_LINEAR)) + return sprintf(buf, "unknown\n"); + + return sprintf(buf, "%s\n", backlight_scale_types[bd->props.scale]); +} +static DEVICE_ATTR_RO(scale); + static struct class *backlight_class; #ifdef CONFIG_PM_SLEEP @@ -292,6 +310,7 @@ static struct attribute *bl_device_attrs[] = { &dev_attr_brightness.attr, &dev_attr_actual_brightness.attr, &dev_attr_max_brightness.attr, + &dev_attr_scale.attr, &dev_attr_type.attr, NULL, }; diff --git a/drivers/video/backlight/gpio_backlight.c b/drivers/video/backlight/gpio_backlight.c index e84f3087e29f..18e053e4716c 100644 --- a/drivers/video/backlight/gpio_backlight.c +++ b/drivers/video/backlight/gpio_backlight.c @@ -59,13 +59,11 @@ static int gpio_backlight_probe_dt(struct platform_device *pdev, struct gpio_backlight *gbl) { struct device *dev = &pdev->dev; - enum gpiod_flags flags; int ret; gbl->def_value = device_property_read_bool(dev, "default-on"); - flags = gbl->def_value ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW; - gbl->gpiod = devm_gpiod_get(dev, NULL, flags); + gbl->gpiod = devm_gpiod_get(dev, NULL, GPIOD_ASIS); if (IS_ERR(gbl->gpiod)) { ret = PTR_ERR(gbl->gpiod); @@ -79,6 +77,22 @@ static int gpio_backlight_probe_dt(struct platform_device *pdev, return 0; } +static int gpio_backlight_initial_power_state(struct gpio_backlight *gbl) +{ + struct device_node *node = gbl->dev->of_node; + + /* Not booted with device tree or no phandle link to the node */ + if (!node || !node->phandle) + return gbl->def_value ? FB_BLANK_UNBLANK : FB_BLANK_POWERDOWN; + + /* if the enable GPIO is disabled, do not enable the backlight */ + if (gpiod_get_value_cansleep(gbl->gpiod) == 0) + return FB_BLANK_POWERDOWN; + + return FB_BLANK_UNBLANK; +} + + static int gpio_backlight_probe(struct platform_device *pdev) { struct gpio_backlight_platform_data *pdata = @@ -136,7 +150,9 @@ static int gpio_backlight_probe(struct platform_device *pdev) return PTR_ERR(bl); } - bl->props.brightness = gbl->def_value; + bl->props.power = gpio_backlight_initial_power_state(gbl); + bl->props.brightness = 1; + backlight_update_status(bl); platform_set_drvdata(pdev, bl); diff --git a/drivers/video/backlight/lm3630a_bl.c b/drivers/video/backlight/lm3630a_bl.c index b04b35d007a2..2d8e8192e4e2 100644 --- a/drivers/video/backlight/lm3630a_bl.c +++ b/drivers/video/backlight/lm3630a_bl.c @@ -377,8 +377,7 @@ static int lm3630a_parse_led_sources(struct fwnode_handle *node, u32 sources[LM3630A_NUM_SINKS]; int ret, num_sources, i; - num_sources = fwnode_property_read_u32_array(node, "led-sources", NULL, - 0); + num_sources = fwnode_property_count_u32(node, "led-sources"); if (num_sources < 0) return default_led_sources; else if (num_sources > ARRAY_SIZE(sources)) diff --git a/drivers/video/backlight/lms283gf05.c b/drivers/video/backlight/lms283gf05.c index 35bc012b22cc..0e45685bcc1c 100644 --- a/drivers/video/backlight/lms283gf05.c +++ b/drivers/video/backlight/lms283gf05.c @@ -158,7 +158,7 @@ static int lms283gf05_probe(struct spi_device *spi) ret = devm_gpio_request_one(&spi->dev, pdata->reset_gpio, GPIOF_DIR_OUT | (!pdata->reset_inverted ? GPIOF_INIT_HIGH : GPIOF_INIT_LOW), - "LMS285GF05 RESET"); + "LMS283GF05 RESET"); if (ret) return ret; } diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index 2201b8c78641..746eebc411df 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -387,6 +387,31 @@ int pwm_backlight_brightness_default(struct device *dev, } #endif +static bool pwm_backlight_is_linear(struct platform_pwm_backlight_data *data) +{ + unsigned int nlevels = data->max_brightness + 1; + unsigned int min_val = data->levels[0]; + unsigned int max_val = data->levels[nlevels - 1]; + /* + * Multiplying by 128 means that even in pathological cases such + * as (max_val - min_val) == nlevels the error at max_val is less + * than 1%. + */ + unsigned int slope = (128 * (max_val - min_val)) / nlevels; + unsigned int margin = (max_val - min_val) / 20; /* 5% */ + int i; + + for (i = 1; i < nlevels; i++) { + unsigned int linear_value = min_val + ((i * slope) / 128); + unsigned int delta = abs(linear_value - data->levels[i]); + + if (delta > margin) + return false; + } + + return true; +} + static int pwm_backlight_initial_power_state(const struct pwm_bl_data *pb) { struct device_node *node = pb->dev->of_node; @@ -536,6 +561,8 @@ static int pwm_backlight_probe(struct platform_device *pdev) goto err_alloc; } + memset(&props, 0, sizeof(struct backlight_properties)); + if (data->levels) { /* * For the DT case, only when brightness levels is defined @@ -548,6 +575,11 @@ static int pwm_backlight_probe(struct platform_device *pdev) pb->levels = data->levels; } + + if (pwm_backlight_is_linear(data)) + props.scale = BACKLIGHT_SCALE_LINEAR; + else + props.scale = BACKLIGHT_SCALE_NON_LINEAR; } else if (!data->max_brightness) { /* * If no brightness levels are provided and max_brightness is @@ -574,6 +606,8 @@ static int pwm_backlight_probe(struct platform_device *pdev) pb->levels = data->levels; } + + props.scale = BACKLIGHT_SCALE_NON_LINEAR; } else { /* * That only happens for the non-DT case, where platform data @@ -584,7 +618,6 @@ static int pwm_backlight_probe(struct platform_device *pdev) pb->lth_brightness = data->lth_brightness * (state.period / pb->scale); - memset(&props, 0, sizeof(struct backlight_properties)); props.type = BACKLIGHT_RAW; props.max_brightness = data->max_brightness; bl = backlight_device_register(dev_name(&pdev->dev), &pdev->dev, pb, diff --git a/drivers/video/backlight/rave-sp-backlight.c b/drivers/video/backlight/rave-sp-backlight.c index 462f14a1b19d..05b5f003a3d1 100644 --- a/drivers/video/backlight/rave-sp-backlight.c +++ b/drivers/video/backlight/rave-sp-backlight.c @@ -48,14 +48,20 @@ static int rave_sp_backlight_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct backlight_device *bd; - bd = devm_backlight_device_register(dev, pdev->name, dev->parent, + bd = devm_backlight_device_register(dev, pdev->name, dev, dev_get_drvdata(dev->parent), &rave_sp_backlight_ops, &rave_sp_backlight_props); if (IS_ERR(bd)) return PTR_ERR(bd); - backlight_update_status(bd); + /* + * If there is a phandle pointing to the device node we can + * assume that another device will manage the status changes. + * If not we make sure the backlight is in a consistent state. + */ + if (!dev->of_node->phandle) + backlight_update_status(bd); return 0; } diff --git a/drivers/video/backlight/tosa_lcd.c b/drivers/video/backlight/tosa_lcd.c index 65cb7578776f..29af8e27b6e5 100644 --- a/drivers/video/backlight/tosa_lcd.c +++ b/drivers/video/backlight/tosa_lcd.c @@ -222,8 +222,7 @@ static int tosa_lcd_remove(struct spi_device *spi) { struct tosa_lcd_data *data = spi_get_drvdata(spi); - if (data->i2c) - i2c_unregister_device(data->i2c); + i2c_unregister_device(data->i2c); tosa_lcd_tg_off(data); diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index c559f706ae7e..156360e37714 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -48,6 +48,7 @@ /* Includes */ #include /* For ACPI support */ +#include /* For BIT() */ #include /* For module specific items */ #include /* For new moduleparam's */ #include /* For standard types (like size_t) */ @@ -215,6 +216,23 @@ static int update_no_reboot_bit_mem(void *priv, bool set) return 0; } +static int update_no_reboot_bit_cnt(void *priv, bool set) +{ + struct iTCO_wdt_private *p = priv; + u16 val, newval; + + val = inw(TCO1_CNT(p)); + if (set) + val |= BIT(0); + else + val &= ~BIT(0); + outw(val, TCO1_CNT(p)); + newval = inw(TCO1_CNT(p)); + + /* make sure the update is successful */ + return val != newval ? -EIO : 0; +} + static void iTCO_wdt_no_reboot_bit_setup(struct iTCO_wdt_private *p, struct itco_wdt_platform_data *pdata) { @@ -224,7 +242,9 @@ static void iTCO_wdt_no_reboot_bit_setup(struct iTCO_wdt_private *p, return; } - if (p->iTCO_version >= 2) + if (p->iTCO_version >= 6) + p->update_no_reboot_bit = update_no_reboot_bit_cnt; + else if (p->iTCO_version >= 2) p->update_no_reboot_bit = update_no_reboot_bit_mem; else if (p->iTCO_version == 1) p->update_no_reboot_bit = update_no_reboot_bit_pci; @@ -452,7 +472,8 @@ static int iTCO_wdt_probe(struct platform_device *pdev) * Get the Memory-Mapped GCS or PMC register, we need it for the * NO_REBOOT flag (TCO v2 and v3). */ - if (p->iTCO_version >= 2 && !pdata->update_no_reboot_bit) { + if (p->iTCO_version >= 2 && p->iTCO_version < 6 && + !pdata->update_no_reboot_bit) { p->gcs_pmc_res = platform_get_resource(pdev, IORESOURCE_MEM, ICH_RES_MEM_GCS_PMC); @@ -502,6 +523,7 @@ static int iTCO_wdt_probe(struct platform_device *pdev) /* Clear out the (probably old) status */ switch (p->iTCO_version) { + case 6: case 5: case 4: outw(0x0008, TCO1_STS(p)); /* Clear the Time Out Status bit */ diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 2e8570c09789..6c8843968a52 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -247,7 +247,7 @@ static void xen_irq_info_cleanup(struct irq_info *info) */ unsigned int evtchn_from_irq(unsigned irq) { - if (unlikely(WARN(irq >= nr_irqs, "Invalid irq %d!\n", irq))) + if (WARN(irq >= nr_irqs, "Invalid irq %d!\n", irq)) return 0; return info_for_irq(irq)->evtchn; diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index 3eeb9bea7630..224df03ce42e 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -17,6 +17,8 @@ #include "../pci/pci.h" #ifdef CONFIG_PCI_MMCONFIG #include + +static int xen_mcfg_late(void); #endif static bool __read_mostly pci_seg_supported = true; @@ -28,7 +30,18 @@ static int xen_add_device(struct device *dev) #ifdef CONFIG_PCI_IOV struct pci_dev *physfn = pci_dev->physfn; #endif - +#ifdef CONFIG_PCI_MMCONFIG + static bool pci_mcfg_reserved = false; + /* + * Reserve MCFG areas in Xen on first invocation due to this being + * potentially called from inside of acpi_init immediately after + * MCFG table has been finally parsed. + */ + if (!pci_mcfg_reserved) { + xen_mcfg_late(); + pci_mcfg_reserved = true; + } +#endif if (pci_seg_supported) { struct { struct physdev_pci_device_add add; @@ -201,7 +214,7 @@ static int __init register_xen_pci_notifier(void) arch_initcall(register_xen_pci_notifier); #ifdef CONFIG_PCI_MMCONFIG -static int __init xen_mcfg_late(void) +static int xen_mcfg_late(void) { struct pci_mmcfg_region *cfg; int rc; @@ -240,8 +253,4 @@ static int __init xen_mcfg_late(void) } return 0; } -/* - * Needs to be done after acpi_init which are subsys_initcall. - */ -subsys_initcall_sync(xen_mcfg_late); #endif diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 58c9365fa217..bd3a10dfac15 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -39,6 +39,7 @@ #include #include +#define MAX_DMA_BITS 32 /* * Used to do a quick range check in swiotlb_tbl_unmap_single and * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this @@ -115,8 +116,6 @@ static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) return 0; } -static int max_dma_bits = 32; - static int xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) { @@ -136,7 +135,7 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) p + (i << IO_TLB_SHIFT), get_order(slabs << IO_TLB_SHIFT), dma_bits, &dma_handle); - } while (rc && dma_bits++ < max_dma_bits); + } while (rc && dma_bits++ < MAX_DMA_BITS); if (rc) return rc; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d4e11b2e04f6..ad4c6b1d5074 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -670,26 +670,6 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, * libraries. There is no binary dependent code anywhere else. */ -#ifndef STACK_RND_MASK -#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ -#endif - -static unsigned long randomize_stack_top(unsigned long stack_top) -{ - unsigned long random_variable = 0; - - if (current->flags & PF_RANDOMIZE) { - random_variable = get_random_long(); - random_variable &= STACK_RND_MASK; - random_variable <<= PAGE_SHIFT; - } -#ifdef CONFIG_STACK_GROWSUP - return PAGE_ALIGN(stack_top) + random_variable; -#else - return PAGE_ALIGN(stack_top) - random_variable; -#endif -} - static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ @@ -1141,7 +1121,8 @@ static int load_elf_binary(struct linux_binprm *bprm) * (since it grows up, and may collide early with the stack * growing down), and into the unused ELF_ET_DYN_BASE region. */ - if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && !interpreter) + if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && + loc->elf_ex.e_type == ET_DYN && !interpreter) current->mm->brk = current->mm->start_brk = ELF_ET_DYN_BASE; diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index a699e320393f..c1da294418d1 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ - export.o caps.o snap.o xattr.o quota.o \ + export.o caps.o snap.o xattr.o quota.o io.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ debugfs.o diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b3c8b886bf64..7ab616601141 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = - &ceph_inode_to_client(inode)->client->osdc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int err = 0; u64 off = page_offset(page); u64 len = PAGE_SIZE; @@ -219,8 +218,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); - err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - off, &len, + err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), + &ci->i_layout, off, &len, ci->i_truncate_seq, ci->i_truncate_size, &page, 1, 0); if (err == -ENOENT) @@ -228,6 +227,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) if (err < 0) { SetPageError(page); ceph_fscache_readpage_cancel(inode, page); + if (err == -EBLACKLISTED) + fsc->blacklisted = true; goto out; } if (err < PAGE_SIZE) @@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req) int i; dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); + if (rc == -EBLACKLISTED) + ceph_inode_to_client(inode)->blacklisted = true; /* unlock all pages, zeroing any data we didn't read */ osd_data = osd_req_op_extent_osd_data(req, 0); @@ -323,7 +326,8 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, /* caller of readpages does not hold buffer and read caps * (fadvise, madvise and readahead cases) */ int want = CEPH_CAP_FILE_CACHE; - ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, + true, &got); if (ret < 0) { dout("start_read %p, error getting cap\n", inode); } else if (!(got & want)) { @@ -569,7 +573,7 @@ static u64 get_writepages_data_length(struct inode *inode, /* * Write a single page, but leave the page locked. * - * If we get a write error, set the page error bit, but still adjust the + * If we get a write error, mark the mapping for error, but still adjust the * dirty page accounting (i.e., page is no longer dirty). */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) @@ -640,9 +644,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) end_page_writeback(page); return err; } + if (err == -EBLACKLISTED) + fsc->blacklisted = true; dout("writepage setting page/mapping error %d %p\n", err, page); - SetPageError(page); mapping_set_error(&inode->i_data, err); wbc->pages_skipped++; } else { @@ -679,23 +684,6 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) return err; } -/* - * lame release_pages helper. release_pages() isn't exported to - * modules. - */ -static void ceph_release_pages(struct page **pages, int num) -{ - struct pagevec pvec; - int i; - - pagevec_init(&pvec); - for (i = 0; i < num; i++) { - if (pagevec_add(&pvec, pages[i]) == 0) - pagevec_release(&pvec); - } - pagevec_release(&pvec); -} - /* * async writeback completion handler. * @@ -720,6 +708,8 @@ static void writepages_finish(struct ceph_osd_request *req) if (rc < 0) { mapping_set_error(mapping, rc); ceph_set_error_write(ci); + if (rc == -EBLACKLISTED) + fsc->blacklisted = true; } else { ceph_clear_error_write(ci); } @@ -769,7 +759,7 @@ static void writepages_finish(struct ceph_osd_request *req) dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", inode, osd_data->length, rc >= 0 ? num_pages : 0); - ceph_release_pages(osd_data->pages, num_pages); + release_pages(osd_data->pages, num_pages); } ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); @@ -1452,7 +1442,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, + &got, &pinned_page); if (err < 0) goto out_restore; @@ -1540,6 +1531,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) if (!prealloc_cf) return VM_FAULT_OOM; + sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); if (ci->i_inline_version != CEPH_INLINE_NONE) { @@ -1568,7 +1560,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got, NULL); if (err < 0) goto out_free; @@ -1614,6 +1606,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ceph_put_cap_refs(ci, got); out_free: ceph_restore_sigs(&oldset); + sb_end_pagefault(inode->i_sb); ceph_free_cap_flush(prealloc_cf); if (err < 0) ret = vmf_error(err); @@ -1946,12 +1939,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, if (err >= 0 || err == -ENOENT) have |= POOL_READ; - else if (err != -EPERM) + else if (err != -EPERM) { + if (err == -EBLACKLISTED) + fsc->blacklisted = true; goto out_unlock; + } if (err2 == 0 || err2 == -EEXIST) have |= POOL_WRITE; else if (err2 != -EPERM) { + if (err2 == -EBLACKLISTED) + fsc->blacklisted = true; err = err2; goto out_unlock; } @@ -1989,10 +1987,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, return err; } -int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) +int ceph_pool_perm_check(struct inode *inode, int need) { - s64 pool; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_string *pool_ns; + s64 pool; int ret, flags; if (ci->i_vino.snap != CEPH_NOSNAP) { @@ -2004,7 +2003,7 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) return 0; } - if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), + if (ceph_test_mount_opt(ceph_inode_to_client(inode), NOPOOLPERM)) return 0; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index bc90cf6ad7ed..b2ec29eeb4c4 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -6,6 +6,8 @@ * Written by Milosz Tanski (milosz@adfin.com) */ +#include + #include "super.h" #include "cache.h" diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ce0f5658720a..d3b9c9d5c1bd 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -457,37 +457,6 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) return cap; } -/* - * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. - */ -static int __ceph_get_cap_mds(struct ceph_inode_info *ci) -{ - struct ceph_cap *cap; - int mds = -1; - struct rb_node *p; - - /* prefer mds with WR|BUFFER|EXCL caps */ - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { - cap = rb_entry(p, struct ceph_cap, ci_node); - mds = cap->mds; - if (cap->issued & (CEPH_CAP_FILE_WR | - CEPH_CAP_FILE_BUFFER | - CEPH_CAP_FILE_EXCL)) - break; - } - return mds; -} - -int ceph_get_cap_mds(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int mds; - spin_lock(&ci->i_ceph_lock); - mds = __ceph_get_cap_mds(ceph_inode(inode)); - spin_unlock(&ci->i_ceph_lock); - return mds; -} - /* * Called under i_ceph_lock. */ @@ -628,7 +597,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, /* * Add a capability under the given MDS session. * - * Caller should hold session snap_rwsem (read) and s_mutex. + * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock * * @fmode is the open file mode, if we are opening a file, otherwise * it is < 0. (This is so we can atomically add the cap and add an @@ -645,6 +614,9 @@ void ceph_add_cap(struct inode *inode, struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; + u32 gen; + + lockdep_assert_held(&ci->i_ceph_lock); dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, session->s_mds, cap_id, ceph_cap_string(issued), seq); @@ -656,6 +628,10 @@ void ceph_add_cap(struct inode *inode, if (fmode >= 0) wanted |= ceph_caps_for_mode(fmode); + spin_lock(&session->s_gen_ttl_lock); + gen = session->s_cap_gen; + spin_unlock(&session->s_gen_ttl_lock); + cap = __get_cap_for_mds(ci, mds); if (!cap) { cap = *new_cap; @@ -681,7 +657,7 @@ void ceph_add_cap(struct inode *inode, list_move_tail(&cap->session_caps, &session->s_caps); spin_unlock(&session->s_cap_lock); - if (cap->cap_gen < session->s_cap_gen) + if (cap->cap_gen < gen) cap->issued = cap->implemented = CEPH_CAP_PIN; /* @@ -775,7 +751,7 @@ void ceph_add_cap(struct inode *inode, cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; - cap->cap_gen = session->s_cap_gen; + cap->cap_gen = gen; if (fmode >= 0) __ceph_get_fmode(ci, fmode); @@ -1284,10 +1260,6 @@ void __ceph_remove_caps(struct ceph_inode_info *ci) * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. * - * Make half-hearted attempt ot to invalidate page cache if we are - * dropping RDCACHE. Note that this will leave behind locked pages - * that we'll then need to deal with elsewhere. - * * Return non-zero if delayed release, or we experienced an error * such that the caller should requeue + retry later. * @@ -1746,11 +1718,11 @@ static bool __finish_cap_flush(struct ceph_mds_client *mdsc, * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * - * Called under i_ceph_lock. + * Called under i_ceph_lock. Returns the flush tid. */ -static int __mark_caps_flushing(struct inode *inode, +static u64 __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session, bool wake, - u64 *flush_tid, u64 *oldest_flush_tid) + u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); @@ -1789,8 +1761,7 @@ static int __mark_caps_flushing(struct inode *inode, list_add_tail(&cf->i_list, &ci->i_cap_flush_list); - *flush_tid = cf->tid; - return flushing; + return cf->tid; } /* @@ -2028,11 +1999,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, } ack: - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { - dout(" skipping %p I_NOFLUSH set\n", inode); - continue; - } - if (session && session != cap->session) { dout("oops, wrong session %p mutex\n", session); mutex_unlock(&session->s_mutex); @@ -2080,9 +2046,9 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, } if (cap == ci->i_auth_cap && ci->i_dirty_caps) { - flushing = __mark_caps_flushing(inode, session, false, - &flush_tid, - &oldest_flush_tid); + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, false, + &oldest_flush_tid); } else { flushing = 0; flush_tid = 0; @@ -2130,16 +2096,11 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) retry: spin_lock(&ci->i_ceph_lock); retry_locked: - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { - spin_unlock(&ci->i_ceph_lock); - dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); - goto out; - } if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; int delayed; - if (!session || session != cap->session) { + if (session != cap->session) { spin_unlock(&ci->i_ceph_lock); if (session) mutex_unlock(&session->s_mutex); @@ -2161,8 +2122,9 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) goto retry_locked; } - flushing = __mark_caps_flushing(inode, session, true, - &flush_tid, &oldest_flush_tid); + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, true, + &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, @@ -2261,35 +2223,45 @@ static int unsafe_request_wait(struct inode *inode) int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { + struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); u64 flush_tid; - int ret; + int ret, err; int dirty; dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); ret = file_write_and_wait_range(file, start, end); - if (ret < 0) - goto out; - if (datasync) goto out; dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); - ret = unsafe_request_wait(inode); + err = unsafe_request_wait(inode); /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ - if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { - ret = wait_event_interruptible(ci->i_cap_wq, + if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { + err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } + + if (err < 0) + ret = err; + + if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) { + spin_lock(&file->f_lock); + err = errseq_check_and_advance(&ci->i_meta_err, + &fi->meta_err); + spin_unlock(&file->f_lock); + if (err < 0) + ret = err; + } out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; @@ -2560,10 +2532,15 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, * * FIXME: how does a 0 return differ from -EAGAIN? */ -static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - loff_t endoff, bool nonblock, int *got) +enum { + NON_BLOCKING = 1, + CHECK_FILELOCK = 2, +}; + +static int try_get_cap_refs(struct inode *inode, int need, int want, + loff_t endoff, int flags, int *got) { - struct inode *inode = &ci->vfs_inode; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; int ret = 0; int have, implemented; @@ -2576,6 +2553,13 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, again: spin_lock(&ci->i_ceph_lock); + if ((flags & CHECK_FILELOCK) && + (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { + dout("try_get_cap_refs %p error filelock\n", inode); + ret = -EIO; + goto out_unlock; + } + /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); if ((file_wanted & need) != need) { @@ -2637,7 +2621,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, * we can not call down_read() when * task isn't in TASK_RUNNING state */ - if (nonblock) { + if (flags & NON_BLOCKING) { ret = -EAGAIN; goto out_unlock; } @@ -2731,18 +2715,19 @@ static void check_max_size(struct inode *inode, loff_t endoff) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } -int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, +int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got) { int ret; BUG_ON(need & ~CEPH_CAP_FILE_RD); BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); - ret = ceph_pool_perm_check(ci, need); + ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; - ret = try_get_cap_refs(ci, need, want, 0, nonblock, got); + ret = try_get_cap_refs(inode, need, want, 0, + (nonblock ? NON_BLOCKING : 0), got); return ret == -EAGAIN ? 0 : ret; } @@ -2751,30 +2736,40 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, +int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { - int _got, ret; + struct ceph_file_info *fi = filp->private_data; + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + int ret, _got, flags; - ret = ceph_pool_perm_check(ci, need); + ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; + if ((fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) + return -EBADF; + while (true) { if (endoff > 0) - check_max_size(&ci->vfs_inode, endoff); + check_max_size(inode, endoff); + flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0; _got = 0; - ret = try_get_cap_refs(ci, need, want, endoff, - false, &_got); + ret = try_get_cap_refs(inode, need, want, endoff, + flags, &_got); if (ret == -EAGAIN) continue; if (!ret) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&ci->i_cap_wq, &wait); - while (!(ret = try_get_cap_refs(ci, need, want, endoff, - true, &_got))) { + flags |= NON_BLOCKING; + while (!(ret = try_get_cap_refs(inode, need, want, + endoff, flags, &_got))) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -2786,10 +2781,18 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (ret == -EAGAIN) continue; } + + if ((fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) { + if (ret >= 0 && _got) + ceph_put_cap_refs(ci, _got); + return -EBADF; + } + if (ret < 0) { if (ret == -ESTALE) { /* session was killed, try renew caps */ - ret = ceph_renew_caps(&ci->vfs_inode); + ret = ceph_renew_caps(inode); if (ret == 0) continue; } @@ -2798,9 +2801,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (ci->i_inline_version != CEPH_INLINE_NONE && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - i_size_read(&ci->vfs_inode) > 0) { + i_size_read(inode) > 0) { struct page *page = - find_get_page(ci->vfs_inode.i_mapping, 0); + find_get_page(inode->i_mapping, 0); if (page) { if (PageUptodate(page)) { *pinned_page = page; @@ -2819,7 +2822,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, * getattr request will bring inline data into * page cache */ - ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + ret = __ceph_do_getattr(inode, NULL, CEPH_STAT_CAP_INLINE_DATA, true); if (ret < 0) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 2eb88ed22993..facb387c2735 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -294,7 +294,6 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { - return 0; } void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 15ff1b09cfa2..b6bfa94332c3 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -35,7 +35,7 @@ struct ceph_nfs_snapfh { static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { - const static int snap_handle_length = + static const int snap_handle_length = sizeof(struct ceph_nfs_snapfh) >> 2; struct ceph_nfs_snapfh *sfh = (void *)rawfh; u64 snapid = ceph_snap(inode); @@ -85,9 +85,9 @@ static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { - const static int handle_length = + static const int handle_length = sizeof(struct ceph_nfs_fh) >> 2; - const static int connected_handle_length = + static const int connected_handle_length = sizeof(struct ceph_nfs_confh) >> 2; int type; @@ -458,33 +458,33 @@ static int __get_snap_name(struct dentry *parent, char *name, if (err < 0) goto out; - rinfo = &req->r_reply_info; - for (i = 0; i < rinfo->dir_nr; i++) { - rde = rinfo->dir_entries + i; - BUG_ON(!rde->inode.in); - if (ceph_snap(inode) == - le64_to_cpu(rde->inode.in->snapid)) { - memcpy(name, rde->name, rde->name_len); - name[rde->name_len] = '\0'; - err = 0; - goto out; - } - } + rinfo = &req->r_reply_info; + for (i = 0; i < rinfo->dir_nr; i++) { + rde = rinfo->dir_entries + i; + BUG_ON(!rde->inode.in); + if (ceph_snap(inode) == + le64_to_cpu(rde->inode.in->snapid)) { + memcpy(name, rde->name, rde->name_len); + name[rde->name_len] = '\0'; + err = 0; + goto out; + } + } - if (rinfo->dir_end) - break; + if (rinfo->dir_end) + break; - BUG_ON(rinfo->dir_nr <= 0); - rde = rinfo->dir_entries + (rinfo->dir_nr - 1); - next_offset += rinfo->dir_nr; - last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); - if (!last_name) { - err = -ENOMEM; - goto out; - } + BUG_ON(rinfo->dir_nr <= 0); + rde = rinfo->dir_entries + (rinfo->dir_nr - 1); + next_offset += rinfo->dir_nr; + last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); + if (!last_name) { + err = -ENOMEM; + goto out; + } - ceph_mdsc_put_request(req); - req = NULL; + ceph_mdsc_put_request(req); + req = NULL; } err = -ENOENT; out: diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 685a03cc4b77..d277f71abe0b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -15,6 +15,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "io.h" static __le32 ceph_flags_sys2wire(u32 flags) { @@ -201,6 +202,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode) static int ceph_init_file_info(struct inode *inode, struct file *file, int fmode, bool isdir) { + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, @@ -211,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, struct ceph_dir_file_info *dfi = kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); if (!dfi) { - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + ceph_put_fmode(ci, fmode); /* clean up */ return -ENOMEM; } @@ -222,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, } else { fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); if (!fi) { - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + ceph_put_fmode(ci, fmode); /* clean up */ return -ENOMEM; } @@ -232,6 +234,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, fi->fmode = fmode; spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); + fi->meta_err = errseq_sample(&ci->i_meta_err); + fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); return 0; } @@ -695,7 +699,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ceph_release_page_vector(pages, num_pages); } - if (ret <= 0 || off >= i_size || !more) + if (ret < 0) { + if (ret == -EBLACKLISTED) + fsc->blacklisted = true; + break; + } + + if (off >= i_size || !more) break; } @@ -921,7 +931,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct ceph_aio_request *aio_req = NULL; int num_pages = 0; int flags; - int ret; + int ret = 0; struct timespec64 mtime = current_time(inode); size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; @@ -935,11 +945,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, (write ? "write" : "read"), file, pos, (unsigned)count, snapc, snapc ? snapc->seq : 0); - ret = filemap_write_and_wait_range(inode->i_mapping, - pos, pos + count - 1); - if (ret < 0) - return ret; - if (write) { int ret2 = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, @@ -1260,7 +1265,8 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, + &got, &pinned_page); if (ret < 0) return ret; @@ -1274,12 +1280,16 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) if (ci->i_inline_version == CEPH_INLINE_NONE) { if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { + ceph_start_io_direct(inode); ret = ceph_direct_read_write(iocb, to, NULL, NULL); + ceph_end_io_direct(inode); if (ret >= 0 && ret < len) retry_op = CHECK_EOF; } else { + ceph_start_io_read(inode); ret = ceph_sync_read(iocb, to, &retry_op); + ceph_end_io_read(inode); } } else { retry_op = READ_INLINE; @@ -1290,7 +1300,9 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); ceph_add_rw_context(fi, &rw_ctx); + ceph_start_io_read(inode); ret = generic_file_read_iter(iocb, to); + ceph_end_io_read(inode); ceph_del_rw_context(fi, &rw_ctx); } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", @@ -1399,7 +1411,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) return -ENOMEM; retry_snap: - inode_lock(inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_start_io_direct(inode); + else + ceph_start_io_write(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1457,7 +1472,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) else want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, + err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got, NULL); if (err < 0) goto out; @@ -1470,7 +1485,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { struct ceph_snap_context *snapc; struct iov_iter data; - inode_unlock(inode); spin_lock(&ci->i_ceph_lock); if (__ceph_have_pending_cap_snap(ci)) { @@ -1487,11 +1501,14 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) /* we might need to revert back to that point */ data = *from; - if (iocb->ki_flags & IOCB_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) { written = ceph_direct_read_write(iocb, &data, snapc, &prealloc_cf); - else + ceph_end_io_direct(inode); + } else { written = ceph_sync_write(iocb, &data, pos, snapc); + ceph_end_io_write(inode); + } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); @@ -1506,7 +1523,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - inode_unlock(inode); + ceph_end_io_write(inode); } if (written >= 0) { @@ -1541,9 +1558,11 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) } goto out_unlocked; - out: - inode_unlock(inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_write(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; @@ -1781,7 +1800,7 @@ static long ceph_fallocate(struct file *file, int mode, else want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); + ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); if (ret < 0) goto unlock; @@ -1810,16 +1829,15 @@ static long ceph_fallocate(struct file *file, int mode, * src_ci. Two attempts are made to obtain both caps, and an error is return if * this fails; zero is returned on success. */ -static int get_rd_wr_caps(struct ceph_inode_info *src_ci, - loff_t src_endoff, int *src_got, - struct ceph_inode_info *dst_ci, +static int get_rd_wr_caps(struct file *src_filp, int *src_got, + struct file *dst_filp, loff_t dst_endoff, int *dst_got) { int ret = 0; bool retrying = false; retry_caps: - ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, + ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, dst_endoff, dst_got, NULL); if (ret < 0) return ret; @@ -1829,24 +1847,24 @@ static int get_rd_wr_caps(struct ceph_inode_info *src_ci, * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some * retry dance instead to try to get both capabilities. */ - ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, + ret = ceph_try_get_caps(file_inode(src_filp), + CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, false, src_got); if (ret <= 0) { /* Start by dropping dst_ci caps and getting src_ci caps */ - ceph_put_cap_refs(dst_ci, *dst_got); + ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got); if (retrying) { if (!ret) /* ceph_try_get_caps masks EAGAIN */ ret = -EAGAIN; return ret; } - ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, - CEPH_CAP_FILE_SHARED, src_endoff, - src_got, NULL); + ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, + CEPH_CAP_FILE_SHARED, -1, src_got, NULL); if (ret < 0) return ret; /*... drop src_ci caps too, and retry */ - ceph_put_cap_refs(src_ci, *src_got); + ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got); retrying = true; goto retry_caps; } @@ -1904,6 +1922,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, struct ceph_inode_info *src_ci = ceph_inode(src_inode); struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); struct ceph_cap_flush *prealloc_cf; + struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); struct ceph_object_locator src_oloc, dst_oloc; struct ceph_object_id src_oid, dst_oid; loff_t endoff = 0, size; @@ -1913,10 +1932,16 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, int src_got = 0, dst_got = 0, err, dirty; bool do_final_copy = false; - if (src_inode == dst_inode) - return -EINVAL; - if (src_inode->i_sb != dst_inode->i_sb) - return -EXDEV; + if (src_inode->i_sb != dst_inode->i_sb) { + struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); + + if (ceph_fsid_compare(&src_fsc->client->fsid, + &dst_fsc->client->fsid)) { + dout("Copying files across clusters: src: %pU dst: %pU\n", + &src_fsc->client->fsid, &dst_fsc->client->fsid); + return -EXDEV; + } + } if (ceph_snap(dst_inode) != CEPH_NOSNAP) return -EROFS; @@ -1928,7 +1953,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * efficient). */ - if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM)) + if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) return -EOPNOTSUPP; if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || @@ -1960,8 +1985,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * clients may have dirty data in their caches. And OSDs know nothing * about caps, so they can't safely do the remote object copies. */ - err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, - dst_ci, (dst_off + len), &dst_got); + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); if (err < 0) { dout("get_rd_wr_caps returned %d\n", err); ret = -EOPNOTSUPP; @@ -2018,9 +2043,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, goto out; } len -= ret; - err = get_rd_wr_caps(src_ci, (src_off + len), - &src_got, dst_ci, - (dst_off + len), &dst_got); + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); if (err < 0) goto out; err = is_file_size_ok(src_inode, dst_inode, @@ -2044,7 +2068,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, dst_ci->i_vino.ino, dst_objnum); /* Do an object remote copy */ err = ceph_osdc_copy_from( - &ceph_inode_to_client(src_inode)->client->osdc, + &src_fsc->client->osdc, src_ci->i_vino.snap, 0, &src_oid, &src_oloc, CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 18500edefc56..9f135624ae47 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -515,6 +515,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ceph_fscache_inode_init(ci); + ci->i_meta_err = 0; + return &ci->vfs_inode; } @@ -801,7 +803,12 @@ static int fill_inode(struct inode *inode, struct page *locked_page, /* update inode */ inode->i_rdev = le32_to_cpu(info->rdev); - inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + /* directories have fl_stripe_unit set to zero */ + if (le32_to_cpu(info->layout.fl_stripe_unit)) + inode->i_blkbits = + fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + else + inode->i_blkbits = CEPH_BLOCK_SHIFT; __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); @@ -1982,7 +1989,7 @@ static const struct inode_operations ceph_symlink_iops = { int __ceph_setattr(struct inode *inode, struct iattr *attr) { struct ceph_inode_info *ci = ceph_inode(inode); - const unsigned int ia_valid = attr->ia_valid; + unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_cap_flush *prealloc_cf; @@ -2087,6 +2094,26 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } + if (ia_valid & ATTR_SIZE) { + dout("setattr %p size %lld -> %lld\n", inode, + inode->i_size, attr->ia_size); + if ((issued & CEPH_CAP_FILE_EXCL) && + attr->ia_size > inode->i_size) { + i_size_write(inode, attr->ia_size); + inode->i_blocks = calc_inode_blocks(attr->ia_size); + ci->i_reported_size = attr->ia_size; + dirtied |= CEPH_CAP_FILE_EXCL; + ia_valid |= ATTR_MTIME; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + attr->ia_size != inode->i_size) { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = + cpu_to_le64(inode->i_size); + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + } + } if (ia_valid & ATTR_MTIME) { dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, @@ -2109,25 +2136,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } - if (ia_valid & ATTR_SIZE) { - dout("setattr %p size %lld -> %lld\n", inode, - inode->i_size, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && - attr->ia_size > inode->i_size) { - i_size_write(inode, attr->ia_size); - inode->i_blocks = calc_inode_blocks(attr->ia_size); - ci->i_reported_size = attr->ia_size; - dirtied |= CEPH_CAP_FILE_EXCL; - } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - attr->ia_size != inode->i_size) { - req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = - cpu_to_le64(inode->i_size); - mask |= CEPH_SETATTR_SIZE; - release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; - } - } /* these do nothing */ if (ia_valid & ATTR_CTIME) { diff --git a/fs/ceph/io.c b/fs/ceph/io.c new file mode 100644 index 000000000000..97602ea92ff4 --- /dev/null +++ b/fs/ceph/io.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Trond Myklebust + * Copyright (c) 2019 Jeff Layton + * + * I/O and data path helper functionality. + * + * Heavily borrowed from equivalent code in fs/nfs/io.c + */ + +#include + +#include +#include +#include +#include + +#include "super.h" +#include "io.h" + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) +{ + lockdep_assert_held_write(&inode->i_rwsem); + + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + inode_dio_wait(inode); + } +} + +/** + * ceph_start_io_read - declare the file is being used for buffered reads + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +void +ceph_start_io_read(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + ceph_block_o_direct(ci, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * ceph_end_io_read - declare that the buffered read operation is done + * @inode: file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_read(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} + +/** + * ceph_start_io_write - declare the file is being used for buffered writes + * @inode: file inode + * + * Declare that a buffered write operation is about to start, and ensure + * that we block all direct I/O. + */ +void +ceph_start_io_write(struct inode *inode) +{ + down_write(&inode->i_rwsem); + ceph_block_o_direct(ceph_inode(inode), inode); +} + +/** + * ceph_end_io_write - declare that the buffered write operation is done + * @inode: file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void +ceph_end_io_write(struct inode *inode) +{ + up_write(&inode->i_rwsem); +} + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) +{ + lockdep_assert_held_write(&inode->i_rwsem); + + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + /* FIXME: unmap_mapping_range? */ + filemap_write_and_wait(inode->i_mapping); + } +} + +/** + * ceph_end_io_direct - declare the file is being used for direct i/o + * @inode: file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +void +ceph_start_io_direct(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + ceph_block_buffered(ci, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * ceph_end_io_direct - declare that the direct i/o operation is done + * @inode: file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_direct(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} diff --git a/fs/ceph/io.h b/fs/ceph/io.h new file mode 100644 index 000000000000..fa594cd77348 --- /dev/null +++ b/fs/ceph/io.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_IO_H +#define _FS_CEPH_IO_H + +void ceph_start_io_read(struct inode *inode); +void ceph_end_io_read(struct inode *inode); +void ceph_start_io_write(struct inode *inode); +void ceph_end_io_write(struct inode *inode); +void ceph_start_io_direct(struct inode *inode); +void ceph_end_io_direct(struct inode *inode); + +#endif /* FS_CEPH_IO_H */ diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 5083e238ad15..544e9e85b120 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -32,14 +32,18 @@ void __init ceph_flock_init(void) static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) { - struct inode *inode = file_inode(src->fl_file); + struct ceph_file_info *fi = dst->fl_file->private_data; + struct inode *inode = file_inode(dst->fl_file); atomic_inc(&ceph_inode(inode)->i_filelock_ref); + atomic_inc(&fi->num_locks); } static void ceph_fl_release_lock(struct file_lock *fl) { + struct ceph_file_info *fi = fl->fl_file->private_data; struct inode *inode = file_inode(fl->fl_file); struct ceph_inode_info *ci = ceph_inode(inode); + atomic_dec(&fi->num_locks); if (atomic_dec_and_test(&ci->i_filelock_ref)) { /* clear error when all locks are released */ spin_lock(&ci->i_ceph_lock); @@ -73,7 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, * window. Caller function will decrease the counter. */ fl->fl_ops = &ceph_fl_lock_ops; - atomic_inc(&ceph_inode(inode)->i_filelock_ref); + fl->fl_ops->fl_copy_lock(fl, NULL); } if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 920e9f048bd8..a8a8f84f3bbf 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -639,7 +639,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_renew_seq = 0; INIT_LIST_HEAD(&s->s_caps); s->s_nr_caps = 0; - s->s_trim_caps = 0; refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); @@ -1270,6 +1269,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, { struct ceph_mds_request *req; struct rb_node *p; + struct ceph_inode_info *ci; dout("cleanup_session_requests mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); @@ -1278,6 +1278,16 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, struct ceph_mds_request, r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); + if (req->r_target_inode) { + /* dropping unsafe change of inode's attributes */ + ci = ceph_inode(req->r_target_inode); + errseq_set(&ci->i_meta_err, -EIO); + } + if (req->r_unsafe_dir) { + /* dropping unsafe directory operation */ + ci = ceph_inode(req->r_unsafe_dir); + errseq_set(&ci->i_meta_err, -EIO); + } __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ @@ -1370,7 +1380,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; struct ceph_inode_info *ci = ceph_inode(inode); LIST_HEAD(to_remove); - bool drop = false; + bool dirty_dropped = false; bool invalidate = false; dout("removing cap %p, ci is %p, inode is %p\n", @@ -1383,9 +1393,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_cap_flush *cf; struct ceph_mds_client *mdsc = fsc->mdsc; - if (ci->i_wrbuffer_ref > 0 && - READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) - invalidate = true; + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (inode->i_data.nrpages > 0) + invalidate = true; + if (ci->i_wrbuffer_ref > 0) + mapping_set_error(&inode->i_data, -EIO); + } while (!list_empty(&ci->i_cap_flush_list)) { cf = list_first_entry(&ci->i_cap_flush_list, @@ -1405,7 +1418,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, inode, ceph_ino(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); - drop = true; + dirty_dropped = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited( @@ -1415,10 +1428,22 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; - drop = true; + dirty_dropped = true; } spin_unlock(&mdsc->cap_dirty_lock); + if (dirty_dropped) { + errseq_set(&ci->i_meta_err, -EIO); + + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + } + if (atomic_read(&ci->i_filelock_ref) > 0) { /* make further file lock syscall return -EIO */ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; @@ -1430,15 +1455,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } - - if (drop && - ci->i_wrbuffer_ref_head == 0 && - ci->i_wr_ref == 0 && - ci->i_dirty_caps == 0 && - ci->i_flushing_caps == 0) { - ceph_put_snap_context(ci->i_head_snapc); - ci->i_head_snapc = NULL; - } } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { @@ -1452,7 +1468,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, wake_up_all(&ci->i_cap_wq); if (invalidate) ceph_queue_invalidate(inode); - if (drop) + if (dirty_dropped) iput(inode); return 0; } @@ -1705,11 +1721,11 @@ static bool drop_negative_children(struct dentry *dentry) */ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { - struct ceph_mds_session *session = arg; + int *remaining = arg; struct ceph_inode_info *ci = ceph_inode(inode); int used, wanted, oissued, mine; - if (session->s_trim_caps <= 0) + if (*remaining <= 0) return -1; spin_lock(&ci->i_ceph_lock); @@ -1746,7 +1762,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if (oissued) { /* we aren't the only cap.. just remove us */ __ceph_remove_cap(cap, true); - session->s_trim_caps--; + (*remaining)--; } else { struct dentry *dentry; /* try dropping referring dentries */ @@ -1758,7 +1774,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) d_prune_aliases(inode); count = atomic_read(&inode->i_count); if (count == 1) - session->s_trim_caps--; + (*remaining)--; dout("trim_caps_cb %p cap %p pruned, count now %d\n", inode, cap, count); } else { @@ -1784,12 +1800,12 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, dout("trim_caps mds%d start: %d / %d, trim %d\n", session->s_mds, session->s_nr_caps, max_caps, trim_caps); if (trim_caps > 0) { - session->s_trim_caps = trim_caps; - ceph_iterate_session_caps(session, trim_caps_cb, session); + int remaining = trim_caps; + + ceph_iterate_session_caps(session, trim_caps_cb, &remaining); dout("trim_caps mds%d done: %d / %d, trimmed %d\n", session->s_mds, session->s_nr_caps, max_caps, - trim_caps - session->s_trim_caps); - session->s_trim_caps = 0; + trim_caps - remaining); } ceph_flush_cap_releases(mdsc, session); @@ -3015,18 +3031,23 @@ static void handle_forward(struct ceph_mds_client *mdsc, pr_err("mdsc_handle_forward decode error err=%d\n", err); } -static int __decode_and_drop_session_metadata(void **p, void *end) +static int __decode_session_metadata(void **p, void *end, + bool *blacklisted) { /* map */ u32 n; + bool err_str; ceph_decode_32_safe(p, end, n, bad); while (n-- > 0) { u32 len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); + err_str = !strncmp(*p, "error_string", len); *p += len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); + if (err_str && strnstr(*p, "blacklisted", len)) + *blacklisted = true; *p += len; } return 0; @@ -3050,6 +3071,7 @@ static void handle_session(struct ceph_mds_session *session, u64 seq; unsigned long features = 0; int wake = 0; + bool blacklisted = false; /* decode */ ceph_decode_need(&p, end, sizeof(*h), bad); @@ -3062,7 +3084,7 @@ static void handle_session(struct ceph_mds_session *session, if (msg_version >= 3) { u32 len; /* version >= 2, metadata */ - if (__decode_and_drop_session_metadata(&p, end) < 0) + if (__decode_session_metadata(&p, end, &blacklisted) < 0) goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); @@ -3149,6 +3171,8 @@ static void handle_session(struct ceph_mds_session *session, session->s_state = CEPH_MDS_SESSION_REJECTED; cleanup_session_requests(mdsc, session); remove_session_caps(session); + if (blacklisted) + mdsc->fsc->blacklisted = true; wake = 2; /* for good measure */ break; @@ -3998,7 +4022,27 @@ static void lock_unlock_sessions(struct ceph_mds_client *mdsc) mutex_unlock(&mdsc->mutex); } +static void maybe_recover_session(struct ceph_mds_client *mdsc) +{ + struct ceph_fs_client *fsc = mdsc->fsc; + if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) + return; + + if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) + return; + + if (!READ_ONCE(fsc->blacklisted)) + return; + + if (fsc->last_auto_reconnect && + time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30)) + return; + + pr_info("auto reconnect after blacklisted\n"); + fsc->last_auto_reconnect = jiffies; + ceph_force_reconnect(fsc->sb); +} /* * delayed work -- periodically trim expired leases, renew caps with mds @@ -4044,7 +4088,9 @@ static void delayed_work(struct work_struct *work) pr_info("mds%d hung\n", s->s_mds); } } - if (s->s_state < CEPH_MDS_SESSION_OPEN) { + if (s->s_state == CEPH_MDS_SESSION_NEW || + s->s_state == CEPH_MDS_SESSION_RESTARTING || + s->s_state == CEPH_MDS_SESSION_REJECTED) { /* this mds is failed or recovering, just wait */ ceph_put_mds_session(s); continue; @@ -4072,6 +4118,8 @@ static void delayed_work(struct work_struct *work) ceph_trim_snapid_map(mdsc); + maybe_recover_session(mdsc); + schedule_delayed(mdsc); } @@ -4355,7 +4403,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) session = __ceph_lookup_mds_session(mdsc, mds); if (!session) continue; + + if (session->s_state == CEPH_MDS_SESSION_REJECTED) + __unregister_session(mdsc, session); + __wake_requests(mdsc, &session->s_waiting); mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); __close_session(mdsc, session); if (session->s_state == CEPH_MDS_SESSION_CLOSING) { @@ -4364,6 +4417,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); kick_requests(mdsc, mds); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index f7c8603484fe..5cd131b41d84 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -148,9 +148,9 @@ enum { CEPH_MDS_SESSION_OPENING = 2, CEPH_MDS_SESSION_OPEN = 3, CEPH_MDS_SESSION_HUNG = 4, - CEPH_MDS_SESSION_CLOSING = 5, - CEPH_MDS_SESSION_RESTARTING = 6, - CEPH_MDS_SESSION_RECONNECTING = 7, + CEPH_MDS_SESSION_RESTARTING = 5, + CEPH_MDS_SESSION_RECONNECTING = 6, + CEPH_MDS_SESSION_CLOSING = 7, CEPH_MDS_SESSION_REJECTED = 8, }; @@ -176,7 +176,7 @@ struct ceph_mds_session { spinlock_t s_cap_lock; struct list_head s_caps; /* all caps issued by this session */ struct ceph_cap *s_cap_iterator; - int s_nr_caps, s_trim_caps; + int s_nr_caps; int s_num_cap_releases; int s_cap_reconnect; int s_readonly; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 377fafc76f20..edfd643a8205 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -143,6 +143,7 @@ enum { Opt_snapdirname, Opt_mds_namespace, Opt_fscache_uniq, + Opt_recover_session, Opt_last_string, /* string args above */ Opt_dirstat, @@ -184,6 +185,7 @@ static match_table_t fsopt_tokens = { /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, {Opt_mds_namespace, "mds_namespace=%s"}, + {Opt_recover_session, "recover_session=%s"}, {Opt_fscache_uniq, "fsc=%s"}, /* string args above */ {Opt_dirstat, "dirstat"}, @@ -254,6 +256,17 @@ static int parse_fsopt_token(char *c, void *private) if (!fsopt->mds_namespace) return -ENOMEM; break; + case Opt_recover_session: + if (!strncmp(argstr[0].from, "no", + argstr[0].to - argstr[0].from)) { + fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; + } else if (!strncmp(argstr[0].from, "clean", + argstr[0].to - argstr[0].from)) { + fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; + } else { + return -EINVAL; + } + break; case Opt_fscache_uniq: kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = kstrndup(argstr[0].from, @@ -576,6 +589,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->mds_namespace) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) + seq_show_option(m, "recover_session", "clean"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) @@ -664,6 +681,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->sb = NULL; fsc->mount_state = CEPH_MOUNT_MOUNTING; + fsc->filp_gen = 1; atomic_long_set(&fsc->writeback_count, 0); @@ -713,6 +731,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); + ceph_mdsc_destroy(fsc); destroy_workqueue(fsc->inode_wq); destroy_workqueue(fsc->cap_wq); @@ -829,7 +848,7 @@ static void ceph_umount_begin(struct super_block *sb) fsc->mount_state = CEPH_MOUNT_SHUTDOWN; ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); ceph_mdsc_force_umount(fsc->mdsc); - return; + fsc->filp_gen++; // invalidate open files } static int ceph_remount(struct super_block *sb, int *flags, char *data) @@ -1089,7 +1108,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } if (ceph_sb_to_client(sb) != fsc) { - ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); fsc = ceph_sb_to_client(sb); dout("get_sb got existing client %p\n", fsc); @@ -1115,7 +1133,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, goto out_final; out: - ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); out_final: dout("ceph_mount fail %ld\n", PTR_ERR(res)); @@ -1139,8 +1156,6 @@ static void ceph_kill_sb(struct super_block *s) ceph_fscache_unregister_fs(fsc); - ceph_mdsc_destroy(fsc); - destroy_fs_client(fsc); free_anon_bdev(dev); } @@ -1154,6 +1169,33 @@ static struct file_system_type ceph_fs_type = { }; MODULE_ALIAS_FS("ceph"); +int ceph_force_reconnect(struct super_block *sb) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + int err = 0; + + ceph_umount_begin(sb); + + /* Make sure all page caches get invalidated. + * see remove_session_caps_cb() */ + flush_workqueue(fsc->inode_wq); + + /* In case that we were blacklisted. This also reset + * all mon/osd connections */ + ceph_reset_client_addr(fsc->client); + + ceph_osdc_clear_abort_err(&fsc->client->osdc); + + fsc->blacklisted = false; + fsc->mount_state = CEPH_MOUNT_MOUNTED; + + if (sb->s_root) { + err = __ceph_do_getattr(d_inode(sb->s_root), NULL, + CEPH_STAT_CAP_INODE, true); + } + return err; +} + static int __init init_ceph(void) { int ret = init_caches(); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6b9f1ee7de85..f98d9247f9cb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -31,6 +32,7 @@ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) +#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ @@ -101,6 +103,11 @@ struct ceph_fs_client { struct ceph_client *client; unsigned long mount_state; + + unsigned long last_auto_reconnect; + bool blacklisted; + + u32 filp_gen; loff_t max_file_size; struct ceph_mds_client *mdsc; @@ -395,6 +402,8 @@ struct ceph_inode_info { struct fscache_cookie *fscache; u32 i_fscache_gen; #endif + errseq_t i_meta_err; + struct inode vfs_inode; /* at end */ }; @@ -499,17 +508,16 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ #define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ -#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ -#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ -#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ -#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ -#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ -#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ -#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ -#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ -#define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ - +#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ +#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ +#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ +#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ +#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */ +#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */ +#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ +#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ +#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ +#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */ /* * Masks of ceph inode work. @@ -703,6 +711,10 @@ struct ceph_file_info { spinlock_t rw_contexts_lock; struct list_head rw_contexts; + + errseq_t meta_err; + u32 filp_gen; + atomic_t num_locks; }; struct ceph_dir_file_info { @@ -842,7 +854,8 @@ static inline int default_congestion_kb(void) } - +/* super.c */ +extern int ceph_force_reconnect(struct super_block *sb); /* snap.c */ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino); @@ -959,7 +972,10 @@ static inline bool ceph_security_xattr_wanted(struct inode *in) #ifdef CONFIG_CEPH_FS_SECURITY_LABEL extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx); -extern void ceph_security_invalidate_secctx(struct inode *inode); +static inline void ceph_security_invalidate_secctx(struct inode *inode) +{ + security_inode_invalidate_secctx(inode); +} #else static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx) @@ -1039,7 +1055,6 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds); -extern int ceph_get_cap_mds(struct inode *inode); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, @@ -1058,9 +1073,9 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, struct inode *dir, int mds, int drop, int unless); -extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, +extern int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got, struct page **pinned_page); -extern int ceph_try_get_caps(struct ceph_inode_info *ci, +extern int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got); /* for counting open files by mode */ @@ -1071,7 +1086,7 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); extern const struct address_space_operations ceph_aops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); extern int ceph_uninline_data(struct file *filp, struct page *locked_page); -extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); +extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 939eab7aa219..cb18ee637cb7 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -20,7 +20,8 @@ static int __remove_xattr(struct ceph_inode_info *ci, static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || + return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } @@ -892,7 +893,8 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, memcpy(value, xattr->val, xattr->val_len); if (current->journal_info && - !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN)) ci->i_ceph_flags |= CEPH_I_SEC_INITED; out: spin_unlock(&ci->i_ceph_lock); @@ -903,11 +905,9 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); bool len_only = (size == 0); u32 namelen; int err; - int i; spin_lock(&ci->i_ceph_lock); dout("listxattr %p ver=%lld index_ver=%lld\n", inode, @@ -936,33 +936,6 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) names = __copy_xattr_names(ci, names); size -= namelen; } - - - /* virtual xattr names, too */ - if (vxattrs) { - for (i = 0; vxattrs[i].name; i++) { - size_t this_len; - - if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN) - continue; - if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci)) - continue; - - this_len = strlen(vxattrs[i].name) + 1; - namelen += this_len; - if (len_only) - continue; - - if (this_len > size) { - err = -ERANGE; - goto out; - } - - memcpy(names, vxattrs[i].name, this_len); - names += this_len; - size -= this_len; - } - } err = namelen; out: spin_unlock(&ci->i_ceph_lock); @@ -1293,42 +1266,8 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, ceph_pagelist_release(pagelist); return err; } - -void ceph_security_invalidate_secctx(struct inode *inode) -{ - security_inode_invalidate_secctx(inode); -} - -static int ceph_xattr_set_security_label(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *key, const void *buf, - size_t buflen, int flags) -{ - if (security_ismaclabel(key)) { - const char *name = xattr_full_name(handler, key); - return __ceph_setxattr(inode, name, buf, buflen, flags); - } - return -EOPNOTSUPP; -} - -static int ceph_xattr_get_security_label(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *key, void *buf, size_t buflen) -{ - if (security_ismaclabel(key)) { - const char *name = xattr_full_name(handler, key); - return __ceph_getxattr(inode, name, buf, buflen); - } - return -EOPNOTSUPP; -} - -static const struct xattr_handler ceph_security_label_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .get = ceph_xattr_get_security_label, - .set = ceph_xattr_set_security_label, -}; -#endif -#endif +#endif /* CONFIG_CEPH_FS_SECURITY_LABEL */ +#endif /* CONFIG_SECURITY */ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) { @@ -1351,9 +1290,6 @@ const struct xattr_handler *ceph_xattr_handlers[] = { #ifdef CONFIG_CEPH_FS_POSIX_ACL &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, -#endif -#ifdef CONFIG_CEPH_FS_SECURITY_LABEL - &ceph_security_label_handler, #endif &ceph_other_xattr_handler, NULL, diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 1bda2ab6745b..054acd9fd033 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -88,9 +88,7 @@ static int fat__get_entry(struct inode *dir, loff_t *pos, int err, offset; next: - if (*bh) - brelse(*bh); - + brelse(*bh); *bh = NULL; iblock = *pos >> sb->s_blocksize_bits; err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false); @@ -1100,8 +1098,11 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, err = -ENOMEM; goto error; } + /* Avoid race with userspace read via bdev */ + lock_buffer(bhs[n]); memset(bhs[n]->b_data, 0, sb->s_blocksize); set_buffer_uptodate(bhs[n]); + unlock_buffer(bhs[n]); mark_buffer_dirty_inode(bhs[n], dir); n++; @@ -1158,6 +1159,8 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) fat_time_unix2fat(sbi, ts, &time, &date, &time_cs); de = (struct msdos_dir_entry *)bhs[0]->b_data; + /* Avoid race with userspace read via bdev */ + lock_buffer(bhs[0]); /* filling the new directory slots ("." and ".." entries) */ memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME); memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME); @@ -1180,6 +1183,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) de[0].size = de[1].size = 0; memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); set_buffer_uptodate(bhs[0]); + unlock_buffer(bhs[0]); mark_buffer_dirty_inode(bhs[0], dir); err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); @@ -1237,11 +1241,14 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, /* fill the directory entry */ copy = min(size, sb->s_blocksize); + /* Avoid race with userspace read via bdev */ + lock_buffer(bhs[n]); memcpy(bhs[n]->b_data, slots, copy); + set_buffer_uptodate(bhs[n]); + unlock_buffer(bhs[n]); + mark_buffer_dirty_inode(bhs[n], dir); slots += copy; size -= copy; - set_buffer_uptodate(bhs[n]); - mark_buffer_dirty_inode(bhs[n], dir); if (!size) break; n++; diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 265983635f2b..3647c65a0f48 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -388,8 +388,11 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, err = -ENOMEM; goto error; } + /* Avoid race with userspace read via bdev */ + lock_buffer(c_bh); memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); set_buffer_uptodate(c_bh); + unlock_buffer(c_bh); mark_buffer_dirty_inode(c_bh, sbi->fat_inode); if (sb->s_flags & SB_SYNCHRONOUS) err = sync_dirty_buffer(c_bh); diff --git a/fs/fs_context.c b/fs/fs_context.c index 87c2c9687d90..138b5b4d621d 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -504,7 +504,6 @@ void put_fs_context(struct fs_context *fc) put_net(fc->net_ns); put_user_ns(fc->user_ns); put_cred(fc->cred); - kfree(fc->subtype); put_fc_log(fc); put_filesystem(fc->fs_type); kfree(fc->source); @@ -571,17 +570,6 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } - if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) && - strcmp(param->key, "subtype") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "VFS: Legacy: Non-string subtype"); - if (fc->subtype) - return invalf(fc, "VFS: Legacy: Multiple subtype"); - fc->subtype = param->string; - param->string = NULL; - return 0; - } - if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); @@ -738,8 +726,6 @@ void vfs_clean_context(struct fs_context *fc) fc->s_fs_info = NULL; fc->sb_flags = 0; security_free_mnt_opts(&fc->security); - kfree(fc->subtype); - fc->subtype = NULL; kfree(fc->source); fc->source = NULL; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index bab7a0db81dd..00015d851382 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -142,11 +142,10 @@ static int cuse_open(struct inode *inode, struct file *file) static int cuse_release(struct inode *inode, struct file *file) { - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - fuse_sync_release(fi, ff, file->f_flags); + fuse_sync_release(NULL, ff, file->f_flags); fuse_conn_put(fc); return 0; @@ -299,6 +298,14 @@ static void cuse_gendev_release(struct device *dev) kfree(dev); } +struct cuse_init_args { + struct fuse_args_pages ap; + struct cuse_init_in in; + struct cuse_init_out out; + struct page *page; + struct fuse_page_desc desc; +}; + /** * cuse_process_init_reply - finish initializing CUSE channel * @@ -306,21 +313,22 @@ static void cuse_gendev_release(struct device *dev) * required data structures for it. Please read the comment at the * top of this file for high level overview. */ -static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) +static void cuse_process_init_reply(struct fuse_conn *fc, + struct fuse_args *args, int error) { + struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_args_pages *ap = &ia->ap; struct cuse_conn *cc = fc_to_cc(fc), *pos; - struct cuse_init_out *arg = req->out.args[0].value; - struct page *page = req->pages[0]; + struct cuse_init_out *arg = &ia->out; + struct page *page = ap->pages[0]; struct cuse_devinfo devinfo = { }; struct device *dev; struct cdev *cdev; dev_t devt; int rc, i; - if (req->out.h.error || - arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { + if (error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) goto err; - } fc->minor = arg->minor; fc->max_read = max_t(unsigned, arg->max_read, 4096); @@ -329,7 +337,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) /* parse init reply */ cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; - rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size, + rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size, &devinfo); if (rc) goto err; @@ -396,7 +404,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) dev_set_uevent_suppress(dev, 0); kobject_uevent(&dev->kobj, KOBJ_ADD); out: - kfree(arg); + kfree(ia); __free_page(page); return; @@ -415,55 +423,49 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) static int cuse_send_init(struct cuse_conn *cc) { int rc; - struct fuse_req *req; struct page *page; struct fuse_conn *fc = &cc->fc; - struct cuse_init_in *arg; - void *outarg; + struct cuse_init_args *ia; + struct fuse_args_pages *ap; BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); - req = fuse_get_req_for_background(fc, 1); - if (IS_ERR(req)) { - rc = PTR_ERR(req); - goto err; - } - rc = -ENOMEM; page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) - goto err_put_req; + goto err; - outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL); - if (!outarg) + ia = kzalloc(sizeof(*ia), GFP_KERNEL); + if (!ia) goto err_free_page; - arg = &req->misc.cuse_init_in; - arg->major = FUSE_KERNEL_VERSION; - arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->flags |= CUSE_UNRESTRICTED_IOCTL; - req->in.h.opcode = CUSE_INIT; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct cuse_init_in); - req->in.args[0].value = arg; - req->out.numargs = 2; - req->out.args[0].size = sizeof(struct cuse_init_out); - req->out.args[0].value = outarg; - req->out.args[1].size = CUSE_INIT_INFO_MAX; - req->out.argvar = 1; - req->out.argpages = 1; - req->pages[0] = page; - req->page_descs[0].length = req->out.args[1].size; - req->num_pages = 1; - req->end = cuse_process_init_reply; - fuse_request_send_background(fc, req); - - return 0; + ap = &ia->ap; + ia->in.major = FUSE_KERNEL_VERSION; + ia->in.minor = FUSE_KERNEL_MINOR_VERSION; + ia->in.flags |= CUSE_UNRESTRICTED_IOCTL; + ap->args.opcode = CUSE_INIT; + ap->args.in_numargs = 1; + ap->args.in_args[0].size = sizeof(ia->in); + ap->args.in_args[0].value = &ia->in; + ap->args.out_numargs = 2; + ap->args.out_args[0].size = sizeof(ia->out); + ap->args.out_args[0].value = &ia->out; + ap->args.out_args[1].size = CUSE_INIT_INFO_MAX; + ap->args.out_argvar = 1; + ap->args.out_pages = 1; + ap->num_pages = 1; + ap->pages = &ia->page; + ap->descs = &ia->desc; + ia->page = page; + ia->desc.length = ap->args.out_args[1].size; + ap->args.end = cuse_process_init_reply; + rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + if (rc) { + kfree(ia); err_free_page: - __free_page(page); -err_put_req: - fuse_put_request(fc, req); + __free_page(page); + } err: return rc; } @@ -504,9 +506,9 @@ static int cuse_channel_open(struct inode *inode, struct file *file) * Limit the cuse channel to requests that can * be represented in file->f_cred->user_ns. */ - fuse_conn_init(&cc->fc, file->f_cred->user_ns); + fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL); - fud = fuse_dev_alloc(&cc->fc); + fud = fuse_dev_alloc_install(&cc->fc); if (!fud) { kfree(cc); return -ENOMEM; @@ -519,6 +521,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) rc = cuse_send_init(cc); if (rc) { fuse_dev_free(fud); + fuse_conn_put(&cc->fc); return rc; } file->private_data = fud; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ea8237513dfa..dadd617d826c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -40,107 +40,30 @@ static struct fuse_dev *fuse_get_dev(struct file *file) return READ_ONCE(file->private_data); } -static void fuse_request_init(struct fuse_req *req, struct page **pages, - struct fuse_page_desc *page_descs, - unsigned npages) +static void fuse_request_init(struct fuse_req *req) { INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); refcount_set(&req->count, 1); - req->pages = pages; - req->page_descs = page_descs; - req->max_pages = npages; __set_bit(FR_PENDING, &req->flags); } -static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc) -{ - struct page **pages; - - pages = kzalloc(npages * (sizeof(struct page *) + - sizeof(struct fuse_page_desc)), flags); - *desc = (void *) pages + npages * sizeof(struct page *); - - return pages; -} - -static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) +static struct fuse_req *fuse_request_alloc(gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); - if (req) { - struct page **pages = NULL; - struct fuse_page_desc *page_descs = NULL; + if (req) + fuse_request_init(req); - WARN_ON(npages > FUSE_MAX_MAX_PAGES); - if (npages > FUSE_REQ_INLINE_PAGES) { - pages = fuse_req_pages_alloc(npages, flags, - &page_descs); - if (!pages) { - kmem_cache_free(fuse_req_cachep, req); - return NULL; - } - } else if (npages) { - pages = req->inline_pages; - page_descs = req->inline_page_descs; - } - - fuse_request_init(req, pages, page_descs, npages); - } return req; } -struct fuse_req *fuse_request_alloc(unsigned npages) +static void fuse_request_free(struct fuse_req *req) { - return __fuse_request_alloc(npages, GFP_KERNEL); -} -EXPORT_SYMBOL_GPL(fuse_request_alloc); - -struct fuse_req *fuse_request_alloc_nofs(unsigned npages) -{ - return __fuse_request_alloc(npages, GFP_NOFS); -} - -static void fuse_req_pages_free(struct fuse_req *req) -{ - if (req->pages != req->inline_pages) - kfree(req->pages); -} - -bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, - gfp_t flags) -{ - struct page **pages; - struct fuse_page_desc *page_descs; - unsigned int npages = min_t(unsigned int, - max_t(unsigned int, req->max_pages * 2, - FUSE_DEFAULT_MAX_PAGES_PER_REQ), - fc->max_pages); - WARN_ON(npages <= req->max_pages); - - pages = fuse_req_pages_alloc(npages, flags, &page_descs); - if (!pages) - return false; - - memcpy(pages, req->pages, sizeof(struct page *) * req->max_pages); - memcpy(page_descs, req->page_descs, - sizeof(struct fuse_page_desc) * req->max_pages); - fuse_req_pages_free(req); - req->pages = pages; - req->page_descs = page_descs; - req->max_pages = npages; - - return true; -} - -void fuse_request_free(struct fuse_req *req) -{ - fuse_req_pages_free(req); kmem_cache_free(fuse_req_cachep, req); } -void __fuse_get_request(struct fuse_req *req) +static void __fuse_get_request(struct fuse_req *req) { refcount_inc(&req->count); } @@ -177,8 +100,9 @@ static void fuse_drop_waiting(struct fuse_conn *fc) } } -static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, - bool for_background) +static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); + +static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) { struct fuse_req *req; int err; @@ -201,7 +125,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, if (fc->conn_error) goto out; - req = fuse_request_alloc(npages); + req = fuse_request_alloc(GFP_KERNEL); err = -ENOMEM; if (!req) { if (for_background) @@ -229,101 +153,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, return ERR_PTR(err); } -struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) -{ - return __fuse_get_req(fc, npages, false); -} -EXPORT_SYMBOL_GPL(fuse_get_req); - -struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, - unsigned npages) -{ - return __fuse_get_req(fc, npages, true); -} -EXPORT_SYMBOL_GPL(fuse_get_req_for_background); - -/* - * Return request in fuse_file->reserved_req. However that may - * currently be in use. If that is the case, wait for it to become - * available. - */ -static struct fuse_req *get_reserved_req(struct fuse_conn *fc, - struct file *file) -{ - struct fuse_req *req = NULL; - struct fuse_inode *fi = get_fuse_inode(file_inode(file)); - struct fuse_file *ff = file->private_data; - - do { - wait_event(fc->reserved_req_waitq, ff->reserved_req); - spin_lock(&fi->lock); - if (ff->reserved_req) { - req = ff->reserved_req; - ff->reserved_req = NULL; - req->stolen_file = get_file(file); - } - spin_unlock(&fi->lock); - } while (!req); - - return req; -} - -/* - * Put stolen request back into fuse_file->reserved_req - */ -static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) -{ - struct file *file = req->stolen_file; - struct fuse_inode *fi = get_fuse_inode(file_inode(file)); - struct fuse_file *ff = file->private_data; - - WARN_ON(req->max_pages); - spin_lock(&fi->lock); - memset(req, 0, sizeof(*req)); - fuse_request_init(req, NULL, NULL, 0); - BUG_ON(ff->reserved_req); - ff->reserved_req = req; - wake_up_all(&fc->reserved_req_waitq); - spin_unlock(&fi->lock); - fput(file); -} - -/* - * Gets a requests for a file operation, always succeeds - * - * This is used for sending the FLUSH request, which must get to - * userspace, due to POSIX locks which may need to be unlocked. - * - * If allocation fails due to OOM, use the reserved request in - * fuse_file. - * - * This is very unlikely to deadlock accidentally, since the - * filesystem should not have it's own file open. If deadlock is - * intentional, it can still be broken by "aborting" the filesystem. - */ -struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, - struct file *file) -{ - struct fuse_req *req; - - atomic_inc(&fc->num_waiting); - wait_event(fc->blocked_waitq, fc->initialized); - /* Matches smp_wmb() in fuse_set_initialized() */ - smp_rmb(); - req = fuse_request_alloc(0); - if (!req) - req = get_reserved_req(fc, file); - - req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); - req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); - req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); - - __set_bit(FR_WAITING, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); - return req; -} - -void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (refcount_dec_and_test(&req->count)) { if (test_bit(FR_BACKGROUND, &req->flags)) { @@ -342,15 +172,11 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) fuse_drop_waiting(fc); } - if (req->stolen_file) - put_reserved_req(fc, req); - else - fuse_request_free(req); + fuse_request_free(req); } } -EXPORT_SYMBOL_GPL(fuse_put_request); -static unsigned len_args(unsigned numargs, struct fuse_arg *args) +unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) { unsigned nbytes = 0; unsigned i; @@ -360,25 +186,47 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) return nbytes; } +EXPORT_SYMBOL_GPL(fuse_len_args); -static u64 fuse_get_unique(struct fuse_iqueue *fiq) +u64 fuse_get_unique(struct fuse_iqueue *fiq) { fiq->reqctr += FUSE_REQ_ID_STEP; return fiq->reqctr; } +EXPORT_SYMBOL_GPL(fuse_get_unique); static unsigned int fuse_req_hash(u64 unique) { return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } -static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) +/** + * A new request is available, wake fiq->waitq + */ +static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + wake_up(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + spin_unlock(&fiq->lock); +} + +const struct fuse_iqueue_ops fuse_dev_fiq_ops = { + .wake_forget_and_unlock = fuse_dev_wake_and_unlock, + .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock, + .wake_pending_and_unlock = fuse_dev_wake_and_unlock, +}; +EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); + +static void queue_request_and_unlock(struct fuse_iqueue *fiq, + struct fuse_req *req) +__releases(fiq->lock) { req->in.h.len = sizeof(struct fuse_in_header) + - len_args(req->in.numargs, (struct fuse_arg *) req->in.args); + fuse_len_args(req->args->in_numargs, + (struct fuse_arg *) req->args->in_args); list_add_tail(&req->list, &fiq->pending); - wake_up_locked(&fiq->waitq); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + fiq->ops->wake_pending_and_unlock(fiq); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, @@ -389,16 +237,15 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (fiq->connected) { fiq->forget_list_tail->next = forget; fiq->forget_list_tail = forget; - wake_up_locked(&fiq->waitq); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + fiq->ops->wake_forget_and_unlock(fiq); } else { kfree(forget); + spin_unlock(&fiq->lock); } - spin_unlock(&fiq->waitq.lock); } static void flush_bg_queue(struct fuse_conn *fc) @@ -412,10 +259,9 @@ static void flush_bg_queue(struct fuse_conn *fc) req = list_first_entry(&fc->bg_queue, struct fuse_req, list); list_del(&req->list); fc->active_background++; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); req->in.h.unique = fuse_get_unique(fiq); - queue_request(fiq, req); - spin_unlock(&fiq->waitq.lock); + queue_request_and_unlock(fiq, req); } } @@ -427,9 +273,10 @@ static void flush_bg_queue(struct fuse_conn *fc) * the 'end' callback is called if given, else the reference to the * request is released */ -static void request_end(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) { struct fuse_iqueue *fiq = &fc->iq; + bool async = req->args->end; if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; @@ -439,9 +286,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) * smp_mb() from queue_interrupt(). */ if (!list_empty(&req->intr_entry)) { - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); list_del_init(&req->intr_entry); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); } WARN_ON(test_bit(FR_PENDING, &req->flags)); WARN_ON(test_bit(FR_SENT, &req->flags)); @@ -475,18 +322,19 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&req->waitq); } - if (req->end) - req->end(fc, req); + if (async) + req->args->end(fc, req->args, req->out.h.error); put_request: fuse_put_request(fc, req); } +EXPORT_SYMBOL_GPL(fuse_request_end); static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return -EINVAL; } @@ -499,13 +347,13 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) smp_mb(); if (test_bit(FR_FINISHED, &req->flags)) { list_del_init(&req->intr_entry); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return 0; } - wake_up_locked(&fiq->waitq); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + fiq->ops->wake_interrupt_and_unlock(fiq); + } else { + spin_unlock(&fiq->lock); } - spin_unlock(&fiq->waitq.lock); return 0; } @@ -535,16 +383,16 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) if (!err) return; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); /* Request is not yet in userspace, bail out */ if (test_bit(FR_PENDING, &req->flags)) { list_del(&req->list); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); __fuse_put_request(req); req->out.h.error = -EINTR; return; } - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); } /* @@ -559,101 +407,110 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) struct fuse_iqueue *fiq = &fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (!fiq->connected) { - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); req->out.h.error = -ENOTCONN; } else { req->in.h.unique = fuse_get_unique(fiq); - queue_request(fiq, req); /* acquire extra reference, since request is still needed - after request_end() */ + after fuse_request_end() */ __fuse_get_request(req); - spin_unlock(&fiq->waitq.lock); + queue_request_and_unlock(fiq, req); request_wait_answer(fc, req); - /* Pairs with smp_wmb() in request_end() */ + /* Pairs with smp_wmb() in fuse_request_end() */ smp_rmb(); } } -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) -{ - __set_bit(FR_ISREPLY, &req->flags); - if (!test_bit(FR_WAITING, &req->flags)) { - __set_bit(FR_WAITING, &req->flags); - atomic_inc(&fc->num_waiting); - } - __fuse_request_send(fc, req); -} -EXPORT_SYMBOL_GPL(fuse_request_send); - static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) { - if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS) - args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE; + if (fc->minor < 4 && args->opcode == FUSE_STATFS) + args->out_args[0].size = FUSE_COMPAT_STATFS_SIZE; if (fc->minor < 9) { - switch (args->in.h.opcode) { + switch (args->opcode) { case FUSE_LOOKUP: case FUSE_CREATE: case FUSE_MKNOD: case FUSE_MKDIR: case FUSE_SYMLINK: case FUSE_LINK: - args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + args->out_args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; break; case FUSE_GETATTR: case FUSE_SETATTR: - args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + args->out_args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; break; } } if (fc->minor < 12) { - switch (args->in.h.opcode) { + switch (args->opcode) { case FUSE_CREATE: - args->in.args[0].size = sizeof(struct fuse_open_in); + args->in_args[0].size = sizeof(struct fuse_open_in); break; case FUSE_MKNOD: - args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE; + args->in_args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE; break; } } } +static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) +{ + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); +} + +static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) +{ + req->in.h.opcode = args->opcode; + req->in.h.nodeid = args->nodeid; + req->args = args; +} + ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) { struct fuse_req *req; ssize_t ret; - req = fuse_get_req(fc, 0); - if (IS_ERR(req)) - return PTR_ERR(req); + if (args->force) { + atomic_inc(&fc->num_waiting); + req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL); + + if (!args->nocreds) + fuse_force_creds(fc, req); + + __set_bit(FR_WAITING, &req->flags); + __set_bit(FR_FORCE, &req->flags); + } else { + WARN_ON(args->nocreds); + req = fuse_get_req(fc, false); + if (IS_ERR(req)) + return PTR_ERR(req); + } /* Needs to be done after fuse_get_req() so that fc->minor is valid */ fuse_adjust_compat(fc, args); + fuse_args_to_req(req, args); - req->in.h.opcode = args->in.h.opcode; - req->in.h.nodeid = args->in.h.nodeid; - req->in.numargs = args->in.numargs; - memcpy(req->in.args, args->in.args, - args->in.numargs * sizeof(struct fuse_in_arg)); - req->out.argvar = args->out.argvar; - req->out.numargs = args->out.numargs; - memcpy(req->out.args, args->out.args, - args->out.numargs * sizeof(struct fuse_arg)); - fuse_request_send(fc, req); + if (!args->noreply) + __set_bit(FR_ISREPLY, &req->flags); + __fuse_request_send(fc, req); ret = req->out.h.error; - if (!ret && args->out.argvar) { - BUG_ON(args->out.numargs != 1); - ret = req->out.args[0].size; + if (!ret && args->out_argvar) { + BUG_ON(args->out_numargs == 0); + ret = args->out_args[args->out_numargs - 1].size; } fuse_put_request(fc, req); return ret; } -bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req) +static bool fuse_request_queue_background(struct fuse_conn *fc, + struct fuse_req *req) { bool queued = false; @@ -681,56 +538,63 @@ bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req) return queued; } -void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) +int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, + gfp_t gfp_flags) { - WARN_ON(!req->end); - if (!fuse_request_queue_background(fc, req)) { - req->out.h.error = -ENOTCONN; - req->end(fc, req); - fuse_put_request(fc, req); - } -} -EXPORT_SYMBOL_GPL(fuse_request_send_background); + struct fuse_req *req; -static int fuse_request_send_notify_reply(struct fuse_conn *fc, - struct fuse_req *req, u64 unique) + if (args->force) { + WARN_ON(!args->nocreds); + req = fuse_request_alloc(gfp_flags); + if (!req) + return -ENOMEM; + __set_bit(FR_BACKGROUND, &req->flags); + } else { + WARN_ON(args->nocreds); + req = fuse_get_req(fc, true); + if (IS_ERR(req)) + return PTR_ERR(req); + } + + fuse_args_to_req(req, args); + + if (!fuse_request_queue_background(fc, req)) { + fuse_put_request(fc, req); + return -ENOTCONN; + } + + return 0; +} +EXPORT_SYMBOL_GPL(fuse_simple_background); + +static int fuse_simple_notify_reply(struct fuse_conn *fc, + struct fuse_args *args, u64 unique) { - int err = -ENODEV; + struct fuse_req *req; struct fuse_iqueue *fiq = &fc->iq; + int err = 0; + + req = fuse_get_req(fc, false); + if (IS_ERR(req)) + return PTR_ERR(req); __clear_bit(FR_ISREPLY, &req->flags); req->in.h.unique = unique; - spin_lock(&fiq->waitq.lock); + + fuse_args_to_req(req, args); + + spin_lock(&fiq->lock); if (fiq->connected) { - queue_request(fiq, req); - err = 0; + queue_request_and_unlock(fiq, req); + } else { + err = -ENODEV; + spin_unlock(&fiq->lock); + fuse_put_request(fc, req); } - spin_unlock(&fiq->waitq.lock); return err; } -void fuse_force_forget(struct file *file, u64 nodeid) -{ - struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - struct fuse_forget_in inarg; - - memset(&inarg, 0, sizeof(inarg)); - inarg.nlookup = 1; - req = fuse_get_req_nofail_nopages(fc, file); - req->in.h.opcode = FUSE_FORGET; - req->in.h.nodeid = nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - __clear_bit(FR_ISREPLY, &req->flags); - __fuse_request_send(fc, req); - /* ignore errors */ - fuse_put_request(fc, req); -} - /* * Lock the request. Up to the next unlock_request() there mustn't be * anything that could cause a page-fault. If the request was already @@ -1084,14 +948,15 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, { unsigned i; struct fuse_req *req = cs->req; + struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); - for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { + + for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) { int err; - unsigned offset = req->page_descs[i].offset; - unsigned count = min(nbytes, req->page_descs[i].length); + unsigned int offset = ap->descs[i].offset; + unsigned int count = min(nbytes, ap->descs[i].length); - err = fuse_copy_page(cs, &req->pages[i], offset, count, - zeroing); + err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing); if (err) return err; @@ -1149,12 +1014,12 @@ static int request_pending(struct fuse_iqueue *fiq) * Unlike other requests this is assembled on demand, without a need * to allocate a separate fuse_req structure. * - * Called with fiq->waitq.lock held, releases it + * Called with fiq->lock held, releases it */ static int fuse_read_interrupt(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes, struct fuse_req *req) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { struct fuse_in_header ih; struct fuse_interrupt_in arg; @@ -1169,7 +1034,7 @@ __releases(fiq->waitq.lock) ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT); arg.unique = req->in.h.unique; - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); if (nbytes < reqsize) return -EINVAL; @@ -1181,9 +1046,9 @@ __releases(fiq->waitq.lock) return err ? err : reqsize; } -static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, - unsigned max, - unsigned *countp) +struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp) { struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; @@ -1202,14 +1067,15 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, return head; } +EXPORT_SYMBOL(fuse_dequeue_forget); static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { int err; - struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL); + struct fuse_forget_link *forget = fuse_dequeue_forget(fiq, 1, NULL); struct fuse_forget_in arg = { .nlookup = forget->forget_one.nlookup, }; @@ -1220,7 +1086,7 @@ __releases(fiq->waitq.lock) .len = sizeof(ih) + sizeof(arg), }; - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); kfree(forget); if (nbytes < ih.len) return -EINVAL; @@ -1238,7 +1104,7 @@ __releases(fiq->waitq.lock) static int fuse_read_batch_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { int err; unsigned max_forgets; @@ -1252,13 +1118,13 @@ __releases(fiq->waitq.lock) }; if (nbytes < ih.len) { - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return -EINVAL; } max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); - head = dequeue_forget(fiq, max_forgets, &count); - spin_unlock(&fiq->waitq.lock); + head = fuse_dequeue_forget(fiq, max_forgets, &count); + spin_unlock(&fiq->lock); arg.count = count; ih.len += count * sizeof(struct fuse_forget_one); @@ -1288,7 +1154,7 @@ __releases(fiq->waitq.lock) static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL) return fuse_read_single_forget(fiq, cs, nbytes); @@ -1302,7 +1168,7 @@ __releases(fiq->waitq.lock) * the pending list and copies request data to userspace buffer. If * no reply is needed (FORGET) or request has been aborted or there * was an error during the copying then it's finished by calling - * request_end(). Otherwise add it to the processing list, and set + * fuse_request_end(). Otherwise add it to the processing list, and set * the 'sent' flag. */ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, @@ -1313,21 +1179,42 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_iqueue *fiq = &fc->iq; struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; - struct fuse_in *in; + struct fuse_args *args; unsigned reqsize; unsigned int hash; - restart: - spin_lock(&fiq->waitq.lock); - err = -EAGAIN; - if ((file->f_flags & O_NONBLOCK) && fiq->connected && - !request_pending(fiq)) - goto err_unlock; + /* + * Require sane minimum read buffer - that has capacity for fixed part + * of any request header + negotiated max_write room for data. + * + * Historically libfuse reserves 4K for fixed header room, but e.g. + * GlusterFS reserves only 80 bytes + * + * = `sizeof(fuse_in_header) + sizeof(fuse_write_in)` + * + * which is the absolute minimum any sane filesystem should be using + * for header room. + */ + if (nbytes < max_t(size_t, FUSE_MIN_READ_BUFFER, + sizeof(struct fuse_in_header) + + sizeof(struct fuse_write_in) + + fc->max_write)) + return -EINVAL; - err = wait_event_interruptible_exclusive_locked(fiq->waitq, + restart: + for (;;) { + spin_lock(&fiq->lock); + if (!fiq->connected || request_pending(fiq)) + break; + spin_unlock(&fiq->lock); + + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + err = wait_event_interruptible_exclusive(fiq->waitq, !fiq->connected || request_pending(fiq)); - if (err) - goto err_unlock; + if (err) + return err; + } if (!fiq->connected) { err = fc->aborted ? -ECONNABORTED : -ENODEV; @@ -1351,28 +1238,28 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, req = list_entry(fiq->pending.next, struct fuse_req, list); clear_bit(FR_PENDING, &req->flags); list_del_init(&req->list); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); - in = &req->in; - reqsize = in->h.len; + args = req->args; + reqsize = req->in.h.len; /* If request is too large, reply with an error and restart the read */ if (nbytes < reqsize) { req->out.h.error = -EIO; /* SETXATTR is special, since it may contain too large data */ - if (in->h.opcode == FUSE_SETXATTR) + if (args->opcode == FUSE_SETXATTR) req->out.h.error = -E2BIG; - request_end(fc, req); + fuse_request_end(fc, req); goto restart; } spin_lock(&fpq->lock); list_add(&req->list, &fpq->io); spin_unlock(&fpq->lock); cs->req = req; - err = fuse_copy_one(cs, &in->h, sizeof(in->h)); + err = fuse_copy_one(cs, &req->in.h, sizeof(req->in.h)); if (!err) - err = fuse_copy_args(cs, in->numargs, in->argpages, - (struct fuse_arg *) in->args, 0); + err = fuse_copy_args(cs, args->in_numargs, args->in_pages, + (struct fuse_arg *) args->in_args, 0); fuse_copy_finish(cs); spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); @@ -1405,11 +1292,11 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, if (!test_bit(FR_PRIVATE, &req->flags)) list_del_init(&req->list); spin_unlock(&fpq->lock); - request_end(fc, req); + fuse_request_end(fc, req); return err; err_unlock: - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return err; } @@ -1728,9 +1615,19 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, return err; } -static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) +struct fuse_retrieve_args { + struct fuse_args_pages ap; + struct fuse_notify_retrieve_in inarg; +}; + +static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args, + int error) { - release_pages(req->pages, req->num_pages); + struct fuse_retrieve_args *ra = + container_of(args, typeof(*ra), ap.args); + + release_pages(ra->ap.pages, ra->ap.num_pages); + kfree(ra); } static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, @@ -1738,13 +1635,16 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, { int err; struct address_space *mapping = inode->i_mapping; - struct fuse_req *req; pgoff_t index; loff_t file_size; unsigned int num; unsigned int offset; size_t total_len = 0; unsigned int num_pages; + struct fuse_retrieve_args *ra; + size_t args_size = sizeof(*ra); + struct fuse_args_pages *ap; + struct fuse_args *args; offset = outarg->offset & ~PAGE_MASK; file_size = i_size_read(inode); @@ -1758,19 +1658,26 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); - req = fuse_get_req(fc, num_pages); - if (IS_ERR(req)) - return PTR_ERR(req); + args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0])); - req->in.h.opcode = FUSE_NOTIFY_REPLY; - req->in.h.nodeid = outarg->nodeid; - req->in.numargs = 2; - req->in.argpages = 1; - req->end = fuse_retrieve_end; + ra = kzalloc(args_size, GFP_KERNEL); + if (!ra) + return -ENOMEM; + + ap = &ra->ap; + ap->pages = (void *) (ra + 1); + ap->descs = (void *) (ap->pages + num_pages); + + args = &ap->args; + args->nodeid = outarg->nodeid; + args->opcode = FUSE_NOTIFY_REPLY; + args->in_numargs = 2; + args->in_pages = true; + args->end = fuse_retrieve_end; index = outarg->offset >> PAGE_SHIFT; - while (num && req->num_pages < num_pages) { + while (num && ap->num_pages < num_pages) { struct page *page; unsigned int this_num; @@ -1779,27 +1686,25 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, break; this_num = min_t(unsigned, num, PAGE_SIZE - offset); - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].offset = offset; - req->page_descs[req->num_pages].length = this_num; - req->num_pages++; + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].offset = offset; + ap->descs[ap->num_pages].length = this_num; + ap->num_pages++; offset = 0; num -= this_num; total_len += this_num; index++; } - req->misc.retrieve_in.offset = outarg->offset; - req->misc.retrieve_in.size = total_len; - req->in.args[0].size = sizeof(req->misc.retrieve_in); - req->in.args[0].value = &req->misc.retrieve_in; - req->in.args[1].size = total_len; + ra->inarg.offset = outarg->offset; + ra->inarg.size = total_len; + args->in_args[0].size = sizeof(ra->inarg); + args->in_args[0].value = &ra->inarg; + args->in_args[1].size = total_len; - err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); - if (err) { - fuse_retrieve_end(fc, req); - fuse_put_request(fc, req); - } + err = fuse_simple_notify_reply(fc, args, outarg->notify_unique); + if (err) + fuse_retrieve_end(fc, args, err); return err; } @@ -1885,27 +1790,25 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) return NULL; } -static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, +static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, unsigned nbytes) { unsigned reqsize = sizeof(struct fuse_out_header); - if (out->h.error) - return nbytes != reqsize ? -EINVAL : 0; + reqsize += fuse_len_args(args->out_numargs, args->out_args); - reqsize += len_args(out->numargs, out->args); - - if (reqsize < nbytes || (reqsize > nbytes && !out->argvar)) + if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar)) return -EINVAL; else if (reqsize > nbytes) { - struct fuse_arg *lastarg = &out->args[out->numargs-1]; + struct fuse_arg *lastarg = &args->out_args[args->out_numargs-1]; unsigned diffsize = reqsize - nbytes; + if (diffsize > lastarg->size) return -EINVAL; lastarg->size -= diffsize; } - return fuse_copy_args(cs, out->numargs, out->argpages, out->args, - out->page_zeroing); + return fuse_copy_args(cs, args->out_numargs, args->out_pages, + args->out_args, args->page_zeroing); } /* @@ -1913,7 +1816,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, * the write buffer. The request is then searched on the processing * list by the unique ID found in the header. If found, then remove * it from the list and copy the rest of the buffer to the request. - * The request is finished by calling request_end() + * The request is finished by calling fuse_request_end(). */ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_copy_state *cs, size_t nbytes) @@ -1984,10 +1887,13 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, set_bit(FR_LOCKED, &req->flags); spin_unlock(&fpq->lock); cs->req = req; - if (!req->out.page_replace) + if (!req->args->page_replace) cs->move_pages = 0; - err = copy_out_args(cs, &req->out, nbytes); + if (oh.error) + err = nbytes != sizeof(oh) ? -EINVAL : 0; + else + err = copy_out_args(cs, req->args, nbytes); fuse_copy_finish(cs); spin_lock(&fpq->lock); @@ -2000,7 +1906,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, list_del_init(&req->list); spin_unlock(&fpq->lock); - request_end(fc, req); + fuse_request_end(fc, req); out: return err ? err : nbytes; @@ -2121,12 +2027,12 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) fiq = &fud->fc->iq; poll_wait(file, &fiq->waitq, wait); - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (!fiq->connected) mask = EPOLLERR; else if (request_pending(fiq)) mask |= EPOLLIN | EPOLLRDNORM; - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return mask; } @@ -2140,7 +2046,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) req->out.h.error = -ECONNABORTED; clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - request_end(fc, req); + fuse_request_end(fc, req); } } @@ -2221,15 +2127,15 @@ void fuse_abort_conn(struct fuse_conn *fc) flush_bg_queue(fc); spin_unlock(&fc->bg_lock); - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); fiq->connected = 0; list_for_each_entry(req, &fiq->pending, list) clear_bit(FR_PENDING, &req->flags); list_splice_tail_init(&fiq->pending, &to_end); while (forget_pending(fiq)) - kfree(dequeue_forget(fiq, 1, NULL)); - wake_up_all_locked(&fiq->waitq); - spin_unlock(&fiq->waitq.lock); + kfree(fuse_dequeue_forget(fiq, 1, NULL)); + wake_up_all(&fiq->waitq); + spin_unlock(&fiq->lock); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); end_polls(fc); wake_up_all(&fc->blocked_waitq); @@ -2296,7 +2202,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) if (new->private_data) return -EINVAL; - fud = fuse_dev_alloc(fc); + fud = fuse_dev_alloc_install(fc); if (!fud) return -ENOMEM; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index dd0f64f7bc06..d572c900bb0f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -24,20 +24,54 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } +#if BITS_PER_LONG >= 64 +static inline void __fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_fsdata = (void *) time; +} + +static inline u64 fuse_dentry_time(const struct dentry *entry) +{ + return (u64)entry->d_fsdata; +} + +#else union fuse_dentry { u64 time; struct rcu_head rcu; }; -static inline void fuse_dentry_settime(struct dentry *entry, u64 time) +static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time) { - ((union fuse_dentry *) entry->d_fsdata)->time = time; + ((union fuse_dentry *) dentry->d_fsdata)->time = time; } -static inline u64 fuse_dentry_time(struct dentry *entry) +static inline u64 fuse_dentry_time(const struct dentry *entry) { return ((union fuse_dentry *) entry->d_fsdata)->time; } +#endif + +static void fuse_dentry_settime(struct dentry *dentry, u64 time) +{ + struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); + bool delete = !time && fc->delete_stale; + /* + * Mess with DCACHE_OP_DELETE because dput() will be faster without it. + * Don't care about races, either way it's just an optimization + */ + if ((!delete && (dentry->d_flags & DCACHE_OP_DELETE)) || + (delete && !(dentry->d_flags & DCACHE_OP_DELETE))) { + spin_lock(&dentry->d_lock); + if (!delete) + dentry->d_flags &= ~DCACHE_OP_DELETE; + else + dentry->d_flags |= DCACHE_OP_DELETE; + spin_unlock(&dentry->d_lock); + } + + __fuse_dentry_settime(dentry, time); +} /* * FUSE caches dentries and attributes with separate timeout. The @@ -139,14 +173,14 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, struct fuse_entry_out *outarg) { memset(outarg, 0, sizeof(struct fuse_entry_out)); - args->in.h.opcode = FUSE_LOOKUP; - args->in.h.nodeid = nodeid; - args->in.numargs = 1; - args->in.args[0].size = name->len + 1; - args->in.args[0].value = name->name; - args->out.numargs = 1; - args->out.args[0].size = sizeof(struct fuse_entry_out); - args->out.args[0].value = outarg; + args->opcode = FUSE_LOOKUP; + args->nodeid = nodeid; + args->in_numargs = 1; + args->in_args[0].size = name->len + 1; + args->in_args[0].value = name->name; + args->out_numargs = 1; + args->out_args[0].size = sizeof(struct fuse_entry_out); + args->out_args[0].value = outarg; } /* @@ -242,9 +276,11 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) goto out; } +#if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { - dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); + dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); return dentry->d_fsdata ? 0 : -ENOMEM; } @@ -254,16 +290,27 @@ static void fuse_dentry_release(struct dentry *dentry) kfree_rcu(fd, rcu); } +#endif + +static int fuse_dentry_delete(const struct dentry *dentry) +{ + return time_before64(fuse_dentry_time(dentry), get_jiffies_64()); +} const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, + .d_delete = fuse_dentry_delete, +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; const struct dentry_operations fuse_root_dentry_operations = { +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; int fuse_valid_type(int m) @@ -410,18 +457,18 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.flags = flags; inarg.mode = mode; inarg.umask = current_umask(); - args.in.h.opcode = FUSE_CREATE; - args.in.h.nodeid = get_node_id(dir); - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = entry->d_name.len + 1; - args.in.args[1].value = entry->d_name.name; - args.out.numargs = 2; - args.out.args[0].size = sizeof(outentry); - args.out.args[0].value = &outentry; - args.out.args[1].size = sizeof(outopen); - args.out.args[1].value = &outopen; + args.opcode = FUSE_CREATE; + args.nodeid = get_node_id(dir); + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; + args.out_numargs = 2; + args.out_args[0].size = sizeof(outentry); + args.out_args[0].value = &outentry; + args.out_args[1].size = sizeof(outopen); + args.out_args[1].value = &outopen; err = fuse_simple_request(fc, &args); if (err) goto out_free_ff; @@ -526,10 +573,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, return -ENOMEM; memset(&outarg, 0, sizeof(outarg)); - args->in.h.nodeid = get_node_id(dir); - args->out.numargs = 1; - args->out.args[0].size = sizeof(outarg); - args->out.args[0].value = &outarg; + args->nodeid = get_node_id(dir); + args->out_numargs = 1; + args->out_args[0].size = sizeof(outarg); + args->out_args[0].value = &outarg; err = fuse_simple_request(fc, args); if (err) goto out_put_forget_req; @@ -582,12 +629,12 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, inarg.mode = mode; inarg.rdev = new_encode_dev(rdev); inarg.umask = current_umask(); - args.in.h.opcode = FUSE_MKNOD; - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = entry->d_name.len + 1; - args.in.args[1].value = entry->d_name.name; + args.opcode = FUSE_MKNOD; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; return create_new_entry(fc, &args, dir, entry, mode); } @@ -609,12 +656,12 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; inarg.umask = current_umask(); - args.in.h.opcode = FUSE_MKDIR; - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = entry->d_name.len + 1; - args.in.args[1].value = entry->d_name.name; + args.opcode = FUSE_MKDIR; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; return create_new_entry(fc, &args, dir, entry, S_IFDIR); } @@ -625,12 +672,12 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, unsigned len = strlen(link) + 1; FUSE_ARGS(args); - args.in.h.opcode = FUSE_SYMLINK; - args.in.numargs = 2; - args.in.args[0].size = entry->d_name.len + 1; - args.in.args[0].value = entry->d_name.name; - args.in.args[1].size = len; - args.in.args[1].value = link; + args.opcode = FUSE_SYMLINK; + args.in_numargs = 2; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; + args.in_args[1].size = len; + args.in_args[1].value = link; return create_new_entry(fc, &args, dir, entry, S_IFLNK); } @@ -648,11 +695,11 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); - args.in.h.opcode = FUSE_UNLINK; - args.in.h.nodeid = get_node_id(dir); - args.in.numargs = 1; - args.in.args[0].size = entry->d_name.len + 1; - args.in.args[0].value = entry->d_name.name; + args.opcode = FUSE_UNLINK; + args.nodeid = get_node_id(dir); + args.in_numargs = 1; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { struct inode *inode = d_inode(entry); @@ -684,11 +731,11 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); - args.in.h.opcode = FUSE_RMDIR; - args.in.h.nodeid = get_node_id(dir); - args.in.numargs = 1; - args.in.args[0].size = entry->d_name.len + 1; - args.in.args[0].value = entry->d_name.name; + args.opcode = FUSE_RMDIR; + args.nodeid = get_node_id(dir); + args.in_numargs = 1; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { clear_nlink(d_inode(entry)); @@ -711,15 +758,15 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); inarg.flags = flags; - args.in.h.opcode = opcode; - args.in.h.nodeid = get_node_id(olddir); - args.in.numargs = 3; - args.in.args[0].size = argsize; - args.in.args[0].value = &inarg; - args.in.args[1].size = oldent->d_name.len + 1; - args.in.args[1].value = oldent->d_name.name; - args.in.args[2].size = newent->d_name.len + 1; - args.in.args[2].value = newent->d_name.name; + args.opcode = opcode; + args.nodeid = get_node_id(olddir); + args.in_numargs = 3; + args.in_args[0].size = argsize; + args.in_args[0].value = &inarg; + args.in_args[1].size = oldent->d_name.len + 1; + args.in_args[1].value = oldent->d_name.name; + args.in_args[2].size = newent->d_name.len + 1; + args.in_args[2].value = newent->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { /* ctime changes */ @@ -796,12 +843,12 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); - args.in.h.opcode = FUSE_LINK; - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = newent->d_name.len + 1; - args.in.args[1].value = newent->d_name.name; + args.opcode = FUSE_LINK; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = newent->d_name.len + 1; + args.in_args[1].value = newent->d_name.name; err = create_new_entry(fc, &args, newdir, newent, inode->i_mode); /* Contrary to "normal" filesystems it can happen that link makes two "logical" inodes point to the same "physical" @@ -884,14 +931,14 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } - args.in.h.opcode = FUSE_GETATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_GETATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) { if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { @@ -1056,11 +1103,11 @@ static int fuse_access(struct inode *inode, int mask) memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); - args.in.h.opcode = FUSE_ACCESS; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = FUSE_ACCESS; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_access = 1; @@ -1152,38 +1199,36 @@ static int fuse_permission(struct inode *inode, int mask) static int fuse_readlink_page(struct inode *inode, struct page *page) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - int err; + struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_args_pages ap = { + .num_pages = 1, + .pages = &page, + .descs = &desc, + }; + char *link; + ssize_t res; - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); + ap.args.opcode = FUSE_READLINK; + ap.args.nodeid = get_node_id(inode); + ap.args.out_pages = true; + ap.args.out_argvar = true; + ap.args.page_zeroing = true; + ap.args.out_numargs = 1; + ap.args.out_args[0].size = desc.length; + res = fuse_simple_request(fc, &ap.args); - req->out.page_zeroing = 1; - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = PAGE_SIZE - 1; - req->in.h.opcode = FUSE_READLINK; - req->in.h.nodeid = get_node_id(inode); - req->out.argvar = 1; - req->out.numargs = 1; - req->out.args[0].size = PAGE_SIZE - 1; - fuse_request_send(fc, req); - err = req->out.h.error; - - if (!err) { - char *link = page_address(page); - size_t len = req->out.args[0].size; - - BUG_ON(len >= PAGE_SIZE); - link[len] = '\0'; - } - - fuse_put_request(fc, req); fuse_invalidate_atime(inode); - return err; + if (res < 0) + return res; + + if (WARN_ON(res >= PAGE_SIZE)) + return -EIO; + + link = page_address(page); + link[res] = '\0'; + + return 0; } static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, @@ -1383,14 +1428,14 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, struct fuse_setattr_in *inarg_p, struct fuse_attr_out *outarg_p) { - args->in.h.opcode = FUSE_SETATTR; - args->in.h.nodeid = get_node_id(inode); - args->in.numargs = 1; - args->in.args[0].size = sizeof(*inarg_p); - args->in.args[0].value = inarg_p; - args->out.numargs = 1; - args->out.args[0].size = sizeof(*outarg_p); - args->out.args[0].value = outarg_p; + args->opcode = FUSE_SETATTR; + args->nodeid = get_node_id(inode); + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg_p); + args->in_args[0].value = inarg_p; + args->out_numargs = 1; + args->out_args[0].size = sizeof(*outarg_p); + args->out_args[0].value = outarg_p; } /* diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ae2828beb00..0f0225686aee 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,6 +19,18 @@ #include #include +static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc) +{ + struct page **pages; + + pages = kzalloc(npages * (sizeof(struct page *) + + sizeof(struct fuse_page_desc)), flags); + *desc = (void *) (pages + npages); + + return pages; +} + static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, int opcode, struct fuse_open_out *outargp) { @@ -29,29 +41,36 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); if (!fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; - args.in.h.opcode = opcode; - args.in.h.nodeid = nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(*outargp); - args.out.args[0].value = outargp; + args.opcode = opcode; + args.nodeid = nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(*outargp); + args.out_args[0].value = outargp; return fuse_simple_request(fc, &args); } +struct fuse_release_args { + struct fuse_args args; + struct fuse_release_in inarg; + struct inode *inode; +}; + struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT); if (unlikely(!ff)) return NULL; ff->fc = fc; - ff->reserved_req = fuse_request_alloc(0); - if (unlikely(!ff->reserved_req)) { + ff->release_args = kzalloc(sizeof(*ff->release_args), + GFP_KERNEL_ACCOUNT); + if (!ff->release_args) { kfree(ff); return NULL; } @@ -69,7 +88,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) void fuse_file_free(struct fuse_file *ff) { - fuse_request_free(ff->reserved_req); + kfree(ff->release_args); mutex_destroy(&ff->readdir.lock); kfree(ff); } @@ -80,34 +99,31 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } -static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args, + int error) { - iput(req->misc.release.inode); + struct fuse_release_args *ra = container_of(args, typeof(*ra), args); + + iput(ra->inode); + kfree(ra); } static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) { if (refcount_dec_and_test(&ff->count)) { - struct fuse_req *req = ff->reserved_req; + struct fuse_args *args = &ff->release_args->args; if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { - /* - * Drop the release request when client does not - * implement 'open' - */ - __clear_bit(FR_BACKGROUND, &req->flags); - iput(req->misc.release.inode); - fuse_put_request(ff->fc, req); + /* Do nothing when client does not implement 'open' */ + fuse_release_end(ff->fc, args, 0); } else if (sync) { - __set_bit(FR_FORCE, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); - fuse_request_send(ff->fc, req); - iput(req->misc.release.inode); - fuse_put_request(ff->fc, req); + fuse_simple_request(ff->fc, args); + fuse_release_end(ff->fc, args, 0); } else { - req->end = fuse_release_end; - __set_bit(FR_BACKGROUND, &req->flags); - fuse_request_send_background(ff->fc, req); + args->end = fuse_release_end; + if (fuse_simple_background(ff->fc, args, + GFP_KERNEL | __GFP_NOFAIL)) + fuse_release_end(ff->fc, args, -ENOTCONN); } kfree(ff); } @@ -227,8 +243,7 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, int flags, int opcode) { struct fuse_conn *fc = ff->fc; - struct fuse_req *req = ff->reserved_req; - struct fuse_release_in *inarg = &req->misc.release.in; + struct fuse_release_args *ra = ff->release_args; /* Inode is NULL on error path of fuse_create_open() */ if (likely(fi)) { @@ -243,32 +258,33 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, wake_up_interruptible_all(&ff->poll_wait); - inarg->fh = ff->fh; - inarg->flags = flags; - req->in.h.opcode = opcode; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct fuse_release_in); - req->in.args[0].value = inarg; + ra->inarg.fh = ff->fh; + ra->inarg.flags = flags; + ra->args.in_numargs = 1; + ra->args.in_args[0].size = sizeof(struct fuse_release_in); + ra->args.in_args[0].value = &ra->inarg; + ra->args.opcode = opcode; + ra->args.nodeid = ff->nodeid; + ra->args.force = true; + ra->args.nocreds = true; } void fuse_release_common(struct file *file, bool isdir) { struct fuse_inode *fi = get_fuse_inode(file_inode(file)); struct fuse_file *ff = file->private_data; - struct fuse_req *req = ff->reserved_req; + struct fuse_release_args *ra = ff->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; fuse_prepare_release(fi, ff, file->f_flags, opcode); if (ff->flock) { - struct fuse_release_in *inarg = &req->misc.release.in; - inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; - inarg->lock_owner = fuse_lock_owner_id(ff->fc, - (fl_owner_t) file); + ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; + ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc, + (fl_owner_t) file); } /* Hold inode until release is finished */ - req->misc.release.inode = igrab(file_inode(file)); + ra->inode = igrab(file_inode(file)); /* * Normally this will send the RELEASE request, however if @@ -279,7 +295,7 @@ void fuse_release_common(struct file *file, bool isdir) * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir); + fuse_file_put(ff, ff->fc->destroy, isdir); } static int fuse_open(struct inode *inode, struct file *file) @@ -335,19 +351,27 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) return (u64) v0 + ((u64) v1 << 32); } -static struct fuse_req *fuse_find_writeback(struct fuse_inode *fi, +struct fuse_writepage_args { + struct fuse_io_args ia; + struct list_head writepages_entry; + struct list_head queue_entry; + struct fuse_writepage_args *next; + struct inode *inode; +}; + +static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, pgoff_t idx_from, pgoff_t idx_to) { - struct fuse_req *req; + struct fuse_writepage_args *wpa; - list_for_each_entry(req, &fi->writepages, writepages_entry) { + list_for_each_entry(wpa, &fi->writepages, writepages_entry) { pgoff_t curr_index; - WARN_ON(get_fuse_inode(req->inode) != fi); - curr_index = req->misc.write.in.offset >> PAGE_SHIFT; - if (idx_from < curr_index + req->num_pages && + WARN_ON(get_fuse_inode(wpa->inode) != fi); + curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; + if (idx_from < curr_index + wpa->ia.ap.num_pages && curr_index <= idx_to) { - return req; + return wpa; } } return NULL; @@ -383,12 +407,11 @@ static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) * Since fuse doesn't rely on the VM writeback tracking, this has to * use some other means. */ -static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) +static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) { struct fuse_inode *fi = get_fuse_inode(inode); wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); - return 0; } /* @@ -411,8 +434,8 @@ static int fuse_flush(struct file *file, fl_owner_t id) struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; - struct fuse_req *req; struct fuse_flush_in inarg; + FUSE_ARGS(args); int err; if (is_bad_inode(inode)) @@ -433,19 +456,17 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); - req->in.h.opcode = FUSE_FLUSH; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - __set_bit(FR_FORCE, &req->flags); - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.opcode = FUSE_FLUSH; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_flush = 1; err = 0; @@ -465,11 +486,11 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0; - args.in.h.opcode = opcode; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = opcode; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; return fuse_simple_request(fc, &args); } @@ -523,35 +544,35 @@ static int fuse_fsync(struct file *file, loff_t start, loff_t end, return err; } -void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, - size_t count, int opcode) +void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, + size_t count, int opcode) { - struct fuse_read_in *inarg = &req->misc.read.in; struct fuse_file *ff = file->private_data; + struct fuse_args *args = &ia->ap.args; - inarg->fh = ff->fh; - inarg->offset = pos; - inarg->size = count; - inarg->flags = file->f_flags; - req->in.h.opcode = opcode; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct fuse_read_in); - req->in.args[0].value = inarg; - req->out.argvar = 1; - req->out.numargs = 1; - req->out.args[0].size = count; + ia->read.in.fh = ff->fh; + ia->read.in.offset = pos; + ia->read.in.size = count; + ia->read.in.flags = file->f_flags; + args->opcode = opcode; + args->nodeid = ff->nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(ia->read.in); + args->in_args[0].value = &ia->read.in; + args->out_argvar = true; + args->out_numargs = 1; + args->out_args[0].size = count; } -static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) +static void fuse_release_user_pages(struct fuse_args_pages *ap, + bool should_dirty) { - unsigned i; + unsigned int i; - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; + for (i = 0; i < ap->num_pages; i++) { if (should_dirty) - set_page_dirty_lock(page); - put_page(page); + set_page_dirty_lock(ap->pages[i]); + put_page(ap->pages[i]); } } @@ -621,64 +642,94 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) kref_put(&io->refcnt, fuse_io_release); } -static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) +static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, + unsigned int npages) { - struct fuse_io_priv *io = req->io; - ssize_t pos = -1; + struct fuse_io_args *ia; - fuse_release_user_pages(req, io->should_dirty); - - if (io->write) { - if (req->misc.write.in.size != req->misc.write.out.size) - pos = req->misc.write.in.offset - io->offset + - req->misc.write.out.size; - } else { - if (req->misc.read.in.size != req->out.args[0].size) - pos = req->misc.read.in.offset - io->offset + - req->out.args[0].size; + ia = kzalloc(sizeof(*ia), GFP_KERNEL); + if (ia) { + ia->io = io; + ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL, + &ia->ap.descs); + if (!ia->ap.pages) { + kfree(ia); + ia = NULL; + } } - - fuse_aio_complete(io, req->out.h.error, pos); + return ia; } -static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, - size_t num_bytes, struct fuse_io_priv *io) +static void fuse_io_free(struct fuse_io_args *ia) { + kfree(ia->ap.pages); + kfree(ia); +} + +static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args, + int err) +{ + struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_io_priv *io = ia->io; + ssize_t pos = -1; + + fuse_release_user_pages(&ia->ap, io->should_dirty); + + if (err) { + /* Nothing */ + } else if (io->write) { + if (ia->write.out.size > ia->write.in.size) { + err = -EIO; + } else if (ia->write.in.size != ia->write.out.size) { + pos = ia->write.in.offset - io->offset + + ia->write.out.size; + } + } else { + u32 outsize = args->out_args[0].size; + + if (ia->read.in.size != outsize) + pos = ia->read.in.offset - io->offset + outsize; + } + + fuse_aio_complete(io, err, pos); + fuse_io_free(ia); +} + +static ssize_t fuse_async_req_send(struct fuse_conn *fc, + struct fuse_io_args *ia, size_t num_bytes) +{ + ssize_t err; + struct fuse_io_priv *io = ia->io; + spin_lock(&io->lock); kref_get(&io->refcnt); io->size += num_bytes; io->reqs++; spin_unlock(&io->lock); - req->io = io; - req->end = fuse_aio_complete_req; + ia->ap.args.end = fuse_aio_complete_req; + err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); - __fuse_get_request(req); - fuse_request_send_background(fc, req); - - return num_bytes; + return err ?: num_bytes; } -static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io, - loff_t pos, size_t count, fl_owner_t owner) +static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, + fl_owner_t owner) { - struct file *file = io->iocb->ki_filp; + struct file *file = ia->io->iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - fuse_read_fill(req, file, pos, count, FUSE_READ); + fuse_read_args_fill(ia, file, pos, count, FUSE_READ); if (owner != NULL) { - struct fuse_read_in *inarg = &req->misc.read.in; - - inarg->read_flags |= FUSE_READ_LOCKOWNER; - inarg->lock_owner = fuse_lock_owner_id(fc, owner); + ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; + ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner); } - if (io->async) - return fuse_async_req_send(fc, req, count, io); + if (ia->io->async) + return fuse_async_req_send(fc, ia, count); - fuse_request_send(fc, req); - return req->out.args[0].size; + return fuse_simple_request(fc, &ia->ap.args); } static void fuse_read_update_size(struct inode *inode, loff_t size, @@ -696,10 +747,9 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, spin_unlock(&fi->lock); } -static void fuse_short_read(struct fuse_req *req, struct inode *inode, - u64 attr_ver) +static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, + struct fuse_args_pages *ap) { - size_t num_read = req->out.args[0].size; struct fuse_conn *fc = get_fuse_conn(inode); if (fc->writeback_cache) { @@ -712,28 +762,31 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode, int start_idx = num_read >> PAGE_SHIFT; size_t off = num_read & (PAGE_SIZE - 1); - for (i = start_idx; i < req->num_pages; i++) { - zero_user_segment(req->pages[i], off, PAGE_SIZE); + for (i = start_idx; i < ap->num_pages; i++) { + zero_user_segment(ap->pages[i], off, PAGE_SIZE); off = 0; } } else { - loff_t pos = page_offset(req->pages[0]) + num_read; + loff_t pos = page_offset(ap->pages[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } } static int fuse_do_readpage(struct file *file, struct page *page) { - struct kiocb iocb; - struct fuse_io_priv io; struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - size_t num_read; loff_t pos = page_offset(page); - size_t count = PAGE_SIZE; + struct fuse_page_desc desc = { .length = PAGE_SIZE }; + struct fuse_io_args ia = { + .ap.args.page_zeroing = true, + .ap.args.out_pages = true, + .ap.num_pages = 1, + .ap.pages = &page, + .ap.descs = &desc, + }; + ssize_t res; u64 attr_ver; - int err; /* * Page writeback can extend beyond the lifetime of the @@ -742,35 +795,21 @@ static int fuse_do_readpage(struct file *file, struct page *page) */ fuse_wait_on_page_writeback(inode, page->index); - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); - attr_ver = fuse_get_attr_version(fc); - req->out.page_zeroing = 1; - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = count; - init_sync_kiocb(&iocb, file); - io = (struct fuse_io_priv) FUSE_IO_PRIV_SYNC(&iocb); - num_read = fuse_send_read(req, &io, pos, count, NULL); - err = req->out.h.error; + fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); + res = fuse_simple_request(fc, &ia.ap.args); + if (res < 0) + return res; + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (res < desc.length) + fuse_short_read(inode, attr_ver, res, &ia.ap); - if (!err) { - /* - * Short read means EOF. If file size is larger, truncate it - */ - if (num_read < count) - fuse_short_read(req, inode, attr_ver); + SetPageUptodate(page); - SetPageUptodate(page); - } - - fuse_put_request(fc, req); - - return err; + return 0; } static int fuse_readpage(struct file *file, struct page *page) @@ -789,15 +828,18 @@ static int fuse_readpage(struct file *file, struct page *page) return err; } -static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args, + int err) { int i; - size_t count = req->misc.read.in.size; - size_t num_read = req->out.args[0].size; + struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_args_pages *ap = &ia->ap; + size_t count = ia->read.in.size; + size_t num_read = args->out_args[0].size; struct address_space *mapping = NULL; - for (i = 0; mapping == NULL && i < req->num_pages; i++) - mapping = req->pages[i]->mapping; + for (i = 0; mapping == NULL && i < ap->num_pages; i++) + mapping = ap->pages[i]->mapping; if (mapping) { struct inode *inode = mapping->host; @@ -805,93 +847,97 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) /* * Short read means EOF. If file size is larger, truncate it */ - if (!req->out.h.error && num_read < count) - fuse_short_read(req, inode, req->misc.read.attr_ver); + if (!err && num_read < count) + fuse_short_read(inode, ia->read.attr_ver, num_read, ap); fuse_invalidate_atime(inode); } - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; - if (!req->out.h.error) + for (i = 0; i < ap->num_pages; i++) { + struct page *page = ap->pages[i]; + + if (!err) SetPageUptodate(page); else SetPageError(page); unlock_page(page); put_page(page); } - if (req->ff) - fuse_file_put(req->ff, false, false); + if (ia->ff) + fuse_file_put(ia->ff, false, false); + + fuse_io_free(ia); } -static void fuse_send_readpages(struct fuse_req *req, struct file *file) +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - loff_t pos = page_offset(req->pages[0]); - size_t count = req->num_pages << PAGE_SHIFT; + struct fuse_args_pages *ap = &ia->ap; + loff_t pos = page_offset(ap->pages[0]); + size_t count = ap->num_pages << PAGE_SHIFT; + int err; - req->out.argpages = 1; - req->out.page_zeroing = 1; - req->out.page_replace = 1; - fuse_read_fill(req, file, pos, count, FUSE_READ); - req->misc.read.attr_ver = fuse_get_attr_version(fc); + ap->args.out_pages = true; + ap->args.page_zeroing = true; + ap->args.page_replace = true; + fuse_read_args_fill(ia, file, pos, count, FUSE_READ); + ia->read.attr_ver = fuse_get_attr_version(fc); if (fc->async_read) { - req->ff = fuse_file_get(ff); - req->end = fuse_readpages_end; - fuse_request_send_background(fc, req); + ia->ff = fuse_file_get(ff); + ap->args.end = fuse_readpages_end; + err = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + if (!err) + return; } else { - fuse_request_send(fc, req); - fuse_readpages_end(fc, req); - fuse_put_request(fc, req); + err = fuse_simple_request(fc, &ap->args); } + fuse_readpages_end(fc, &ap->args, err); } struct fuse_fill_data { - struct fuse_req *req; + struct fuse_io_args *ia; struct file *file; struct inode *inode; - unsigned nr_pages; + unsigned int nr_pages; + unsigned int max_pages; }; static int fuse_readpages_fill(void *_data, struct page *page) { struct fuse_fill_data *data = _data; - struct fuse_req *req = data->req; + struct fuse_io_args *ia = data->ia; + struct fuse_args_pages *ap = &ia->ap; struct inode *inode = data->inode; struct fuse_conn *fc = get_fuse_conn(inode); fuse_wait_on_page_writeback(inode, page->index); - if (req->num_pages && - (req->num_pages == fc->max_pages || - (req->num_pages + 1) * PAGE_SIZE > fc->max_read || - req->pages[req->num_pages - 1]->index + 1 != page->index)) { - unsigned int nr_alloc = min_t(unsigned int, data->nr_pages, - fc->max_pages); - fuse_send_readpages(req, data->file); - if (fc->async_read) - req = fuse_get_req_for_background(fc, nr_alloc); - else - req = fuse_get_req(fc, nr_alloc); - - data->req = req; - if (IS_ERR(req)) { + if (ap->num_pages && + (ap->num_pages == fc->max_pages || + (ap->num_pages + 1) * PAGE_SIZE > fc->max_read || + ap->pages[ap->num_pages - 1]->index + 1 != page->index)) { + data->max_pages = min_t(unsigned int, data->nr_pages, + fc->max_pages); + fuse_send_readpages(ia, data->file); + data->ia = ia = fuse_io_alloc(NULL, data->max_pages); + if (!ia) { unlock_page(page); - return PTR_ERR(req); + return -ENOMEM; } + ap = &ia->ap; } - if (WARN_ON(req->num_pages >= req->max_pages)) { + if (WARN_ON(ap->num_pages >= data->max_pages)) { unlock_page(page); - fuse_put_request(fc, req); + fuse_io_free(ia); return -EIO; } get_page(page); - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].length = PAGE_SIZE; - req->num_pages++; + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].length = PAGE_SIZE; + ap->num_pages++; data->nr_pages--; return 0; } @@ -903,7 +949,6 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_data data; int err; - unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages); err = -EIO; if (is_bad_inode(inode)) @@ -911,21 +956,20 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - if (fc->async_read) - data.req = fuse_get_req_for_background(fc, nr_alloc); - else - data.req = fuse_get_req(fc, nr_alloc); data.nr_pages = nr_pages; - err = PTR_ERR(data.req); - if (IS_ERR(data.req)) + data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages); +; + data.ia = fuse_io_alloc(NULL, data.max_pages); + err = -ENOMEM; + if (!data.ia) goto out; err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); if (!err) { - if (data.req->num_pages) - fuse_send_readpages(data.req, file); + if (data.ia->ap.num_pages) + fuse_send_readpages(data.ia, file); else - fuse_put_request(fc, data.req); + fuse_io_free(data.ia); } out: return err; @@ -952,54 +996,65 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) return generic_file_read_iter(iocb, to); } -static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, - loff_t pos, size_t count) +static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, + loff_t pos, size_t count) { - struct fuse_write_in *inarg = &req->misc.write.in; - struct fuse_write_out *outarg = &req->misc.write.out; + struct fuse_args *args = &ia->ap.args; - inarg->fh = ff->fh; - inarg->offset = pos; - inarg->size = count; - req->in.h.opcode = FUSE_WRITE; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 2; + ia->write.in.fh = ff->fh; + ia->write.in.offset = pos; + ia->write.in.size = count; + args->opcode = FUSE_WRITE; + args->nodeid = ff->nodeid; + args->in_numargs = 2; if (ff->fc->minor < 9) - req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; + args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; else - req->in.args[0].size = sizeof(struct fuse_write_in); - req->in.args[0].value = inarg; - req->in.args[1].size = count; - req->out.numargs = 1; - req->out.args[0].size = sizeof(struct fuse_write_out); - req->out.args[0].value = outarg; + args->in_args[0].size = sizeof(ia->write.in); + args->in_args[0].value = &ia->write.in; + args->in_args[1].size = count; + args->out_numargs = 1; + args->out_args[0].size = sizeof(ia->write.out); + args->out_args[0].value = &ia->write.out; } -static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, - loff_t pos, size_t count, fl_owner_t owner) +static unsigned int fuse_write_flags(struct kiocb *iocb) { - struct kiocb *iocb = io->iocb; + unsigned int flags = iocb->ki_filp->f_flags; + + if (iocb->ki_flags & IOCB_DSYNC) + flags |= O_DSYNC; + if (iocb->ki_flags & IOCB_SYNC) + flags |= O_SYNC; + + return flags; +} + +static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, + size_t count, fl_owner_t owner) +{ + struct kiocb *iocb = ia->io->iocb; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - struct fuse_write_in *inarg = &req->misc.write.in; + struct fuse_write_in *inarg = &ia->write.in; + ssize_t err; - fuse_write_fill(req, ff, pos, count); - inarg->flags = file->f_flags; - if (iocb->ki_flags & IOCB_DSYNC) - inarg->flags |= O_DSYNC; - if (iocb->ki_flags & IOCB_SYNC) - inarg->flags |= O_SYNC; + fuse_write_args_fill(ia, ff, pos, count); + inarg->flags = fuse_write_flags(iocb); if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); } - if (io->async) - return fuse_async_req_send(fc, req, count, io); + if (ia->io->async) + return fuse_async_req_send(fc, ia, count); - fuse_request_send(fc, req); - return req->misc.write.out.size; + err = fuse_simple_request(fc, &ia->ap.args); + if (!err && ia->write.out.size > count) + err = -EIO; + + return err ?: ia->write.out.size; } bool fuse_write_update_size(struct inode *inode, loff_t pos) @@ -1019,26 +1074,31 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos) return ret; } -static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb, - struct inode *inode, loff_t pos, - size_t count) +static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, + struct kiocb *iocb, struct inode *inode, + loff_t pos, size_t count) { - size_t res; - unsigned offset; - unsigned i; - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + struct fuse_args_pages *ap = &ia->ap; + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + unsigned int offset, i; + int err; - for (i = 0; i < req->num_pages; i++) - fuse_wait_on_page_writeback(inode, req->pages[i]->index); + for (i = 0; i < ap->num_pages; i++) + fuse_wait_on_page_writeback(inode, ap->pages[i]->index); - res = fuse_send_write(req, &io, pos, count, NULL); + fuse_write_args_fill(ia, ff, pos, count); + ia->write.in.flags = fuse_write_flags(iocb); - offset = req->page_descs[0].offset; - count = res; - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; + err = fuse_simple_request(fc, &ap->args); - if (!req->out.h.error && !offset && count >= PAGE_SIZE) + offset = ap->descs[0].offset; + count = ia->write.out.size; + for (i = 0; i < ap->num_pages; i++) { + struct page *page = ap->pages[i]; + + if (!err && !offset && count >= PAGE_SIZE) SetPageUptodate(page); if (count > PAGE_SIZE - offset) @@ -1051,20 +1111,21 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb, put_page(page); } - return res; + return err; } -static ssize_t fuse_fill_write_pages(struct fuse_req *req, - struct address_space *mapping, - struct iov_iter *ii, loff_t pos) +static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos, + unsigned int max_pages) { struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); size_t count = 0; int err; - req->in.argpages = 1; - req->page_descs[0].offset = offset; + ap->args.in_pages = true; + ap->descs[0].offset = offset; do { size_t tmp; @@ -1100,9 +1161,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, } err = 0; - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].length = tmp; - req->num_pages++; + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].length = tmp; + ap->num_pages++; count += tmp; pos += tmp; @@ -1113,7 +1174,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && - req->num_pages < req->max_pages && offset == 0); + ap->num_pages < max_pages && offset == 0); return count > 0 ? count : err; } @@ -1141,27 +1202,27 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); do { - struct fuse_req *req; ssize_t count; + struct fuse_io_args ia = {}; + struct fuse_args_pages *ap = &ia.ap; unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), fc->max_pages); - req = fuse_get_req(fc, nr_pages); - if (IS_ERR(req)) { - err = PTR_ERR(req); + ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs); + if (!ap->pages) { + err = -ENOMEM; break; } - count = fuse_fill_write_pages(req, mapping, ii, pos); + count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages); if (count <= 0) { err = count; } else { - size_t num_written; - - num_written = fuse_send_write_pages(req, iocb, inode, - pos, count); - err = req->out.h.error; + err = fuse_send_write_pages(&ia, iocb, inode, + pos, count); if (!err) { + size_t num_written = ia.write.out.size; + res += num_written; pos += num_written; @@ -1170,7 +1231,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, err = -EIO; } } - fuse_put_request(fc, req); + kfree(ap->pages); } while (!err && iov_iter_count(ii)); if (res > 0) @@ -1258,14 +1319,14 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) return written ? written : err; } -static inline void fuse_page_descs_length_init(struct fuse_req *req, - unsigned index, unsigned nr_pages) +static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, + unsigned int index, + unsigned int nr_pages) { int i; for (i = index; i < index + nr_pages; i++) - req->page_descs[i].length = PAGE_SIZE - - req->page_descs[i].offset; + descs[i].length = PAGE_SIZE - descs[i].offset; } static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) @@ -1279,8 +1340,9 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii, return min(iov_iter_single_seg_count(ii), max_size); } -static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, - size_t *nbytesp, int write) +static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, + size_t *nbytesp, int write, + unsigned int max_pages) { size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; @@ -1291,21 +1353,21 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, size_t frag_size = fuse_get_frag_size(ii, *nbytesp); if (write) - req->in.args[1].value = (void *) user_addr; + ap->args.in_args[1].value = (void *) user_addr; else - req->out.args[0].value = (void *) user_addr; + ap->args.out_args[0].value = (void *) user_addr; iov_iter_advance(ii, frag_size); *nbytesp = frag_size; return 0; } - while (nbytes < *nbytesp && req->num_pages < req->max_pages) { + while (nbytes < *nbytesp && ap->num_pages < max_pages) { unsigned npages; size_t start; - ret = iov_iter_get_pages(ii, &req->pages[req->num_pages], + ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages], *nbytesp - nbytes, - req->max_pages - req->num_pages, + max_pages - ap->num_pages, &start); if (ret < 0) break; @@ -1316,18 +1378,18 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, ret += start; npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; - req->page_descs[req->num_pages].offset = start; - fuse_page_descs_length_init(req, req->num_pages, npages); + ap->descs[ap->num_pages].offset = start; + fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); - req->num_pages += npages; - req->page_descs[req->num_pages - 1].length -= + ap->num_pages += npages; + ap->descs[ap->num_pages - 1].length -= (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } if (write) - req->in.argpages = 1; + ap->args.in_pages = 1; else - req->out.argpages = 1; + ap->args.out_pages = 1; *nbytesp = nbytes; @@ -1349,17 +1411,16 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, pgoff_t idx_from = pos >> PAGE_SHIFT; pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT; ssize_t res = 0; - struct fuse_req *req; int err = 0; + struct fuse_io_args *ia; + unsigned int max_pages; - if (io->async) - req = fuse_get_req_for_background(fc, iov_iter_npages(iter, - fc->max_pages)); - else - req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages)); - if (IS_ERR(req)) - return PTR_ERR(req); + max_pages = iov_iter_npages(iter, fc->max_pages); + ia = fuse_io_alloc(io, max_pages); + if (!ia) + return -ENOMEM; + ia->io = io; if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) inode_lock(inode); @@ -1370,54 +1431,49 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, io->should_dirty = !write && iter_is_iovec(iter); while (count) { - size_t nres; + ssize_t nres; fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); - err = fuse_get_user_pages(req, iter, &nbytes, write); + + err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, + max_pages); if (err && !nbytes) break; if (write) { - if (!capable(CAP_FSETID)) { - struct fuse_write_in *inarg; + if (!capable(CAP_FSETID)) + ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV; - inarg = &req->misc.write.in; - inarg->write_flags |= FUSE_WRITE_KILL_PRIV; - } - nres = fuse_send_write(req, io, pos, nbytes, owner); + nres = fuse_send_write(ia, pos, nbytes, owner); } else { - nres = fuse_send_read(req, io, pos, nbytes, owner); + nres = fuse_send_read(ia, pos, nbytes, owner); } - if (!io->async) - fuse_release_user_pages(req, io->should_dirty); - if (req->out.h.error) { - err = req->out.h.error; - break; - } else if (nres > nbytes) { - res = 0; - err = -EIO; + if (!io->async || nres < 0) { + fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_io_free(ia); + } + ia = NULL; + if (nres < 0) { + err = nres; break; } + WARN_ON(nres > nbytes); + count -= nres; res += nres; pos += nres; if (nres != nbytes) break; if (count) { - fuse_put_request(fc, req); - if (io->async) - req = fuse_get_req_for_background(fc, - iov_iter_npages(iter, fc->max_pages)); - else - req = fuse_get_req(fc, iov_iter_npages(iter, - fc->max_pages)); - if (IS_ERR(req)) + max_pages = iov_iter_npages(iter, fc->max_pages); + ia = fuse_io_alloc(io, max_pages); + if (!ia) break; } } - if (!IS_ERR(req)) - fuse_put_request(fc, req); + if (ia) + fuse_io_free(ia); if (res > 0) *ppos = pos; @@ -1509,45 +1565,53 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return fuse_direct_write_iter(iocb, from); } -static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_writepage_free(struct fuse_writepage_args *wpa) { + struct fuse_args_pages *ap = &wpa->ia.ap; int i; - for (i = 0; i < req->num_pages; i++) - __free_page(req->pages[i]); + for (i = 0; i < ap->num_pages; i++) + __free_page(ap->pages[i]); - if (req->ff) - fuse_file_put(req->ff, false, false); + if (wpa->ia.ff) + fuse_file_put(wpa->ia.ff, false, false); + + kfree(ap->pages); + kfree(wpa); } -static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_writepage_finish(struct fuse_conn *fc, + struct fuse_writepage_args *wpa) { - struct inode *inode = req->inode; + struct fuse_args_pages *ap = &wpa->ia.ap; + struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - list_del(&req->writepages_entry); - for (i = 0; i < req->num_pages; i++) { + list_del(&wpa->writepages_entry); + for (i = 0; i < ap->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP); + dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); } wake_up(&fi->page_waitq); } /* Called under fi->lock, may release and reacquire it */ -static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req, - loff_t size) +static void fuse_send_writepage(struct fuse_conn *fc, + struct fuse_writepage_args *wpa, loff_t size) __releases(fi->lock) __acquires(fi->lock) { - struct fuse_req *aux, *next; - struct fuse_inode *fi = get_fuse_inode(req->inode); - struct fuse_write_in *inarg = &req->misc.write.in; - __u64 data_size = req->num_pages * PAGE_SIZE; - bool queued; + struct fuse_writepage_args *aux, *next; + struct fuse_inode *fi = get_fuse_inode(wpa->inode); + struct fuse_write_in *inarg = &wpa->ia.write.in; + struct fuse_args *args = &wpa->ia.ap.args; + __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE; + int err; + fi->writectr++; if (inarg->offset + data_size <= size) { inarg->size = data_size; } else if (inarg->offset < size) { @@ -1557,29 +1621,36 @@ __acquires(fi->lock) goto out_free; } - req->in.args[1].size = inarg->size; - queued = fuse_request_queue_background(fc, req); + args->in_args[1].size = inarg->size; + args->force = true; + args->nocreds = true; + + err = fuse_simple_background(fc, args, GFP_ATOMIC); + if (err == -ENOMEM) { + spin_unlock(&fi->lock); + err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL); + spin_lock(&fi->lock); + } + /* Fails on broken connection only */ - if (unlikely(!queued)) + if (unlikely(err)) goto out_free; - fi->writectr++; return; out_free: - fuse_writepage_finish(fc, req); + fi->writectr--; + fuse_writepage_finish(fc, wpa); spin_unlock(&fi->lock); /* After fuse_writepage_finish() aux request list is private */ - for (aux = req->misc.write.next; aux; aux = next) { - next = aux->misc.write.next; - aux->misc.write.next = NULL; - fuse_writepage_free(fc, aux); - fuse_put_request(fc, aux); + for (aux = wpa->next; aux; aux = next) { + next = aux->next; + aux->next = NULL; + fuse_writepage_free(aux); } - fuse_writepage_free(fc, req); - fuse_put_request(fc, req); + fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1596,29 +1667,34 @@ __acquires(fi->lock) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); loff_t crop = i_size_read(inode); - struct fuse_req *req; + struct fuse_writepage_args *wpa; while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { - req = list_entry(fi->queued_writes.next, struct fuse_req, list); - list_del_init(&req->list); - fuse_send_writepage(fc, req, crop); + wpa = list_entry(fi->queued_writes.next, + struct fuse_writepage_args, queue_entry); + list_del_init(&wpa->queue_entry); + fuse_send_writepage(fc, wpa, crop); } } -static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, + int error) { - struct inode *inode = req->inode; + struct fuse_writepage_args *wpa = + container_of(args, typeof(*wpa), ia.ap.args); + struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - mapping_set_error(inode->i_mapping, req->out.h.error); + mapping_set_error(inode->i_mapping, error); spin_lock(&fi->lock); - while (req->misc.write.next) { + while (wpa->next) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_write_in *inarg = &req->misc.write.in; - struct fuse_req *next = req->misc.write.next; - req->misc.write.next = next->misc.write.next; - next->misc.write.next = NULL; - next->ff = fuse_file_get(req->ff); + struct fuse_write_in *inarg = &wpa->ia.write.in; + struct fuse_writepage_args *next = wpa->next; + + wpa->next = next->next; + next->next = NULL; + next->ia.ff = fuse_file_get(wpa->ia.ff); list_add(&next->writepages_entry, &fi->writepages); /* @@ -1647,9 +1723,9 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) fuse_send_writepage(fc, next, inarg->offset + inarg->size); } fi->writectr--; - fuse_writepage_finish(fc, req); + fuse_writepage_finish(fc, wpa); spin_unlock(&fi->lock); - fuse_writepage_free(fc, req); + fuse_writepage_free(wpa); } static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, @@ -1691,52 +1767,71 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } +static struct fuse_writepage_args *fuse_writepage_args_alloc(void) +{ + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; + + wpa = kzalloc(sizeof(*wpa), GFP_NOFS); + if (wpa) { + ap = &wpa->ia.ap; + ap->num_pages = 0; + ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs); + if (!ap->pages) { + kfree(wpa); + wpa = NULL; + } + } + return wpa; + +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_req *req; + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; struct page *tmp_page; int error = -ENOMEM; set_page_writeback(page); - req = fuse_request_alloc_nofs(1); - if (!req) + wpa = fuse_writepage_args_alloc(); + if (!wpa) goto err; + ap = &wpa->ia.ap; - /* writeback always goes to bg_queue */ - __set_bit(FR_BACKGROUND, &req->flags); tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) goto err_free; error = -EIO; - req->ff = fuse_write_file_get(fc, fi); - if (!req->ff) + wpa->ia.ff = fuse_write_file_get(fc, fi); + if (!wpa->ia.ff) goto err_nofile; - fuse_write_fill(req, req->ff, page_offset(page), 0); + fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); copy_highpage(tmp_page, page); - req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; - req->misc.write.next = NULL; - req->in.argpages = 1; - req->num_pages = 1; - req->pages[0] = tmp_page; - req->page_descs[0].offset = 0; - req->page_descs[0].length = PAGE_SIZE; - req->end = fuse_writepage_end; - req->inode = inode; + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->next = NULL; + ap->args.in_pages = true; + ap->num_pages = 1; + ap->pages[0] = tmp_page; + ap->descs[0].offset = 0; + ap->descs[0].length = PAGE_SIZE; + ap->args.end = fuse_writepage_end; + wpa->inode = inode; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); - list_add(&req->writepages_entry, &fi->writepages); - list_add_tail(&req->list, &fi->queued_writes); + list_add(&wpa->writepages_entry, &fi->writepages); + list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -1747,7 +1842,7 @@ static int fuse_writepage_locked(struct page *page) err_nofile: __free_page(tmp_page); err_free: - fuse_request_free(req); + kfree(wpa); err: mapping_set_error(page->mapping, error); end_page_writeback(page); @@ -1767,6 +1862,7 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) WARN_ON(wbc->sync_mode == WB_SYNC_ALL); redirty_page_for_writepage(wbc, page); + unlock_page(page); return 0; } @@ -1777,23 +1873,50 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) } struct fuse_fill_wb_data { - struct fuse_req *req; + struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; struct page **orig_pages; + unsigned int max_pages; }; +static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) +{ + struct fuse_args_pages *ap = &data->wpa->ia.ap; + struct fuse_conn *fc = get_fuse_conn(data->inode); + struct page **pages; + struct fuse_page_desc *descs; + unsigned int npages = min_t(unsigned int, + max_t(unsigned int, data->max_pages * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), + fc->max_pages); + WARN_ON(npages <= data->max_pages); + + pages = fuse_pages_alloc(npages, GFP_NOFS, &descs); + if (!pages) + return false; + + memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages); + memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages); + kfree(ap->pages); + ap->pages = pages; + ap->descs = descs; + data->max_pages = npages; + + return true; +} + static void fuse_writepages_send(struct fuse_fill_wb_data *data) { - struct fuse_req *req = data->req; + struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_pages = req->num_pages; + int num_pages = wpa->ia.ap.num_pages; int i; - req->ff = fuse_file_get(data->ff); + wpa->ia.ff = fuse_file_get(data->ff); spin_lock(&fi->lock); - list_add_tail(&req->list, &fi->queued_writes); + list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -1808,54 +1931,52 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) * this new request onto the auxiliary list, otherwise reuse the existing one by * copying the new page contents over to the old temporary page. */ -static bool fuse_writepage_in_flight(struct fuse_req *new_req, +static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, struct page *page) { - struct fuse_conn *fc = get_fuse_conn(new_req->inode); - struct fuse_inode *fi = get_fuse_inode(new_req->inode); - struct fuse_req *tmp; - struct fuse_req *old_req; + struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); + struct fuse_writepage_args *tmp; + struct fuse_writepage_args *old_wpa; + struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - WARN_ON(new_req->num_pages != 0); + WARN_ON(new_ap->num_pages != 0); spin_lock(&fi->lock); - list_del(&new_req->writepages_entry); - old_req = fuse_find_writeback(fi, page->index, page->index); - if (!old_req) { - list_add(&new_req->writepages_entry, &fi->writepages); + list_del(&new_wpa->writepages_entry); + old_wpa = fuse_find_writeback(fi, page->index, page->index); + if (!old_wpa) { + list_add(&new_wpa->writepages_entry, &fi->writepages); spin_unlock(&fi->lock); return false; } - new_req->num_pages = 1; - for (tmp = old_req->misc.write.next; tmp; tmp = tmp->misc.write.next) { + new_ap->num_pages = 1; + for (tmp = old_wpa->next; tmp; tmp = tmp->next) { pgoff_t curr_index; - WARN_ON(tmp->inode != new_req->inode); - curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT; + WARN_ON(tmp->inode != new_wpa->inode); + curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; if (curr_index == page->index) { - WARN_ON(tmp->num_pages != 1); - WARN_ON(!test_bit(FR_PENDING, &tmp->flags)); - swap(tmp->pages[0], new_req->pages[0]); + WARN_ON(tmp->ia.ap.num_pages != 1); + swap(tmp->ia.ap.pages[0], new_ap->pages[0]); break; } } if (!tmp) { - new_req->misc.write.next = old_req->misc.write.next; - old_req->misc.write.next = new_req; + new_wpa->next = old_wpa->next; + old_wpa->next = new_wpa; } spin_unlock(&fi->lock); if (tmp) { - struct backing_dev_info *bdi = inode_to_bdi(new_req->inode); + struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); + dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); - fuse_writepage_free(fc, new_req); - fuse_request_free(new_req); + fuse_writepage_free(new_wpa); } return true; @@ -1865,7 +1986,8 @@ static int fuse_writepages_fill(struct page *page, struct writeback_control *wbc, void *_data) { struct fuse_fill_wb_data *data = _data; - struct fuse_req *req = data->req; + struct fuse_writepage_args *wpa = data->wpa; + struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); @@ -1888,16 +2010,16 @@ static int fuse_writepages_fill(struct page *page, */ is_writeback = fuse_page_is_writeback(inode, page->index); - if (req && req->num_pages && - (is_writeback || req->num_pages == fc->max_pages || - (req->num_pages + 1) * PAGE_SIZE > fc->max_write || - data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { + if (wpa && ap->num_pages && + (is_writeback || ap->num_pages == fc->max_pages || + (ap->num_pages + 1) * PAGE_SIZE > fc->max_write || + data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) { fuse_writepages_send(data); - data->req = NULL; - } else if (req && req->num_pages == req->max_pages) { - if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) { + data->wpa = NULL; + } else if (wpa && ap->num_pages == data->max_pages) { + if (!fuse_pages_realloc(data)) { fuse_writepages_send(data); - req = data->req = NULL; + data->wpa = NULL; } } @@ -1915,59 +2037,60 @@ static int fuse_writepages_fill(struct page *page, * This is ensured by holding the page lock in page_mkwrite() while * checking fuse_page_is_writeback(). We already hold the page lock * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment req->num_pages. + * request to the fi->writepages list and increment ap->num_pages. * After this fuse_page_is_writeback() will indicate that the page is * under writeback, so we can release the page lock. */ - if (data->req == NULL) { + if (data->wpa == NULL) { struct fuse_inode *fi = get_fuse_inode(inode); err = -ENOMEM; - req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES); - if (!req) { + wpa = fuse_writepage_args_alloc(); + if (!wpa) { __free_page(tmp_page); goto out_unlock; } + data->max_pages = 1; - fuse_write_fill(req, data->ff, page_offset(page), 0); - req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; - req->misc.write.next = NULL; - req->in.argpages = 1; - __set_bit(FR_BACKGROUND, &req->flags); - req->num_pages = 0; - req->end = fuse_writepage_end; - req->inode = inode; + ap = &wpa->ia.ap; + fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0); + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->next = NULL; + ap->args.in_pages = true; + ap->args.end = fuse_writepage_end; + ap->num_pages = 0; + wpa->inode = inode; spin_lock(&fi->lock); - list_add(&req->writepages_entry, &fi->writepages); + list_add(&wpa->writepages_entry, &fi->writepages); spin_unlock(&fi->lock); - data->req = req; + data->wpa = wpa; } set_page_writeback(page); copy_highpage(tmp_page, page); - req->pages[req->num_pages] = tmp_page; - req->page_descs[req->num_pages].offset = 0; - req->page_descs[req->num_pages].length = PAGE_SIZE; + ap->pages[ap->num_pages] = tmp_page; + ap->descs[ap->num_pages].offset = 0; + ap->descs[ap->num_pages].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; - if (is_writeback && fuse_writepage_in_flight(req, page)) { + if (is_writeback && fuse_writepage_in_flight(wpa, page)) { end_page_writeback(page); - data->req = NULL; + data->wpa = NULL; goto out_unlock; } - data->orig_pages[req->num_pages] = page; + data->orig_pages[ap->num_pages] = page; /* * Protected by fi->lock against concurrent access by * fuse_page_is_writeback(). */ spin_lock(&fi->lock); - req->num_pages++; + ap->num_pages++; spin_unlock(&fi->lock); out_unlock: @@ -1989,7 +2112,7 @@ static int fuse_writepages(struct address_space *mapping, goto out; data.inode = inode; - data.req = NULL; + data.wpa = NULL; data.ff = NULL; err = -ENOMEM; @@ -2000,9 +2123,9 @@ static int fuse_writepages(struct address_space *mapping, goto out; err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); - if (data.req) { + if (data.wpa) { /* Ignore errors if we can write at least one page */ - BUG_ON(!data.req->num_pages); + WARN_ON(!data.wpa->ia.ap.num_pages); fuse_writepages_send(&data); err = 0; } @@ -2222,11 +2345,11 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file, inarg->lk.pid = pid; if (flock) inarg->lk_flags |= FUSE_LK_FLOCK; - args->in.h.opcode = opcode; - args->in.h.nodeid = get_node_id(inode); - args->in.numargs = 1; - args->in.args[0].size = sizeof(*inarg); - args->in.args[0].value = inarg; + args->opcode = opcode; + args->nodeid = get_node_id(inode); + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; } static int fuse_getlk(struct file *file, struct file_lock *fl) @@ -2239,9 +2362,9 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) int err; fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) err = convert_fuse_file_lock(fc, &outarg.lk, fl); @@ -2336,14 +2459,14 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) memset(&inarg, 0, sizeof(inarg)); inarg.block = block; inarg.blocksize = inode->i_sb->s_blocksize; - args.in.h.opcode = FUSE_BMAP; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_BMAP; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) fc->no_bmap = 1; @@ -2368,14 +2491,14 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) if (fc->no_lseek) goto fallback; - args.in.h.opcode = FUSE_LSEEK; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_LSEEK; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (err) { if (err == -ENOSYS) { @@ -2573,14 +2696,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, .flags = flags }; struct fuse_ioctl_out outarg; - struct fuse_req *req = NULL; - struct page **pages = NULL; struct iovec *iov_page = NULL; struct iovec *in_iov = NULL, *out_iov = NULL; - unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; - size_t in_size, out_size, transferred, c; + unsigned int in_iovs = 0, out_iovs = 0, max_pages; + size_t in_size, out_size, c; + ssize_t transferred; int err, i; struct iov_iter ii; + struct fuse_args_pages ap = {}; #if BITS_PER_LONG == 32 inarg.flags |= FUSE_IOCTL_32BIT; @@ -2598,11 +2721,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - pages = kcalloc(fc->max_pages, sizeof(pages[0]), GFP_KERNEL); + ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); - if (!pages || !iov_page) + if (!ap.pages || !iov_page) goto out; + fuse_page_descs_length_init(ap.descs, 0, fc->max_pages); + /* * If restricted, initialize IO parameters as encoded in @cmd. * RETRY from server is not allowed. @@ -2639,56 +2764,44 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -ENOMEM; if (max_pages > fc->max_pages) goto out; - while (num_pages < max_pages) { - pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!pages[num_pages]) + while (ap.num_pages < max_pages) { + ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!ap.pages[ap.num_pages]) goto out; - num_pages++; + ap.num_pages++; } - req = fuse_get_req(fc, num_pages); - if (IS_ERR(req)) { - err = PTR_ERR(req); - req = NULL; - goto out; - } - memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); - req->num_pages = num_pages; - fuse_page_descs_length_init(req, 0, req->num_pages); /* okay, let's send it to the client */ - req->in.h.opcode = FUSE_IOCTL; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; + ap.args.opcode = FUSE_IOCTL; + ap.args.nodeid = ff->nodeid; + ap.args.in_numargs = 1; + ap.args.in_args[0].size = sizeof(inarg); + ap.args.in_args[0].value = &inarg; if (in_size) { - req->in.numargs++; - req->in.args[1].size = in_size; - req->in.argpages = 1; + ap.args.in_numargs++; + ap.args.in_args[1].size = in_size; + ap.args.in_pages = true; err = -EFAULT; iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { - c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } } - req->out.numargs = 2; - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - req->out.args[1].size = out_size; - req->out.argpages = 1; - req->out.argvar = 1; + ap.args.out_numargs = 2; + ap.args.out_args[0].size = sizeof(outarg); + ap.args.out_args[0].value = &outarg; + ap.args.out_args[1].size = out_size; + ap.args.out_pages = true; + ap.args.out_argvar = true; - fuse_request_send(fc, req); - err = req->out.h.error; - transferred = req->out.args[1].size; - fuse_put_request(fc, req); - req = NULL; - if (err) + transferred = fuse_simple_request(fc, &ap.args); + err = transferred; + if (transferred < 0) goto out; /* did it ask for retry? */ @@ -2713,7 +2826,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - vaddr = kmap_atomic(pages[0]); + vaddr = kmap_atomic(ap.pages[0]); err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); @@ -2741,19 +2854,17 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -EFAULT; iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { - c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } err = 0; out: - if (req) - fuse_put_request(fc, req); free_page((unsigned long) iov_page); - while (num_pages) - __free_page(pages[--num_pages]); - kfree(pages); + while (ap.num_pages) + __free_page(ap.pages[--ap.num_pages]); + kfree(ap.pages); return err ? err : outarg.result; } @@ -2861,14 +2972,14 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) fuse_register_polled_file(fc, ff); } - args.in.h.opcode = FUSE_POLL; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_POLL; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) @@ -3076,11 +3187,11 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - args.in.h.opcode = FUSE_FALLOCATE; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = FUSE_FALLOCATE; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_fallocate = 1; @@ -3168,14 +3279,14 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); - args.in.h.opcode = FUSE_COPY_FILE_RANGE; - args.in.h.nodeid = ff_in->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_COPY_FILE_RANGE; + args.nodeid = ff_in->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_copy_file_range = 1; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 24dbca777775..fc89cb40e874 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -47,9 +47,6 @@ /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 -/** Number of page pointers embedded in fuse_req */ -#define FUSE_REQ_INLINE_PAGES 1 - /** List of active connections */ extern struct list_head fuse_conn_list; @@ -164,17 +161,15 @@ enum { }; struct fuse_conn; +struct fuse_release_args; /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ struct fuse_conn *fc; - /* - * Request reserved for flush and release. - * Modified under relative fuse_inode::lock. - */ - struct fuse_req *reserved_req; + /* Argument space reserved for release */ + struct fuse_release_args *release_args; /** Kernel file handle guaranteed to be unique */ u64 kh; @@ -229,57 +224,12 @@ struct fuse_in_arg { const void *value; }; -/** The request input */ -struct fuse_in { - /** The request header */ - struct fuse_in_header h; - - /** True if the data for the last argument is in req->pages */ - unsigned argpages:1; - - /** Number of arguments */ - unsigned numargs; - - /** Array of arguments */ - struct fuse_in_arg args[3]; -}; - /** One output argument of a request */ struct fuse_arg { unsigned size; void *value; }; -/** The request output */ -struct fuse_out { - /** Header returned from userspace */ - struct fuse_out_header h; - - /* - * The following bitfields are not changed during the request - * processing - */ - - /** Last argument is variable length (can be shorter than - arg->size) */ - unsigned argvar:1; - - /** Last argument is a list of pages to copy data to */ - unsigned argpages:1; - - /** Zero partially or not copied pages */ - unsigned page_zeroing:1; - - /** Pages may be replaced with new ones */ - unsigned page_replace:1; - - /** Number or arguments */ - unsigned numargs; - - /** Array of arguments */ - struct fuse_arg args[2]; -}; - /** FUSE page descriptor */ struct fuse_page_desc { unsigned int length; @@ -287,20 +237,28 @@ struct fuse_page_desc { }; struct fuse_args { - struct { - struct { - uint32_t opcode; - uint64_t nodeid; - } h; - unsigned numargs; - struct fuse_in_arg args[3]; + uint64_t nodeid; + uint32_t opcode; + unsigned short in_numargs; + unsigned short out_numargs; + bool force:1; + bool noreply:1; + bool nocreds:1; + bool in_pages:1; + bool out_pages:1; + bool out_argvar:1; + bool page_zeroing:1; + bool page_replace:1; + struct fuse_in_arg in_args[3]; + struct fuse_arg out_args[2]; + void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error); +}; - } in; - struct { - unsigned argvar:1; - unsigned numargs; - struct fuse_arg args[2]; - } out; +struct fuse_args_pages { + struct fuse_args args; + struct page **pages; + struct fuse_page_desc *descs; + unsigned int num_pages; }; #define FUSE_ARGS(args) struct fuse_args args = {} @@ -373,83 +331,70 @@ struct fuse_req { /** Entry on the interrupts list */ struct list_head intr_entry; + /* Input/output arguments */ + struct fuse_args *args; + /** refcount */ refcount_t count; /* Request flags, updated with test/set/clear_bit() */ unsigned long flags; - /** The request input */ - struct fuse_in in; + /* The request input header */ + struct { + struct fuse_in_header h; + } in; - /** The request output */ - struct fuse_out out; + /* The request output header */ + struct { + struct fuse_out_header h; + } out; /** Used to wake up the task waiting for completion of request*/ wait_queue_head_t waitq; - /** Data for asynchronous requests */ - union { - struct { - struct fuse_release_in in; - struct inode *inode; - } release; - struct fuse_init_in init_in; - struct fuse_init_out init_out; - struct cuse_init_in cuse_init_in; - struct { - struct fuse_read_in in; - u64 attr_ver; - } read; - struct { - struct fuse_write_in in; - struct fuse_write_out out; - struct fuse_req *next; - } write; - struct fuse_notify_retrieve_in retrieve_in; - } misc; - - /** page vector */ - struct page **pages; - - /** page-descriptor vector */ - struct fuse_page_desc *page_descs; - - /** size of the 'pages' array */ - unsigned max_pages; - - /** inline page vector */ - struct page *inline_pages[FUSE_REQ_INLINE_PAGES]; - - /** inline page-descriptor vector */ - struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES]; - - /** number of pages in vector */ - unsigned num_pages; - - /** File used in the request (or NULL) */ - struct fuse_file *ff; - - /** Inode used in the request or NULL */ - struct inode *inode; - - /** AIO control block */ - struct fuse_io_priv *io; - - /** Link on fi->writepages */ - struct list_head writepages_entry; - - /** Request completion callback */ - void (*end)(struct fuse_conn *, struct fuse_req *); - - /** Request is stolen from fuse_file->reserved_req */ - struct file *stolen_file; }; +struct fuse_iqueue; + +/** + * Input queue callbacks + * + * Input queue signalling is device-specific. For example, the /dev/fuse file + * uses fiq->waitq and fasync to wake processes that are waiting on queue + * readiness. These callbacks allow other device types to respond to input + * queue activity. + */ +struct fuse_iqueue_ops { + /** + * Signal that a forget has been queued + */ + void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); + + /** + * Signal that an INTERRUPT request has been queued + */ + void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); + + /** + * Signal that a request has been queued + */ + void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); +}; + +/** /dev/fuse input queue operations */ +extern const struct fuse_iqueue_ops fuse_dev_fiq_ops; + struct fuse_iqueue { /** Connection established */ unsigned connected; + /** Lock protecting accesses to members of this structure */ + spinlock_t lock; + /** Readers of the connection are waiting on this */ wait_queue_head_t waitq; @@ -471,6 +416,12 @@ struct fuse_iqueue { /** O_ASYNC requests */ struct fasync_struct *fasync; + + /** Device-specific callbacks */ + const struct fuse_iqueue_ops *ops; + + /** Device-specific state */ + void *priv; }; #define FUSE_PQ_HASH_BITS 8 @@ -504,6 +455,29 @@ struct fuse_dev { struct list_head entry; }; +struct fuse_fs_context { + int fd; + unsigned int rootmode; + kuid_t user_id; + kgid_t group_id; + bool is_bdev:1; + bool fd_present:1; + bool rootmode_present:1; + bool user_id_present:1; + bool group_id_present:1; + bool default_permissions:1; + bool allow_other:1; + bool destroy:1; + bool no_control:1; + bool no_force_umount:1; + unsigned int max_read; + unsigned int blksize; + const char *subtype; + + /* fuse_dev pointer to fill in, should contain NULL on entry */ + void **fudptr; +}; + /** * A Fuse connection. * @@ -584,9 +558,6 @@ struct fuse_conn { /** waitq for blocked connection */ wait_queue_head_t blocked_waitq; - /** waitq for reserved requests */ - wait_queue_head_t reserved_req_waitq; - /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -721,6 +692,18 @@ struct fuse_conn { /** Does the filesystem support copy_file_range? */ unsigned no_copy_file_range:1; + /* Send DESTROY request */ + unsigned int destroy:1; + + /* Delete dentries that have gone stale */ + unsigned int delete_stale:1; + + /** Do not create entry in fusectl fs */ + unsigned int no_control:1; + + /** Do not allow MNT_FORCE umount */ + unsigned int no_force_umount:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -742,9 +725,6 @@ struct fuse_conn { /** Key for lock owner ID scrambling */ u32 scramble_key[4]; - /** Reserved request for the DESTROY message */ - struct fuse_req *destroy_req; - /** Version counter for attribute changes */ atomic64_t attr_version; @@ -820,14 +800,32 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); -/* Used by READDIRPLUS */ -void fuse_force_forget(struct file *file, u64 nodeid); +struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp); -/** +/* * Initialize READ or READDIR request */ -void fuse_read_fill(struct fuse_req *req, struct file *file, - loff_t pos, size_t count, int opcode); +struct fuse_io_args { + union { + struct { + struct fuse_read_in in; + u64 attr_ver; + } read; + struct { + struct fuse_write_in in; + struct fuse_write_out out; + } write; + }; + struct fuse_args_pages ap; + struct fuse_io_priv *io; + struct fuse_file *ff; +}; + +void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, + size_t count, int opcode); + /** * Send OPEN or OPENDIR request @@ -899,62 +897,17 @@ void fuse_dev_cleanup(void); int fuse_ctl_init(void); void __exit fuse_ctl_cleanup(void); -/** - * Allocate a request - */ -struct fuse_req *fuse_request_alloc(unsigned npages); - -struct fuse_req *fuse_request_alloc_nofs(unsigned npages); - -bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, - gfp_t flags); - - -/** - * Free a request - */ -void fuse_request_free(struct fuse_req *req); - -/** - * Get a request, may fail with -ENOMEM, - * caller should specify # elements in req->pages[] explicitly - */ -struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages); -struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, - unsigned npages); - -/* - * Increment reference count on request - */ -void __fuse_get_request(struct fuse_req *req); - -/** - * Gets a requests for a file operation, always succeeds - */ -struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, - struct file *file); - -/** - * Decrement reference count of a request. If count goes to zero free - * the request. - */ -void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); - -/** - * Send a request (synchronous) - */ -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); - /** * Simple request sending that does request allocation and freeing */ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); +int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, + gfp_t gfp_flags); /** - * Send a request in the background + * End a finished request */ -void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); -bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); @@ -980,15 +933,33 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** * Initialize fuse_conn */ -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns); +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, + const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv); /** * Release reference to fuse_conn */ void fuse_conn_put(struct fuse_conn *fc); -struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc(void); +void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); +void fuse_send_init(struct fuse_conn *fc); + +/** + * Fill in superblock and initialize fuse connection + * @sb: partially-initialized superblock to fill in + * @ctx: mount context + */ +int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); + +/** + * Disassociate fuse connection from superblock and kill the superblock + * + * Calls kill_anon_super(), do not use with bdev mounts. + */ +void fuse_kill_sb_anon(struct super_block *sb); /** * Add connection to control filesystem @@ -1093,4 +1064,15 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); /* readdir.c */ int fuse_readdir(struct file *file, struct dir_context *ctx); +/** + * Return the number of bytes in an arguments list + */ +unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); + +/** + * Get the next unique ID for a request + */ +u64 fuse_get_unique(struct fuse_iqueue *fiq); +void fuse_free_conn(struct fuse_conn *fc); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4bb885b0f032..51cb471f4dc3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -15,7 +15,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -59,24 +60,13 @@ MODULE_PARM_DESC(max_user_congthresh, /** Congestion starts at 75% of maximum */ #define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) -struct fuse_mount_data { - int fd; - unsigned rootmode; - kuid_t user_id; - kgid_t group_id; - unsigned fd_present:1; - unsigned rootmode_present:1; - unsigned user_id_present:1; - unsigned group_id_present:1; - unsigned default_permissions:1; - unsigned allow_other:1; - unsigned max_read; - unsigned blksize; -}; +#ifdef CONFIG_BLOCK +static struct file_system_type fuseblk_fs_type; +#endif struct fuse_forget_link *fuse_alloc_forget(void) { - return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); + return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT); } static struct inode *fuse_alloc_inode(struct super_block *sb) @@ -374,19 +364,21 @@ void fuse_unlock_inode(struct inode *inode, bool locked) static void fuse_umount_begin(struct super_block *sb) { - fuse_abort_conn(get_fuse_conn_super(sb)); + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (!fc->no_force_umount) + fuse_abort_conn(fc); } static void fuse_send_destroy(struct fuse_conn *fc) { - struct fuse_req *req = fc->destroy_req; - if (req && fc->conn_init) { - fc->destroy_req = NULL; - req->in.h.opcode = FUSE_DESTROY; - __set_bit(FR_FORCE, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); - fuse_request_send(fc, req); - fuse_put_request(fc, req); + if (fc->conn_init) { + FUSE_ARGS(args); + + args.opcode = FUSE_DESTROY; + args.force = true; + args.nocreds = true; + fuse_simple_request(fc, &args); } } @@ -430,12 +422,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) } memset(&outarg, 0, sizeof(outarg)); - args.in.numargs = 0; - args.in.h.opcode = FUSE_STATFS; - args.in.h.nodeid = get_node_id(d_inode(dentry)); - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.in_numargs = 0; + args.opcode = FUSE_STATFS; + args.nodeid = get_node_id(d_inode(dentry)); + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); @@ -443,6 +435,8 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) } enum { + OPT_SOURCE, + OPT_SUBTYPE, OPT_FD, OPT_ROOTMODE, OPT_USER_ID, @@ -454,111 +448,109 @@ enum { OPT_ERR }; -static const match_table_t tokens = { - {OPT_FD, "fd=%u"}, - {OPT_ROOTMODE, "rootmode=%o"}, - {OPT_USER_ID, "user_id=%u"}, - {OPT_GROUP_ID, "group_id=%u"}, - {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, - {OPT_ALLOW_OTHER, "allow_other"}, - {OPT_MAX_READ, "max_read=%u"}, - {OPT_BLKSIZE, "blksize=%u"}, - {OPT_ERR, NULL} +static const struct fs_parameter_spec fuse_param_specs[] = { + fsparam_string ("source", OPT_SOURCE), + fsparam_u32 ("fd", OPT_FD), + fsparam_u32oct ("rootmode", OPT_ROOTMODE), + fsparam_u32 ("user_id", OPT_USER_ID), + fsparam_u32 ("group_id", OPT_GROUP_ID), + fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS), + fsparam_flag ("allow_other", OPT_ALLOW_OTHER), + fsparam_u32 ("max_read", OPT_MAX_READ), + fsparam_u32 ("blksize", OPT_BLKSIZE), + fsparam_string ("subtype", OPT_SUBTYPE), + {} }; -static int fuse_match_uint(substring_t *s, unsigned int *res) +static const struct fs_parameter_description fuse_fs_parameters = { + .name = "fuse", + .specs = fuse_param_specs, +}; + +static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) { - int err = -ENOMEM; - char *buf = match_strdup(s); - if (buf) { - err = kstrtouint(buf, 10, res); - kfree(buf); - } - return err; -} + struct fs_parse_result result; + struct fuse_fs_context *ctx = fc->fs_private; + int opt; -static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, - struct user_namespace *user_ns) -{ - char *p; - memset(d, 0, sizeof(struct fuse_mount_data)); - d->max_read = ~0; - d->blksize = FUSE_DEFAULT_BLKSIZE; + opt = fs_parse(fc, &fuse_fs_parameters, param, &result); + if (opt < 0) + return opt; - while ((p = strsep(&opt, ",")) != NULL) { - int token; - int value; - unsigned uv; - substring_t args[MAX_OPT_ARGS]; - if (!*p) - continue; + switch (opt) { + case OPT_SOURCE: + if (fc->source) + return invalf(fc, "fuse: Multiple sources specified"); + fc->source = param->string; + param->string = NULL; + break; - token = match_token(p, tokens, args); - switch (token) { - case OPT_FD: - if (match_int(&args[0], &value)) - return 0; - d->fd = value; - d->fd_present = 1; - break; - - case OPT_ROOTMODE: - if (match_octal(&args[0], &value)) - return 0; - if (!fuse_valid_type(value)) - return 0; - d->rootmode = value; - d->rootmode_present = 1; - break; - - case OPT_USER_ID: - if (fuse_match_uint(&args[0], &uv)) - return 0; - d->user_id = make_kuid(user_ns, uv); - if (!uid_valid(d->user_id)) - return 0; - d->user_id_present = 1; - break; - - case OPT_GROUP_ID: - if (fuse_match_uint(&args[0], &uv)) - return 0; - d->group_id = make_kgid(user_ns, uv); - if (!gid_valid(d->group_id)) - return 0; - d->group_id_present = 1; - break; - - case OPT_DEFAULT_PERMISSIONS: - d->default_permissions = 1; - break; - - case OPT_ALLOW_OTHER: - d->allow_other = 1; - break; - - case OPT_MAX_READ: - if (match_int(&args[0], &value)) - return 0; - d->max_read = value; - break; - - case OPT_BLKSIZE: - if (!is_bdev || match_int(&args[0], &value)) - return 0; - d->blksize = value; - break; - - default: - return 0; - } - } - - if (!d->fd_present || !d->rootmode_present || - !d->user_id_present || !d->group_id_present) + case OPT_SUBTYPE: + if (ctx->subtype) + return invalf(fc, "fuse: Multiple subtypes specified"); + ctx->subtype = param->string; + param->string = NULL; return 0; - return 1; + case OPT_FD: + ctx->fd = result.uint_32; + ctx->fd_present = 1; + break; + + case OPT_ROOTMODE: + if (!fuse_valid_type(result.uint_32)) + return invalf(fc, "fuse: Invalid rootmode"); + ctx->rootmode = result.uint_32; + ctx->rootmode_present = 1; + break; + + case OPT_USER_ID: + ctx->user_id = make_kuid(fc->user_ns, result.uint_32); + if (!uid_valid(ctx->user_id)) + return invalf(fc, "fuse: Invalid user_id"); + ctx->user_id_present = 1; + break; + + case OPT_GROUP_ID: + ctx->group_id = make_kgid(fc->user_ns, result.uint_32); + if (!gid_valid(ctx->group_id)) + return invalf(fc, "fuse: Invalid group_id"); + ctx->group_id_present = 1; + break; + + case OPT_DEFAULT_PERMISSIONS: + ctx->default_permissions = 1; + break; + + case OPT_ALLOW_OTHER: + ctx->allow_other = 1; + break; + + case OPT_MAX_READ: + ctx->max_read = result.uint_32; + break; + + case OPT_BLKSIZE: + if (!ctx->is_bdev) + return invalf(fc, "fuse: blksize only supported for fuseblk"); + ctx->blksize = result.uint_32; + break; + + default: + return -EINVAL; + } + + return 0; +} + +static void fuse_free_fc(struct fs_context *fc) +{ + struct fuse_fs_context *ctx = fc->fs_private; + + if (ctx) { + kfree(ctx->subtype); + kfree(ctx); + } } static int fuse_show_options(struct seq_file *m, struct dentry *root) @@ -579,14 +571,19 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void fuse_iqueue_init(struct fuse_iqueue *fiq) +static void fuse_iqueue_init(struct fuse_iqueue *fiq, + const struct fuse_iqueue_ops *ops, + void *priv) { memset(fiq, 0, sizeof(struct fuse_iqueue)); + spin_lock_init(&fiq->lock); init_waitqueue_head(&fiq->waitq); INIT_LIST_HEAD(&fiq->pending); INIT_LIST_HEAD(&fiq->interrupts); fiq->forget_list_tail = &fiq->forget_list_head; fiq->connected = 1; + fiq->ops = ops; + fiq->priv = priv; } static void fuse_pqueue_init(struct fuse_pqueue *fpq) @@ -600,7 +597,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) fpq->connected = 1; } -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, + const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); @@ -609,8 +607,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); - init_waitqueue_head(&fc->reserved_req_waitq); - fuse_iqueue_init(&fc->iq); + fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); INIT_LIST_HEAD(&fc->devices); @@ -633,8 +630,6 @@ EXPORT_SYMBOL_GPL(fuse_conn_init); void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { - if (fc->destroy_req) - fuse_request_free(fc->destroy_req); put_pid_ns(fc->pid_ns); put_user_ns(fc->user_ns); fc->release(fc); @@ -822,9 +817,12 @@ static const struct super_operations fuse_super_operations = { static void sanitize_global_limit(unsigned *limit) { + /* + * The default maximum number of async requests is calculated to consume + * 1/2^13 of the total memory, assuming 392 bytes per request. + */ if (*limit == 0) - *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / - sizeof(struct fuse_req); + *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 392; if (*limit >= 1 << 16) *limit = (1 << 16) - 1; @@ -870,11 +868,19 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) spin_unlock(&fc->bg_lock); } -static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) -{ - struct fuse_init_out *arg = &req->misc.init_out; +struct fuse_init_args { + struct fuse_args args; + struct fuse_init_in in; + struct fuse_init_out out; +}; - if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) +static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, + int error) +{ + struct fuse_init_args *ia = container_of(args, typeof(*ia), args); + struct fuse_init_out *arg = &ia->out; + + if (error || arg->major != FUSE_KERNEL_VERSION) fc->conn_error = 1; else { unsigned long ra_pages; @@ -951,18 +957,23 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->max_write = max_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } + kfree(ia); + fuse_set_initialized(fc); wake_up_all(&fc->blocked_waitq); } -static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) +void fuse_send_init(struct fuse_conn *fc) { - struct fuse_init_in *arg = &req->misc.init_in; + struct fuse_init_args *ia; - arg->major = FUSE_KERNEL_VERSION; - arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; - arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL); + + ia->in.major = FUSE_KERNEL_VERSION; + ia->in.minor = FUSE_KERNEL_MINOR_VERSION; + ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; + ia->in.flags |= + FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | @@ -971,26 +982,32 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA; - req->in.h.opcode = FUSE_INIT; - req->in.numargs = 1; - req->in.args[0].size = sizeof(*arg); - req->in.args[0].value = arg; - req->out.numargs = 1; + ia->args.opcode = FUSE_INIT; + ia->args.in_numargs = 1; + ia->args.in_args[0].size = sizeof(ia->in); + ia->args.in_args[0].value = &ia->in; + ia->args.out_numargs = 1; /* Variable length argument used for backward compatibility with interface version < 7.5. Rest of init_out is zeroed by do_get_request(), so a short reply is not a problem */ - req->out.argvar = 1; - req->out.args[0].size = sizeof(struct fuse_init_out); - req->out.args[0].value = &req->misc.init_out; - req->end = process_init_reply; - fuse_request_send_background(fc, req); -} + ia->args.out_argvar = 1; + ia->args.out_args[0].size = sizeof(ia->out); + ia->args.out_args[0].value = &ia->out; + ia->args.force = true; + ia->args.nocreds = true; + ia->args.end = process_init_reply; -static void fuse_free_conn(struct fuse_conn *fc) + if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0) + process_init_reply(fc, &ia->args, -ENOTCONN); +} +EXPORT_SYMBOL_GPL(fuse_send_init); + +void fuse_free_conn(struct fuse_conn *fc) { WARN_ON(!list_empty(&fc->devices)); kfree_rcu(fc, rcu); } +EXPORT_SYMBOL_GPL(fuse_free_conn); static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { @@ -1032,7 +1049,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) return 0; } -struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) +struct fuse_dev *fuse_dev_alloc(void) { struct fuse_dev *fud; struct list_head *pq; @@ -1048,17 +1065,34 @@ struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) } fud->pq.processing = pq; - fud->fc = fuse_conn_get(fc); fuse_pqueue_init(&fud->pq); - spin_lock(&fc->lock); - list_add_tail(&fud->entry, &fc->devices); - spin_unlock(&fc->lock); - return fud; } EXPORT_SYMBOL_GPL(fuse_dev_alloc); +void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc) +{ + fud->fc = fuse_conn_get(fc); + spin_lock(&fc->lock); + list_add_tail(&fud->entry, &fc->devices); + spin_unlock(&fc->lock); +} +EXPORT_SYMBOL_GPL(fuse_dev_install); + +struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + + fud = fuse_dev_alloc(); + if (!fud) + return NULL; + + fuse_dev_install(fud, fc); + return fud; +} +EXPORT_SYMBOL_GPL(fuse_dev_alloc_install); + void fuse_dev_free(struct fuse_dev *fud) { struct fuse_conn *fc = fud->fc; @@ -1075,17 +1109,13 @@ void fuse_dev_free(struct fuse_dev *fud) } EXPORT_SYMBOL_GPL(fuse_dev_free); -static int fuse_fill_super(struct super_block *sb, void *data, int silent) +int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { struct fuse_dev *fud; - struct fuse_conn *fc; + struct fuse_conn *fc = get_fuse_conn_super(sb); struct inode *root; - struct fuse_mount_data d; - struct file *file; struct dentry *root_dentry; - struct fuse_req *init_req; int err; - int is_bdev = sb->s_bdev != NULL; err = -EINVAL; if (sb->s_flags & SB_MANDLOCK) @@ -1093,19 +1123,19 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) - goto err; - - if (is_bdev) { + if (ctx->is_bdev) { #ifdef CONFIG_BLOCK err = -EINVAL; - if (!sb_set_blocksize(sb, d.blksize)) + if (!sb_set_blocksize(sb, ctx->blksize)) goto err; #endif } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; } + + sb->s_subtype = ctx->subtype; + ctx->subtype = NULL; sb->s_magic = FUSE_SUPER_MAGIC; sb->s_op = &fuse_super_operations; sb->s_xattr = fuse_xattr_handlers; @@ -1116,19 +1146,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; - file = fget(d.fd); - err = -EINVAL; - if (!file) - goto err; - - /* - * Require mount to happen from the same user namespace which - * opened /dev/fuse to prevent potential attacks. - */ - if (file->f_op != &fuse_dev_operations || - file->f_cred->user_ns != sb->s_user_ns) - goto err_fput; - /* * If we are not in the initial user namespace posix * acls must be translated. @@ -1136,17 +1153,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_user_ns != &init_user_ns) sb->s_xattr = fuse_no_acl_xattr_handlers; - fc = kmalloc(sizeof(*fc), GFP_KERNEL); - err = -ENOMEM; - if (!fc) - goto err_fput; - - fuse_conn_init(fc, sb->s_user_ns); - fc->release = fuse_free_conn; - - fud = fuse_dev_alloc(fc); + fud = fuse_dev_alloc_install(fc); if (!fud) - goto err_put_conn; + goto err; fc->dev = sb->s_dev; fc->sb = sb; @@ -1159,17 +1168,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->dont_mask = 1; sb->s_flags |= SB_POSIXACL; - fc->default_permissions = d.default_permissions; - fc->allow_other = d.allow_other; - fc->user_id = d.user_id; - fc->group_id = d.group_id; - fc->max_read = max_t(unsigned, 4096, d.max_read); - - /* Used by get_root_inode() */ - sb->s_fs_info = fc; + fc->default_permissions = ctx->default_permissions; + fc->allow_other = ctx->allow_other; + fc->user_id = ctx->user_id; + fc->group_id = ctx->group_id; + fc->max_read = max_t(unsigned, 4096, ctx->max_read); + fc->destroy = ctx->destroy; + fc->no_control = ctx->no_control; + fc->no_force_umount = ctx->no_force_umount; err = -ENOMEM; - root = fuse_get_root_inode(sb, d.rootmode); + root = fuse_get_root_inode(sb, ctx->rootmode); sb->s_d_op = &fuse_root_dentry_operations; root_dentry = d_make_root(root); if (!root_dentry) @@ -1177,20 +1186,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) /* Root dentry doesn't have .d_revalidate */ sb->s_d_op = &fuse_dentry_operations; - init_req = fuse_request_alloc(0); - if (!init_req) - goto err_put_root; - __set_bit(FR_BACKGROUND, &init_req->flags); - - if (is_bdev) { - fc->destroy_req = fuse_request_alloc(0); - if (!fc->destroy_req) - goto err_free_init_req; - } - mutex_lock(&fuse_mutex); err = -EINVAL; - if (file->private_data) + if (*ctx->fudptr) goto err_unlock; err = fuse_ctl_add_conn(fc); @@ -1199,27 +1197,62 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - file->private_data = fud; + *ctx->fudptr = fud; mutex_unlock(&fuse_mutex); + return 0; + + err_unlock: + mutex_unlock(&fuse_mutex); + dput(root_dentry); + err_dev_free: + fuse_dev_free(fud); + err: + return err; +} +EXPORT_SYMBOL_GPL(fuse_fill_super_common); + +static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) +{ + struct fuse_fs_context *ctx = fsc->fs_private; + struct file *file; + int err; + struct fuse_conn *fc; + + err = -EINVAL; + file = fget(ctx->fd); + if (!file) + goto err; + + /* + * Require mount to happen from the same user namespace which + * opened /dev/fuse to prevent potential attacks. + */ + if ((file->f_op != &fuse_dev_operations) || + (file->f_cred->user_ns != sb->s_user_ns)) + goto err_fput; + ctx->fudptr = &file->private_data; + + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + err = -ENOMEM; + if (!fc) + goto err_fput; + + fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); + fc->release = fuse_free_conn; + sb->s_fs_info = fc; + + err = fuse_fill_super_common(sb, ctx); + if (err) + goto err_put_conn; /* * atomic_dec_and_test() in fput() provides the necessary * memory barrier for file->private_data to be visible on all * CPUs after this */ fput(file); - - fuse_send_init(fc, init_req); - + fuse_send_init(get_fuse_conn_super(sb)); return 0; - err_unlock: - mutex_unlock(&fuse_mutex); - err_free_init_req: - fuse_request_free(init_req); - err_put_root: - dput(root_dentry); - err_dev_free: - fuse_dev_free(fud); err_put_conn: fuse_conn_put(fc); sb->s_fs_info = NULL; @@ -1229,11 +1262,52 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) return err; } -static struct dentry *fuse_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *raw_data) +static int fuse_get_tree(struct fs_context *fc) { - return mount_nodev(fs_type, flags, raw_data, fuse_fill_super); + struct fuse_fs_context *ctx = fc->fs_private; + + if (!ctx->fd_present || !ctx->rootmode_present || + !ctx->user_id_present || !ctx->group_id_present) + return -EINVAL; + +#ifdef CONFIG_BLOCK + if (ctx->is_bdev) + return get_tree_bdev(fc, fuse_fill_super); +#endif + + return get_tree_nodev(fc, fuse_fill_super); +} + +static const struct fs_context_operations fuse_context_ops = { + .free = fuse_free_fc, + .parse_param = fuse_parse_param, + .get_tree = fuse_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int fuse_init_fs_context(struct fs_context *fc) +{ + struct fuse_fs_context *ctx; + + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->max_read = ~0; + ctx->blksize = FUSE_DEFAULT_BLKSIZE; + +#ifdef CONFIG_BLOCK + if (fc->fs_type == &fuseblk_fs_type) { + ctx->is_bdev = true; + ctx->destroy = true; + } +#endif + + fc->fs_private = ctx; + fc->ops = &fuse_context_ops; + return 0; } static void fuse_sb_destroy(struct super_block *sb) @@ -1241,7 +1315,8 @@ static void fuse_sb_destroy(struct super_block *sb) struct fuse_conn *fc = get_fuse_conn_super(sb); if (fc) { - fuse_send_destroy(fc); + if (fc->destroy) + fuse_send_destroy(fc); fuse_abort_conn(fc); fuse_wait_aborted(fc); @@ -1252,29 +1327,24 @@ static void fuse_sb_destroy(struct super_block *sb) } } -static void fuse_kill_sb_anon(struct super_block *sb) +void fuse_kill_sb_anon(struct super_block *sb) { fuse_sb_destroy(sb); kill_anon_super(sb); } +EXPORT_SYMBOL_GPL(fuse_kill_sb_anon); static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, - .mount = fuse_mount, + .init_fs_context = fuse_init_fs_context, + .parameters = &fuse_fs_parameters, .kill_sb = fuse_kill_sb_anon, }; MODULE_ALIAS_FS("fuse"); #ifdef CONFIG_BLOCK -static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *raw_data) -{ - return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super); -} - static void fuse_kill_sb_blk(struct super_block *sb) { fuse_sb_destroy(sb); @@ -1284,7 +1354,8 @@ static void fuse_kill_sb_blk(struct super_block *sb) static struct file_system_type fuseblk_fs_type = { .owner = THIS_MODULE, .name = "fuseblk", - .mount = fuse_mount_blk, + .init_fs_context = fuse_init_fs_context, + .parameters = &fuse_fs_parameters, .kill_sb = fuse_kill_sb_blk, .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, }; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 574d03f8a573..5c38b9d84c6e 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -249,6 +249,27 @@ static int fuse_direntplus_link(struct file *file, return 0; } +static void fuse_force_forget(struct file *file, u64 nodeid) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_forget_in inarg; + FUSE_ARGS(args); + + memset(&inarg, 0, sizeof(inarg)); + inarg.nlookup = 1; + args.opcode = FUSE_FORGET; + args.nodeid = nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + args.noreply = true; + + fuse_simple_request(fc, &args); + /* ignore errors */ +} + static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, struct dir_context *ctx, u64 attr_version) { @@ -295,62 +316,55 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { - int plus, err; - size_t nbytes; + int plus; + ssize_t res; struct page *page; struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + struct fuse_io_args ia = {}; + struct fuse_args_pages *ap = &ia.ap; + struct fuse_page_desc desc = { .length = PAGE_SIZE }; u64 attr_version = 0; bool locked; - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); - page = alloc_page(GFP_KERNEL); - if (!page) { - fuse_put_request(fc, req); + if (!page) return -ENOMEM; - } plus = fuse_use_readdirplus(inode, ctx); - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = PAGE_SIZE; + ap->args.out_pages = 1; + ap->num_pages = 1; + ap->pages = &page; + ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fc); - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); + fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, + FUSE_READDIRPLUS); } else { - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); + fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, + FUSE_READDIR); } locked = fuse_lock_inode(inode); - fuse_request_send(fc, req); + res = fuse_simple_request(fc, &ap->args); fuse_unlock_inode(inode, locked); - nbytes = req->out.args[0].size; - err = req->out.h.error; - fuse_put_request(fc, req); - if (!err) { - if (!nbytes) { + if (res >= 0) { + if (!res) { struct fuse_file *ff = file->private_data; if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - err = parse_dirplusfile(page_address(page), nbytes, + res = parse_dirplusfile(page_address(page), res, file, ctx, attr_version); } else { - err = parse_dirfile(page_address(page), nbytes, file, + res = parse_dirfile(page_address(page), res, file, ctx); } } __free_page(page); fuse_invalidate_atime(inode); - return err; + return res; } enum fuse_parse_result { @@ -372,11 +386,13 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, for (;;) { struct fuse_dirent *dirent = addr + offset; unsigned int nbytes = size - offset; - size_t reclen = FUSE_DIRENT_SIZE(dirent); + size_t reclen; if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen) break; + reclen = FUSE_DIRENT_SIZE(dirent); /* derefs ->namelen */ + if (WARN_ON(dirent->namelen > FUSE_NAME_MAX)) return FOUND_ERR; if (WARN_ON(reclen > nbytes)) diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 433717640f78..20d052e08b3b 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -25,15 +25,15 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, memset(&inarg, 0, sizeof(inarg)); inarg.size = size; inarg.flags = flags; - args.in.h.opcode = FUSE_SETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 3; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; - args.in.args[2].size = size; - args.in.args[2].value = value; + args.opcode = FUSE_SETXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 3; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; + args.in_args[2].size = size; + args.in_args[2].value = value; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_setxattr = 1; @@ -60,22 +60,22 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, memset(&inarg, 0, sizeof(inarg)); inarg.size = size; - args.in.h.opcode = FUSE_GETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; + args.opcode = FUSE_GETXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; /* This is really two different operations rolled into one */ - args.out.numargs = 1; + args.out_numargs = 1; if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = value; + args.out_argvar = true; + args.out_args[0].size = size; + args.out_args[0].value = value; } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; } ret = fuse_simple_request(fc, &args); if (!ret && !size) @@ -121,20 +121,20 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) memset(&inarg, 0, sizeof(inarg)); inarg.size = size; - args.in.h.opcode = FUSE_LISTXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = FUSE_LISTXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; /* This is really two different operations rolled into one */ - args.out.numargs = 1; + args.out_numargs = 1; if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = list; + args.out_argvar = true; + args.out_args[0].size = size; + args.out_args[0].value = list; } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; } ret = fuse_simple_request(fc, &args); if (!ret && !size) @@ -157,11 +157,11 @@ int fuse_removexattr(struct inode *inode, const char *name) if (fc->no_removexattr) return -EOPNOTSUPP; - args.in.h.opcode = FUSE_REMOVEXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = strlen(name) + 1; - args.in.args[0].value = name; + args.opcode = FUSE_REMOVEXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = strlen(name) + 1; + args.in_args[0].value = name; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_removexattr = 1; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 6b450065b9d5..5f89c515f5bb 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -584,10 +584,10 @@ struct gfs2_args { unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ unsigned int ar_loccookie:1; /* use location based readdir cookies */ - int ar_commit; /* Commit interval */ - int ar_statfs_quantum; /* The fast statfs interval */ - int ar_quota_quantum; /* The quota interval */ - int ar_statfs_percent; /* The % change to force sync */ + s32 ar_commit; /* Commit interval */ + s32 ar_statfs_quantum; /* The fast statfs interval */ + s32 ar_quota_quantum; /* The quota interval */ + s32 ar_statfs_percent; /* The % change to force sync */ }; struct gfs2_tune { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index f3fd5cd9d43f..681b44682b0d 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -1031,16 +1032,17 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp) } /** - * fill_super - Read in superblock + * gfs2_fill_super - Read in superblock * @sb: The VFS superblock - * @data: Mount options + * @args: Mount options * @silent: Don't complain if it's not a GFS2 filesystem * - * Returns: errno + * Returns: -errno */ - -static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent) +static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) { + struct gfs2_args *args = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; struct gfs2_sbd *sdp; struct gfs2_holder mount_gh; int error; @@ -1205,161 +1207,411 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent return error; } -static int set_gfs2_super(struct super_block *s, void *data) +/** + * gfs2_get_tree - Get the GFS2 superblock and root directory + * @fc: The filesystem context + * + * Returns: 0 or -errno on error + */ +static int gfs2_get_tree(struct fs_context *fc) { - s->s_bdev = data; - s->s_dev = s->s_bdev->bd_dev; - s->s_bdi = bdi_get(s->s_bdev->bd_bdi); + struct gfs2_args *args = fc->fs_private; + struct gfs2_sbd *sdp; + int error; + + error = get_tree_bdev(fc, gfs2_fill_super); + if (error) + return error; + + sdp = fc->root->d_sb->s_fs_info; + dput(fc->root); + if (args->ar_meta) + fc->root = dget(sdp->sd_master_dir); + else + fc->root = dget(sdp->sd_root_dir); return 0; } -static int test_gfs2_super(struct super_block *s, void *ptr) +static void gfs2_fc_free(struct fs_context *fc) { - struct block_device *bdev = ptr; - return (bdev == s->s_bdev); + struct gfs2_args *args = fc->fs_private; + + kfree(args); } -/** - * gfs2_mount - Get the GFS2 superblock - * @fs_type: The GFS2 filesystem type - * @flags: Mount flags - * @dev_name: The name of the device - * @data: The mount arguments - * - * Q. Why not use get_sb_bdev() ? - * A. We need to select one of two root directories to mount, independent - * of whether this is the initial, or subsequent, mount of this sb - * - * Returns: 0 or -ve on error - */ +enum gfs2_param { + Opt_lockproto, + Opt_locktable, + Opt_hostdata, + Opt_spectator, + Opt_ignore_local_fs, + Opt_localflocks, + Opt_localcaching, + Opt_debug, + Opt_upgrade, + Opt_acl, + Opt_quota, + Opt_suiddir, + Opt_data, + Opt_meta, + Opt_discard, + Opt_commit, + Opt_errors, + Opt_statfs_quantum, + Opt_statfs_percent, + Opt_quota_quantum, + Opt_barrier, + Opt_rgrplvb, + Opt_loccookie, +}; -static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +enum opt_quota { + Opt_quota_unset = 0, + Opt_quota_off, + Opt_quota_account, + Opt_quota_on, +}; + +static const unsigned int opt_quota_values[] = { + [Opt_quota_off] = GFS2_QUOTA_OFF, + [Opt_quota_account] = GFS2_QUOTA_ACCOUNT, + [Opt_quota_on] = GFS2_QUOTA_ON, +}; + +enum opt_data { + Opt_data_writeback = GFS2_DATA_WRITEBACK, + Opt_data_ordered = GFS2_DATA_ORDERED, +}; + +enum opt_errors { + Opt_errors_withdraw = GFS2_ERRORS_WITHDRAW, + Opt_errors_panic = GFS2_ERRORS_PANIC, +}; + +static const struct fs_parameter_spec gfs2_param_specs[] = { + fsparam_string ("lockproto", Opt_lockproto), + fsparam_string ("locktable", Opt_locktable), + fsparam_string ("hostdata", Opt_hostdata), + fsparam_flag ("spectator", Opt_spectator), + fsparam_flag ("norecovery", Opt_spectator), + fsparam_flag ("ignore_local_fs", Opt_ignore_local_fs), + fsparam_flag ("localflocks", Opt_localflocks), + fsparam_flag ("localcaching", Opt_localcaching), + fsparam_flag_no("debug", Opt_debug), + fsparam_flag ("upgrade", Opt_upgrade), + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("suiddir", Opt_suiddir), + fsparam_enum ("data", Opt_data), + fsparam_flag ("meta", Opt_meta), + fsparam_flag_no("discard", Opt_discard), + fsparam_s32 ("commit", Opt_commit), + fsparam_enum ("errors", Opt_errors), + fsparam_s32 ("statfs_quantum", Opt_statfs_quantum), + fsparam_s32 ("statfs_percent", Opt_statfs_percent), + fsparam_s32 ("quota_quantum", Opt_quota_quantum), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag_no("rgrplvb", Opt_rgrplvb), + fsparam_flag_no("loccookie", Opt_loccookie), + /* quota can be a flag or an enum so it gets special treatment */ + __fsparam(fs_param_is_enum, "quota", Opt_quota, fs_param_neg_with_no|fs_param_v_optional), + {} +}; + +static const struct fs_parameter_enum gfs2_param_enums[] = { + { Opt_quota, "off", Opt_quota_off }, + { Opt_quota, "account", Opt_quota_account }, + { Opt_quota, "on", Opt_quota_on }, + { Opt_data, "writeback", Opt_data_writeback }, + { Opt_data, "ordered", Opt_data_ordered }, + { Opt_errors, "withdraw", Opt_errors_withdraw }, + { Opt_errors, "panic", Opt_errors_panic }, + {} +}; + +const struct fs_parameter_description gfs2_fs_parameters = { + .name = "gfs2", + .specs = gfs2_param_specs, + .enums = gfs2_param_enums, +}; + +/* Parse a single mount parameter */ +static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct block_device *bdev; - struct super_block *s; - fmode_t mode = FMODE_READ | FMODE_EXCL; - int error; - struct gfs2_args args; - struct gfs2_sbd *sdp; + struct gfs2_args *args = fc->fs_private; + struct fs_parse_result result; + int o; - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; + o = fs_parse(fc, &gfs2_fs_parameters, param, &result); + if (o < 0) + return o; - bdev = blkdev_get_by_path(dev_name, mode, fs_type); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); - - /* - * once the super is inserted into the list by sget, s_umount - * will protect the lockfs code from trying to start a snapshot - * while we are mounting - */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - error = -EBUSY; - goto error_bdev; + switch (o) { + case Opt_lockproto: + strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_locktable: + strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_hostdata: + strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_spectator: + args->ar_spectator = 1; + break; + case Opt_ignore_local_fs: + /* Retained for backwards compat only */ + break; + case Opt_localflocks: + args->ar_localflocks = 1; + break; + case Opt_localcaching: + /* Retained for backwards compat only */ + break; + case Opt_debug: + if (result.boolean && args->ar_errors == GFS2_ERRORS_PANIC) + return invalf(fc, "gfs2: -o debug and -o errors=panic are mutually exclusive"); + args->ar_debug = result.boolean; + break; + case Opt_upgrade: + /* Retained for backwards compat only */ + break; + case Opt_acl: + args->ar_posix_acl = result.boolean; + break; + case Opt_quota: + /* The quota option can be a flag or an enum. A non-zero int_32 + result means that we have an enum index. Otherwise we have + to rely on the 'negated' flag to tell us whether 'quota' or + 'noquota' was specified. */ + if (result.negated) + args->ar_quota = GFS2_QUOTA_OFF; + else if (result.int_32 > 0) + args->ar_quota = opt_quota_values[result.int_32]; + else + args->ar_quota = GFS2_QUOTA_ON; + break; + case Opt_suiddir: + args->ar_suiddir = result.boolean; + break; + case Opt_data: + /* The uint_32 result maps directly to GFS2_DATA_* */ + args->ar_data = result.uint_32; + break; + case Opt_meta: + args->ar_meta = 1; + break; + case Opt_discard: + args->ar_discard = result.boolean; + break; + case Opt_commit: + if (result.int_32 <= 0) + return invalf(fc, "gfs2: commit mount option requires a positive numeric argument"); + args->ar_commit = result.int_32; + break; + case Opt_statfs_quantum: + if (result.int_32 < 0) + return invalf(fc, "gfs2: statfs_quantum mount option requires a non-negative numeric argument"); + args->ar_statfs_quantum = result.int_32; + break; + case Opt_quota_quantum: + if (result.int_32 <= 0) + return invalf(fc, "gfs2: quota_quantum mount option requires a positive numeric argument"); + args->ar_quota_quantum = result.int_32; + break; + case Opt_statfs_percent: + if (result.int_32 < 0 || result.int_32 > 100) + return invalf(fc, "gfs2: statfs_percent mount option requires a numeric argument between 0 and 100"); + args->ar_statfs_percent = result.int_32; + break; + case Opt_errors: + if (args->ar_debug && result.uint_32 == GFS2_ERRORS_PANIC) + return invalf(fc, "gfs2: -o debug and -o errors=panic are mutually exclusive"); + args->ar_errors = result.uint_32; + break; + case Opt_barrier: + args->ar_nobarrier = result.boolean; + break; + case Opt_rgrplvb: + args->ar_rgrplvb = result.boolean; + break; + case Opt_loccookie: + args->ar_loccookie = result.boolean; + break; + default: + return invalf(fc, "gfs2: invalid mount option: %s", param->key); } - s = sget(fs_type, test_gfs2_super, set_gfs2_super, flags, bdev); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - error = PTR_ERR(s); - if (IS_ERR(s)) - goto error_bdev; + return 0; +} - if (s->s_root) { - /* - * s_umount nests inside bd_mutex during - * __invalidate_device(). blkdev_put() acquires - * bd_mutex and can't be called under s_umount. Drop - * s_umount temporarily. This is safe as we're - * holding an active reference. - */ - up_write(&s->s_umount); - blkdev_put(bdev, mode); - down_write(&s->s_umount); - } else { - /* s_mode must be set before deactivate_locked_super calls */ - s->s_mode = mode; - } +static int gfs2_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_args *oldargs = &sdp->sd_args; + struct gfs2_args *newargs = fc->fs_private; + struct gfs2_tune *gt = &sdp->sd_tune; + int error = 0; - memset(&args, 0, sizeof(args)); - args.ar_quota = GFS2_QUOTA_DEFAULT; - args.ar_data = GFS2_DATA_DEFAULT; - args.ar_commit = 30; - args.ar_statfs_quantum = 30; - args.ar_quota_quantum = 60; - args.ar_errors = GFS2_ERRORS_DEFAULT; + sync_filesystem(sb); - error = gfs2_mount_args(&args, data); - if (error) { - pr_warn("can't parse mount arguments\n"); - goto error_super; - } - - if (s->s_root) { - error = -EBUSY; - if ((flags ^ s->s_flags) & SB_RDONLY) - goto error_super; - } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, &args, flags & SB_SILENT ? 1 : 0); - if (error) - goto error_super; - s->s_flags |= SB_ACTIVE; - bdev->bd_super = s; - } - - sdp = s->s_fs_info; - if (args.ar_meta) - return dget(sdp->sd_master_dir); + spin_lock(>->gt_spin); + oldargs->ar_commit = gt->gt_logd_secs; + oldargs->ar_quota_quantum = gt->gt_quota_quantum; + if (gt->gt_statfs_slow) + oldargs->ar_statfs_quantum = 0; else - return dget(sdp->sd_root_dir); + oldargs->ar_statfs_quantum = gt->gt_statfs_quantum; + spin_unlock(>->gt_spin); -error_super: - deactivate_locked_super(s); - return ERR_PTR(error); -error_bdev: - blkdev_put(bdev, mode); - return ERR_PTR(error); + if (strcmp(newargs->ar_lockproto, oldargs->ar_lockproto)) { + errorf(fc, "gfs2: reconfiguration of locking protocol not allowed"); + return -EINVAL; + } + if (strcmp(newargs->ar_locktable, oldargs->ar_locktable)) { + errorf(fc, "gfs2: reconfiguration of lock table not allowed"); + return -EINVAL; + } + if (strcmp(newargs->ar_hostdata, oldargs->ar_hostdata)) { + errorf(fc, "gfs2: reconfiguration of host data not allowed"); + return -EINVAL; + } + if (newargs->ar_spectator != oldargs->ar_spectator) { + errorf(fc, "gfs2: reconfiguration of spectator mode not allowed"); + return -EINVAL; + } + if (newargs->ar_localflocks != oldargs->ar_localflocks) { + errorf(fc, "gfs2: reconfiguration of localflocks not allowed"); + return -EINVAL; + } + if (newargs->ar_meta != oldargs->ar_meta) { + errorf(fc, "gfs2: switching between gfs2 and gfs2meta not allowed"); + return -EINVAL; + } + if (oldargs->ar_spectator) + fc->sb_flags |= SB_RDONLY; + + if ((sb->s_flags ^ fc->sb_flags) & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { + error = gfs2_make_fs_ro(sdp); + if (error) + errorf(fc, "gfs2: unable to remount read-only"); + } else { + error = gfs2_make_fs_rw(sdp); + if (error) + errorf(fc, "gfs2: unable to remount read-write"); + } + } + sdp->sd_args = *newargs; + + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= SB_POSIXACL; + else + sb->s_flags &= ~SB_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + else + clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); + spin_lock(>->gt_spin); + gt->gt_logd_secs = newargs->ar_commit; + gt->gt_quota_quantum = newargs->ar_quota_quantum; + if (newargs->ar_statfs_quantum) { + gt->gt_statfs_slow = 0; + gt->gt_statfs_quantum = newargs->ar_statfs_quantum; + } + else { + gt->gt_statfs_slow = 1; + gt->gt_statfs_quantum = 30; + } + spin_unlock(>->gt_spin); + + gfs2_online_uevent(sdp); + return error; } -static int set_meta_super(struct super_block *s, void *ptr) +static const struct fs_context_operations gfs2_context_ops = { + .free = gfs2_fc_free, + .parse_param = gfs2_parse_param, + .get_tree = gfs2_get_tree, + .reconfigure = gfs2_reconfigure, +}; + +/* Set up the filesystem mount context */ +static int gfs2_init_fs_context(struct fs_context *fc) +{ + struct gfs2_args *args; + + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (args == NULL) + return -ENOMEM; + + args->ar_quota = GFS2_QUOTA_DEFAULT; + args->ar_data = GFS2_DATA_DEFAULT; + args->ar_commit = 30; + args->ar_statfs_quantum = 30; + args->ar_quota_quantum = 60; + args->ar_errors = GFS2_ERRORS_DEFAULT; + + fc->fs_private = args; + fc->ops = &gfs2_context_ops; + return 0; +} + +static int set_meta_super(struct super_block *s, struct fs_context *fc) { return -EINVAL; } -static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int test_meta_super(struct super_block *s, struct fs_context *fc) +{ + return (fc->sget_key == s->s_bdev); +} + +static int gfs2_meta_get_tree(struct fs_context *fc) { struct super_block *s; struct gfs2_sbd *sdp; struct path path; int error; - if (!dev_name || !*dev_name) - return ERR_PTR(-EINVAL); + if (!fc->source || !*fc->source) + return -EINVAL; - error = kern_path(dev_name, LOOKUP_FOLLOW, &path); + error = kern_path(fc->source, LOOKUP_FOLLOW, &path); if (error) { pr_warn("path_lookup on %s returned error %d\n", - dev_name, error); - return ERR_PTR(error); + fc->source, error); + return error; } - s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, - path.dentry->d_sb->s_bdev); + fc->fs_type = &gfs2_fs_type; + fc->sget_key = path.dentry->d_sb->s_bdev; + s = sget_fc(fc, test_meta_super, set_meta_super); path_put(&path); if (IS_ERR(s)) { pr_warn("gfs2 mount does not exist\n"); - return ERR_CAST(s); + return PTR_ERR(s); } - if ((flags ^ s->s_flags) & SB_RDONLY) { + if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); - return ERR_PTR(-EBUSY); + return -EBUSY; } sdp = s->s_fs_info; - return dget(sdp->sd_master_dir); + fc->root = dget(sdp->sd_master_dir); + return 0; +} + +static const struct fs_context_operations gfs2_meta_context_ops = { + .get_tree = gfs2_meta_get_tree, +}; + +static int gfs2_meta_init_fs_context(struct fs_context *fc) +{ + int ret = gfs2_init_fs_context(fc); + + if (ret) + return ret; + + fc->ops = &gfs2_meta_context_ops; + return 0; } static void gfs2_kill_sb(struct super_block *sb) @@ -1383,7 +1635,8 @@ static void gfs2_kill_sb(struct super_block *sb) struct file_system_type gfs2_fs_type = { .name = "gfs2", .fs_flags = FS_REQUIRES_DEV, - .mount = gfs2_mount, + .init_fs_context = gfs2_init_fs_context, + .parameters = &gfs2_fs_parameters, .kill_sb = gfs2_kill_sb, .owner = THIS_MODULE, }; @@ -1392,7 +1645,7 @@ MODULE_ALIAS_FS("gfs2"); struct file_system_type gfs2meta_fs_type = { .name = "gfs2meta", .fs_flags = FS_REQUIRES_DEV, - .mount = gfs2_mount_meta, + .init_fs_context = gfs2_meta_init_fs_context, .owner = THIS_MODULE, }; MODULE_ALIAS_FS("gfs2meta"); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 644c70ae09f7..5fa1eec4fb4f 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -44,258 +44,6 @@ #include "xattr.h" #include "lops.h" -#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) - -enum { - Opt_lockproto, - Opt_locktable, - Opt_hostdata, - Opt_spectator, - Opt_ignore_local_fs, - Opt_localflocks, - Opt_localcaching, - Opt_debug, - Opt_nodebug, - Opt_upgrade, - Opt_acl, - Opt_noacl, - Opt_quota_off, - Opt_quota_account, - Opt_quota_on, - Opt_quota, - Opt_noquota, - Opt_suiddir, - Opt_nosuiddir, - Opt_data_writeback, - Opt_data_ordered, - Opt_meta, - Opt_discard, - Opt_nodiscard, - Opt_commit, - Opt_err_withdraw, - Opt_err_panic, - Opt_statfs_quantum, - Opt_statfs_percent, - Opt_quota_quantum, - Opt_barrier, - Opt_nobarrier, - Opt_rgrplvb, - Opt_norgrplvb, - Opt_loccookie, - Opt_noloccookie, - Opt_error, -}; - -static const match_table_t tokens = { - {Opt_lockproto, "lockproto=%s"}, - {Opt_locktable, "locktable=%s"}, - {Opt_hostdata, "hostdata=%s"}, - {Opt_spectator, "spectator"}, - {Opt_spectator, "norecovery"}, - {Opt_ignore_local_fs, "ignore_local_fs"}, - {Opt_localflocks, "localflocks"}, - {Opt_localcaching, "localcaching"}, - {Opt_debug, "debug"}, - {Opt_nodebug, "nodebug"}, - {Opt_upgrade, "upgrade"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_quota_off, "quota=off"}, - {Opt_quota_account, "quota=account"}, - {Opt_quota_on, "quota=on"}, - {Opt_quota, "quota"}, - {Opt_noquota, "noquota"}, - {Opt_suiddir, "suiddir"}, - {Opt_nosuiddir, "nosuiddir"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_data_ordered, "data=ordered"}, - {Opt_meta, "meta"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_commit, "commit=%d"}, - {Opt_err_withdraw, "errors=withdraw"}, - {Opt_err_panic, "errors=panic"}, - {Opt_statfs_quantum, "statfs_quantum=%d"}, - {Opt_statfs_percent, "statfs_percent=%d"}, - {Opt_quota_quantum, "quota_quantum=%d"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_rgrplvb, "rgrplvb"}, - {Opt_norgrplvb, "norgrplvb"}, - {Opt_loccookie, "loccookie"}, - {Opt_noloccookie, "noloccookie"}, - {Opt_error, NULL} -}; - -/** - * gfs2_mount_args - Parse mount options - * @args: The structure into which the parsed options will be written - * @options: The options to parse - * - * Return: errno - */ - -int gfs2_mount_args(struct gfs2_args *args, char *options) -{ - char *o; - int token; - substring_t tmp[MAX_OPT_ARGS]; - int rv; - - /* Split the options into tokens with the "," character and - process them */ - - while (1) { - o = strsep(&options, ","); - if (o == NULL) - break; - if (*o == '\0') - continue; - - token = match_token(o, tokens, tmp); - switch (token) { - case Opt_lockproto: - match_strlcpy(args->ar_lockproto, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_locktable: - match_strlcpy(args->ar_locktable, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_hostdata: - match_strlcpy(args->ar_hostdata, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_spectator: - args->ar_spectator = 1; - break; - case Opt_ignore_local_fs: - /* Retained for backwards compat only */ - break; - case Opt_localflocks: - args->ar_localflocks = 1; - break; - case Opt_localcaching: - /* Retained for backwards compat only */ - break; - case Opt_debug: - if (args->ar_errors == GFS2_ERRORS_PANIC) { - pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); - return -EINVAL; - } - args->ar_debug = 1; - break; - case Opt_nodebug: - args->ar_debug = 0; - break; - case Opt_upgrade: - /* Retained for backwards compat only */ - break; - case Opt_acl: - args->ar_posix_acl = 1; - break; - case Opt_noacl: - args->ar_posix_acl = 0; - break; - case Opt_quota_off: - case Opt_noquota: - args->ar_quota = GFS2_QUOTA_OFF; - break; - case Opt_quota_account: - args->ar_quota = GFS2_QUOTA_ACCOUNT; - break; - case Opt_quota_on: - case Opt_quota: - args->ar_quota = GFS2_QUOTA_ON; - break; - case Opt_suiddir: - args->ar_suiddir = 1; - break; - case Opt_nosuiddir: - args->ar_suiddir = 0; - break; - case Opt_data_writeback: - args->ar_data = GFS2_DATA_WRITEBACK; - break; - case Opt_data_ordered: - args->ar_data = GFS2_DATA_ORDERED; - break; - case Opt_meta: - args->ar_meta = 1; - break; - case Opt_discard: - args->ar_discard = 1; - break; - case Opt_nodiscard: - args->ar_discard = 0; - break; - case Opt_commit: - rv = match_int(&tmp[0], &args->ar_commit); - if (rv || args->ar_commit <= 0) { - pr_warn("commit mount option requires a positive numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_statfs_quantum: - rv = match_int(&tmp[0], &args->ar_statfs_quantum); - if (rv || args->ar_statfs_quantum < 0) { - pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_quota_quantum: - rv = match_int(&tmp[0], &args->ar_quota_quantum); - if (rv || args->ar_quota_quantum <= 0) { - pr_warn("quota_quantum mount option requires a positive numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_statfs_percent: - rv = match_int(&tmp[0], &args->ar_statfs_percent); - if (rv || args->ar_statfs_percent < 0 || - args->ar_statfs_percent > 100) { - pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_err_withdraw: - args->ar_errors = GFS2_ERRORS_WITHDRAW; - break; - case Opt_err_panic: - if (args->ar_debug) { - pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); - return -EINVAL; - } - args->ar_errors = GFS2_ERRORS_PANIC; - break; - case Opt_barrier: - args->ar_nobarrier = 0; - break; - case Opt_nobarrier: - args->ar_nobarrier = 1; - break; - case Opt_rgrplvb: - args->ar_rgrplvb = 1; - break; - case Opt_norgrplvb: - args->ar_rgrplvb = 0; - break; - case Opt_loccookie: - args->ar_loccookie = 1; - break; - case Opt_noloccookie: - args->ar_loccookie = 0; - break; - case Opt_error: - default: - pr_warn("invalid mount option: %s\n", o); - return -EINVAL; - } - } - - return 0; -} - /** * gfs2_jindex_free - Clear all the journal index information * @sdp: The GFS2 superblock @@ -847,7 +595,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) * Returns: errno */ -static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +int gfs2_make_fs_ro(struct gfs2_sbd *sdp) { struct gfs2_holder freeze_gh; int error; @@ -1226,84 +974,6 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -/** - * gfs2_remount_fs - called when the FS is remounted - * @sb: the filesystem - * @flags: the remount flags - * @data: extra data passed in (not used right now) - * - * Returns: errno - */ - -static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_args args = sdp->sd_args; /* Default to current settings */ - struct gfs2_tune *gt = &sdp->sd_tune; - int error; - - sync_filesystem(sb); - - spin_lock(>->gt_spin); - args.ar_commit = gt->gt_logd_secs; - args.ar_quota_quantum = gt->gt_quota_quantum; - if (gt->gt_statfs_slow) - args.ar_statfs_quantum = 0; - else - args.ar_statfs_quantum = gt->gt_statfs_quantum; - spin_unlock(>->gt_spin); - error = gfs2_mount_args(&args, data); - if (error) - return error; - - /* Not allowed to change locking details */ - if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) || - strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) || - strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata)) - return -EINVAL; - - /* Some flags must not be changed */ - if (args_neq(&args, &sdp->sd_args, spectator) || - args_neq(&args, &sdp->sd_args, localflocks) || - args_neq(&args, &sdp->sd_args, meta)) - return -EINVAL; - - if (sdp->sd_args.ar_spectator) - *flags |= SB_RDONLY; - - if ((sb->s_flags ^ *flags) & SB_RDONLY) { - if (*flags & SB_RDONLY) - error = gfs2_make_fs_ro(sdp); - else - error = gfs2_make_fs_rw(sdp); - } - - sdp->sd_args = args; - if (sdp->sd_args.ar_posix_acl) - sb->s_flags |= SB_POSIXACL; - else - sb->s_flags &= ~SB_POSIXACL; - if (sdp->sd_args.ar_nobarrier) - set_bit(SDF_NOBARRIERS, &sdp->sd_flags); - else - clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); - spin_lock(>->gt_spin); - gt->gt_logd_secs = args.ar_commit; - gt->gt_quota_quantum = args.ar_quota_quantum; - if (args.ar_statfs_quantum) { - gt->gt_statfs_slow = 0; - gt->gt_statfs_quantum = args.ar_statfs_quantum; - } - else { - gt->gt_statfs_slow = 1; - gt->gt_statfs_quantum = 30; - } - spin_unlock(>->gt_spin); - - gfs2_online_uevent(sdp); - return error; -} - /** * gfs2_drop_inode - Drop an inode (test for remote unlink) * @inode: The inode to drop @@ -1748,7 +1418,6 @@ const struct super_operations gfs2_super_ops = { .freeze_super = gfs2_freeze, .thaw_super = gfs2_unfreeze, .statfs = gfs2_statfs, - .remount_fs = gfs2_remount_fs, .drop_inode = gfs2_drop_inode, .show_options = gfs2_show_options, }; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 9d49eaadb9d9..b8bf811a1305 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -24,8 +24,6 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) extern void gfs2_jindex_free(struct gfs2_sbd *sdp); -extern int gfs2_mount_args(struct gfs2_args *args, char *data); - extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); @@ -33,6 +31,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, struct gfs2_inode **ipp); extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); +extern int gfs2_make_fs_ro(struct gfs2_sbd *sdp); extern void gfs2_online_uevent(struct gfs2_sbd *sdp); extern int gfs2_statfs_init(struct gfs2_sbd *sdp); extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, diff --git a/fs/inode.c b/fs/inode.c index f731409d3a8e..853ce7aa6301 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -181,6 +181,9 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->flags = 0; mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + atomic_set(&mapping->nr_thps, 0); +#endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->writeback_index = 0; diff --git a/fs/io_uring.c b/fs/io_uring.c index 0dadbdbead0f..dd094b387cab 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -200,6 +200,7 @@ struct io_ring_ctx { struct io_uring_sqe *sq_sqes; struct list_head defer_list; + struct list_head timeout_list; } ____cacheline_aligned_in_smp; /* IO offload */ @@ -216,6 +217,7 @@ struct io_ring_ctx { struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; struct eventfd_ctx *cq_ev_fd; + atomic_t cq_timeouts; } ____cacheline_aligned_in_smp; struct io_rings *rings; @@ -283,6 +285,11 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; +struct io_timeout { + struct file *file; + struct hrtimer timer; +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -294,6 +301,7 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; + struct io_timeout timeout; }; struct sqe_submit submit; @@ -313,6 +321,7 @@ struct io_kiocb { #define REQ_F_LINK_DONE 128 /* linked sqes done */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ #define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ +#define REQ_F_TIMEOUT 1024 /* timeout request */ u64 user_data; u32 result; u32 sequence; @@ -344,6 +353,8 @@ struct io_submit_state { }; static void io_sq_wq_submit_work(struct work_struct *work); +static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, + long res); static void __io_free_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -400,26 +411,30 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->cancel_list); INIT_LIST_HEAD(&ctx->defer_list); + INIT_LIST_HEAD(&ctx->timeout_list); return ctx; } static inline bool io_sequence_defer(struct io_ring_ctx *ctx, struct io_kiocb *req) { - if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) + /* timeout requests always honor sequence */ + if (!(req->flags & REQ_F_TIMEOUT) && + (req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; } -static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) +static struct io_kiocb *__io_get_deferred_req(struct io_ring_ctx *ctx, + struct list_head *list) { struct io_kiocb *req; - if (list_empty(&ctx->defer_list)) + if (list_empty(list)) return NULL; - req = list_first_entry(&ctx->defer_list, struct io_kiocb, list); + req = list_first_entry(list, struct io_kiocb, list); if (!io_sequence_defer(ctx, req)) { list_del_init(&req->list); return req; @@ -428,6 +443,16 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) return NULL; } +static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) +{ + return __io_get_deferred_req(ctx, &ctx->defer_list); +} + +static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) +{ + return __io_get_deferred_req(ctx, &ctx->timeout_list); +} + static void __io_commit_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; @@ -446,25 +471,50 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) static inline void io_queue_async_work(struct io_ring_ctx *ctx, struct io_kiocb *req) { - int rw; + int rw = 0; - switch (req->submit.sqe->opcode) { - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - rw = !(req->rw.ki_flags & IOCB_DIRECT); - break; - default: - rw = 0; - break; + if (req->submit.sqe) { + switch (req->submit.sqe->opcode) { + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + rw = !(req->rw.ki_flags & IOCB_DIRECT); + break; + } } queue_work(ctx->sqo_wq[rw], &req->work); } +static void io_kill_timeout(struct io_kiocb *req) +{ + int ret; + + ret = hrtimer_try_to_cancel(&req->timeout.timer); + if (ret != -1) { + atomic_inc(&req->ctx->cq_timeouts); + list_del(&req->list); + io_cqring_fill_event(req->ctx, req->user_data, 0); + __io_free_req(req); + } +} + +static void io_kill_timeouts(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req, *tmp; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) + io_kill_timeout(req); + spin_unlock_irq(&ctx->completion_lock); +} + static void io_commit_cqring(struct io_ring_ctx *ctx) { struct io_kiocb *req; + while ((req = io_get_timeout_req(ctx)) != NULL) + io_kill_timeout(req); + __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { @@ -1248,6 +1298,51 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) } } +/* + * For files that don't have ->read_iter() and ->write_iter(), handle them + * by looping over ->read() or ->write() manually. + */ +static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, + struct iov_iter *iter) +{ + ssize_t ret = 0; + + /* + * Don't support polled IO through this interface, and we can't + * support non-blocking either. For the latter, this just causes + * the kiocb to be handled from an async context. + */ + if (kiocb->ki_flags & IOCB_HIPRI) + return -EOPNOTSUPP; + if (kiocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + while (iov_iter_count(iter)) { + struct iovec iovec = iov_iter_iovec(iter); + ssize_t nr; + + if (rw == READ) { + nr = file->f_op->read(file, iovec.iov_base, + iovec.iov_len, &kiocb->ki_pos); + } else { + nr = file->f_op->write(file, iovec.iov_base, + iovec.iov_len, &kiocb->ki_pos); + } + + if (nr < 0) { + if (!ret) + ret = nr; + break; + } + ret += nr; + if (nr != iovec.iov_len) + break; + iov_iter_advance(iter, nr); + } + + return ret; +} + static int io_read(struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock) { @@ -1265,8 +1360,6 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; - if (unlikely(!file->f_op->read_iter)) - return -EINVAL; ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); if (ret < 0) @@ -1281,7 +1374,11 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, if (!ret) { ssize_t ret2; - ret2 = call_read_iter(file, kiocb, &iter); + if (file->f_op->read_iter) + ret2 = call_read_iter(file, kiocb, &iter); + else + ret2 = loop_rw_iter(READ, file, kiocb, &iter); + /* * In case of a short read, punt to async. This can happen * if we have data partially cached. Alternatively we can @@ -1326,8 +1423,6 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, file = kiocb->ki_filp; if (unlikely(!(file->f_mode & FMODE_WRITE))) return -EBADF; - if (unlikely(!file->f_op->write_iter)) - return -EINVAL; ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); if (ret < 0) @@ -1365,7 +1460,10 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, } kiocb->ki_flags |= IOCB_WRITE; - ret2 = call_write_iter(file, kiocb, &iter); + if (file->f_op->write_iter) + ret2 = call_write_iter(file, kiocb, &iter); + else + ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) { io_rw_done(kiocb, ret2); } else { @@ -1714,6 +1812,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!poll->file) return -EBADF; + req->submit.sqe = NULL; INIT_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; @@ -1765,6 +1864,81 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ipt.error; } +static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) +{ + struct io_ring_ctx *ctx; + struct io_kiocb *req; + unsigned long flags; + + req = container_of(timer, struct io_kiocb, timeout.timer); + ctx = req->ctx; + atomic_inc(&ctx->cq_timeouts); + + spin_lock_irqsave(&ctx->completion_lock, flags); + list_del(&req->list); + + io_cqring_fill_event(ctx, req->user_data, -ETIME); + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); + + io_put_req(req); + return HRTIMER_NORESTART; +} + +static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + unsigned count, req_dist, tail_index; + struct io_ring_ctx *ctx = req->ctx; + struct list_head *entry; + struct timespec ts; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags || + sqe->len != 1) + return -EINVAL; + if (copy_from_user(&ts, (void __user *) (unsigned long) sqe->addr, + sizeof(ts))) + return -EFAULT; + + /* + * sqe->off holds how many events that need to occur for this + * timeout event to be satisfied. + */ + count = READ_ONCE(sqe->off); + if (!count) + count = 1; + + req->sequence = ctx->cached_sq_head + count - 1; + req->flags |= REQ_F_TIMEOUT; + + /* + * Insertion sort, ensuring the first entry in the list is always + * the one we need first. + */ + tail_index = ctx->cached_cq_tail - ctx->rings->sq_dropped; + req_dist = req->sequence - tail_index; + spin_lock_irq(&ctx->completion_lock); + list_for_each_prev(entry, &ctx->timeout_list) { + struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); + unsigned dist; + + dist = nxt->sequence - tail_index; + if (req_dist >= dist) + break; + } + list_add(&req->list, entry); + spin_unlock_irq(&ctx->completion_lock); + + hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + req->timeout.timer.function = io_timeout_fn; + hrtimer_start(&req->timeout.timer, timespec_to_ktime(ts), + HRTIMER_MODE_REL); + return 0; +} + static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -1842,6 +2016,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_RECVMSG: ret = io_recvmsg(req, s->sqe, force_nonblock); break; + case IORING_OP_TIMEOUT: + ret = io_timeout(req, s->sqe); + break; default: ret = -EINVAL; break; @@ -2098,13 +2275,11 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { struct io_uring_sqe *sqe_copy; - sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); + sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); if (sqe_copy) { struct async_list *list; - memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy)); s->sqe = sqe_copy; - memcpy(&req->submit, s, sizeof(*s)); list = io_async_list_from_sqe(ctx, s->sqe); if (!io_add_to_prev_work(list, req)) { @@ -2359,18 +2534,22 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, io_queue_link_head(ctx, link, &link->submit, shadow_req, true); link = NULL; + shadow_req = NULL; } prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) { if (!shadow_req) { shadow_req = io_get_req(ctx, NULL); + if (unlikely(!shadow_req)) + goto out; shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); refcount_dec(&shadow_req->refs); } shadow_req->sequence = sqes[i].sequence; } +out: if (unlikely(mm_fault)) { io_cqring_add_event(ctx, sqes[i].sqe->user_data, -EFAULT); @@ -2436,7 +2615,7 @@ static int io_sq_thread(void *data) * to sleep. */ if (inflight || !time_after(jiffies, timeout)) { - cpu_relax(); + cond_resched(); continue; } @@ -2545,18 +2724,22 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, io_queue_link_head(ctx, link, &link->submit, shadow_req, force_nonblock); link = NULL; + shadow_req = NULL; } prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { if (!shadow_req) { shadow_req = io_get_req(ctx, NULL); + if (unlikely(!shadow_req)) + goto out; shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); refcount_dec(&shadow_req->refs); } shadow_req->sequence = s.sequence; } +out: s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; @@ -2593,6 +2776,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz) { struct io_rings *rings = ctx->rings; + unsigned nr_timeouts; int ret; if (io_cqring_events(rings) >= min_events) @@ -2611,7 +2795,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events); + nr_timeouts = atomic_read(&ctx->cq_timeouts); + /* + * Return if we have enough events, or if a timeout occured since + * we started waiting. For timeouts, we always want to return to + * userspace. + */ + ret = wait_event_interruptible(ctx->wait, + io_cqring_events(rings) >= min_events || + atomic_read(&ctx->cq_timeouts) != nr_timeouts); restore_saved_sigmask_unless(ret == -ERESTARTSYS); if (ret == -ERESTARTSYS) ret = -EINTR; @@ -3282,6 +3474,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); + io_kill_timeouts(ctx); io_poll_remove_all(ctx); io_iopoll_reap_events(ctx); wait_for_completion(&ctx->ctx_done); @@ -3319,7 +3512,7 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) } page = virt_to_head_page(ptr); - if (sz > (PAGE_SIZE << compound_order(page))) + if (sz > page_size(page)) return -EINVAL; pfn = virt_to_phys(ptr) >> PAGE_SHIFT; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 10517cea9682..1fc28c2da279 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -24,7 +24,7 @@ struct iomap_dio { struct kiocb *iocb; - iomap_dio_end_io_t *end_io; + const struct iomap_dio_ops *dops; loff_t i_size; loff_t size; atomic_t ref; @@ -72,18 +72,14 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, static ssize_t iomap_dio_complete(struct iomap_dio *dio) { + const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; - ssize_t ret; + ssize_t ret = dio->error; - if (dio->end_io) { - ret = dio->end_io(iocb, - dio->error ? dio->error : dio->size, - dio->flags); - } else { - ret = dio->error; - } + if (dops && dops->end_io) + ret = dops->end_io(iocb, dio->size, ret, dio->flags); if (likely(!ret)) { ret = dio->size; @@ -101,9 +97,9 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) * one is a pretty crazy thing to do, so we don't support it 100%. If * this invalidation fails, tough, the write still worked... * - * And this page cache invalidation has to be after dio->end_io(), as - * some filesystems convert unwritten extents to real allocations in - * end_io() when necessary, otherwise a racing buffer read would cache + * And this page cache invalidation has to be after ->end_io(), as some + * filesystems convert unwritten extents to real allocations in + * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ if (!dio->error && @@ -396,7 +392,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, */ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, - const struct iomap_ops *ops, iomap_dio_end_io_t end_io) + const struct iomap_ops *ops, const struct iomap_dio_ops *dops) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); @@ -421,7 +417,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, atomic_set(&dio->ref, 1); dio->size = 0; dio->i_size = i_size_read(inode); - dio->end_io = end_io; + dio->dops = dops; dio->error = 0; dio->flags = 0; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 953990eb70a9..1c58859aa592 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -89,8 +89,6 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); EXPORT_SYMBOL(jbd2_journal_invalidatepage); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); -EXPORT_SYMBOL(jbd2_journal_inode_add_write); -EXPORT_SYMBOL(jbd2_journal_inode_add_wait); EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index afc06daee5bb..bee8498d7792 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2622,18 +2622,6 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, return 0; } -int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode) -{ - return jbd2_journal_file_inode(handle, jinode, - JI_WRITE_DATA | JI_WAIT_DATA, 0, LLONG_MAX); -} - -int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode) -{ - return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0, - LLONG_MAX); -} - int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *jinode, loff_t start_byte, loff_t length) { diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index cbe70637c117..0e6406c4f362 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -163,13 +163,11 @@ static const struct export_operations jffs2_export_ops = { * Opt_rp_size: size of reserved pool in KiB */ enum { - Opt_source, Opt_override_compr, Opt_rp_size, }; static const struct fs_parameter_spec jffs2_param_specs[] = { - fsparam_string ("source", Opt_source), fsparam_enum ("compr", Opt_override_compr), fsparam_u32 ("rp_size", Opt_rp_size), {} diff --git a/fs/namespace.c b/fs/namespace.c index 17f352cb87e8..b7eb5f242b86 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2832,8 +2832,6 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, put_filesystem(type); return -EINVAL; } - } else { - subtype = ""; } } @@ -3058,7 +3056,7 @@ void *copy_mount_options(const void __user * data) * the remainder of the page. */ /* copy_from_user cannot cross TASK_SIZE ! */ - size = TASK_SIZE - (unsigned long)data; + size = TASK_SIZE - (unsigned long)untagged_addr(data); if (size > PAGE_SIZE) size = PAGE_SIZE; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 0adfd8840110..e180033e35cf 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1669,10 +1669,8 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) #endif /* CONFIG_NFSV4 */ -/* - * Code common to create, mkdir, and mknod. - */ -int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, +struct dentry * +nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { @@ -1680,13 +1678,10 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, struct inode *dir = d_inode(parent); struct inode *inode; struct dentry *d; - int error = -EACCES; + int error; d_drop(dentry); - /* We may have been initialized further down */ - if (d_really_is_positive(dentry)) - goto out; if (fhandle->size == 0) { error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); if (error) @@ -1702,18 +1697,32 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, } inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); d = d_splice_alias(inode, dentry); - if (IS_ERR(d)) { - error = PTR_ERR(d); - goto out_error; - } - dput(d); out: dput(parent); - return 0; + return d; out_error: nfs_mark_for_revalidate(dir); - dput(parent); - return error; + d = ERR_PTR(error); + goto out; +} +EXPORT_SYMBOL_GPL(nfs_add_or_obtain); + +/* + * Code common to create, mkdir, and mknod. + */ +int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + struct dentry *d; + + d = nfs_add_or_obtain(dentry, fhandle, fattr, label); + if (IS_ERR(d)) + return PTR_ERR(d); + + /* Callers don't care */ + dput(d); + return 0; } EXPORT_SYMBOL_GPL(nfs_instantiate); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 3cb073c50fa6..c9b605f6c9cb 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1164,6 +1164,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", .owner = THIS_MODULE, + .flags = PNFS_LAYOUTGET_ON_OPEN, .max_layoutget_response = 4096, /* 1 page or so... */ .alloc_layout_hdr = filelayout_alloc_layout_hdr, .free_layout_hdr = filelayout_free_layout_hdr, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index e64f810223be..447a3c17fa8e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -16,14 +16,6 @@ extern const struct export_operations nfs_export_ops; struct nfs_string; -/* Maximum number of readahead requests - * FIXME: this should really be a sysctl so that users may tune it to suit - * their needs. People that do NFS over a slow network, might for - * instance want to reduce it to something closer to 1 for improved - * interactive response. - */ -#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) - static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) { if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index a3ad2d46fd42..9eb2f1a503ab 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -279,15 +279,17 @@ static struct nfs3_createdata *nfs3_alloc_createdata(void) return data; } -static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) +static struct dentry * +nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) { int status; status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); nfs_post_op_update_inode(dir, data->res.dir_attr); - if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); - return status; + if (status != 0) + return ERR_PTR(status); + + return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr, NULL); } static void nfs3_free_createdata(struct nfs3_createdata *data) @@ -304,6 +306,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call create %pd\n", dentry); @@ -330,7 +333,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, goto out; for (;;) { - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); if (status != -ENOTSUPP) break; @@ -355,6 +359,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + /* When we created the file with exclusive semantics, make * sure we set the attributes afterwards. */ if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) { @@ -372,11 +379,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_post_op_update_inode(d_inode(dentry), data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); if (status != 0) - goto out_release_acls; + goto out_dput; } status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); +out_dput: + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); @@ -504,6 +513,7 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; if (len > NFS3_MAXPATHLEN) @@ -522,7 +532,11 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, data->arg.symlink.pathlen = len; data->arg.symlink.sattr = sattr; - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); + + if (status == 0) + dput(d_alias); nfs3_free_createdata(data); out: @@ -535,6 +549,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call mkdir %pd\n", dentry); @@ -553,12 +568,18 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) data->arg.mkdir.len = dentry->d_name.len; data->arg.mkdir.sattr = sattr; - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); + if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); @@ -660,6 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call mknod %pd %u:%u\n", dentry, @@ -698,12 +720,17 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, goto out; } - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 3564da1ba8a1..16b2e5cc3e94 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -491,8 +491,6 @@ extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, const struct nfs_lock_context *, nfs4_stateid *, const struct cred **); -extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst, - struct nfs4_state *state); extern bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state); @@ -574,6 +572,15 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; } +static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1) +{ + u32 seqid = be32_to_cpu(s1->seqid); + + if (++seqid == 0) + ++seqid; + s1->seqid = cpu_to_be32(seqid); +} + static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) { return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1406858bae6c..11eafcfc490b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1073,14 +1073,26 @@ static const struct rpc_call_ops nfs40_call_sync_ops = { .rpc_call_done = nfs40_call_sync_done, }; +static int nfs4_call_sync_custom(struct rpc_task_setup *task_setup) +{ + int ret; + struct rpc_task *task; + + task = rpc_run_task(task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + + ret = task->tk_status; + rpc_put_task(task); + return ret; +} + static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res) { - int ret; - struct rpc_task *task; struct nfs_client *clp = server->nfs_client; struct nfs4_call_sync_data data = { .seq_server = server, @@ -1094,14 +1106,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, .callback_data = &data }; - task = rpc_run_task(&task_setup); - if (IS_ERR(task)) - ret = PTR_ERR(task); - else { - ret = task->tk_status; - rpc_put_task(task); - } - return ret; + return nfs4_call_sync_custom(&task_setup); } int nfs4_call_sync(struct rpc_clnt *clnt, @@ -3308,6 +3313,75 @@ nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task) return pnfs_wait_on_layoutreturn(inode, task); } +/* + * Update the seqid of an open stateid + */ +static void nfs4_sync_open_stateid(nfs4_stateid *dst, + struct nfs4_state *state) +{ + __be32 seqid_open; + u32 dst_seqid; + int seq; + + for (;;) { + if (!nfs4_valid_open_stateid(state)) + break; + seq = read_seqbegin(&state->seqlock); + if (!nfs4_state_match_open_stateid_other(state, dst)) { + nfs4_stateid_copy(dst, &state->open_stateid); + if (read_seqretry(&state->seqlock, seq)) + continue; + break; + } + seqid_open = state->open_stateid.seqid; + if (read_seqretry(&state->seqlock, seq)) + continue; + + dst_seqid = be32_to_cpu(dst->seqid); + if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) < 0) + dst->seqid = seqid_open; + break; + } +} + +/* + * Update the seqid of an open stateid after receiving + * NFS4ERR_OLD_STATEID + */ +static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, + struct nfs4_state *state) +{ + __be32 seqid_open; + u32 dst_seqid; + bool ret; + int seq; + + for (;;) { + ret = false; + if (!nfs4_valid_open_stateid(state)) + break; + seq = read_seqbegin(&state->seqlock); + if (!nfs4_state_match_open_stateid_other(state, dst)) { + if (read_seqretry(&state->seqlock, seq)) + continue; + break; + } + seqid_open = state->open_stateid.seqid; + if (read_seqretry(&state->seqlock, seq)) + continue; + + dst_seqid = be32_to_cpu(dst->seqid); + if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) >= 0) + dst->seqid = cpu_to_be32(dst_seqid + 1); + else + dst->seqid = seqid_open; + ret = true; + break; + } + + return ret; +} + struct nfs4_closedata { struct inode *inode; struct nfs4_state *state; @@ -3358,32 +3432,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data) trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status); /* Handle Layoutreturn errors */ - if (calldata->arg.lr_args && task->tk_status != 0) { - switch (calldata->res.lr_ret) { - default: - calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; - break; - case 0: - calldata->arg.lr_args = NULL; - calldata->res.lr_res = NULL; - break; - case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&calldata->arg.lr_args->stateid, - &calldata->arg.lr_args->range, - calldata->inode)) - goto lr_restart; - /* Fallthrough */ - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_UNKNOWN_LAYOUTTYPE: - case -NFS4ERR_WRONG_CRED: - calldata->arg.lr_args = NULL; - calldata->res.lr_res = NULL; - goto lr_restart; - } - } + if (pnfs_roc_done(task, calldata->inode, + &calldata->arg.lr_args, + &calldata->res.lr_res, + &calldata->res.lr_ret) == -EAGAIN) + goto out_restart; /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors @@ -3403,7 +3456,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) break; case -NFS4ERR_OLD_STATEID: /* Did we race with OPEN? */ - if (nfs4_refresh_open_stateid(&calldata->arg.stateid, + if (nfs4_refresh_open_old_stateid(&calldata->arg.stateid, state)) goto out_restart; goto out_release; @@ -3415,7 +3468,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data) task->tk_msg.rpc_cred); /* Fallthrough */ case -NFS4ERR_BAD_STATEID: - break; + if (calldata->arg.fmode == 0) + break; + /* Fallthrough */ default: task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); @@ -3430,8 +3485,6 @@ static void nfs4_close_done(struct rpc_task *task, void *data) nfs_refresh_inode(calldata->inode, &calldata->fattr); dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); return; -lr_restart: - calldata->res.lr_ret = 0; out_restart: task->tk_status = 0; rpc_restart_call_prepare(task); @@ -3472,8 +3525,8 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) } else if (is_rdwr) calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; - if (!nfs4_valid_open_stateid(state) || - !nfs4_refresh_open_stateid(&calldata->arg.stateid, state)) + nfs4_sync_open_stateid(&calldata->arg.stateid, state); + if (!nfs4_valid_open_stateid(state)) call_close = 0; spin_unlock(&state->owner->so_lock); @@ -6018,7 +6071,6 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, .rpc_resp = res, .rpc_cred = cred, }; - struct rpc_task *task; struct rpc_task_setup task_setup_data = { .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, @@ -6051,17 +6103,12 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, dprintk("NFS call setclientid auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_owner_id); - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out; - } - status = task->tk_status; + + status = nfs4_call_sync_custom(&task_setup_data); if (setclientid.sc_cred) { clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred); put_rpccred(setclientid.sc_cred); } - rpc_put_task(task); out: trace_nfs4_setclientid(clp, status); dprintk("NFS reply setclientid: %d\n", status); @@ -6129,32 +6176,11 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); /* Handle Layoutreturn errors */ - if (data->args.lr_args && task->tk_status != 0) { - switch(data->res.lr_ret) { - default: - data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; - break; - case 0: - data->args.lr_args = NULL; - data->res.lr_res = NULL; - break; - case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&data->args.lr_args->stateid, - &data->args.lr_args->range, - data->inode)) - goto lr_restart; - /* Fallthrough */ - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_UNKNOWN_LAYOUTTYPE: - case -NFS4ERR_WRONG_CRED: - data->args.lr_args = NULL; - data->res.lr_res = NULL; - goto lr_restart; - } - } + if (pnfs_roc_done(task, data->inode, + &data->args.lr_args, + &data->res.lr_res, + &data->res.lr_ret) == -EAGAIN) + goto out_restart; switch (task->tk_status) { case 0: @@ -6192,8 +6218,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) } data->rpc_status = task->tk_status; return; -lr_restart: - data->res.lr_ret = 0; out_restart: task->tk_status = 0; rpc_restart_call_prepare(task); @@ -6386,6 +6410,42 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } +/* + * Update the seqid of a lock stateid after receiving + * NFS4ERR_OLD_STATEID + */ +static bool nfs4_refresh_lock_old_stateid(nfs4_stateid *dst, + struct nfs4_lock_state *lsp) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret = false; + + spin_lock(&state->state_lock); + if (!nfs4_stateid_match_other(dst, &lsp->ls_stateid)) + goto out; + if (!nfs4_stateid_is_newer(&lsp->ls_stateid, dst)) + nfs4_stateid_seqid_inc(dst); + else + dst->seqid = lsp->ls_stateid.seqid; + ret = true; +out: + spin_unlock(&state->state_lock); + return ret; +} + +static bool nfs4_sync_lock_stateid(nfs4_stateid *dst, + struct nfs4_lock_state *lsp) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret; + + spin_lock(&state->state_lock); + ret = !nfs4_stateid_match_other(dst, &lsp->ls_stateid); + nfs4_stateid_copy(dst, &lsp->ls_stateid); + spin_unlock(&state->state_lock); + return ret; +} + struct nfs4_unlockdata { struct nfs_locku_args arg; struct nfs_locku_res res; @@ -6403,7 +6463,8 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs_seqid *seqid) { struct nfs4_unlockdata *p; - struct inode *inode = lsp->ls_state->inode; + struct nfs4_state *state = lsp->ls_state; + struct inode *inode = state->inode; p = kzalloc(sizeof(*p), GFP_NOFS); if (p == NULL) @@ -6419,6 +6480,9 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); + spin_lock(&state->state_lock); + nfs4_stateid_copy(&p->arg.stateid, &lsp->ls_stateid); + spin_unlock(&state->state_lock); return p; } @@ -6457,10 +6521,14 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) task->tk_msg.rpc_cred); /* Fall through */ case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - if (!nfs4_stateid_match(&calldata->arg.stateid, - &calldata->lsp->ls_stateid)) + if (nfs4_sync_lock_stateid(&calldata->arg.stateid, + calldata->lsp)) + rpc_restart_call_prepare(task); + break; + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_lock_old_stateid(&calldata->arg.stateid, + calldata->lsp)) rpc_restart_call_prepare(task); break; default: @@ -6483,7 +6551,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) goto out_wait; - nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid); if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ goto out_no_action; @@ -7645,6 +7712,8 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred) static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity) { int status; + struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct nfs_client *clp = NFS_SERVER(dir)->nfs_client; struct nfs4_secinfo_arg args = { .dir_fh = NFS_FH(dir), .name = name, @@ -7657,26 +7726,37 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct .rpc_argp = &args, .rpc_resp = &res, }; - struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct nfs4_call_sync_data data = { + .seq_server = NFS_SERVER(dir), + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = clp->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; const struct cred *cred = NULL; if (use_integrity) { - clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient; - cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client); + clnt = clp->cl_rpcclient; + task_setup.rpc_client = clnt; + + cred = nfs4_get_clid_cred(clp); msg.rpc_cred = cred; } dprintk("NFS call secinfo %s\n", name->name); - nfs4_state_protect(NFS_SERVER(dir)->nfs_client, - NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + nfs4_state_protect(clp, NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + status = nfs4_call_sync_custom(&task_setup); - status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args, - &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); dprintk("NFS reply secinfo: %d\n", status); put_cred(cred); - return status; } @@ -8344,7 +8424,6 @@ static const struct rpc_call_ops nfs4_get_lease_time_ops = { int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) { - struct rpc_task *task; struct nfs4_get_lease_time_args args; struct nfs4_get_lease_time_res res = { .lr_fsinfo = fsinfo, @@ -8366,17 +8445,9 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) .callback_data = &data, .flags = RPC_TASK_TIMEOUT, }; - int status; nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1); - task = rpc_run_task(&task_setup); - - if (IS_ERR(task)) - return PTR_ERR(task); - - status = task->tk_status; - rpc_put_task(task); - return status; + return nfs4_call_sync_custom(&task_setup); } #ifdef CONFIG_NFS_V4_1 @@ -8845,7 +8916,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, const struct cred *cred) { struct nfs4_reclaim_complete_data *calldata; - struct rpc_task *task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], .rpc_cred = cred, @@ -8854,7 +8924,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs4_reclaim_complete_call_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_NO_ROUND_ROBIN, + .flags = RPC_TASK_NO_ROUND_ROBIN, }; int status = -ENOMEM; @@ -8869,15 +8939,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out; - } - status = rpc_wait_for_completion_task(task); - if (status == 0) - status = task->tk_status; - rpc_put_task(task); + status = nfs4_call_sync_custom(&task_setup_data); out: dprintk("<-- %s status=%d\n", __func__, status); return status; @@ -9103,10 +9165,19 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) if (!nfs41_sequence_process(task, &lrp->res.seq_res)) return; + /* + * Was there an RPC level error? Assume the call succeeded, + * and that we need to release the layout + */ + if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) { + lrp->res.lrs_present = 0; + return; + } + server = NFS_SERVER(lrp->args.inode); switch (task->tk_status) { case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&lrp->args.stateid, + if (nfs4_layout_refresh_old_stateid(&lrp->args.stateid, &lrp->args.range, lrp->args.inode)) goto out_restart; @@ -9362,18 +9433,32 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; struct rpc_clnt *clnt = server->client; + struct nfs4_call_sync_data data = { + .seq_server = server, + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = server->nfs_client->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; const struct cred *cred = NULL; int status; if (use_integrity) { clnt = server->nfs_client->cl_rpcclient; + task_setup.rpc_client = clnt; + cred = nfs4_get_clid_cred(server->nfs_client); msg.rpc_cred = cred; } dprintk("--> %s\n", __func__); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + status = nfs4_call_sync_custom(&task_setup); dprintk("<-- %s status=%d\n", __func__, status); put_cred(cred); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cad4e064b328..0c6d53dc3672 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1015,22 +1015,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst, return ret; } -bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) -{ - bool ret; - int seq; - - do { - ret = false; - seq = read_seqbegin(&state->seqlock); - if (nfs4_state_match_open_stateid_other(state, dst)) { - dst->seqid = state->open_stateid.seqid; - ret = true; - } - } while (read_seqretry(&state->seqlock, seq)); - return ret; -} - bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) { bool ret; @@ -2095,8 +2079,10 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred } status = nfs4_begin_drain_session(clp); - if (status != 0) - return status; + if (status != 0) { + result = status; + goto out; + } status = nfs4_replace_transport(server, locations); if (status != 0) { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 46a8d636d151..ab07db0f07cd 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1174,7 +1174,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } - if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { + if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) { *p++ = cpu_to_be32(label->lfs); *p++ = cpu_to_be32(label->pi); *p++ = cpu_to_be32(label->len); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4525d5acae38..bb80034a7661 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -359,9 +359,10 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg, } /* - * Update the seqid of a layout stateid + * Update the seqid of a layout stateid after receiving + * NFS4ERR_OLD_STATEID */ -bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode) { @@ -377,7 +378,15 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, spin_lock(&inode->i_lock); lo = NFS_I(inode)->layout; - if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + if (lo && pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + /* Is our call using the most recent seqid? If so, bump it */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) { + nfs4_stateid_seqid_inc(dst); + ret = true; + goto out; + } + /* Try to update the seqid to the most recent */ err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0); if (err != -EBUSY) { dst->seqid = lo->plh_stateid.seqid; @@ -385,6 +394,7 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, ret = true; } } +out: spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); return ret; @@ -1440,6 +1450,52 @@ bool pnfs_roc(struct inode *ino, return false; } +int pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret) +{ + struct nfs4_layoutreturn_args *arg = *argpp; + int retval = -EAGAIN; + + if (!arg) + return 0; + /* Handle Layoutreturn errors */ + switch (*ret) { + case 0: + retval = 0; + break; + case -NFS4ERR_NOMATCHING_LAYOUT: + /* Was there an RPC level error? If not, retry */ + if (task->tk_rpc_status == 0) + break; + /* If the call was not sent, let caller handle it */ + if (!RPC_WAS_SENT(task)) + return 0; + /* + * Otherwise, assume the call succeeded and + * that we need to release the layout + */ + *ret = 0; + (*respp)->lrs_present = 0; + retval = 0; + break; + case -NFS4ERR_DELAY: + /* Let the caller handle the retry */ + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return 0; + case -NFS4ERR_OLD_STATEID: + if (!nfs4_layout_refresh_old_stateid(&arg->stateid, + &arg->range, inode)) + break; + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return -EAGAIN; + } + *argpp = NULL; + *respp = NULL; + return retval; +} + void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, int ret) @@ -1449,10 +1505,15 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, const nfs4_stateid *res_stateid = NULL; struct nfs4_xdr_opaque_data *ld_private = args->ld_private; - if (ret == 0) { - arg_stateid = &args->stateid; + switch (ret) { + case -NFS4ERR_NOMATCHING_LAYOUT: + break; + case 0: if (res->lrs_present) res_stateid = &res->stateid; + /* Fallthrough */ + default: + arg_stateid = &args->stateid; } pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range, res_stateid); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f15609c003d8..f8a38065c7e4 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -261,7 +261,7 @@ int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, bool is_recall); int pnfs_destroy_layouts_byclid(struct nfs_client *clp, bool is_recall); -bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode); void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); @@ -282,6 +282,10 @@ bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, const struct cred *cred); +int pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret); void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, int ret); @@ -701,6 +705,15 @@ pnfs_roc(struct inode *ino, return false; } +static inline int +pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret) +{ + return 0; +} + static inline void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, @@ -785,7 +798,7 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) { } -static inline bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +static inline bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode) { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 19a76cfa8b1f..a84df7d63403 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2645,6 +2645,13 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, } EXPORT_SYMBOL_GPL(nfs_clone_sb_security); +static void nfs_set_readahead(struct backing_dev_info *bdi, + unsigned long iomax_pages) +{ + bdi->ra_pages = VM_READAHEAD_PAGES; + bdi->io_pages = iomax_pages; +} + struct dentry *nfs_fs_mount_common(struct nfs_server *server, int flags, const char *dev_name, struct nfs_mount_info *mount_info, @@ -2687,7 +2694,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, mntroot = ERR_PTR(error); goto error_splat_super; } - s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD; + nfs_set_readahead(s->s_bdi, server->rpages); server->super = s; } diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index c03758c91481..7a42c2ebe28d 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -279,6 +280,17 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) goto out_err; } + /* + * convert the userspace DN_* "arg" to the internal FS_* + * defined in fsnotify + */ + mask = convert_arg(arg); + + error = security_path_notify(&filp->f_path, mask, + FSNOTIFY_OBJ_TYPE_INODE); + if (error) + goto out_err; + /* expect most fcntl to add new rather than augment old */ dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL); if (!dn) { @@ -293,9 +305,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) goto out_err; } - /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */ - mask = convert_arg(arg); - /* set up the new_fsn_mark and new_dn_mark */ new_fsn_mark = &new_dn_mark->fsn_mark; fsnotify_init_mark(new_fsn_mark, dnotify_group); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 09c67262a832..2e667cb8d69e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -528,7 +528,8 @@ static const struct file_operations fanotify_fops = { }; static int fanotify_find_path(int dfd, const char __user *filename, - struct path *path, unsigned int flags) + struct path *path, unsigned int flags, __u64 mask, + unsigned int obj_type) { int ret; @@ -567,8 +568,15 @@ static int fanotify_find_path(int dfd, const char __user *filename, /* you can only watch an inode if you have read permissions on it */ ret = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ); + if (ret) { + path_put(path); + goto out; + } + + ret = security_path_notify(path, mask, obj_type); if (ret) path_put(path); + out: return ret; } @@ -947,6 +955,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, __kernel_fsid_t __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; + unsigned int obj_type; int ret; pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", @@ -961,8 +970,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, switch (mark_type) { case FAN_MARK_INODE: + obj_type = FSNOTIFY_OBJ_TYPE_INODE; + break; case FAN_MARK_MOUNT: + obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT; + break; case FAN_MARK_FILESYSTEM: + obj_type = FSNOTIFY_OBJ_TYPE_SB; break; default: return -EINVAL; @@ -1030,7 +1044,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, goto fput_and_out; } - ret = fanotify_find_path(dfd, pathname, &path, flags); + ret = fanotify_find_path(dfd, pathname, &path, flags, + (mask & ALL_FSNOTIFY_EVENTS), obj_type); if (ret) goto fput_and_out; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index cade62c6219d..bd16ec03b5fd 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "inotify.h" #include "../fdinfo.h" @@ -331,7 +332,8 @@ static const struct file_operations inotify_fops = { /* * find_inode - resolve a user-given path to a specific inode */ -static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags) +static int inotify_find_inode(const char __user *dirname, struct path *path, + unsigned int flags, __u64 mask) { int error; @@ -340,8 +342,15 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns return error; /* you can only watch an inode if you have read permissions on it */ error = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ); + if (error) { + path_put(path); + return error; + } + error = security_path_notify(path, mask, + FSNOTIFY_OBJ_TYPE_INODE); if (error) path_put(path); + return error; } @@ -735,7 +744,8 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, if (mask & IN_ONLYDIR) flags |= LOOKUP_DIRECTORY; - ret = inotify_find_inode(pathname, &path, flags); + ret = inotify_find_inode(pathname, &path, flags, + (mask & IN_ALL_EVENTS)); if (ret) goto fput_and_out; diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 20c841a906f2..3aac5c917afe 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -71,7 +71,7 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) } /* Read, map, and pin the page. */ page = ntfs_map_page(mft_vi->i_mapping, index); - if (likely(!IS_ERR(page))) { + if (!IS_ERR(page)) { /* Catch multi sector transfer fixup errors. */ if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) + ofs)))) { @@ -154,7 +154,7 @@ MFT_RECORD *map_mft_record(ntfs_inode *ni) mutex_lock(&ni->mrec_lock); m = map_mft_record_page(ni); - if (likely(!IS_ERR(m))) + if (!IS_ERR(m)) return m; mutex_unlock(&ni->mrec_lock); @@ -271,7 +271,7 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, m = map_mft_record(ni); /* map_mft_record() has incremented this on success. */ atomic_dec(&ni->count); - if (likely(!IS_ERR(m))) { + if (!IS_ERR(m)) { /* Verify the sequence number. */ if (likely(le16_to_cpu(m->sequence_number) == seq_no)) { ntfs_debug("Done 1."); @@ -1303,7 +1303,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, (ll - 1) >> vol->cluster_size_bits, NULL); - if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { up_write(&mftbmp_ni->runlist.lock); ntfs_error(vol->sb, "Failed to determine last allocated " "cluster of mft bitmap attribute."); @@ -1734,7 +1734,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) read_unlock_irqrestore(&mft_ni->size_lock, flags); rl = ntfs_attr_find_vcn_nolock(mft_ni, (ll - 1) >> vol->cluster_size_bits, NULL); - if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { up_write(&mft_ni->runlist.lock); ntfs_error(vol->sb, "Failed to determine last allocated " "cluster of mft data attribute."); @@ -1776,7 +1776,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) do { rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE, true); - if (likely(!IS_ERR(rl2))) + if (!IS_ERR(rl2)) break; if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { ntfs_error(vol->sb, "Failed to allocate the minimal " diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 2d3cc9e3395d..4e6a44bc654c 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -115,7 +115,7 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, dent_ino = MREF(mref); ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino); dent_inode = ntfs_iget(vol->sb, dent_ino); - if (likely(!IS_ERR(dent_inode))) { + if (!IS_ERR(dent_inode)) { /* Consistency check. */ if (is_bad_inode(dent_inode) || MSEQNO(mref) == NTFS_I(dent_inode)->seq_no || diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c index 508744a93180..97932fb5179c 100644 --- a/fs/ntfs/runlist.c +++ b/fs/ntfs/runlist.c @@ -951,7 +951,7 @@ runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, } /* Now combine the new and old runlists checking for overlaps. */ old_rl = ntfs_runlists_merge(old_rl, rl); - if (likely(!IS_ERR(old_rl))) + if (!IS_ERR(old_rl)) return old_rl; ntfs_free(rl); ntfs_error(vol->sb, "Failed to merge runlists."); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 29621d40f448..7dc3bc604f78 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1475,7 +1475,7 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol) kfree(name); /* Get the inode. */ tmp_ino = ntfs_iget(vol->sb, MREF(mref)); - if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) { + if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) { if (!IS_ERR(tmp_ino)) iput(tmp_ino); ntfs_error(vol->sb, "Failed to load $UsnJrnl."); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 0c335b51043d..f9baefc76cf9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5993,6 +5993,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) struct buffer_head *data_alloc_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; + struct ocfs2_journal *journal = osb->journal; BUG_ON(inode_trylock(tl_inode)); @@ -6013,6 +6014,20 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } + /* Appending truncate log(TA) and and flushing truncate log(TF) are + * two separated transactions. They can be both committed but not + * checkpointed. If crash occurs then, both two transaction will be + * replayed with several already released to global bitmap clusters. + * Then truncate log will be replayed resulting in cluster double free. + */ + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) { + mlog_errno(status); + goto out; + } + data_alloc_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -6792,6 +6807,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, struct page *page, int zero, u64 *phys) { int ret, partial = 0; + loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from; + loff_t length = to - from; ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); if (ret) @@ -6811,7 +6828,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, if (ret < 0) mlog_errno(ret); else if (ocfs2_should_order_data(inode)) { - ret = ocfs2_jbd2_file_inode(handle, inode); + ret = ocfs2_jbd2_inode_add_write(handle, inode, + start_byte, length); if (ret < 0) mlog_errno(ret); } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a4c905d6b575..8de1c9d644f6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -942,7 +942,8 @@ static void ocfs2_write_failure(struct inode *inode, if (tmppage && page_has_buffers(tmppage)) { if (ocfs2_should_order_data(inode)) - ocfs2_jbd2_file_inode(wc->w_handle, inode); + ocfs2_jbd2_inode_add_write(wc->w_handle, inode, + user_pos, user_len); block_commit_write(tmppage, from, to); } @@ -2023,8 +2024,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, } if (page_has_buffers(tmppage)) { - if (handle && ocfs2_should_order_data(inode)) - ocfs2_jbd2_file_inode(handle, inode); + if (handle && ocfs2_should_order_data(inode)) { + loff_t start_byte = + ((loff_t)tmppage->index << PAGE_SHIFT) + + from; + loff_t length = to - from; + ocfs2_jbd2_inode_add_write(handle, inode, + start_byte, length); + } block_commit_write(tmppage, from, to); } } diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index 429e6a8359a5..eaf042feaf5e 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -231,14 +231,6 @@ static int blockcheck_u64_get(void *data, u64 *val) } DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); -static struct dentry *blockcheck_debugfs_create(const char *name, - struct dentry *parent, - u64 *value) -{ - return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value, - &blockcheck_fops); -} - static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) { if (stats) { @@ -250,16 +242,20 @@ static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, struct dentry *parent) { - stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); + struct dentry *dir; - blockcheck_debugfs_create("blocks_checked", stats->b_debug_dir, - &stats->b_check_count); + dir = debugfs_create_dir("blockcheck", parent); + stats->b_debug_dir = dir; - blockcheck_debugfs_create("checksums_failed", stats->b_debug_dir, - &stats->b_failure_count); + debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir, + &stats->b_check_count, &blockcheck_fops); + + debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir, + &stats->b_failure_count, &blockcheck_fops); + + debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir, + &stats->b_recover_count, &blockcheck_fops); - blockcheck_debugfs_create("ecc_recoveries", stats->b_debug_dir, - &stats->b_recover_count); } #else static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f1b613327ac8..a368350d4c27 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -225,10 +225,6 @@ struct o2hb_region { unsigned int hr_region_num; struct dentry *hr_debug_dir; - struct dentry *hr_debug_livenodes; - struct dentry *hr_debug_regnum; - struct dentry *hr_debug_elapsed_time; - struct dentry *hr_debug_pinned; struct o2hb_debug_buf *hr_db_livenodes; struct o2hb_debug_buf *hr_db_regnum; struct o2hb_debug_buf *hr_db_elapsed_time; @@ -1394,21 +1390,20 @@ void o2hb_exit(void) kfree(o2hb_db_failedregions); } -static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, - struct o2hb_debug_buf **db, int db_len, - int type, int size, int len, void *data) +static void o2hb_debug_create(const char *name, struct dentry *dir, + struct o2hb_debug_buf **db, int db_len, int type, + int size, int len, void *data) { *db = kmalloc(db_len, GFP_KERNEL); if (!*db) - return NULL; + return; (*db)->db_type = type; (*db)->db_size = size; (*db)->db_len = len; (*db)->db_data = data; - return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, - &o2hb_debug_fops); + debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops); } static void o2hb_debug_init(void) @@ -1525,11 +1520,7 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slots); - debugfs_remove(reg->hr_debug_livenodes); - debugfs_remove(reg->hr_debug_regnum); - debugfs_remove(reg->hr_debug_elapsed_time); - debugfs_remove(reg->hr_debug_pinned); - debugfs_remove(reg->hr_debug_dir); + debugfs_remove_recursive(reg->hr_debug_dir); kfree(reg->hr_db_livenodes); kfree(reg->hr_db_regnum); kfree(reg->hr_db_elapsed_time); @@ -1988,69 +1979,33 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group : NULL; } -static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) +static void o2hb_debug_region_init(struct o2hb_region *reg, + struct dentry *parent) { - int ret = -ENOMEM; + struct dentry *dir; - reg->hr_debug_dir = - debugfs_create_dir(config_item_name(®->hr_item), dir); - if (!reg->hr_debug_dir) { - mlog_errno(ret); - goto bail; - } + dir = debugfs_create_dir(config_item_name(®->hr_item), parent); + reg->hr_debug_dir = dir; - reg->hr_debug_livenodes = - o2hb_debug_create(O2HB_DEBUG_LIVENODES, - reg->hr_debug_dir, - &(reg->hr_db_livenodes), - sizeof(*(reg->hr_db_livenodes)), - O2HB_DB_TYPE_REGION_LIVENODES, - sizeof(reg->hr_live_node_bitmap), - O2NM_MAX_NODES, reg); - if (!reg->hr_debug_livenodes) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes), + sizeof(*(reg->hr_db_livenodes)), + O2HB_DB_TYPE_REGION_LIVENODES, + sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES, + reg); - reg->hr_debug_regnum = - o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, - reg->hr_debug_dir, - &(reg->hr_db_regnum), - sizeof(*(reg->hr_db_regnum)), - O2HB_DB_TYPE_REGION_NUMBER, - 0, O2NM_MAX_NODES, reg); - if (!reg->hr_debug_regnum) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum), + sizeof(*(reg->hr_db_regnum)), + O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg); - reg->hr_debug_elapsed_time = - o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, - reg->hr_debug_dir, - &(reg->hr_db_elapsed_time), - sizeof(*(reg->hr_db_elapsed_time)), - O2HB_DB_TYPE_REGION_ELAPSED_TIME, - 0, 0, reg); - if (!reg->hr_debug_elapsed_time) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir, + &(reg->hr_db_elapsed_time), + sizeof(*(reg->hr_db_elapsed_time)), + O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg); - reg->hr_debug_pinned = - o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, - reg->hr_debug_dir, - &(reg->hr_db_pinned), - sizeof(*(reg->hr_db_pinned)), - O2HB_DB_TYPE_REGION_PINNED, - 0, 0, reg); - if (!reg->hr_debug_pinned) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned), + sizeof(*(reg->hr_db_pinned)), + O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg); - ret = 0; -bail: - return ret; } static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, @@ -2106,11 +2061,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g if (ret) goto unregister_handler; - ret = o2hb_debug_region_init(reg, o2hb_debug_dir); - if (ret) { - config_item_put(®->hr_item); - goto unregister_handler; - } + o2hb_debug_region_init(reg, o2hb_debug_dir); return ®->hr_item; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 784426dee56c..bdef72c0f099 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3636,7 +3636,7 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, int i, j, num_used; u32 major_hash; struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; - struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list; + struct ocfs2_dx_entry_list *orig_list, *tmp_list; struct ocfs2_dx_entry *dx_entry; tmp_list = &tmp_dx_leaf->dl_list; @@ -3645,7 +3645,6 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; orig_list = &orig_dx_leaf->dl_list; new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; - new_list = &new_dx_leaf->dl_list; num_used = le16_to_cpu(orig_list->de_num_used); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 69a429b625cc..aaf24548b02a 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -142,7 +142,6 @@ struct dlm_ctxt atomic_t res_tot_count; atomic_t res_cur_count; - struct dlm_debug_ctxt *dlm_debug_ctxt; struct dentry *dlm_debugfs_subroot; /* NOTE: Next three are protected by dlm_domain_lock */ diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index a4b58ba99927..4d0b452012b2 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -853,67 +853,34 @@ static const struct file_operations debug_state_fops = { /* files in subroot */ void dlm_debug_init(struct dlm_ctxt *dlm) { - struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; - /* for dumping dlm_ctxt */ - dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_state_fops); + debugfs_create_file(DLM_DEBUGFS_DLM_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_state_fops); /* for dumping lockres */ - dc->debug_lockres_dentry = - debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_lockres_fops); + debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops); /* for dumping mles */ - dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_mle_fops); + debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops); /* for dumping lockres on the purge list */ - dc->debug_purgelist_dentry = - debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_purgelist_fops); -} - -void dlm_debug_shutdown(struct dlm_ctxt *dlm) -{ - struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; - - if (dc) { - debugfs_remove(dc->debug_purgelist_dentry); - debugfs_remove(dc->debug_mle_dentry); - debugfs_remove(dc->debug_lockres_dentry); - debugfs_remove(dc->debug_state_dentry); - kfree(dc); - dc = NULL; - } + debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, + &debug_purgelist_fops); } /* subroot - domain dir */ -int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) { - dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt), - GFP_KERNEL); - if (!dlm->dlm_debug_ctxt) { - mlog_errno(-ENOMEM); - return -ENOMEM; - } - dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, dlm_debugfs_root); - return 0; } void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { - debugfs_remove(dlm->dlm_debugfs_subroot); + debugfs_remove_recursive(dlm->dlm_debugfs_subroot); } /* debugfs root */ diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index 7d0c7c9013ce..f8fd8680a4b6 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -14,13 +14,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle); #ifdef CONFIG_DEBUG_FS -struct dlm_debug_ctxt { - struct dentry *debug_state_dentry; - struct dentry *debug_lockres_dentry; - struct dentry *debug_mle_dentry; - struct dentry *debug_purgelist_dentry; -}; - struct debug_lockres { int dl_len; char *dl_buf; @@ -29,9 +22,8 @@ struct debug_lockres { }; void dlm_debug_init(struct dlm_ctxt *dlm); -void dlm_debug_shutdown(struct dlm_ctxt *dlm); -int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); +void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); void dlm_create_debugfs_root(void); @@ -42,13 +34,9 @@ void dlm_destroy_debugfs_root(void); static inline void dlm_debug_init(struct dlm_ctxt *dlm) { } -static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm) +static inline void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) { } -static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) -{ - return 0; -} static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 7338b5d4647c..ee6f459f9770 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -387,7 +387,6 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) { dlm_unregister_domain_handlers(dlm); - dlm_debug_shutdown(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); dlm_destroy_dlm_worker(dlm); @@ -1938,7 +1937,6 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) if (status) { dlm_unregister_domain_handlers(dlm); - dlm_debug_shutdown(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); dlm_destroy_dlm_worker(dlm); @@ -1992,9 +1990,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->key = key; dlm->node_num = o2nm_this_node(); - ret = dlm_create_debugfs_subroot(dlm); - if (ret < 0) - goto leave; + dlm_create_debugfs_subroot(dlm); spin_lock_init(&dlm->spinlock); spin_lock_init(&dlm->master_lock); @@ -2056,6 +2052,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, mlog(0, "context init: refcount %u\n", kref_read(&dlm->dlm_refs)); + ret = 0; leave: if (ret < 0 && dlm) { if (dlm->master_hash) diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index e78657742bd8..3883633e82eb 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -90,7 +90,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, enum dlm_status status; int actions = 0; int in_use; - u8 owner; + u8 owner; + int recovery_wait = 0; mlog(0, "master_node = %d, valblk = %d\n", master_node, flags & LKM_VALBLK); @@ -193,9 +194,12 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, } if (flags & LKM_CANCEL) lock->cancel_pending = 0; - else - lock->unlock_pending = 0; - + else { + if (!lock->unlock_pending) + recovery_wait = 1; + else + lock->unlock_pending = 0; + } } /* get an extra ref on lock. if we are just switching @@ -229,6 +233,17 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, spin_unlock(&res->spinlock); wake_up(&res->wq); + if (recovery_wait) { + spin_lock(&res->spinlock); + /* Unlock request will directly succeed after owner dies, + * and the lock is already removed from grant list. We have to + * wait for RECOVERING done or we miss the chance to purge it + * since the removement is much faster than RECOVERING proc. + */ + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING); + spin_unlock(&res->spinlock); + } + /* let the caller's final dlm_lock_put handle the actual kfree */ if (actions & DLM_UNLOCK_FREE_LOCK) { /* this should always be coupled with list removal */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 14207234fa3d..6e774c5ea13b 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2508,9 +2508,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, ocfs2_inode_unlock(inode, ex); } - if (local_bh) - brelse(local_bh); - + brelse(local_bh); return status; } @@ -2593,8 +2591,7 @@ int ocfs2_inode_lock_atime(struct inode *inode, *level = 1; if (ocfs2_should_update_atime(inode, vfsmnt)) ocfs2_update_inode_atime(inode, bh); - if (bh) - brelse(bh); + brelse(bh); } else *level = 0; @@ -3012,8 +3009,6 @@ struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) kref_init(&dlm_debug->d_refcnt); INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); - dlm_debug->d_locking_state = NULL; - dlm_debug->d_locking_filter = NULL; dlm_debug->d_filter_secs = 0; out: return dlm_debug; @@ -3282,27 +3277,19 @@ static void ocfs2_dlm_init_debug(struct ocfs2_super *osb) { struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; - dlm_debug->d_locking_state = debugfs_create_file("locking_state", - S_IFREG|S_IRUSR, - osb->osb_debug_root, - osb, - &ocfs2_dlm_debug_fops); + debugfs_create_file("locking_state", S_IFREG|S_IRUSR, + osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops); - dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter", - 0600, - osb->osb_debug_root, - &dlm_debug->d_filter_secs); + debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root, + &dlm_debug->d_filter_secs); } static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) { struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; - if (dlm_debug) { - debugfs_remove(dlm_debug->d_locking_state); - debugfs_remove(dlm_debug->d_locking_filter); + if (dlm_debug) ocfs2_put_dlm_debug(dlm_debug); - } } int ocfs2_dlm_init(struct ocfs2_super *osb) diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e66a249fe07c..e3e2d1b2af51 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -590,8 +590,7 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, *extent_flags = rec->e_flags; } out: - if (eb_bh) - brelse(eb_bh); + brelse(eb_bh); return ret; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4435df3e5adb..2e982db3e1ae 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -706,7 +706,9 @@ static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, * Thus, we need to explicitly order the zeroed pages. */ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, - struct buffer_head *di_bh) + struct buffer_head *di_bh, + loff_t start_byte, + loff_t length) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; @@ -722,7 +724,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, goto out; } - ret = ocfs2_jbd2_file_inode(handle, inode); + ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length); if (ret < 0) { mlog_errno(ret); goto out; @@ -761,7 +763,9 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); - handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); + handle = ocfs2_zero_start_ordered_transaction(inode, di_bh, + abs_from, + abs_to - abs_from); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -2126,7 +2130,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); struct buffer_head *di_bh = NULL; - loff_t end; /* * We start with a read level meta lock and only jump to an ex @@ -2190,8 +2193,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, } } - end = pos + count; - ret = ocfs2_check_range_for_refcount(inode, pos, count); if (ret == 1) { ocfs2_inode_unlock(inode, meta_level); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7ad9d6590818..7c9dfd50c1c1 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -534,7 +534,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, */ mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), - "Inode %llu: system file state is ambigous\n", + "Inode %llu: system file state is ambiguous\n", (unsigned long long)args->fi_blkno); if (S_ISCHR(le16_to_cpu(fe->i_mode)) || diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index c0fe6ed08ab1..3103ba7f97a2 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -144,7 +144,6 @@ static inline void ocfs2_ci_set_new(struct ocfs2_super *osb, void ocfs2_orphan_scan_init(struct ocfs2_super *osb); void ocfs2_orphan_scan_start(struct ocfs2_super *osb); void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); -void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); void ocfs2_complete_recovery(struct work_struct *work); void ocfs2_wait_for_recovery(struct ocfs2_super *osb); @@ -232,8 +231,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) * ocfs2_journal_access_*() unless you intend to * manage the checksum by hand. * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. - * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before - * the current handle commits. + * ocfs2_jbd2_inode_add_write - Mark an inode with range so that its data goes + * out before the current handle commits. */ /* You must always start_trans with a number of buffs > 0, but it's @@ -441,7 +440,7 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, * previous dirblock update in the free list */ static inline int ocfs2_link_credits(struct super_block *sb) { - return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + + return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 + ocfs2_quota_trans_credits(sb); } @@ -575,37 +574,12 @@ static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb) return ocfs2_extent_recs_per_gd(sb); } -static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, - unsigned int clusters_to_del, - struct ocfs2_dinode *fe, - struct ocfs2_extent_list *last_el) +static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode, + loff_t start_byte, loff_t length) { - /* for dinode + all headers in this pass + update to next leaf */ - u16 next_free = le16_to_cpu(last_el->l_next_free_rec); - u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); - int credits = 1 + tree_depth + 1; - int i; - - i = next_free - 1; - BUG_ON(i < 0); - - /* We may be deleting metadata blocks, so metadata alloc dinode + - one desc. block for each possible delete. */ - if (tree_depth && next_free == 1 && - ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del) - credits += 1 + tree_depth; - - /* update to the truncate log. */ - credits += OCFS2_TRUNCATE_LOG_UPDATE; - - credits += ocfs2_quota_trans_credits(sb); - - return credits; -} - -static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) -{ - return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode); + return jbd2_journal_inode_ranged_write(handle, + &OCFS2_I(inode)->ip_jinode, + start_byte, length); } static inline int ocfs2_begin_ordered_truncate(struct inode *inode, diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 6f8e1c4fdb9c..8ea51cf27b97 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2486,7 +2486,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, struct inode *inode = NULL; struct inode *orphan_dir = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); - struct ocfs2_dinode *di = NULL; handle_t *handle = NULL; char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; struct buffer_head *parent_di_bh = NULL; @@ -2552,7 +2551,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; } - di = (struct ocfs2_dinode *)new_di_bh->b_data; status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, &orphan_insert, orphan_dir, false); if (status < 0) { diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fddbbd60f434..9150cfa4df7d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -223,8 +223,6 @@ struct ocfs2_orphan_scan { struct ocfs2_dlm_debug { struct kref d_refcnt; - struct dentry *d_locking_state; - struct dentry *d_locking_filter; u32 d_filter_secs; struct list_head d_lockres_tracking; }; @@ -401,7 +399,6 @@ struct ocfs2_super struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; - struct dentry *osb_ctxt; wait_queue_head_t recovery_event; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8b2f39506648..c81e86c62380 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1080,10 +1080,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); - osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, - osb->osb_debug_root, - osb, - &ocfs2_osb_debug_fops); + debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, + osb, &ocfs2_osb_debug_fops); if (ocfs2_meta_ecc(osb)) ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, @@ -1861,8 +1859,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) kset_unregister(osb->osb_dev_kset); - debugfs_remove(osb->osb_ctxt); - /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); @@ -1918,7 +1914,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_dlm_shutdown(osb, hangup_needed); ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); - debugfs_remove(osb->osb_debug_root); + debugfs_remove_recursive(osb->osb_debug_root); if (hangup_needed) ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); diff --git a/fs/open.c b/fs/open.c index bb89771dd2a6..681affa0cb37 100644 --- a/fs/open.c +++ b/fs/open.c @@ -788,7 +788,7 @@ static int do_dentry_open(struct file *f, f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); - if (unlikely(WARN_ON(!f->f_op))) { + if (WARN_ON(!f->f_op)) { error = -ENODEV; goto cleanup_all; } @@ -830,6 +830,14 @@ static int do_dentry_open(struct file *f, if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) return -EINVAL; } + + /* + * XXX: Huge page cache doesn't support writing yet. Drop all page + * cache for this file before processing writes. + */ + if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping)) + truncate_pagecache(inode, 0); + return 0; cleanup_all: diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 465ea0153b2a..ac9247371871 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -106,9 +105,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_zone_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); -#ifdef CONFIG_QUICKLIST - show_val_kb(m, "Quicklists: ", quicklist_total_size()); -#endif show_val_kb(m, "NFS_Unstable: ", global_node_page_state(NR_UNSTABLE_NFS)); @@ -136,6 +132,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); + show_val_kb(m, "FileHugePages: ", + global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR); + show_val_kb(m, "FilePmdMapped: ", + global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); #endif #ifdef CONFIG_CMA diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 45b0f652c49a..1ce1cb08a5d3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -474,6 +474,7 @@ struct mem_size_stats { unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; + unsigned long file_thp; unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; @@ -518,7 +519,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked) { - int i, nr = compound ? 1 << compound_order(page) : 1; + int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; /* @@ -645,7 +646,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, else if (is_zone_device_page(page)) /* pass */; else - VM_BUG_ON_PAGE(1, page); + mss->file_thp += HPAGE_PMD_SIZE; smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); } #else @@ -866,6 +867,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); + SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", mss->private_hugetlb >> 10, 7); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 8fde658fdb11..5b8d065fa83c 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -88,7 +88,7 @@ static inline void mangle(struct seq_file *m, const char *s) static void show_type(struct seq_file *m, struct super_block *sb) { mangle(m, sb->s_type->name); - if (sb->s_subtype && sb->s_subtype[0]) { + if (sb->s_subtype) { seq_putc(m, '.'); mangle(m, sb->s_subtype); } diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 9c02d96d3a42..4075e41408b4 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -239,10 +239,8 @@ static int balance_leaf_when_delete_left(struct tree_balance *tb) static int balance_leaf_when_delete(struct tree_balance *tb, int flag) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int item_pos = PATH_LAST_POSITION(tb->tb_path); struct buffer_info bi; int n; - struct item_head *ih; RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1, "vs- 12000: level: wrong FR %z", tb->FR[0]); @@ -251,7 +249,6 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag) RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0), "PAP-12010: tree can not be empty"); - ih = item_head(tbS0, item_pos); buffer_info_init_tbS0(tb, &bi); /* Delete or truncate the item */ @@ -298,7 +295,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb, if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { /* part of new item falls into L[0] */ int new_item_len, shift; - int version; ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1); @@ -317,8 +313,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb, leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, min_t(int, tb->zeroes_num, ih_item_len(ih))); - version = ih_version(ih); - /* * Calculate key component, item length and body to * insert into S[0] @@ -632,7 +626,6 @@ static void balance_leaf_insert_right(struct tree_balance *tb, struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n = B_NR_ITEMS(tbS0); struct buffer_info bi; - int ret; /* new item or part of it doesn't fall into R[0] */ if (n - tb->rnum[0] >= tb->item_pos) { @@ -646,13 +639,11 @@ static void balance_leaf_insert_right(struct tree_balance *tb, if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { loff_t old_key_comp, old_len, r_zeroes_number; const char *r_body; - int version, shift; + int shift; loff_t offset; leaf_shift_right(tb, tb->rnum[0] - 1, -1); - version = ih_version(ih); - /* Remember key component and item length */ old_key_comp = le_ih_k_offset(ih); old_len = ih_item_len(ih); @@ -698,7 +689,7 @@ static void balance_leaf_insert_right(struct tree_balance *tb, /* whole new item falls into R[0] */ /* Shift rnum[0]-1 items to R[0] */ - ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); + leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); /* Insert new item into R[0] */ buffer_info_init_right(tb, &bi); @@ -950,14 +941,12 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb, if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) { int old_key_comp, old_len, r_zeroes_number; const char *r_body; - int version; /* Move snum[i]-1 items from S[0] to S_new[i] */ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1, tb->S_new[i]); /* Remember key component and item length */ - version = ih_version(ih); old_key_comp = le_ih_k_offset(ih); old_len = ih_item_len(ih); diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 6b0ddb2a9091..117092224111 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -376,7 +376,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, int to, int to_bytes, short *snum012, int flow) { int i; - int cur_free; int units; struct virtual_node *vn = tb->tb_vn; int total_node_size, max_node_size, current_item_size; @@ -438,7 +437,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, /* leaf level */ needed_nodes = 1; total_node_size = 0; - cur_free = max_node_size; /* start from 'from'-th item */ start_item = from; @@ -1734,14 +1732,12 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) * and Fh is its father. */ struct buffer_head *Sh, *Fh; - int maxsize, ret; + int ret; int lfree, rfree /* free space in L and R */ ; Sh = PATH_H_PBUFFER(tb->tb_path, h); Fh = PATH_H_PPARENT(tb->tb_path, h); - maxsize = MAX_CHILD_SIZE(Sh); - /* * using tb->insert_size[h], which is negative in this case, * create_virtual_node calculates: diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4517a1394c6f..4b3e3e73b512 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -891,7 +891,6 @@ static int flush_older_commits(struct super_block *s, struct list_head *entry; unsigned int trans_id = jl->j_trans_id; unsigned int other_trans_id; - unsigned int first_trans_id; find_first: /* @@ -914,8 +913,6 @@ static int flush_older_commits(struct super_block *s, return 0; } - first_trans_id = first_jl->j_trans_id; - entry = &first_jl->j_list; while (1) { other_jl = JOURNAL_LIST_ENTRY(entry); @@ -1351,7 +1348,7 @@ static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { struct reiserfs_journal_list *pjl; - struct reiserfs_journal_cnode *cn, *last; + struct reiserfs_journal_cnode *cn; int count; int was_jwait = 0; int was_dirty = 0; @@ -1509,7 +1506,6 @@ static int flush_journal_list(struct super_block *s, b_blocknr, __func__); } free_cnode: - last = cn; cn = cn->next; if (saved_bh) { /* @@ -1792,7 +1788,6 @@ static int flush_used_journal_lists(struct super_block *s, { unsigned long len = 0; unsigned long cur_len; - int ret; int i; int limit = 256; struct reiserfs_journal_list *tjl; @@ -1829,9 +1824,9 @@ static int flush_used_journal_lists(struct super_block *s, * transactions, but only bother if we've actually spanned * across multiple lists */ - if (flush_jl != jl) { - ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); - } + if (flush_jl != jl) + kupdate_transactions(s, jl, &tjl, &trans_id, len, i); + flush_journal_list(s, flush_jl, 1); put_journal_list(s, flush_jl); put_journal_list(s, jl); @@ -1911,7 +1906,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *sb, int error) { struct reiserfs_transaction_handle myth; - int flushed = 0; struct reiserfs_journal *journal = SB_JOURNAL(sb); /* @@ -1933,7 +1927,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, 1); journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); do_journal_end(&myth, FLUSH_ALL); - flushed = 1; } } @@ -3444,9 +3437,8 @@ static int remove_from_transaction(struct super_block *sb, if (cn == journal->j_last) { journal->j_last = cn->prev; } - if (bh) - remove_journal_hash(sb, journal->j_hash_table, NULL, - bh->b_blocknr, 0); + remove_journal_hash(sb, journal->j_hash_table, NULL, + bh->b_blocknr, 0); clear_buffer_journaled(bh); /* don't log this one */ if (!already_cleaned) { @@ -3988,7 +3980,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) struct buffer_head *c_bh; /* commit bh */ struct buffer_head *d_bh; /* desc bh */ int cur_write_start = 0; /* start index of current log write */ - int old_start; int i; int flush; int wait_on_commit; @@ -4245,7 +4236,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) journal->j_num_work_lists++; /* reset journal values for the next transaction */ - old_start = journal->j_start; journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(sb); diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index f5cebd70d903..7f868569d4d0 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -1322,7 +1322,7 @@ void leaf_paste_entries(struct buffer_info *bi, char *item; struct reiserfs_de_head *deh; char *insert_point; - int i, old_entry_num; + int i; struct buffer_head *bh = bi->bi_bh; if (new_entry_count == 0) @@ -1362,7 +1362,6 @@ void leaf_paste_entries(struct buffer_info *bi, put_deh_location(&deh[i], deh_location(&deh[i]) + paste_size); - old_entry_num = ih_entry_count(ih); put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count); /* prepare space for pasted records */ diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index 415d66ca87d1..34baf5c0f265 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -183,13 +183,12 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s) int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2; int old_max = sb_oid_maxsize(disk_sb); struct reiserfs_super_block_v1 *disk_sb_v1; - __le32 *objectid_map, *new_objectid_map; + __le32 *objectid_map; int i; disk_sb_v1 = (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); objectid_map = (__le32 *) (disk_sb_v1 + 1); - new_objectid_map = (__le32 *) (disk_sb + 1); if (cur_size > new_size) { /* diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 9fed1c05f1f4..500f2000eb41 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -746,9 +746,6 @@ static void check_leaf_block_head(struct buffer_head *bh) static void check_internal_block_head(struct buffer_head *bh) { - struct block_head *blkh; - - blkh = B_BLK_HEAD(bh); if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT)) reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh); diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 0037aea97d39..da9ebe33882b 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -593,7 +593,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, struct buffer_head *bh; struct path_element *last_element; int node_level, retval; - int right_neighbor_of_leaf_node; int fs_gen; struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA]; @@ -614,8 +613,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, pathrelse(search_path); - right_neighbor_of_leaf_node = 0; - /* * With each iteration of this loop we search through the items in the * current node, and calculate the next current node(next path element) @@ -701,7 +698,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, */ block_number = SB_ROOT_BLOCK(sb); expected_level = -1; - right_neighbor_of_leaf_node = 0; /* repeat search from the root */ continue; diff --git a/fs/super.c b/fs/super.c index 8020974b2a68..f627b7c53d2b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1555,11 +1555,6 @@ int vfs_get_tree(struct fs_context *fc) sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); - if (fc->subtype && !sb->s_subtype) { - sb->s_subtype = fc->subtype; - fc->subtype = NULL; - } - /* * Write barrier is for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is the diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 33f2614210e0..fc08951daa62 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1273,21 +1273,23 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, } static __always_inline int validate_range(struct mm_struct *mm, - __u64 start, __u64 len) + __u64 *start, __u64 len) { __u64 task_size = mm->task_size; - if (start & ~PAGE_MASK) + *start = untagged_addr(*start); + + if (*start & ~PAGE_MASK) return -EINVAL; if (len & ~PAGE_MASK) return -EINVAL; if (!len) return -EINVAL; - if (start < mmap_min_addr) + if (*start < mmap_min_addr) return -EINVAL; - if (start >= task_size) + if (*start >= task_size) return -EINVAL; - if (len > task_size - start) + if (len > task_size - *start) return -EINVAL; return 0; } @@ -1337,7 +1339,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; } - ret = validate_range(mm, uffdio_register.range.start, + ret = validate_range(mm, &uffdio_register.range.start, uffdio_register.range.len); if (ret) goto out; @@ -1527,7 +1529,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) goto out; - ret = validate_range(mm, uffdio_unregister.start, + ret = validate_range(mm, &uffdio_unregister.start, uffdio_unregister.len); if (ret) goto out; @@ -1679,7 +1681,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) goto out; - ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); + ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len); if (ret) goto out; @@ -1719,7 +1721,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, sizeof(uffdio_copy)-sizeof(__s64))) goto out; - ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); + ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len); if (ret) goto out; /* @@ -1775,7 +1777,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, sizeof(uffdio_zeropage)-sizeof(__s64))) goto out; - ret = validate_range(ctx->mm, uffdio_zeropage.range.start, + ret = validate_range(ctx->mm, &uffdio_zeropage.range.start, uffdio_zeropage.range.len); if (ret) goto out; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 58fa85cec325..d6ed5d2c07c2 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -81,9 +81,10 @@ typedef struct xfs_alloc_arg { /* * Defines for datatype */ -#define XFS_ALLOC_INITIAL_USER_DATA (1 << 0)/* special case start of file */ -#define XFS_ALLOC_USERDATA_ZERO (1 << 1)/* zero extent on allocation */ -#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ +#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ +#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ +#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ +#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ static inline bool xfs_alloc_is_userdata(int datatype) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 054b4ce30033..4edc25a2ba80 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4042,8 +4042,12 @@ xfs_bmapi_allocate( */ if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK && bma->offset == 0) - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + if (whichfork == XFS_DATA_FORK) { + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + else + bma->datatype |= XFS_ALLOC_USERDATA; + } if (bma->flags & XFS_BMAPI_ZERO) bma->datatype |= XFS_ALLOC_USERDATA_ZERO; } @@ -5621,6 +5625,11 @@ xfs_bmse_merge( if (error) return error; + /* change to extent format if required after extent removal */ + error = xfs_bmap_btree_to_extents(tp, ip, cur, logflags, whichfork); + if (error) + return error; + done: xfs_iext_remove(ip, icur, 0); xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a08dd8f40346..ac6cdca63e15 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -928,7 +928,7 @@ xfs_log_sb( xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); - xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); + xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); } /* diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index a43d1813c4ff..5533e48e605d 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -97,7 +97,6 @@ xchk_allocbt_rec( xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t bno; xfs_extlen_t len; - int error = 0; bno = be32_to_cpu(rec->alloc.ar_startblock); len = be32_to_cpu(rec->alloc.ar_blockcount); @@ -109,7 +108,7 @@ xchk_allocbt_rec( xchk_allocbt_xref(bs->sc, bno, len); - return error; + return 0; } /* Scrub the freespace btrees for some AG. */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 120ef99d09e8..21c243622a79 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2097,7 +2097,7 @@ xfs_verify_magic( int idx; idx = xfs_sb_version_hascrc(&mp->m_sb); - if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))) + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) return false; return dmagic == bp->b_ops->magic[idx]; } @@ -2115,7 +2115,7 @@ xfs_verify_magic16( int idx; idx = xfs_sb_version_hascrc(&mp->m_sb); - if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))) + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) return false; return dmagic == bp->b_ops->magic16[idx]; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d952d5962e93..1ffb179f35d2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -370,21 +370,23 @@ static int xfs_dio_write_end_io( struct kiocb *iocb, ssize_t size, + int error, unsigned flags) { struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); loff_t offset = iocb->ki_pos; unsigned int nofs_flag; - int error = 0; trace_xfs_end_io_direct_write(ip, offset, size); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (size <= 0) - return size; + if (error) + return error; + if (!size) + return 0; /* * Capture amount written on completion as we can't reliably account @@ -441,6 +443,10 @@ xfs_dio_write_end_io( return error; } +static const struct iomap_dio_ops xfs_dio_write_ops = { + .end_io = xfs_dio_write_end_io, +}; + /* * xfs_file_dio_aio_write - handle direct IO writes * @@ -541,7 +547,7 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); - ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops); /* * If unaligned, this is the only IO in-flight. If it has not yet diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index ddd0bf7a4740..f1bc88f4367c 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -63,19 +63,6 @@ static const struct sysfs_ops xfs_sysfs_ops = { .store = xfs_sysfs_object_store, }; -/* - * xfs_mount kobject. The mp kobject also serves as the per-mount parent object - * that is identified by the fsname under sysfs. - */ - -static inline struct xfs_mount * -to_mp(struct kobject *kobject) -{ - struct xfs_kobj *kobj = to_kobj(kobject); - - return container_of(kobj, struct xfs_mount, m_kobj); -} - static struct attribute *xfs_mp_attrs[] = { NULL, }; diff --git a/include/Kbuild b/include/Kbuild index 4ae65e13c3f0..ffba79483cc5 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -312,7 +312,6 @@ header-test- += linux/mfd/as3711.h header-test- += linux/mfd/as3722.h header-test- += linux/mfd/da903x.h header-test- += linux/mfd/da9055/pdata.h -header-test- += linux/mfd/da9063/pdata.h header-test- += linux/mfd/db8500-prcmu.h header-test- += linux/mfd/dbx500-prcmu.h header-test- += linux/mfd/dln2.h diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 7357a3c942a0..384b5c835ced 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -10,6 +10,7 @@ #define BUGFLAG_WARNING (1 << 0) #define BUGFLAG_ONCE (1 << 1) #define BUGFLAG_DONE (1 << 2) +#define BUGFLAG_NO_CUT_HERE (1 << 3) /* CUT_HERE already sent */ #define BUGFLAG_TAINT(taint) ((taint) << 8) #define BUG_GET_TAINT(bug) ((bug)->flags >> 8) #endif @@ -61,18 +62,6 @@ struct bug_entry { #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0) #endif -#ifdef __WARN_FLAGS -#define __WARN_TAINT(taint) __WARN_FLAGS(BUGFLAG_TAINT(taint)) -#define __WARN_ONCE_TAINT(taint) __WARN_FLAGS(BUGFLAG_ONCE|BUGFLAG_TAINT(taint)) - -#define WARN_ON_ONCE(condition) ({ \ - int __ret_warn_on = !!(condition); \ - if (unlikely(__ret_warn_on)) \ - __WARN_ONCE_TAINT(TAINT_WARN); \ - unlikely(__ret_warn_on); \ -}) -#endif - /* * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report * significant kernel issues that need prompt attention if they should ever @@ -89,27 +78,27 @@ struct bug_entry { * * Use the versions with printk format strings to provide better diagnostics. */ -#ifndef __WARN_TAINT -extern __printf(3, 4) -void warn_slowpath_fmt(const char *file, const int line, - const char *fmt, ...); +#ifndef __WARN_FLAGS extern __printf(4, 5) -void warn_slowpath_fmt_taint(const char *file, const int line, unsigned taint, - const char *fmt, ...); -extern void warn_slowpath_null(const char *file, const int line); -#define WANT_WARN_ON_SLOWPATH -#define __WARN() warn_slowpath_null(__FILE__, __LINE__) -#define __WARN_printf(arg...) warn_slowpath_fmt(__FILE__, __LINE__, arg) -#define __WARN_printf_taint(taint, arg...) \ - warn_slowpath_fmt_taint(__FILE__, __LINE__, taint, arg) +void warn_slowpath_fmt(const char *file, const int line, unsigned taint, + const char *fmt, ...); +#define __WARN() __WARN_printf(TAINT_WARN, NULL) +#define __WARN_printf(taint, arg...) \ + warn_slowpath_fmt(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); -#define __WARN() do { \ - printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \ -} while (0) -#define __WARN_printf(arg...) __WARN_printf_taint(TAINT_WARN, arg) -#define __WARN_printf_taint(taint, arg...) \ - do { __warn_printk(arg); __WARN_TAINT(taint); } while (0) +#define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)) +#define __WARN_printf(taint, arg...) do { \ + __warn_printk(arg); \ + __WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ + } while (0) +#define WARN_ON_ONCE(condition) ({ \ + int __ret_warn_on = !!(condition); \ + if (unlikely(__ret_warn_on)) \ + __WARN_FLAGS(BUGFLAG_ONCE | \ + BUGFLAG_TAINT(TAINT_WARN)); \ + unlikely(__ret_warn_on); \ +}) #endif /* used internally by panic.c */ @@ -132,7 +121,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #define WARN(condition, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_printf(format); \ + __WARN_printf(TAINT_WARN, format); \ unlikely(__ret_warn_on); \ }) #endif @@ -140,7 +129,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #define WARN_TAINT(condition, taint, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_printf_taint(taint, format); \ + __WARN_printf(taint, format); \ unlikely(__ret_warn_on); \ }) diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 8476175c07e7..73f7421413cb 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -49,7 +49,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) * @mm: the mm_struct of the current context * @gfp: GFP flags to use for the allocation * - * Allocates a page and runs the pgtable_page_ctor(). + * Allocates a page and runs the pgtable_pte_page_ctor(). * * This function is intended for architectures that need * anything beyond simple page allocation or must have custom GFP flags. @@ -63,7 +63,7 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) pte = alloc_page(gfp); if (!pte) return NULL; - if (!pgtable_page_ctor(pte)) { + if (!pgtable_pte_page_ctor(pte)) { __free_page(pte); return NULL; } @@ -76,7 +76,7 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) * pte_alloc_one - allocate a page for PTE-level user page table * @mm: the mm_struct of the current context * - * Allocates a page and runs the pgtable_page_ctor(). + * Allocates a page and runs the pgtable_pte_page_ctor(). * * Return: `struct page` initialized as page table or %NULL on error */ @@ -98,15 +98,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) */ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { - pgtable_page_dtor(pte_page); + pgtable_pte_page_dtor(pte_page); __free_page(pte_page); } -#else /* CONFIG_MMU */ - -/* This is enough for a nommu architecture */ -#define check_pgt_cache() do { } while (0) - #endif /* CONFIG_MMU */ #endif /* __ASM_GENERIC_PGALLOC_H */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 75d9d68a6de7..818691846c90 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1002,9 +1002,8 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) * need this). If THP is not enabled, the pmd can't go away under the * code even if MADV_DONTNEED runs, but if THP is enabled we need to * run a pmd_trans_unstable before walking the ptes after - * split_huge_page_pmd returns (because it may have run when the pmd - * become null, but then a page fault can map in a THP and not a - * regular page). + * split_huge_pmd returns (because it may have run when the pmd become + * null, but then a page fault can map in a THP and not a regular page). */ static inline int pmd_trans_unstable(pmd_t *pmd) { @@ -1126,7 +1125,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, static inline void init_espfix_bsp(void) { } #endif -extern void __init pgd_cache_init(void); +extern void __init pgtable_cache_init(void); #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) diff --git a/include/linux/backlight.h b/include/linux/backlight.h index 0b5897446dca..c7d6b2e8c3b5 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -46,6 +46,12 @@ enum backlight_notification { BACKLIGHT_UNREGISTERED, }; +enum backlight_scale { + BACKLIGHT_SCALE_UNKNOWN = 0, + BACKLIGHT_SCALE_LINEAR, + BACKLIGHT_SCALE_NON_LINEAR, +}; + struct backlight_device; struct fb_info; @@ -80,6 +86,8 @@ struct backlight_properties { enum backlight_type type; /* Flags used to signal drivers of state changes */ unsigned int state; + /* Type of the brightness scale (linear, non-linear, ...) */ + enum backlight_scale scale; #define BL_CORE_SUSPENDED (1 << 0) /* backlight is suspended */ #define BL_CORE_FBBLANK (1 << 1) /* backlight is under an fb blank event */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d9db32fb75ee..f3ea78b0c91c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1524,10 +1524,14 @@ struct blk_integrity_iter { }; typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); +typedef void (integrity_prepare_fn) (struct request *); +typedef void (integrity_complete_fn) (struct request *, unsigned int); struct blk_integrity_profile { integrity_processing_fn *generate_fn; integrity_processing_fn *verify_fn; + integrity_prepare_fn *prepare_fn; + integrity_complete_fn *complete_fn; const char *name; }; diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 82156da3c650..b9dbda1c26aa 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -293,6 +293,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private); struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); u64 ceph_client_gid(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); +extern void ceph_reset_client_addr(struct ceph_client *client); extern int __ceph_open_session(struct ceph_client *client, unsigned long started); extern int ceph_open_session(struct ceph_client *client); diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 23895d178149..c4458dc6a757 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -337,6 +337,7 @@ extern void ceph_msgr_flush(void); extern void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr); extern void ceph_messenger_fini(struct ceph_messenger *msgr); +extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr); extern void ceph_con_init(struct ceph_connection *con, void *private, const struct ceph_connection_operations *ops, diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index b4d134d3312a..dbb8a6959a73 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -109,6 +109,7 @@ extern int ceph_monmap_contains(struct ceph_monmap *m, extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); extern void ceph_monc_stop(struct ceph_mon_client *monc); +extern void ceph_monc_reopen_session(struct ceph_mon_client *monc); enum { CEPH_SUB_MONMAP = 0, diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ad7fe5d10dcd..eaffbdddf89a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -381,6 +381,7 @@ extern void ceph_osdc_cleanup(void); extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); extern void ceph_osdc_stop(struct ceph_osd_client *osdc); +extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc); extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg); @@ -388,6 +389,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); +void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc); #define osd_req_op_data(oreq, whch, typ, fld) \ ({ \ diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 9569e7c786d3..4b898cdbdf05 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -129,11 +129,8 @@ static inline bool compaction_failed(enum compact_result result) return false; } -/* - * Compaction has backed off for some reason. It might be throttling or - * lock contention. Retrying is still worthwhile. - */ -static inline bool compaction_withdrawn(enum compact_result result) +/* Compaction needs reclaim to be performed first, so it can continue. */ +static inline bool compaction_needs_reclaim(enum compact_result result) { /* * Compaction backed off due to watermark checks for order-0 @@ -142,6 +139,16 @@ static inline bool compaction_withdrawn(enum compact_result result) if (result == COMPACT_SKIPPED) return true; + return false; +} + +/* + * Compaction has backed off for some reason after doing some work or none + * at all. It might be throttling or lock contention. Retrying might be still + * worthwhile, but with a higher priority if allowed. + */ +static inline bool compaction_withdrawn(enum compact_result result) +{ /* * If compaction is deferred for high-order allocations, it is * because sync compaction recently failed. If this is the case @@ -207,6 +214,11 @@ static inline bool compaction_failed(enum compact_result result) return false; } +static inline bool compaction_needs_reclaim(enum compact_result result) +{ + return false; +} + static inline bool compaction_withdrawn(enum compact_result result) { return true; diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index b5a5a1ed9efd..78a73eba64dd 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -200,8 +200,8 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node) for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_wrap(cpu, mask, start) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) -#define for_each_cpu_and(cpu, mask, and) \ - for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and) +#define for_each_cpu_and(cpu, mask1, mask2) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) #else /** * cpumask_first - get the first cpu in a cpumask @@ -290,20 +290,20 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator - * @mask: the first cpumask pointer - * @and: the second cpumask pointer + * @mask1: the first cpumask pointer + * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; - * cpumask_and(&tmp, &mask, &and); + * cpumask_and(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ -#define for_each_cpu_and(cpu, mask, and) \ +#define for_each_cpu_and(cpu, mask1, mask2) \ for ((cpu) = -1; \ - (cpu) = cpumask_next_and((cpu), (mask), (and)), \ + (cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \ (cpu) < nr_cpu_ids;) #endif /* SMP */ diff --git a/include/linux/cred.h b/include/linux/cred.h index f7a30e0099be..18639c069263 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -386,7 +386,6 @@ static inline void put_cred(const struct cred *_cred) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) -#define current_security() (current_cred_xxx(security)) extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS diff --git a/include/linux/fs.h b/include/linux/fs.h index bcc2ab162126..875c899c8556 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -429,6 +429,7 @@ int pagecache_write_end(struct file *, struct address_space *mapping, * @i_pages: Cached pages. * @gfp_mask: Memory allocation flags to use for allocating pages. * @i_mmap_writable: Number of VM_SHARED mappings. + * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. @@ -446,6 +447,10 @@ struct address_space { struct xarray i_pages; gfp_t gfp_mask; atomic_t i_mmap_writable; +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + /* number of thp, only for non-shmem files */ + atomic_t nr_thps; +#endif struct rb_root_cached i_mmap; struct rw_semaphore i_mmap_rwsem; unsigned long nrpages; @@ -2818,6 +2823,33 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) return errseq_sample(&mapping->wb_err); } +static inline int filemap_nr_thps(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + return atomic_read(&mapping->nr_thps); +#else + return 0; +#endif +} + +static inline void filemap_nr_thps_inc(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + atomic_inc(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + +static inline void filemap_nr_thps_dec(struct address_space *mapping) +{ +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + atomic_dec(&mapping->nr_thps); +#else + WARN_ON_ONCE(1); +#endif +} + extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 0424df7f6e6b..e5c14e2c53d3 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -95,7 +95,6 @@ struct fs_context { const struct cred *cred; /* The mounter's credentials */ struct fc_log *log; /* Logging buffer */ const char *source; /* The source name (eg. dev path) */ - const char *subtype; /* The subtype to set on the superblock */ void *security; /* Linux S&M options */ void *s_fs_info; /* Proposed s_fs_info */ unsigned int sb_flags; /* Proposed superblock flags (SB_*) */ diff --git a/include/linux/hid.h b/include/linux/hid.h index d770ab1a0479..cd41f209043f 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1154,29 +1154,32 @@ int hid_pidff_init(struct hid_device *hid); #define hid_pidff_init NULL #endif -#define dbg_hid(format, arg...) \ +#define dbg_hid(fmt, ...) \ do { \ if (hid_debug) \ - printk(KERN_DEBUG "%s: " format, __FILE__, ##arg); \ + printk(KERN_DEBUG "%s: " fmt, __FILE__, ##__VA_ARGS__); \ } while (0) -#define hid_printk(level, hid, fmt, arg...) \ - dev_printk(level, &(hid)->dev, fmt, ##arg) -#define hid_emerg(hid, fmt, arg...) \ - dev_emerg(&(hid)->dev, fmt, ##arg) -#define hid_crit(hid, fmt, arg...) \ - dev_crit(&(hid)->dev, fmt, ##arg) -#define hid_alert(hid, fmt, arg...) \ - dev_alert(&(hid)->dev, fmt, ##arg) -#define hid_err(hid, fmt, arg...) \ - dev_err(&(hid)->dev, fmt, ##arg) -#define hid_notice(hid, fmt, arg...) \ - dev_notice(&(hid)->dev, fmt, ##arg) -#define hid_warn(hid, fmt, arg...) \ - dev_warn(&(hid)->dev, fmt, ##arg) -#define hid_info(hid, fmt, arg...) \ - dev_info(&(hid)->dev, fmt, ##arg) -#define hid_dbg(hid, fmt, arg...) \ - dev_dbg(&(hid)->dev, fmt, ##arg) +#define hid_err(hid, fmt, ...) \ + dev_err(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_notice(hid, fmt, ...) \ + dev_notice(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_warn(hid, fmt, ...) \ + dev_warn(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_info(hid, fmt, ...) \ + dev_info(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_dbg(hid, fmt, ...) \ + dev_dbg(&(hid)->dev, fmt, ##__VA_ARGS__) + +#define hid_err_once(hid, fmt, ...) \ + dev_err_once(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_notice_once(hid, fmt, ...) \ + dev_notice_once(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_warn_once(hid, fmt, ...) \ + dev_warn_once(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_info_once(hid, fmt, ...) \ + dev_info_once(&(hid)->dev, fmt, ##__VA_ARGS__) +#define hid_dbg_once(hid, fmt, ...) \ + dev_dbg_once(&(hid)->dev, fmt, ##__VA_ARGS__) #endif diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 45ede62aa85b..61c9ffd89b05 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -267,6 +267,15 @@ static inline bool thp_migration_supported(void) return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } +static inline struct list_head *page_deferred_list(struct page *page) +{ + /* + * Global or memcg deferred list in the second tail pages is + * occupied by compound_head. + */ + return &page[2].deferred_list; +} + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index edfca4278319..53fc34f930d0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -454,7 +454,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, static inline struct hstate *page_hstate(struct page *page) { VM_BUG_ON_PAGE(!PageHuge(page), page); - return size_to_hstate(PAGE_SIZE << compound_order(page)); + return size_to_hstate(page_size(page)); } static inline unsigned hstate_index_to_shift(unsigned index) diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 2afe6fdc1dda..b4a017093b69 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -245,7 +245,10 @@ struct vmbus_channel_offer { } pipe; } u; /* - * The sub_channel_index is defined in win8. + * The sub_channel_index is defined in Win8: a value of zero means a + * primary channel and a value of non-zero means a sub-channel. + * + * Before Win8, the field is reserved, meaning it's always zero. */ u16 sub_channel_index; u16 reserved3; @@ -423,6 +426,9 @@ enum vmbus_channel_message_type { CHANNELMSG_COUNT }; +/* Hyper-V supports about 2048 channels, and the RELIDs start with 1. */ +#define INVALID_RELID U32_MAX + struct vmbus_channel_message_header { enum vmbus_channel_message_type msgtype; u32 padding; @@ -934,6 +940,11 @@ static inline bool is_hvsock_channel(const struct vmbus_channel *c) VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER); } +static inline bool is_sub_channel(const struct vmbus_channel *c) +{ + return c->offermsg.offer.sub_channel_index != 0; +} + static inline void set_channel_affinity_state(struct vmbus_channel *c, enum hv_numa_policy policy) { @@ -1149,6 +1160,9 @@ struct hv_driver { int (*remove)(struct hv_device *); void (*shutdown)(struct hv_device *); + int (*suspend)(struct hv_device *); + int (*resume)(struct hv_device *); + }; /* Base device object */ diff --git a/include/linux/i2c.h b/include/linux/i2c.h index c0a78c069117..1361637c369d 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -473,7 +473,7 @@ extern struct i2c_client * devm_i2c_new_dummy_device(struct device *dev, struct i2c_adapter *adap, u16 address); extern struct i2c_client * -i2c_new_secondary_device(struct i2c_client *client, +i2c_new_ancillary_device(struct i2c_client *client, const char *name, u16 default_addr); diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h index 855476145fe1..aaa8a0767aa3 100644 --- a/include/linux/interval_tree_generic.h +++ b/include/linux/interval_tree_generic.h @@ -30,26 +30,8 @@ \ /* Callbacks for augmented rbtree insert and remove */ \ \ -static inline ITTYPE ITPREFIX ## _compute_subtree_last(ITSTRUCT *node) \ -{ \ - ITTYPE max = ITLAST(node), subtree_last; \ - if (node->ITRB.rb_left) { \ - subtree_last = rb_entry(node->ITRB.rb_left, \ - ITSTRUCT, ITRB)->ITSUBTREE; \ - if (max < subtree_last) \ - max = subtree_last; \ - } \ - if (node->ITRB.rb_right) { \ - subtree_last = rb_entry(node->ITRB.rb_right, \ - ITSTRUCT, ITRB)->ITSUBTREE; \ - if (max < subtree_last) \ - max = subtree_last; \ - } \ - return max; \ -} \ - \ -RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB, \ - ITTYPE, ITSUBTREE, ITPREFIX ## _compute_subtree_last) \ +RB_DECLARE_CALLBACKS_MAX(static, ITPREFIX ## _augment, \ + ITSTRUCT, ITRB, ITTYPE, ITSUBTREE, ITLAST) \ \ /* Insert / remove interval nodes from the tree */ \ \ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index bc499ceae392..7aa5d6117936 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -188,10 +188,14 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, */ #define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */ #define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */ -typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret, - unsigned flags); + +struct iomap_dio_ops { + int (*end_io)(struct kiocb *iocb, ssize_t size, int error, + unsigned flags); +}; + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, - const struct iomap_ops *ops, iomap_dio_end_io_t end_io); + const struct iomap_ops *ops, const struct iomap_dio_ops *dops); int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); #ifdef CONFIG_SWAP diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index df03825ad1a1..603fbc4e2f70 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1410,8 +1410,6 @@ extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_force_commit_nested(journal_t *); -extern int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode); -extern int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode); extern int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index f0b809258ed3..cc162f3e6461 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -183,6 +183,8 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, bool get_value); void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len); void * __weak arch_kexec_kernel_image_load(struct kimage *image); int __weak arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index fbf144aaa749..b072aeb1fd78 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -326,8 +326,10 @@ extern atomic_t kgdb_active; (raw_smp_processor_id() == atomic_read(&kgdb_active)) extern bool dbg_is_early; extern void __init dbg_late_init(void); +extern void kgdb_panic(const char *msg); #else /* ! CONFIG_KGDB */ #define in_dbg_master() (0) #define dbg_late_init() +static inline void kgdb_panic(const char *msg) {} #endif /* ! CONFIG_KGDB */ #endif /* _KGDB_H_ */ diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 082d1d2a5216..bc45ea1efbf7 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,6 +15,14 @@ extern int __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags); +#ifdef CONFIG_SHMEM +extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); +#else +static inline void collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ +} +#endif #define khugepaged_enabled() \ (transparent_hugepage_flags & \ @@ -73,6 +81,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, { return 0; } +static inline void collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_KHUGEPAGED_H */ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index df1318d85f7d..3fced5824aee 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -339,6 +339,9 @@ * Check for permission to change root directory. * @path contains the path structure. * Return 0 if permission is granted. + * @path_notify: + * Check permissions before setting a watch on events as defined by @mask, + * on an object at @path, whose type is defined by @obj_type. * @inode_readlink: * Check the permission to read the symbolic link. * @dentry contains the dentry structure for the file link. @@ -1535,7 +1538,9 @@ union security_list_options { int (*path_chown)(const struct path *path, kuid_t uid, kgid_t gid); int (*path_chroot)(const struct path *path); #endif - + /* Needed for inode based security check */ + int (*path_notify)(const struct path *path, u64 mask, + unsigned int obj_type); int (*inode_alloc_security)(struct inode *inode); void (*inode_free_security)(struct inode *inode); int (*inode_init_security)(struct inode *inode, struct inode *dir, @@ -1860,6 +1865,8 @@ struct security_hook_heads { struct hlist_head path_chown; struct hlist_head path_chroot; #endif + /* Needed for inode based modules as well */ + struct hlist_head path_notify; struct hlist_head inode_alloc_security; struct hlist_head inode_free_security; struct hlist_head inode_init_security; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ad8f1a397ae4..9b60863429cc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -128,9 +128,8 @@ struct mem_cgroup_per_node { struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; -#ifdef CONFIG_MEMCG_KMEM struct memcg_shrinker_map __rcu *shrinker_map; -#endif + struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ @@ -331,6 +330,10 @@ struct mem_cgroup { struct list_head event_list; spinlock_t event_list_lock; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + struct deferred_split deferred_split_queue; +#endif + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; @@ -1311,6 +1314,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) } while ((memcg = parent_mem_cgroup(memcg))); return false; } + +extern int memcg_expand_shrinker_maps(int new_id); + +extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; @@ -1319,6 +1327,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; } + +static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id) +{ +} #endif struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); @@ -1390,10 +1403,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -extern int memcg_expand_shrinker_maps(int new_id); - -extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id); #else static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) @@ -1435,8 +1444,6 @@ static inline void memcg_put_cache_ids(void) { } -static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id) { } #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/memory.h b/include/linux/memory.h index 02e633f3ede0..0ebb105eb261 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -25,7 +25,6 @@ struct memory_block { unsigned long start_section_nr; - unsigned long end_section_nr; unsigned long state; /* serialized by the dev->lock */ int section_count; /* serialized by mem_sysfs_mutex */ int online_type; /* for passing data to online routine */ @@ -80,9 +79,9 @@ struct mem_section; #define IPC_CALLBACK_PRI 10 #ifndef CONFIG_MEMORY_HOTPLUG_SPARSE -static inline int memory_dev_init(void) +static inline void memory_dev_init(void) { - return 0; + return; } static inline int register_memory_notifier(struct notifier_block *nb) { @@ -113,7 +112,7 @@ extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size); void remove_memory_block_devices(unsigned long start, unsigned long size); -extern int memory_dev_init(void); +extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); extern int memory_isolate_notify(unsigned long val, void *v); extern struct memory_block *find_memory_block(struct mem_section *); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index fb2a0bd826b9..bef51e35d8d2 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -111,7 +111,6 @@ struct dev_pagemap { struct completion done; enum memory_type type; unsigned int flags; - u64 pci_p2pdma_bus_offset; const struct dev_pagemap_ops *ops; }; diff --git a/include/linux/mfd/da9063/pdata.h b/include/linux/mfd/da9063/pdata.h deleted file mode 100644 index 085edbf7601b..000000000000 --- a/include/linux/mfd/da9063/pdata.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Platform configuration options for DA9063 - * - * Copyright 2012 Dialog Semiconductor Ltd. - * - * Author: Michal Hajduk, Dialog Semiconductor - * Author: Krystian Garbaciak, Dialog Semiconductor - */ - -#ifndef __MFD_DA9063_PDATA_H__ -#define __MFD_DA9063_PDATA_H__ - -/* - * RGB LED configuration - */ -/* LED IDs for flags in struct led_info. */ -enum { - DA9063_GPIO11_LED, - DA9063_GPIO14_LED, - DA9063_GPIO15_LED, - - DA9063_LED_NUM -}; -#define DA9063_LED_ID_MASK 0x3 - -/* LED polarity for flags in struct led_info. */ -#define DA9063_LED_HIGH_LEVEL_ACTIVE 0x0 -#define DA9063_LED_LOW_LEVEL_ACTIVE 0x4 - - -/* - * General PMIC configuration - */ -/* HWMON ADC channels configuration */ -#define DA9063_FLG_FORCE_IN0_MANUAL_MODE 0x0010 -#define DA9063_FLG_FORCE_IN0_AUTO_MODE 0x0020 -#define DA9063_FLG_FORCE_IN1_MANUAL_MODE 0x0040 -#define DA9063_FLG_FORCE_IN1_AUTO_MODE 0x0080 -#define DA9063_FLG_FORCE_IN2_MANUAL_MODE 0x0100 -#define DA9063_FLG_FORCE_IN2_AUTO_MODE 0x0200 -#define DA9063_FLG_FORCE_IN3_MANUAL_MODE 0x0400 -#define DA9063_FLG_FORCE_IN3_AUTO_MODE 0x0800 - -/* Disable register caching. */ -#define DA9063_FLG_NO_CACHE 0x0008 - -struct da9063; - -/* DA9063 platform data */ -struct da9063_pdata { - int (*init)(struct da9063 *da9063); - int irq_base; - bool key_power; - unsigned flags; - struct da9063_regulators_pdata *regulators_pdata; - struct led_platform_data *leds_pdata; -}; - -#endif /* __MFD_DA9063_PDATA_H__ */ diff --git a/include/linux/mfd/intel_soc_pmic_mrfld.h b/include/linux/mfd/intel_soc_pmic_mrfld.h new file mode 100644 index 000000000000..4daecd682275 --- /dev/null +++ b/include/linux/mfd/intel_soc_pmic_mrfld.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Header file for Intel Merrifield Basin Cove PMIC + * + * Copyright (C) 2019 Intel Corporation. All rights reserved. + */ + +#ifndef __INTEL_SOC_PMIC_MRFLD_H__ +#define __INTEL_SOC_PMIC_MRFLD_H__ + +#include + +#define BCOVE_ID 0x00 + +#define BCOVE_ID_MINREV0 GENMASK(2, 0) +#define BCOVE_ID_MAJREV0 GENMASK(5, 3) +#define BCOVE_ID_VENDID0 GENMASK(7, 6) + +#define BCOVE_MINOR(x) (unsigned int)(((x) & BCOVE_ID_MINREV0) >> 0) +#define BCOVE_MAJOR(x) (unsigned int)(((x) & BCOVE_ID_MAJREV0) >> 3) +#define BCOVE_VENDOR(x) (unsigned int)(((x) & BCOVE_ID_VENDID0) >> 6) + +#define BCOVE_IRQLVL1 0x01 + +#define BCOVE_PBIRQ 0x02 +#define BCOVE_TMUIRQ 0x03 +#define BCOVE_THRMIRQ 0x04 +#define BCOVE_BCUIRQ 0x05 +#define BCOVE_ADCIRQ 0x06 +#define BCOVE_CHGRIRQ0 0x07 +#define BCOVE_CHGRIRQ1 0x08 +#define BCOVE_GPIOIRQ 0x09 +#define BCOVE_CRITIRQ 0x0B + +#define BCOVE_MIRQLVL1 0x0C + +#define BCOVE_MPBIRQ 0x0D +#define BCOVE_MTMUIRQ 0x0E +#define BCOVE_MTHRMIRQ 0x0F +#define BCOVE_MBCUIRQ 0x10 +#define BCOVE_MADCIRQ 0x11 +#define BCOVE_MCHGRIRQ0 0x12 +#define BCOVE_MCHGRIRQ1 0x13 +#define BCOVE_MGPIOIRQ 0x14 +#define BCOVE_MCRITIRQ 0x16 + +#define BCOVE_SCHGRIRQ0 0x4E +#define BCOVE_SCHGRIRQ1 0x4F + +/* Level 1 IRQs */ +#define BCOVE_LVL1_PWRBTN BIT(0) /* power button */ +#define BCOVE_LVL1_TMU BIT(1) /* time management unit */ +#define BCOVE_LVL1_THRM BIT(2) /* thermal */ +#define BCOVE_LVL1_BCU BIT(3) /* burst control unit */ +#define BCOVE_LVL1_ADC BIT(4) /* ADC */ +#define BCOVE_LVL1_CHGR BIT(5) /* charger */ +#define BCOVE_LVL1_GPIO BIT(6) /* GPIO */ +#define BCOVE_LVL1_CRIT BIT(7) /* critical event */ + +/* Level 2 IRQs: power button */ +#define BCOVE_PBIRQ_PBTN BIT(0) +#define BCOVE_PBIRQ_UBTN BIT(1) + +/* Level 2 IRQs: ADC */ +#define BCOVE_ADCIRQ_BATTEMP BIT(2) +#define BCOVE_ADCIRQ_SYSTEMP BIT(3) +#define BCOVE_ADCIRQ_BATTID BIT(4) +#define BCOVE_ADCIRQ_VIBATT BIT(5) +#define BCOVE_ADCIRQ_CCTICK BIT(7) + +/* Level 2 IRQs: charger */ +#define BCOVE_CHGRIRQ_BAT0ALRT BIT(4) +#define BCOVE_CHGRIRQ_BAT1ALRT BIT(5) +#define BCOVE_CHGRIRQ_BATCRIT BIT(6) + +#define BCOVE_CHGRIRQ_VBUSDET BIT(0) +#define BCOVE_CHGRIRQ_DCDET BIT(1) +#define BCOVE_CHGRIRQ_BATTDET BIT(2) +#define BCOVE_CHGRIRQ_USBIDDET BIT(3) + +#endif /* __INTEL_SOC_PMIC_MRFLD_H__ */ diff --git a/include/linux/mfd/mt6397/core.h b/include/linux/mfd/mt6397/core.h index 25a95e72179b..fc88d315bdde 100644 --- a/include/linux/mfd/mt6397/core.h +++ b/include/linux/mfd/mt6397/core.h @@ -7,6 +7,14 @@ #ifndef __MFD_MT6397_CORE_H__ #define __MFD_MT6397_CORE_H__ +#include + +enum chip_id { + MT6323_CHIP_ID = 0x23, + MT6391_CHIP_ID = 0x91, + MT6397_CHIP_ID = 0x97, +}; + enum mt6397_irq_numbers { MT6397_IRQ_SPKL_AB = 0, MT6397_IRQ_SPKR_AB, @@ -54,6 +62,9 @@ struct mt6397_chip { u16 irq_masks_cache[2]; u16 int_con[2]; u16 int_status[2]; + u16 chip_id; }; +int mt6397_irq_init(struct mt6397_chip *chip); + #endif /* __MFD_MT6397_CORE_H__ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 0da0c6b43d3b..3d0c800fe574 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -805,6 +805,24 @@ static inline void set_compound_order(struct page *page, unsigned int order) page[1].compound_order = order; } +/* Returns the number of pages in this potentially compound page. */ +static inline unsigned long compound_nr(struct page *page) +{ + return 1UL << compound_order(page); +} + +/* Returns the number of bytes in this potentially compound page. */ +static inline unsigned long page_size(struct page *page) +{ + return PAGE_SIZE << compound_order(page); +} + +/* Returns the number of bits needed for the number of bytes in a page */ +static inline unsigned int page_shift(struct page *page) +{ + return PAGE_SHIFT + compound_order(page); +} + void free_compound_page(struct page *page); #ifdef CONFIG_MMU @@ -1057,8 +1075,9 @@ static inline void put_user_page(struct page *page) put_page(page); } -void put_user_pages_dirty(struct page **pages, unsigned long npages); -void put_user_pages_dirty_lock(struct page **pages, unsigned long npages); +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, + bool make_dirty); + void put_user_pages(struct page **pages, unsigned long npages); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -1405,7 +1424,11 @@ extern void pagefault_out_of_memory(void); extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); +#ifdef CONFIG_MMU extern bool can_do_mlock(void); +#else +static inline bool can_do_mlock(void) { return false; } +#endif extern int user_shm_lock(size_t, struct user_struct *); extern void user_shm_unlock(size_t, struct user_struct *); @@ -1926,7 +1949,7 @@ static inline void pgtable_init(void) pgtable_cache_init(); } -static inline bool pgtable_page_ctor(struct page *page) +static inline bool pgtable_pte_page_ctor(struct page *page) { if (!ptlock_init(page)) return false; @@ -1935,7 +1958,7 @@ static inline bool pgtable_page_ctor(struct page *page) return true; } -static inline void pgtable_page_dtor(struct page *page) +static inline void pgtable_pte_page_dtor(struct page *page) { ptlock_free(page); __ClearPageTable(page); @@ -2305,6 +2328,8 @@ extern int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); +unsigned long randomize_stack_top(unsigned long stack_top); + extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long mmap_region(struct file *file, unsigned long addr, @@ -2568,6 +2593,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ +#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ /* * NOTE on FOLL_LONGTERM: @@ -2845,5 +2871,12 @@ void __init setup_nr_node_ids(void); static inline void setup_nr_node_ids(void) {} #endif +extern int memcmp_pages(struct page *page1, struct page *page2); + +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c72a2a6da735..660b9f8ba9e4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -138,6 +138,7 @@ struct page { struct { /* Second tail page of compound page */ unsigned long _compound_pad_1; /* compound_head */ unsigned long _compound_pad_2; + /* For both global and memcg */ struct list_head deferred_list; }; struct { /* Page table pages */ diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index d7016dcb245e..c1bc6731125c 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -36,6 +36,10 @@ struct vmacache { struct vm_area_struct *vmas[VMACACHE_SIZE]; }; +/* + * When updating this, please also update struct resident_page_types[] in + * kernel/fork.c + */ enum { MM_FILEPAGES, /* Resident file mapping pages */ MM_ANONPAGES, /* Resident anonymous pages */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3f38c30d2f13..bda20282746b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -235,6 +235,8 @@ enum node_stat_item { NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, + NR_FILE_THPS, + NR_FILE_PMDMAPPED, NR_ANON_THPS, NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_VMSCAN_WRITE, @@ -677,6 +679,14 @@ struct zonelist { extern struct page *mem_map; #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +struct deferred_split { + spinlock_t split_queue_lock; + struct list_head split_queue; + unsigned long split_queue_len; +}; +#endif + /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which @@ -756,9 +766,7 @@ typedef struct pglist_data { #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE - spinlock_t split_queue_lock; - struct list_head split_queue; - unsigned long split_queue_len; + struct deferred_split deferred_split_queue; #endif /* Fields commonly accessed by the page reclaim scanner */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 0a11712a80e3..570a60c2f4f4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -490,6 +490,9 @@ extern const struct file_operations nfs_dir_operations; extern const struct dentry_operations nfs_dentry_operations; extern void nfs_force_lookup_revalidate(struct inode *dir); +extern struct dentry *nfs_add_or_obtain(struct dentry *dentry, + struct nfs_fh *fh, struct nfs_fattr *fattr, + struct nfs4_label *label); extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label); extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags); diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 09592951725c..682fd465df06 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -18,6 +18,7 @@ struct page_ext_operations { enum page_ext_flags { PAGE_EXT_OWNER, + PAGE_EXT_OWNER_ACTIVE, #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c7552459a15f..37a4d9e32cd3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -333,6 +333,16 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } +static inline struct page *find_subpage(struct page *page, pgoff_t offset) +{ + if (PageHuge(page)) + return page; + + VM_BUG_ON_PAGE(PageTail(page), page); + + return page + (offset & (compound_nr(page) - 1)); +} + struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, diff --git a/include/linux/pci-aspm.h b/include/linux/pci-aspm.h deleted file mode 100644 index 67064145d76e..000000000000 --- a/include/linux/pci-aspm.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * aspm.h - * - * PCI Express ASPM defines and function prototypes - * - * Copyright (C) 2007 Intel Corp. - * Zhang Yanmin (yanmin.zhang@intel.com) - * Shaohua Li (shaohua.li@intel.com) - * - * For more information, please consult the following manuals (look at - * http://www.pcisig.com/ for how to get them): - * - * PCI Express Specification - */ - -#ifndef LINUX_ASPM_H -#define LINUX_ASPM_H - -#include - -#define PCIE_LINK_STATE_L0S 1 -#define PCIE_LINK_STATE_L1 2 -#define PCIE_LINK_STATE_CLKPM 4 - -#ifdef CONFIG_PCIEASPM -int pci_disable_link_state(struct pci_dev *pdev, int state); -int pci_disable_link_state_locked(struct pci_dev *pdev, int state); -void pcie_no_aspm(void); -#else -static inline int pci_disable_link_state(struct pci_dev *pdev, int state) -{ return 0; } -static inline void pcie_no_aspm(void) { } -#endif - -#endif /* LINUX_ASPM_H */ diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index bca9bc3e5be7..8318a97c9c61 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -30,8 +30,10 @@ struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev, unsigned int *nents, u32 length); void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl); void pci_p2pmem_publish(struct pci_dev *pdev, bool publish); -int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir); +int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs); +void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs); int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma); ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, @@ -81,11 +83,17 @@ static inline void pci_p2pmem_free_sgl(struct pci_dev *pdev, static inline void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) { } -static inline int pci_p2pdma_map_sg(struct device *dev, - struct scatterlist *sg, int nents, enum dma_data_direction dir) +static inline int pci_p2pdma_map_sg_attrs(struct device *dev, + struct scatterlist *sg, int nents, enum dma_data_direction dir, + unsigned long attrs) { return 0; } +static inline void pci_p2pdma_unmap_sg_attrs(struct device *dev, + struct scatterlist *sg, int nents, enum dma_data_direction dir, + unsigned long attrs) +{ +} static inline int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma) { @@ -111,4 +119,16 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client) return pci_p2pmem_find_many(&client, 1); } +static inline int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) +{ + return pci_p2pdma_map_sg_attrs(dev, sg, nents, dir, 0); +} + +static inline void pci_p2pdma_unmap_sg(struct device *dev, + struct scatterlist *sg, int nents, enum dma_data_direction dir) +{ + pci_p2pdma_unmap_sg_attrs(dev, sg, nents, dir, 0); +} + #endif /* _LINUX_PCI_P2P_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 82e4cd1b7ac3..f9088c89a534 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -6,12 +6,18 @@ * Copyright 1994, Drew Eckhardt * Copyright 1997--1999 Martin Mares * + * PCI Express ASPM defines and function prototypes + * Copyright (c) 2007 Intel Corp. + * Zhang Yanmin (yanmin.zhang@intel.com) + * Shaohua Li (shaohua.li@intel.com) + * * For more information, please consult the following manuals (look at * http://www.pcisig.com/ for how to get them): * * PCI BIOS Specification * PCI Local Bus Specification * PCI to PCI Bridge Specification + * PCI Express Specification * PCI System Design Guide */ #ifndef LINUX_PCI_H @@ -145,11 +151,6 @@ static inline const char *pci_power_name(pci_power_t state) return pci_power_names[1 + (__force int) state]; } -#define PCI_PM_D2_DELAY 200 -#define PCI_PM_D3_WAIT 10 -#define PCI_PM_D3COLD_WAIT 100 -#define PCI_PM_BUS_WAIT 50 - /** * typedef pci_channel_state_t * @@ -418,7 +419,6 @@ struct pci_dev { unsigned int broken_intx_masking:1; /* INTx masking can't be used */ unsigned int io_window_1k:1; /* Intel bridge 1K I/O windows */ unsigned int irq_managed:1; - unsigned int has_secondary_link:1; unsigned int non_compliant_bars:1; /* Broken BARs; ignore them */ unsigned int is_probed:1; /* Device probing in progress */ unsigned int link_active_reporting:1;/* Device capable of reporting link active */ @@ -649,9 +649,6 @@ static inline struct pci_dev *pci_upstream_bridge(struct pci_dev *dev) return dev->bus->self; } -struct device *pci_get_host_bridge_device(struct pci_dev *dev); -void pci_put_host_bridge_device(struct device *dev); - #ifdef CONFIG_PCI_MSI static inline bool pci_dev_msi_enabled(struct pci_dev *pci_dev) { @@ -925,6 +922,11 @@ enum { PCI_SCAN_ALL_PCIE_DEVS = 0x00000040, /* Scan all, not just dev 0 */ }; +#define PCI_IRQ_LEGACY (1 << 0) /* Allow legacy interrupts */ +#define PCI_IRQ_MSI (1 << 1) /* Allow MSI interrupts */ +#define PCI_IRQ_MSIX (1 << 2) /* Allow MSI-X interrupts */ +#define PCI_IRQ_AFFINITY (1 << 3) /* Auto-assign affinity */ + /* These external functions are only available when PCI support is enabled */ #ifdef CONFIG_PCI @@ -969,7 +971,7 @@ resource_size_t pcibios_align_resource(void *, const struct resource *, resource_size_t, resource_size_t); -/* Weak but can be overriden by arch */ +/* Weak but can be overridden by arch */ void pci_fixup_cardbus(struct pci_bus *); /* Generic PCI functions used internally */ @@ -995,7 +997,6 @@ struct pci_bus *pci_scan_root_bus(struct device *parent, int bus, int pci_scan_root_bus_bridge(struct pci_host_bridge *bridge); struct pci_bus *pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, int busnr); -void pcie_update_link_speed(struct pci_bus *bus, u16 link_status); struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr, const char *name, struct hotplug_slot *hotplug); @@ -1241,19 +1242,12 @@ int pci_wake_from_d3(struct pci_dev *dev, bool enable); int pci_prepare_to_sleep(struct pci_dev *dev); int pci_back_from_sleep(struct pci_dev *dev); bool pci_dev_run_wake(struct pci_dev *dev); -bool pci_check_pme_status(struct pci_dev *dev); -void pci_pme_wakeup_bus(struct pci_bus *bus); void pci_d3cold_enable(struct pci_dev *dev); void pci_d3cold_disable(struct pci_dev *dev); bool pcie_relaxed_ordering_enabled(struct pci_dev *dev); void pci_wakeup_bus(struct pci_bus *bus); void pci_bus_set_current_state(struct pci_bus *bus, pci_power_t state); -/* PCI Virtual Channel */ -int pci_save_vc_state(struct pci_dev *dev); -void pci_restore_vc_state(struct pci_dev *dev); -void pci_allocate_vc_save_buffers(struct pci_dev *dev); - /* For use by arch with custom probe code */ void set_pcie_port_type(struct pci_dev *pdev); void set_pcie_hotplug_bridge(struct pci_dev *pdev); @@ -1297,8 +1291,6 @@ int pci_request_selected_regions_exclusive(struct pci_dev *, int, const char *); void pci_release_selected_regions(struct pci_dev *, int); /* drivers/pci/bus.c */ -struct pci_bus *pci_bus_get(struct pci_bus *bus); -void pci_bus_put(struct pci_bus *bus); void pci_add_resource(struct list_head *resources, struct resource *res); void pci_add_resource_offset(struct list_head *resources, struct resource *res, resource_size_t offset); @@ -1408,11 +1400,6 @@ resource_size_t pcibios_window_alignment(struct pci_bus *bus, int pci_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags); -#define PCI_IRQ_LEGACY (1 << 0) /* Allow legacy interrupts */ -#define PCI_IRQ_MSI (1 << 1) /* Allow MSI interrupts */ -#define PCI_IRQ_MSIX (1 << 2) /* Allow MSI-X interrupts */ -#define PCI_IRQ_AFFINITY (1 << 3) /* Auto-assign affinity */ - /* * Virtual interrupts allow for more interrupts to be allocated * than the device has interrupts for. These are not programmed @@ -1517,14 +1504,6 @@ static inline int pci_irq_get_node(struct pci_dev *pdev, int vec) } #endif -static inline int -pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, - unsigned int max_vecs, unsigned int flags) -{ - return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs, flags, - NULL); -} - /** * pci_irqd_intx_xlate() - Translate PCI INTx value to an IRQ domain hwirq * @d: the INTx IRQ domain @@ -1565,10 +1544,22 @@ extern bool pcie_ports_native; #define pcie_ports_native false #endif +#define PCIE_LINK_STATE_L0S 1 +#define PCIE_LINK_STATE_L1 2 +#define PCIE_LINK_STATE_CLKPM 4 + #ifdef CONFIG_PCIEASPM +int pci_disable_link_state(struct pci_dev *pdev, int state); +int pci_disable_link_state_locked(struct pci_dev *pdev, int state); +void pcie_no_aspm(void); bool pcie_aspm_support_enabled(void); bool pcie_aspm_enabled(struct pci_dev *pdev); #else +static inline int pci_disable_link_state(struct pci_dev *pdev, int state) +{ return 0; } +static inline int pci_disable_link_state_locked(struct pci_dev *pdev, int state) +{ return 0; } +static inline void pcie_no_aspm(void) { } static inline bool pcie_aspm_support_enabled(void) { return false; } static inline bool pcie_aspm_enabled(struct pci_dev *pdev) { return false; } #endif @@ -1579,23 +1570,8 @@ bool pci_aer_available(void); static inline bool pci_aer_available(void) { return false; } #endif -#ifdef CONFIG_PCIE_ECRC -void pcie_set_ecrc_checking(struct pci_dev *dev); -void pcie_ecrc_get_policy(char *str); -#else -static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { } -static inline void pcie_ecrc_get_policy(char *str) { } -#endif - bool pci_ats_disabled(void); -#ifdef CONFIG_PCIE_PTM -int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); -#else -static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) -{ return -EINVAL; } -#endif - void pci_cfg_access_lock(struct pci_dev *dev); bool pci_cfg_access_trylock(struct pci_dev *dev); void pci_cfg_access_unlock(struct pci_dev *dev); @@ -1749,11 +1725,6 @@ static inline void pci_release_regions(struct pci_dev *dev) { } static inline unsigned long pci_address_to_pio(phys_addr_t addr) { return -1; } -static inline void pci_block_cfg_access(struct pci_dev *dev) { } -static inline int pci_block_cfg_access_in_atomic(struct pci_dev *dev) -{ return 0; } -static inline void pci_unblock_cfg_access(struct pci_dev *dev) { } - static inline struct pci_bus *pci_find_next_bus(const struct pci_bus *from) { return NULL; } static inline struct pci_dev *pci_get_slot(struct pci_bus *bus, @@ -1782,17 +1753,36 @@ static inline const struct pci_device_id *pci_match_id(const struct pci_device_i struct pci_dev *dev) { return NULL; } static inline bool pci_ats_disabled(void) { return true; } + +static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr) +{ + return -EINVAL; +} + +static inline int +pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags, + struct irq_affinity *aff_desc) +{ + return -ENOSPC; +} #endif /* CONFIG_PCI */ +static inline int +pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, + unsigned int max_vecs, unsigned int flags) +{ + return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs, flags, + NULL); +} + #ifdef CONFIG_PCI_ATS /* Address Translation Service */ -void pci_ats_init(struct pci_dev *dev); int pci_enable_ats(struct pci_dev *dev, int ps); void pci_disable_ats(struct pci_dev *dev); int pci_ats_queue_depth(struct pci_dev *dev); int pci_ats_page_aligned(struct pci_dev *dev); #else -static inline void pci_ats_init(struct pci_dev *d) { } static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; } static inline void pci_disable_ats(struct pci_dev *d) { } static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; } @@ -1803,7 +1793,7 @@ static inline int pci_ats_page_aligned(struct pci_dev *dev) { return 0; } #include -/* These two functions provide almost identical functionality. Depennding +/* These two functions provide almost identical functionality. Depending * on the architecture, one will be implemented as a wrapper around the * other (in drivers/pci/mmap.c). * @@ -1872,25 +1862,9 @@ static inline const char *pci_name(const struct pci_dev *pdev) return dev_name(&pdev->dev); } - -/* - * Some archs don't want to expose struct resource to userland as-is - * in sysfs and /proc - */ -#ifdef HAVE_ARCH_PCI_RESOURCE_TO_USER void pci_resource_to_user(const struct pci_dev *dev, int bar, const struct resource *rsrc, resource_size_t *start, resource_size_t *end); -#else -static inline void pci_resource_to_user(const struct pci_dev *dev, int bar, - const struct resource *rsrc, resource_size_t *start, - resource_size_t *end) -{ - *start = rsrc->start; - *end = rsrc->end; -} -#endif /* HAVE_ARCH_PCI_RESOURCE_TO_USER */ - /* * The world is not perfect and supplies us with broken PCI devices. @@ -2032,10 +2006,6 @@ extern unsigned long pci_cardbus_mem_size; extern u8 pci_dfl_cache_line_size; extern u8 pci_cache_line_size; -extern unsigned long pci_hotplug_io_size; -extern unsigned long pci_hotplug_mem_size; -extern unsigned long pci_hotplug_bus_size; - /* Architecture-specific versions may override these (weak) */ void pcibios_disable_device(struct pci_dev *dev); void pcibios_set_master(struct pci_dev *dev); @@ -2305,10 +2275,6 @@ int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off, #ifdef CONFIG_OF struct device_node; struct irq_domain; -void pci_set_of_node(struct pci_dev *dev); -void pci_release_of_node(struct pci_dev *dev); -void pci_set_bus_of_node(struct pci_bus *bus); -void pci_release_bus_of_node(struct pci_bus *bus); struct irq_domain *pci_host_bridge_of_msi_domain(struct pci_bus *bus); int pci_parse_request_of_pci_ranges(struct device *dev, struct list_head *resources, @@ -2318,10 +2284,6 @@ int pci_parse_request_of_pci_ranges(struct device *dev, struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus); #else /* CONFIG_OF */ -static inline void pci_set_of_node(struct pci_dev *dev) { } -static inline void pci_release_of_node(struct pci_dev *dev) { } -static inline void pci_set_bus_of_node(struct pci_bus *bus) { } -static inline void pci_release_bus_of_node(struct pci_bus *bus) { } static inline struct irq_domain * pci_host_bridge_of_msi_domain(struct pci_bus *bus) { return NULL; } static inline int pci_parse_request_of_pci_ranges(struct device *dev, @@ -2435,4 +2397,7 @@ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type); #define pci_notice_ratelimited(pdev, fmt, arg...) \ dev_notice_ratelimited(&(pdev)->dev, fmt, ##arg) +#define pci_info_ratelimited(pdev, fmt, arg...) \ + dev_info_ratelimited(&(pdev)->dev, fmt, ##arg) + #endif /* LINUX_PCI_H */ diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h index f694eb2ca978..b482e42d7153 100644 --- a/include/linux/pci_hotplug.h +++ b/include/linux/pci_hotplug.h @@ -86,114 +86,14 @@ void pci_hp_deregister(struct hotplug_slot *slot); #define pci_hp_initialize(slot, bus, nr, name) \ __pci_hp_initialize(slot, bus, nr, name, THIS_MODULE, KBUILD_MODNAME) -/* PCI Setting Record (Type 0) */ -struct hpp_type0 { - u32 revision; - u8 cache_line_size; - u8 latency_timer; - u8 enable_serr; - u8 enable_perr; -}; - -/* PCI-X Setting Record (Type 1) */ -struct hpp_type1 { - u32 revision; - u8 max_mem_read; - u8 avg_max_split; - u16 tot_max_split; -}; - -/* PCI Express Setting Record (Type 2) */ -struct hpp_type2 { - u32 revision; - u32 unc_err_mask_and; - u32 unc_err_mask_or; - u32 unc_err_sever_and; - u32 unc_err_sever_or; - u32 cor_err_mask_and; - u32 cor_err_mask_or; - u32 adv_err_cap_and; - u32 adv_err_cap_or; - u16 pci_exp_devctl_and; - u16 pci_exp_devctl_or; - u16 pci_exp_lnkctl_and; - u16 pci_exp_lnkctl_or; - u32 sec_unc_err_sever_and; - u32 sec_unc_err_sever_or; - u32 sec_unc_err_mask_and; - u32 sec_unc_err_mask_or; -}; - -/* - * _HPX PCI Express Setting Record (Type 3) - */ -struct hpx_type3 { - u16 device_type; - u16 function_type; - u16 config_space_location; - u16 pci_exp_cap_id; - u16 pci_exp_cap_ver; - u16 pci_exp_vendor_id; - u16 dvsec_id; - u16 dvsec_rev; - u16 match_offset; - u32 match_mask_and; - u32 match_value; - u16 reg_offset; - u32 reg_mask_and; - u32 reg_mask_or; -}; - -struct hotplug_program_ops { - void (*program_type0)(struct pci_dev *dev, struct hpp_type0 *hpp); - void (*program_type1)(struct pci_dev *dev, struct hpp_type1 *hpp); - void (*program_type2)(struct pci_dev *dev, struct hpp_type2 *hpp); - void (*program_type3)(struct pci_dev *dev, struct hpx_type3 *hpp); -}; - -enum hpx_type3_dev_type { - HPX_TYPE_ENDPOINT = BIT(0), - HPX_TYPE_LEG_END = BIT(1), - HPX_TYPE_RC_END = BIT(2), - HPX_TYPE_RC_EC = BIT(3), - HPX_TYPE_ROOT_PORT = BIT(4), - HPX_TYPE_UPSTREAM = BIT(5), - HPX_TYPE_DOWNSTREAM = BIT(6), - HPX_TYPE_PCI_BRIDGE = BIT(7), - HPX_TYPE_PCIE_BRIDGE = BIT(8), -}; - -enum hpx_type3_fn_type { - HPX_FN_NORMAL = BIT(0), - HPX_FN_SRIOV_PHYS = BIT(1), - HPX_FN_SRIOV_VIRT = BIT(2), -}; - -enum hpx_type3_cfg_loc { - HPX_CFG_PCICFG = 0, - HPX_CFG_PCIE_CAP = 1, - HPX_CFG_PCIE_CAP_EXT = 2, - HPX_CFG_VEND_CAP = 3, - HPX_CFG_DVSEC = 4, - HPX_CFG_MAX, -}; - #ifdef CONFIG_ACPI #include -int pci_acpi_program_hp_params(struct pci_dev *dev, - const struct hotplug_program_ops *hp_ops); bool pciehp_is_native(struct pci_dev *bridge); int acpi_get_hp_hw_control_from_firmware(struct pci_dev *bridge); bool shpchp_is_native(struct pci_dev *bridge); int acpi_pci_check_ejectable(struct pci_bus *pbus, acpi_handle handle); int acpi_pci_detect_ejectable(acpi_handle handle); #else -static inline int pci_acpi_program_hp_params(struct pci_dev *dev, - const struct hotplug_program_ops *hp_ops) -{ - return -ENODEV; -} - static inline int acpi_get_hp_hw_control_from_firmware(struct pci_dev *bridge) { return 0; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index de1b75e963ef..21a572469a4e 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2134,6 +2134,7 @@ #define PCI_VENDOR_ID_MYRICOM 0x14c1 #define PCI_VENDOR_ID_MEDIATEK 0x14c3 +#define PCI_DEVICE_ID_MEDIATEK_7629 0x7629 #define PCI_VENDOR_ID_TITAN 0x14D2 #define PCI_DEVICE_ID_TITAN_010L 0x8001 @@ -2574,6 +2575,8 @@ #define PCI_VENDOR_ID_ASMEDIA 0x1b21 +#define PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS 0x1c36 + #define PCI_VENDOR_ID_CIRCUITCO 0x1cc8 #define PCI_SUBSYSTEM_ID_CIRCUITCO_MINNOWBOARD 0x0001 diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index 7ccb8757b79d..98415686cbfa 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -5513,6 +5513,18 @@ struct ec_params_fp_seed { uint8_t seed[FP_CONTEXT_TPM_BYTES]; } __ec_align4; +#define EC_CMD_FP_ENC_STATUS 0x0409 + +/* FP TPM seed has been set or not */ +#define FP_ENC_STATUS_SEED_SET BIT(0) + +struct ec_response_fp_encryption_status { + /* Used bits in encryption engine status */ + uint32_t valid_flags; + /* Encryption engine status */ + uint32_t status; +} __ec_align4; + /*****************************************************************************/ /* Touchpad MCU commands: range 0x0500-0x05FF */ diff --git a/include/linux/printk.h b/include/linux/printk.h index cefd374c47b1..c09d67edda3a 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -488,13 +488,6 @@ extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, extern void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); -#if defined(CONFIG_DYNAMIC_DEBUG) -#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \ - dynamic_hex_dump(prefix_str, prefix_type, 16, 1, buf, len, true) -#else -extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type, - const void *buf, size_t len); -#endif /* defined(CONFIG_DYNAMIC_DEBUG) */ #else static inline void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, @@ -526,4 +519,19 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, } #endif +/** + * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params + * @prefix_str: string to prefix each line with; + * caller supplies trailing spaces for alignment if desired + * @prefix_type: controls whether prefix of an offset, address, or none + * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) + * @buf: data blob to dump + * @len: number of bytes in the @buf + * + * Calls print_hex_dump(), with log level of KERN_DEBUG, + * rowsize of 16, groupsize of 1, and ASCII output included. + */ +#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \ + print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true) + #endif diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h deleted file mode 100644 index 034982c98c8b..000000000000 --- a/include/linux/quicklist.h +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef LINUX_QUICKLIST_H -#define LINUX_QUICKLIST_H -/* - * Fast allocations and disposal of pages. Pages must be in the condition - * as needed after allocation when they are freed. Per cpu lists of pages - * are kept that only contain node local pages. - * - * (C) 2007, SGI. Christoph Lameter - */ -#include -#include -#include - -#ifdef CONFIG_QUICKLIST - -struct quicklist { - void *page; - int nr_pages; -}; - -DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; - -/* - * The two key functions quicklist_alloc and quicklist_free are inline so - * that they may be custom compiled for the platform. - * Specifying a NULL ctor can remove constructor support. Specifying - * a constant quicklist allows the determination of the exact address - * in the per cpu area. - * - * The fast patch in quicklist_alloc touched only a per cpu cacheline and - * the first cacheline of the page itself. There is minmal overhead involved. - */ -static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *)) -{ - struct quicklist *q; - void **p = NULL; - - q =&get_cpu_var(quicklist)[nr]; - p = q->page; - if (likely(p)) { - q->page = p[0]; - p[0] = NULL; - q->nr_pages--; - } - put_cpu_var(quicklist); - if (likely(p)) - return p; - - p = (void *)__get_free_page(flags | __GFP_ZERO); - if (ctor && p) - ctor(p); - return p; -} - -static inline void __quicklist_free(int nr, void (*dtor)(void *), void *p, - struct page *page) -{ - struct quicklist *q; - - q = &get_cpu_var(quicklist)[nr]; - *(void **)p = q->page; - q->page = p; - q->nr_pages++; - put_cpu_var(quicklist); -} - -static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp) -{ - __quicklist_free(nr, dtor, pp, virt_to_page(pp)); -} - -static inline void quicklist_free_page(int nr, void (*dtor)(void *), - struct page *page) -{ - __quicklist_free(nr, dtor, page_address(page), page); -} - -void quicklist_trim(int nr, void (*dtor)(void *), - unsigned long min_pages, unsigned long max_free); - -unsigned long quicklist_total_size(void); - -#else - -static inline unsigned long quicklist_total_size(void) -{ - return 0; -} - -#endif - -#endif /* LINUX_QUICKLIST_H */ - diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 179faab29f52..fdd421b8d9ae 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -60,41 +60,87 @@ rb_insert_augmented_cached(struct rb_node *node, rb_insert_augmented(node, &root->rb_root, augment); } -#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ - rbtype, rbaugmented, rbcompute) \ +/* + * Template for declaring augmented rbtree callbacks (generic case) + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data + */ + +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ -rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ - rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ - rbtype augmented = rbcompute(node); \ - if (node->rbaugmented == augmented) \ + RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ + if (RBCOMPUTE(node, true)) \ break; \ - node->rbaugmented = augmented; \ - rb = rb_parent(&node->rbfield); \ + rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ -rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ -rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ - old->rbaugmented = rbcompute(old); \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ + RBCOMPUTE(old, false); \ } \ -rbstatic const struct rb_augment_callbacks rbname = { \ - .propagate = rbname ## _propagate, \ - .copy = rbname ## _copy, \ - .rotate = rbname ## _rotate \ +RBSTATIC const struct rb_augment_callbacks RBNAME = { \ + .propagate = RBNAME ## _propagate, \ + .copy = RBNAME ## _copy, \ + .rotate = RBNAME ## _rotate \ }; +/* + * Template for declaring augmented rbtree callbacks, + * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar + */ + +#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ +{ \ + RBSTRUCT *child; \ + RBTYPE max = RBCOMPUTE(node); \ + if (node->RBFIELD.rb_left) { \ + child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + if (node->RBFIELD.rb_right) { \ + child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + if (exit && node->RBAUGMENTED == max) \ + return true; \ + node->RBAUGMENTED = max; \ + return false; \ +} \ +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) + #define RB_RED 0 #define RB_BLACK 1 diff --git a/include/linux/security.h b/include/linux/security.h index 5f7441abbf42..ace6fdb604f9 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -259,7 +259,8 @@ int security_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new); - +int security_path_notify(const struct path *path, u64 mask, + unsigned int obj_type); int security_inode_alloc(struct inode *inode); void security_inode_free(struct inode *inode); int security_inode_init_security(struct inode *inode, struct inode *dir, @@ -387,7 +388,6 @@ int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(char *secdata, u32 seclen); - void security_inode_invalidate_secctx(struct inode *inode); int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); @@ -621,6 +621,12 @@ static inline int security_move_mount(const struct path *from_path, return 0; } +static inline int security_path_notify(const struct path *path, u64 mask, + unsigned int obj_type) +{ + return 0; +} + static inline int security_inode_alloc(struct inode *inode) { return 0; diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 9443cafd1969..0f80123650e2 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -69,7 +69,7 @@ struct shrinker { /* These are for internal use */ struct list_head list; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG /* ID in shrinker_idr */ int id; #endif @@ -81,6 +81,11 @@ struct shrinker { /* Flags */ #define SHRINKER_NUMA_AWARE (1 << 0) #define SHRINKER_MEMCG_AWARE (1 << 1) +/* + * It just makes sense when the shrinker is also MEMCG_AWARE for now, + * non-MEMCG_AWARE shrinker should not have this flag set. + */ +#define SHRINKER_NONSLAB (1 << 2) extern int prealloc_shrinker(struct shrinker *shrinker); extern void register_shrinker_prepared(struct shrinker *shrinker); diff --git a/include/linux/slab.h b/include/linux/slab.h index 56c9c7eed34e..ab2b98ad76e1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -595,68 +595,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) return __kmalloc_node(size, flags, node); } -struct memcg_cache_array { - struct rcu_head rcu; - struct kmem_cache *entries[0]; -}; - -/* - * This is the main placeholder for memcg-related information in kmem caches. - * Both the root cache and the child caches will have it. For the root cache, - * this will hold a dynamically allocated array large enough to hold - * information about the currently limited memcgs in the system. To allow the - * array to be accessed without taking any locks, on relocation we free the old - * version only after a grace period. - * - * Root and child caches hold different metadata. - * - * @root_cache: Common to root and child caches. NULL for root, pointer to - * the root cache for children. - * - * The following fields are specific to root caches. - * - * @memcg_caches: kmemcg ID indexed table of child caches. This table is - * used to index child cachces during allocation and cleared - * early during shutdown. - * - * @root_caches_node: List node for slab_root_caches list. - * - * @children: List of all child caches. While the child caches are also - * reachable through @memcg_caches, a child cache remains on - * this list until it is actually destroyed. - * - * The following fields are specific to child caches. - * - * @memcg: Pointer to the memcg this cache belongs to. - * - * @children_node: List node for @root_cache->children list. - * - * @kmem_caches_node: List node for @memcg->kmem_caches list. - */ -struct memcg_cache_params { - struct kmem_cache *root_cache; - union { - struct { - struct memcg_cache_array __rcu *memcg_caches; - struct list_head __root_caches_node; - struct list_head children; - bool dying; - }; - struct { - struct mem_cgroup *memcg; - struct list_head children_node; - struct list_head kmem_caches_node; - struct percpu_ref refcnt; - - void (*work_fn)(struct kmem_cache *); - union { - struct rcu_head rcu_head; - struct work_struct work; - }; - }; - }; -}; - int memcg_update_all_caches(int num_memcgs); /** diff --git a/include/linux/string.h b/include/linux/string.h index 4deb11f7976b..b2f9df7f0761 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -474,8 +474,9 @@ static inline void memcpy_and_pad(void *dest, size_t dest_len, * But this can lead to bugs due to typos, or if prefix is a pointer * and not a constant. Instead use str_has_prefix(). * - * Returns: 0 if @str does not start with @prefix - strlen(@prefix) if @str does start with @prefix + * Returns: + * * strlen(@prefix) if @str starts with @prefix + * * 0 if @str does not start with @prefix */ static __always_inline size_t str_has_prefix(const char *str, const char *prefix) { diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 27536b961552..a6ef35184ef1 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -242,9 +242,6 @@ void rpc_sleep_on_priority_timeout(struct rpc_wait_queue *queue, void rpc_sleep_on_priority(struct rpc_wait_queue *, struct rpc_task *, int priority); -void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, - struct rpc_task *task); void rpc_wake_up_queued_task(struct rpc_wait_queue *, struct rpc_task *); void rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *, diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 8a87d8bcb197..f33e5013bdfb 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -186,7 +186,7 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p) extern void xdr_shift_buf(struct xdr_buf *, size_t); extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *); extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int); -extern int xdr_buf_read_netobj(struct xdr_buf *, struct xdr_netobj *, unsigned int); +extern int xdr_buf_read_mic(struct xdr_buf *, struct xdr_netobj *, unsigned int); extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 13e108bcc9eb..d783e15ba898 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -352,6 +352,7 @@ bool xprt_prepare_transmit(struct rpc_task *task); void xprt_request_enqueue_transmit(struct rpc_task *task); void xprt_request_enqueue_receive(struct rpc_task *task); void xprt_request_wait_receive(struct rpc_task *task); +void xprt_request_dequeue_xprt(struct rpc_task *task); bool xprt_request_need_retransmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); void xprt_end_transmit(struct rpc_task *task); diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 86fc38ff0355..16c239e0d6dd 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -49,9 +49,9 @@ * fully-chunked NFS message (read chunks are the largest). Note only * a single chunk type per message is supported currently. */ -#define RPCRDMA_MIN_SLOT_TABLE (2U) +#define RPCRDMA_MIN_SLOT_TABLE (4U) #define RPCRDMA_DEF_SLOT_TABLE (128U) -#define RPCRDMA_MAX_SLOT_TABLE (256U) +#define RPCRDMA_MAX_SLOT_TABLE (16384U) #define RPCRDMA_MIN_INLINE (1024) /* min inline thresh */ #define RPCRDMA_DEF_INLINE (4096) /* default inline thresh */ diff --git a/include/linux/swap.h b/include/linux/swap.h index de2c67a33b7e..063c0c1e112b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -340,6 +340,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); +extern void deactivate_page(struct page *page); extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); @@ -364,6 +365,7 @@ extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); extern unsigned long vm_total_pages; +extern unsigned long reclaim_pages(struct list_head *page_list); #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 3e2a80cc7b56..96305a64a5a7 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -53,18 +53,4 @@ extern const struct blk_integrity_profile t10_pi_type1_ip; extern const struct blk_integrity_profile t10_pi_type3_crc; extern const struct blk_integrity_profile t10_pi_type3_ip; -#ifdef CONFIG_BLK_DEV_INTEGRITY -extern void t10_pi_prepare(struct request *rq, u8 protection_type); -extern void t10_pi_complete(struct request *rq, u8 protection_type, - unsigned int intervals); -#else -static inline void t10_pi_complete(struct request *rq, u8 protection_type, - unsigned int intervals) -{ -} -static inline void t10_pi_prepare(struct request *rq, u8 protection_type) -{ -} -#endif - #endif diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 8d8821b3689a..659a4400517b 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -134,7 +134,7 @@ static inline void copy_overflow(int size, unsigned long count) WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } -static __always_inline bool +static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) { int sz = __compiletime_object_size(addr); diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 34a038563d97..70bbdc38dc37 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -55,7 +55,7 @@ * as usual) and both source and destination can trigger faults. */ -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { kasan_check_write(to, n); @@ -63,7 +63,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) return raw_copy_from_user(to, from, n); } -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); @@ -85,7 +85,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n) * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. */ -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { kasan_check_read(from, n); @@ -93,7 +93,7 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) return raw_copy_to_user(to, from, n); } -static __always_inline unsigned long +static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); @@ -103,7 +103,7 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) } #ifdef INLINE_COPY_FROM_USER -static inline unsigned long +static inline __must_check unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; @@ -117,12 +117,12 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) return res; } #else -extern unsigned long +extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); #endif #ifdef INLINE_COPY_TO_USER -static inline unsigned long +static inline __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); @@ -133,7 +133,7 @@ _copy_to_user(void __user *to, const void *from, unsigned long n) return n; } #else -extern unsigned long +extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); #endif @@ -222,8 +222,9 @@ static inline bool pagefault_disabled(void) #ifndef ARCH_HAS_NOCACHE_UACCESS -static inline unsigned long __copy_from_user_inatomic_nocache(void *to, - const void __user *from, unsigned long n) +static inline __must_check unsigned long +__copy_from_user_inatomic_nocache(void *to, const void __user *from, + unsigned long n) { return __copy_from_user_inatomic(to, from, n); } diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index dfa718ffdd4f..4e7809408073 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -53,15 +53,21 @@ struct vmap_area { unsigned long va_start; unsigned long va_end; - /* - * Largest available free size in subtree. - */ - unsigned long subtree_max_size; - unsigned long flags; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ - struct llist_node purge_list; /* "lazy purge" list */ - struct vm_struct *vm; + + /* + * The following three variables can be packed, because + * a vmap_area object is always one of the three states: + * 1) in "free" tree (root is vmap_area_root) + * 2) in "busy" tree (root is free_vmap_area_root) + * 3) in purge list (head is vmap_purge_list) + */ + union { + unsigned long subtree_max_size; /* in "free" tree */ + struct vm_struct *vm; /* in "busy" tree */ + struct llist_node purge_list; /* in purge list */ + }; }; /* diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 7238865e75b0..51bf43076165 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -46,6 +46,8 @@ const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); +bool zpool_malloc_support_movable(struct zpool *pool); + int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, unsigned long *handle); @@ -90,6 +92,7 @@ struct zpool_driver { struct zpool *zpool); void (*destroy)(void *pool); + bool malloc_support_movable; int (*malloc)(void *pool, size_t size, gfp_t gfp, unsigned long *handle); void (*free)(void *pool, unsigned long handle); diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index f6a4eaa85a3e..a13830616107 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -451,20 +451,81 @@ TRACE_EVENT(xprtrdma_createmrs, TP_STRUCT__entry( __field(const void *, r_xprt) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) __field(unsigned int, count) ), TP_fast_assign( __entry->r_xprt = r_xprt; __entry->count = count; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("r_xprt=%p: created %u MRs", - __entry->r_xprt, __entry->count + TP_printk("peer=[%s]:%s r_xprt=%p: created %u MRs", + __get_str(addr), __get_str(port), __entry->r_xprt, + __entry->count ) ); -DEFINE_RXPRT_EVENT(xprtrdma_nomrs); +TRACE_EVENT(xprtrdma_mr_get, + TP_PROTO( + const struct rpcrdma_req *req + ), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(const void *, req) + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + ), + + TP_fast_assign( + const struct rpc_rqst *rqst = &req->rl_slot; + + __entry->req = req; + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); + ), + + TP_printk("task:%u@%u xid=0x%08x req=%p", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->req + ) +); + +TRACE_EVENT(xprtrdma_nomrs, + TP_PROTO( + const struct rpcrdma_req *req + ), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(const void *, req) + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + ), + + TP_fast_assign( + const struct rpc_rqst *rqst = &req->rl_slot; + + __entry->req = req; + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); + ), + + TP_printk("task:%u@%u xid=0x%08x req=%p", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->req + ) +); DEFINE_RDCH_EVENT(read); DEFINE_WRCH_EVENT(write); @@ -623,21 +684,21 @@ TRACE_EVENT(xprtrdma_post_send, TRACE_EVENT(xprtrdma_post_recv, TP_PROTO( - const struct ib_cqe *cqe + const struct rpcrdma_rep *rep ), - TP_ARGS(cqe), + TP_ARGS(rep), TP_STRUCT__entry( - __field(const void *, cqe) + __field(const void *, rep) ), TP_fast_assign( - __entry->cqe = cqe; + __entry->rep = rep; ), - TP_printk("cqe=%p", - __entry->cqe + TP_printk("rep=%p", + __entry->rep ) ); @@ -715,14 +776,15 @@ TRACE_EVENT(xprtrdma_wc_receive, TP_ARGS(wc), TP_STRUCT__entry( - __field(const void *, cqe) + __field(const void *, rep) __field(u32, byte_len) __field(unsigned int, status) __field(u32, vendor_err) ), TP_fast_assign( - __entry->cqe = wc->wr_cqe; + __entry->rep = container_of(wc->wr_cqe, struct rpcrdma_rep, + rr_cqe); __entry->status = wc->status; if (wc->status) { __entry->byte_len = 0; @@ -733,8 +795,8 @@ TRACE_EVENT(xprtrdma_wc_receive, } ), - TP_printk("cqe=%p %u bytes: %s (%u/0x%x)", - __entry->cqe, __entry->byte_len, + TP_printk("rep=%p %u bytes: %s (%u/0x%x)", + __entry->rep, __entry->byte_len, rdma_show_wc_status(__entry->status), __entry->status, __entry->vendor_err ) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 3a27335fce2c..c2ce6480b4b1 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -66,8 +66,9 @@ DECLARE_EVENT_CLASS(writeback_page_template, ), TP_fast_assign( - strncpy(__entry->name, - mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", 32); + strscpy_pad(__entry->name, + mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", + 32); __entry->ino = mapping ? mapping->host->i_ino : 0; __entry->index = page->index; ), @@ -110,8 +111,8 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ - strncpy(__entry->name, - bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); + strscpy_pad(__entry->name, + bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->flags = flags; @@ -316,8 +317,8 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, ), TP_fast_assign( - strncpy(__entry->name, - dev_name(inode_to_bdi(inode)->dev), 32); + strscpy_pad(__entry->name, + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); @@ -360,8 +361,9 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(unsigned int, cgroup_ino) ), TP_fast_assign( - strncpy(__entry->name, - wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32); + strscpy_pad(__entry->name, + wb->bdi->dev ? dev_name(wb->bdi->dev) : + "(unknown)", 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; @@ -414,7 +416,7 @@ DECLARE_EVENT_CLASS(writeback_class, __field(unsigned int, cgroup_ino) ), TP_fast_assign( - strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%u", @@ -436,7 +438,7 @@ TRACE_EVENT(writeback_bdi_register, __array(char, name, 32) ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(bdi->dev), 32); ), TP_printk("bdi %s", __entry->name @@ -461,7 +463,7 @@ DECLARE_EVENT_CLASS(wbc_class, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(bdi->dev), 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; @@ -512,7 +514,7 @@ TRACE_EVENT(writeback_queue_io, ), TP_fast_assign( unsigned long *older_than_this = work->older_than_this; - strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32); __entry->older = older_than_this ? *older_than_this : 0; __entry->age = older_than_this ? (jiffies - *older_than_this) * 1000 / HZ : -1; @@ -598,7 +600,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, ), TP_fast_assign( - strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); @@ -663,7 +665,7 @@ TRACE_EVENT(balance_dirty_pages, TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; - strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); + strscpy_pad(__entry->bdi, dev_name(wb->bdi->dev), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + @@ -723,8 +725,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue, ), TP_fast_assign( - strncpy(__entry->name, - dev_name(inode_to_bdi(inode)->dev), 32); + strscpy_pad(__entry->name, + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; @@ -797,8 +799,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, ), TP_fast_assign( - strncpy(__entry->name, - dev_name(inode_to_bdi(inode)->dev), 32); + strscpy_pad(__entry->name, + dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 63b1f506ea67..c160a5354eb6 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -67,6 +67,9 @@ #define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ +#define MADV_COLD 20 /* deactivate these pages */ +#define MADV_PAGEOUT 21 /* reclaim these pages */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/uapi/linux/coff.h b/include/uapi/linux/coff.h index e4a79f80b9a0..ab5c7e847eed 100644 --- a/include/uapi/linux/coff.h +++ b/include/uapi/linux/coff.h @@ -11,6 +11,9 @@ more information about COFF, then O'Reilly has a very excellent book. */ +#ifndef _UAPI_LINUX_COFF_H +#define _UAPI_LINUX_COFF_H + #define E_SYMNMLEN 8 /* Number of characters in a symbol name */ #define E_FILNMLEN 14 /* Number of characters in a file name */ #define E_DIMNUM 4 /* Number of array dimensions in auxiliary entry */ @@ -350,3 +353,5 @@ struct COFF_reloc { /* For new sections we haven't heard of before */ #define COFF_DEF_SECTION_ALIGNMENT 4 + +#endif /* _UAPI_LINUX_COFF_H */ diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 2971d29a42e4..df2e12fb3381 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -425,6 +425,10 @@ enum fuse_opcode { /* CUSE specific operations */ CUSE_INIT = 4096, + + /* Reserved opcodes: helpful to detect structure endian-ness */ + CUSE_INIT_BSWAP_RESERVED = 1048576, /* CUSE_INIT << 8 */ + FUSE_INIT_BSWAP_RESERVED = 436207616, /* FUSE_INIT << 24 */ }; enum fuse_notify_code { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 96ee9d94b73e..ea57526a5b89 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -28,6 +28,7 @@ struct io_uring_sqe { __u16 poll_events; __u32 sync_range_flags; __u32 msg_flags; + __u32 timeout_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -61,6 +62,7 @@ struct io_uring_sqe { #define IORING_OP_SYNC_FILE_RANGE 8 #define IORING_OP_SENDMSG 9 #define IORING_OP_RECVMSG 10 +#define IORING_OP_TIMEOUT 11 /* * sqe->fsync_flags diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f28e562d7ca8..29d6e93fd15e 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -591,6 +591,7 @@ #define PCI_EXP_SLTCTL_CCIE 0x0010 /* Command Completed Interrupt Enable */ #define PCI_EXP_SLTCTL_HPIE 0x0020 /* Hot-Plug Interrupt Enable */ #define PCI_EXP_SLTCTL_AIC 0x00c0 /* Attention Indicator Control */ +#define PCI_EXP_SLTCTL_ATTN_IND_SHIFT 6 /* Attention Indicator shift */ #define PCI_EXP_SLTCTL_ATTN_IND_ON 0x0040 /* Attention Indicator on */ #define PCI_EXP_SLTCTL_ATTN_IND_BLINK 0x0080 /* Attention Indicator blinking */ #define PCI_EXP_SLTCTL_ATTN_IND_OFF 0x00c0 /* Attention Indicator off */ @@ -713,7 +714,9 @@ #define PCI_EXT_CAP_ID_DPC 0x1D /* Downstream Port Containment */ #define PCI_EXT_CAP_ID_L1SS 0x1E /* L1 PM Substates */ #define PCI_EXT_CAP_ID_PTM 0x1F /* Precision Time Measurement */ -#define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PTM +#define PCI_EXT_CAP_ID_DLF 0x25 /* Data Link Feature */ +#define PCI_EXT_CAP_ID_PL_16GT 0x26 /* Physical Layer 16.0 GT/s */ +#define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PL_16GT #define PCI_EXT_CAP_DSN_SIZEOF 12 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40 @@ -1053,4 +1056,14 @@ #define PCI_L1SS_CTL1_LTR_L12_TH_SCALE 0xe0000000 /* LTR_L1.2_THRESHOLD_Scale */ #define PCI_L1SS_CTL2 0x0c /* Control 2 Register */ +/* Data Link Feature */ +#define PCI_DLF_CAP 0x04 /* Capabilities Register */ +#define PCI_DLF_EXCHANGE_ENABLE 0x80000000 /* Data Link Feature Exchange Enable */ + +/* Physical Layer 16.0 GT/s */ +#define PCI_PL_16GT_LE_CTRL 0x20 /* Lane Equalization Control Register */ +#define PCI_PL_16GT_LE_CTRL_DSP_TX_PRESET_MASK 0x0000000F +#define PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_MASK 0x000000F0 +#define PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_SHIFT 4 + #endif /* LINUX_PCI_REGS_H */ diff --git a/init/main.c b/init/main.c index 653693da8da6..208b8fa1808e 100644 --- a/init/main.c +++ b/init/main.c @@ -507,7 +507,7 @@ void __init __weak mem_encrypt_init(void) { } void __init __weak poking_init(void) { } -void __init __weak pgd_cache_init(void) { } +void __init __weak pgtable_cache_init(void) { } bool initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); @@ -556,6 +556,7 @@ static void __init mm_init(void) report_meminit(); mem_init(); kmem_cache_init(); + kmemleak_init(); pgtable_init(); debug_objects_mem_init(); vmalloc_init(); @@ -564,7 +565,6 @@ static void __init mm_init(void) init_espfix_bsp(); /* Should be run after espfix64 is set up. */ pti_init(); - pgd_cache_init(); } void __init __weak arch_call_rest_init(void) @@ -594,7 +594,6 @@ asmlinkage __visible void __init start_kernel(void) page_address_init(); pr_notice("%s", linux_banner); setup_arch(&command_line); - mm_init_cpumask(&init_mm); setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); @@ -740,7 +739,6 @@ asmlinkage __visible void __init start_kernel(void) initrd_start = 0; } #endif - kmemleak_init(); setup_per_cpu_pageset(); numa_policy_init(); acpi_early_init(); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 19055aeb06ea..948d116f3446 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1240,15 +1240,14 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) /* create the notify skb */ nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL); - if (!nc) { - ret = -ENOMEM; - goto out; - } + if (!nc) + return -ENOMEM; + if (copy_from_user(nc->data, notification->sigev_value.sival_ptr, NOTIFY_COOKIE_LEN)) { ret = -EFAULT; - goto out; + goto free_skb; } /* TODO: add a header? */ @@ -1264,8 +1263,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) fdput(f); if (IS_ERR(sock)) { ret = PTR_ERR(sock); - sock = NULL; - goto out; + goto free_skb; } timeo = MAX_SCHEDULE_TIMEOUT; @@ -1274,11 +1272,8 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) sock = NULL; goto retry; } - if (ret) { - sock = NULL; - nc = NULL; - goto out; - } + if (ret) + return ret; } } @@ -1333,7 +1328,8 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) out: if (sock) netlink_detachskb(sock, nc); - else if (nc) + else +free_skb: dev_kfree_skb(nc); return ret; diff --git a/ipc/sem.c b/ipc/sem.c index 7da4504bcc7c..ec97a7072413 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1852,7 +1852,8 @@ static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid) { struct sem_undo *un; - list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) { + list_for_each_entry_rcu(un, &ulp->list_proc, list_proc, + spin_is_locked(&ulp->lock)) { if (un->semid == semid) return un; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index cc0d0cf114e3..a70f7209cda3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -14,8 +14,9 @@ #include #include #include +#include +#include #include -#include #include #include #include @@ -583,58 +584,52 @@ static const struct super_operations bpf_super_ops = { enum { OPT_MODE, - OPT_ERR, }; -static const match_table_t bpf_mount_tokens = { - { OPT_MODE, "mode=%o" }, - { OPT_ERR, NULL }, +static const struct fs_parameter_spec bpf_param_specs[] = { + fsparam_u32oct ("mode", OPT_MODE), + {} +}; + +static const struct fs_parameter_description bpf_fs_parameters = { + .name = "bpf", + .specs = bpf_param_specs, }; struct bpf_mount_opts { umode_t mode; }; -static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) +static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { - substring_t args[MAX_OPT_ARGS]; - int option, token; - char *ptr; + struct bpf_mount_opts *opts = fc->fs_private; + struct fs_parse_result result; + int opt; - opts->mode = S_IRWXUGO; - - while ((ptr = strsep(&data, ",")) != NULL) { - if (!*ptr) - continue; - - token = match_token(ptr, bpf_mount_tokens, args); - switch (token) { - case OPT_MODE: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->mode = option & S_IALLUGO; - break; + opt = fs_parse(fc, &bpf_fs_parameters, param, &result); + if (opt < 0) /* We might like to report bad mount options here, but * traditionally we've ignored all mount options, so we'd * better continue to ignore non-existing options for bpf. */ - } + return opt == -ENOPARAM ? 0 : opt; + + switch (opt) { + case OPT_MODE: + opts->mode = result.uint_32 & S_IALLUGO; + break; } return 0; } -static int bpf_fill_super(struct super_block *sb, void *data, int silent) +static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; - struct bpf_mount_opts opts; + struct bpf_mount_opts *opts = fc->fs_private; struct inode *inode; int ret; - ret = bpf_parse_options(data, &opts); - if (ret) - return ret; - ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; @@ -644,21 +639,50 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) inode = sb->s_root->d_inode; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= S_ISVTX | opts.mode; + inode->i_mode |= S_ISVTX | opts->mode; return 0; } -static struct dentry *bpf_mount(struct file_system_type *type, int flags, - const char *dev_name, void *data) +static int bpf_get_tree(struct fs_context *fc) { - return mount_nodev(type, flags, data, bpf_fill_super); + return get_tree_nodev(fc, bpf_fill_super); +} + +static void bpf_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations bpf_context_ops = { + .free = bpf_free_fc, + .parse_param = bpf_parse_param, + .get_tree = bpf_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int bpf_init_fs_context(struct fs_context *fc) +{ + struct bpf_mount_opts *opts; + + opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL); + if (!opts) + return -ENOMEM; + + opts->mode = S_IRWXUGO; + + fc->fs_private = opts; + fc->ops = &bpf_context_ops; + return 0; } static struct file_system_type bpf_fs_type = { .owner = THIS_MODULE, .name = "bpf", - .mount = bpf_mount, + .init_fs_context = bpf_init_fs_context, + .parameters = &bpf_fs_parameters, .kill_sb = kill_litter_super, }; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 10f1187b3907..f76d6f77dd5e 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -893,29 +893,24 @@ static struct sysrq_key_op sysrq_dbg_op = { }; #endif -static int kgdb_panic_event(struct notifier_block *self, - unsigned long val, - void *data) +void kgdb_panic(const char *msg) { + if (!kgdb_io_module_registered) + return; + /* - * Avoid entering the debugger if we were triggered due to a panic - * We don't want to get stuck waiting for input from user in such case. - * panic_timeout indicates the system should automatically + * We don't want to get stuck waiting for input from user if + * "panic_timeout" indicates the system should automatically * reboot on panic. */ if (panic_timeout) - return NOTIFY_DONE; + return; if (dbg_kdb_mode) - kdb_printf("PANIC: %s\n", (char *)data); - kgdb_breakpoint(); - return NOTIFY_DONE; -} + kdb_printf("PANIC: %s\n", msg); -static struct notifier_block kgdb_panic_event_nb = { - .notifier_call = kgdb_panic_event, - .priority = INT_MAX, -}; + kgdb_breakpoint(); +} void __weak kgdb_arch_late(void) { @@ -965,8 +960,6 @@ static void kgdb_register_callbacks(void) kgdb_arch_late(); register_module_notifier(&dbg_module_load_nb); register_reboot_notifier(&dbg_reboot_notifier); - atomic_notifier_chain_register(&panic_notifier_list, - &kgdb_panic_event_nb); #ifdef CONFIG_MAGIC_SYSRQ register_sysrq_key('g', &sysrq_dbg_op); #endif @@ -980,16 +973,14 @@ static void kgdb_register_callbacks(void) static void kgdb_unregister_callbacks(void) { /* - * When this routine is called KGDB should unregister from the - * panic handler and clean up, making sure it is not handling any + * When this routine is called KGDB should unregister from + * handlers and clean up, making sure it is not handling any * break exceptions at the time. */ if (kgdb_io_module_registered) { kgdb_io_module_registered = 0; unregister_reboot_notifier(&dbg_reboot_notifier); unregister_module_notifier(&dbg_module_load_nb); - atomic_notifier_chain_unregister(&panic_notifier_list, - &kgdb_panic_event_nb); kgdb_arch_exit(); #ifdef CONFIG_MAGIC_SYSRQ unregister_sysrq_key('g', &sysrq_dbg_op); diff --git a/kernel/elfcore.c b/kernel/elfcore.c index fc482c8e0bd8..57fb4dcff434 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -3,6 +3,7 @@ #include #include #include +#include Elf_Half __weak elf_core_extra_phdrs(void) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 84fa00497c49..94d38a39d72e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -143,17 +144,19 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) * * @vma: vma that holds the pte pointing to page * @addr: address the old @page is mapped at - * @page: the cowed page we are replacing by kpage - * @kpage: the modified page we replace page by + * @old_page: the page we are replacing by new_page + * @new_page: the modified page we replace page by * - * Returns 0 on success, -EFAULT on failure. + * If @new_page is NULL, only unmap @old_page. + * + * Returns 0 on success, negative error code otherwise. */ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct mm_struct *mm = vma->vm_mm; struct page_vma_mapped_walk pvmw = { - .page = old_page, + .page = compound_head(old_page), .vma = vma, .address = addr, }; @@ -164,12 +167,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); - VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); - - err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, - false); - if (err) - return err; + if (new_page) { + err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, + &memcg, false); + if (err) + return err; + } /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(old_page); @@ -177,15 +180,20 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; if (!page_vma_mapped_walk(&pvmw)) { - mem_cgroup_cancel_charge(new_page, memcg, false); + if (new_page) + mem_cgroup_cancel_charge(new_page, memcg, false); goto unlock; } VM_BUG_ON_PAGE(addr != pvmw.address, old_page); - get_page(new_page); - page_add_new_anon_rmap(new_page, vma, addr, false); - mem_cgroup_commit_charge(new_page, memcg, false, false); - lru_cache_add_active_or_unevictable(new_page, vma); + if (new_page) { + get_page(new_page); + page_add_new_anon_rmap(new_page, vma, addr, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); + lru_cache_add_active_or_unevictable(new_page, vma); + } else + /* no new page, just dec_mm_counter for old_page */ + dec_mm_counter(mm, MM_ANONPAGES); if (!PageAnon(old_page)) { dec_mm_counter(mm, mm_counter_file(old_page)); @@ -194,8 +202,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); ptep_clear_flush_notify(vma, addr, pvmw.pte); - set_pte_at_notify(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); + if (new_page) + set_pte_at_notify(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, false); if (!page_mapped(old_page)) @@ -464,6 +473,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct page *old_page, *new_page; struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; + bool orig_page_huge = false; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); @@ -471,7 +481,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, retry: /* Read the page with vaddr into memory */ ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); + FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -488,6 +498,10 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, ref_ctr_updated = 1; } + ret = 0; + if (!is_register && !PageAnon(old_page)) + goto put_old; + ret = anon_vma_prepare(vma); if (ret) goto put_old; @@ -501,8 +515,33 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + if (!is_register) { + struct page *orig_page; + pgoff_t index; + + VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); + + index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; + orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, + index); + + if (orig_page) { + if (PageUptodate(orig_page) && + pages_identical(new_page, orig_page)) { + /* let go new_page */ + put_page(new_page); + new_page = NULL; + + if (PageCompound(orig_page)) + orig_page_huge = true; + } + put_page(orig_page); + } + } + ret = __replace_page(vma, vaddr, old_page, new_page); - put_page(new_page); + if (new_page) + put_page(new_page); put_old: put_page(old_page); @@ -513,6 +552,10 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret && is_register && ref_ctr_updated) update_ref_ctr(uprobe, mm, -1); + /* try collapse pmd for compound page */ + if (!ret && orig_page_huge) + collapse_pte_mapped_thp(mm, vaddr); + return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index 097cd1eb80fb..011f99d30d54 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -126,6 +126,15 @@ int nr_threads; /* The idle threads do not count.. */ static int max_threads; /* tunable limit on nr_threads */ +#define NAMED_ARRAY_INDEX(x) [x] = __stringify(x) + +static const char * const resident_page_types[] = { + NAMED_ARRAY_INDEX(MM_FILEPAGES), + NAMED_ARRAY_INDEX(MM_ANONPAGES), + NAMED_ARRAY_INDEX(MM_SWAPENTS), + NAMED_ARRAY_INDEX(MM_SHMEMPAGES), +}; + DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ @@ -648,12 +657,15 @@ static void check_mm(struct mm_struct *mm) { int i; + BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, + "Please make sure 'struct resident_page_types[]' is updated as well"); + for (i = 0; i < NR_MM_COUNTERS; i++) { long x = atomic_long_read(&mm->rss_stat.count[i]); if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", + mm, resident_page_types[i], x); } if (mm_pgtables_bytes(mm)) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d5870723b8ad..15d70a90b50d 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -300,6 +300,8 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; + if (fatal_signal_pending(current)) + return NULL; pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index c4ce08f43bd6..ab4a4606d19b 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1175,6 +1175,7 @@ int klp_module_coming(struct module *mod) pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", patch->mod->name, obj->mod->name, obj->mod->name); mod->klp_alive = false; + obj->mod = NULL; klp_cleanup_module_patches_limited(mod, patch); mutex_unlock(&klp_mutex); diff --git a/kernel/panic.c b/kernel/panic.c index 057540b6eee9..47e8ebccc22b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -219,6 +220,13 @@ void panic(const char *fmt, ...) dump_stack(); #endif + /* + * If kgdb is enabled, give it a chance to run before we stop all + * the other CPUs or else we won't be able to debug processes left + * running on them. + */ + kgdb_panic(buf); + /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. @@ -551,9 +559,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); - if (args) - pr_warn(CUT_HERE); - if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", raw_smp_processor_id(), current->pid, file, line, @@ -591,37 +596,26 @@ void __warn(const char *file, int line, void *caller, unsigned taint, add_taint(taint, LOCKDEP_STILL_OK); } -#ifdef WANT_WARN_ON_SLOWPATH -void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) +#ifndef __WARN_FLAGS +void warn_slowpath_fmt(const char *file, int line, unsigned taint, + const char *fmt, ...) { struct warn_args args; - args.fmt = fmt; - va_start(args.args, fmt); - __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, - &args); - va_end(args.args); -} -EXPORT_SYMBOL(warn_slowpath_fmt); + pr_warn(CUT_HERE); -void warn_slowpath_fmt_taint(const char *file, int line, - unsigned taint, const char *fmt, ...) -{ - struct warn_args args; + if (!fmt) { + __warn(file, line, __builtin_return_address(0), taint, + NULL, NULL); + return; + } args.fmt = fmt; va_start(args.args, fmt); __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } -EXPORT_SYMBOL(warn_slowpath_fmt_taint); - -void warn_slowpath_null(const char *file, int line) -{ - pr_warn(CUT_HERE); - __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); -} -EXPORT_SYMBOL(warn_slowpath_null); +EXPORT_SYMBOL(warn_slowpath_fmt); #else void __warn_printk(const char *fmt, ...) { diff --git a/kernel/resource.c b/kernel/resource.c index 74877e9d90ca..76036a41143b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -487,8 +487,8 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, while (start < end && !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, false, &res)) { - pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; - end_pfn = (res.end + 1) >> PAGE_SHIFT; + pfn = PFN_UP(res.start); + end_pfn = PFN_DOWN(res.end + 1); if (end_pfn > pfn) ret = (*func)(pfn, end_pfn - pfn, arg); if (ret) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c892c6280c9f..8dad5aa600ea 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -238,7 +238,6 @@ static void do_idle(void) tick_nohz_idle_enter(); while (!need_resched()) { - check_pgt_cache(); rmb(); local_irq_disable(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 94cf7c5c1a3f..570cf6083712 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -265,7 +265,8 @@ extern struct ctl_table epoll_table[]; extern struct ctl_table firmware_config_table[]; #endif -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) int sysctl_legacy_va_layout; #endif @@ -1582,7 +1583,8 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec, .extra1 = SYSCTL_ZERO, }, -#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) { .procname = "legacy_va_layout", .data = &sysctl_legacy_va_layout, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e0e14780a13d..93d97f9b0157 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -311,7 +311,7 @@ config HEADERS_CHECK relevant for userspace, say 'Y'. config OPTIMIZE_INLINING - bool "Allow compiler to uninline functions marked 'inline'" + def_bool y help This option determines if the kernel forces gcc to inline the functions developers have marked 'inline'. Doing so takes away freedom from gcc to @@ -322,8 +322,6 @@ config OPTIMIZE_INLINING decision will become the default in the future. Until then this option is there to test gcc for this. - If unsure, say N. - config DEBUG_SECTION_MISMATCH bool "Enable full Section mismatch analysis" help @@ -576,17 +574,18 @@ config DEBUG_KMEMLEAK In order to access the kmemleak file, debugfs needs to be mounted (usually at /sys/kernel/debug). -config DEBUG_KMEMLEAK_EARLY_LOG_SIZE - int "Maximum kmemleak early log entries" +config DEBUG_KMEMLEAK_MEM_POOL_SIZE + int "Kmemleak memory pool size" depends on DEBUG_KMEMLEAK - range 200 40000 - default 400 + range 200 1000000 + default 16000 help Kmemleak must track all the memory allocations to avoid reporting false positives. Since memory may be allocated or - freed before kmemleak is initialised, an early log buffer is - used to store these actions. If kmemleak reports "early log - buffer exceeded", please increase this value. + freed before kmemleak is fully initialised, use a static pool + of metadata objects to track such callbacks. After kmemleak is + fully initialised, this memory pool acts as an emergency one + if slab allocations fail. config DEBUG_KMEMLEAK_TEST tristate "Simple test for the kernel memory leak detector" diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 7fa97a8b5717..6c9682ce0254 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -134,6 +134,14 @@ config KASAN_S390_4_LEVEL_PAGING to 3TB of RAM with KASan enabled). This options allows to force 4-level paging instead. +config KASAN_SW_TAGS_IDENTIFY + bool "Enable memory corruption identification" + depends on KASAN_SW_TAGS + help + This option enables best-effort identification of bug type + (use-after-free or out-of-bounds) at the cost of increased + memory consumption. + config TEST_KASAN tristate "Module for testing KASAN for bug detection" depends on m && KASAN diff --git a/lib/bug.c b/lib/bug.c index 1077366f496b..8c98af0bf585 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -181,6 +181,15 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) } } + /* + * BUG() and WARN_ON() families don't print a custom debug message + * before triggering the exception handler, so we must add the + * "cut here" line now. WARN() issues its own "cut here" before the + * extra debugging message it writes before triggering the handler. + */ + if ((bug->flags & BUGFLAG_NO_CUT_HERE) == 0) + printk(KERN_DEFAULT CUT_HERE); + if (warning) { /* this is a WARN_ON rather than BUG/BUG_ON */ __warn(file, line, (void *)bugaddr, BUG_GET_TAINT(bug), regs, @@ -188,8 +197,6 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) return BUG_TRAP_TYPE_WARN; } - printk(KERN_DEFAULT CUT_HERE); - if (file) pr_crit("kernel BUG at %s:%u!\n", file, line); else diff --git a/lib/extable.c b/lib/extable.c index 25da4071122a..c3e59caf7ffa 100644 --- a/lib/extable.c +++ b/lib/extable.c @@ -10,6 +10,7 @@ #include #include #include +#include #ifndef ARCH_HAS_RELATIVE_EXTABLE #define ex_to_insn(x) ((x)->insn) diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c index a7bafc413730..ae25e2fa2187 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -36,12 +36,12 @@ static inline size_t genradix_depth_size(unsigned depth) #define GENRADIX_DEPTH_MASK \ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) -unsigned genradix_root_to_depth(struct genradix_root *r) +static inline unsigned genradix_root_to_depth(struct genradix_root *r) { return (unsigned long) r & GENRADIX_DEPTH_MASK; } -struct genradix_node *genradix_root_to_node(struct genradix_root *r) +static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r) { return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); } diff --git a/lib/hexdump.c b/lib/hexdump.c index b1d55b669ae2..147133f8eb2f 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -270,25 +270,4 @@ void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, } EXPORT_SYMBOL(print_hex_dump); -#if !defined(CONFIG_DYNAMIC_DEBUG) -/** - * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params - * @prefix_str: string to prefix each line with; - * caller supplies trailing spaces for alignment if desired - * @prefix_type: controls whether prefix of an offset, address, or none - * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) - * @buf: data blob to dump - * @len: number of bytes in the @buf - * - * Calls print_hex_dump(), with log level of KERN_DEBUG, - * rowsize of 16, groupsize of 1, and ASCII output included. - */ -void print_hex_dump_bytes(const char *prefix_str, int prefix_type, - const void *buf, size_t len) -{ - print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, 16, 1, - buf, len, true); -} -EXPORT_SYMBOL(print_hex_dump_bytes); -#endif /* !defined(CONFIG_DYNAMIC_DEBUG) */ #endif /* defined(CONFIG_PRINTK) */ diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f1e0569b4539..639d5e7014c1 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -878,7 +878,7 @@ static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) head = compound_head(page); v += (page - head) << PAGE_SHIFT; - if (likely(n <= v && v <= (PAGE_SIZE << compound_order(head)))) + if (likely(n <= v && v <= (page_size(head)))) return true; WARN_ON(1); return false; diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c index ba16c08e8cb9..717c940112f9 100644 --- a/lib/lzo/lzo1x_compress.c +++ b/lib/lzo/lzo1x_compress.c @@ -83,17 +83,19 @@ lzo1x_1_do_compress(const unsigned char *in, size_t in_len, ALIGN((uintptr_t)ir, 4)) && (ir < limit) && (*ir == 0)) ir++; - for (; (ir + 4) <= limit; ir += 4) { - dv = *((u32 *)ir); - if (dv) { + if (IS_ALIGNED((uintptr_t)ir, 4)) { + for (; (ir + 4) <= limit; ir += 4) { + dv = *((u32 *)ir); + if (dv) { # if defined(__LITTLE_ENDIAN) - ir += __builtin_ctz(dv) >> 3; + ir += __builtin_ctz(dv) >> 3; # elif defined(__BIG_ENDIAN) - ir += __builtin_clz(dv) >> 3; + ir += __builtin_clz(dv) >> 3; # else # error "missing endian definition" # endif - break; + break; + } } } #endif diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index 62b8ee92643d..41ae3c7570d3 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -77,26 +77,10 @@ static inline void erase_cached(struct test_node *node, struct rb_root_cached *r } -static inline u32 augment_recompute(struct test_node *node) -{ - u32 max = node->val, child_augmented; - if (node->rb.rb_left) { - child_augmented = rb_entry(node->rb.rb_left, struct test_node, - rb)->augmented; - if (max < child_augmented) - max = child_augmented; - } - if (node->rb.rb_right) { - child_augmented = rb_entry(node->rb.rb_right, struct test_node, - rb)->augmented; - if (max < child_augmented) - max = child_augmented; - } - return max; -} +#define NODE_VAL(node) ((node)->val) -RB_DECLARE_CALLBACKS(static, augment_callbacks, struct test_node, rb, - u32, augmented, augment_recompute) +RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks, + struct test_node, rb, u32, augmented, NODE_VAL) static void insert_augmented(struct test_node *node, struct rb_root_cached *root) @@ -238,7 +222,20 @@ static void check_augmented(int nr_nodes) check(nr_nodes); for (rb = rb_first(&root.rb_root); rb; rb = rb_next(rb)) { struct test_node *node = rb_entry(rb, struct test_node, rb); - WARN_ON_ONCE(node->augmented != augment_recompute(node)); + u32 subtree, max = node->val; + if (node->rb.rb_left) { + subtree = rb_entry(node->rb.rb_left, struct test_node, + rb)->augmented; + if (max < subtree) + max = subtree; + } + if (node->rb.rb_right) { + subtree = rb_entry(node->rb.rb_right, struct test_node, + rb)->augmented; + if (max < subtree) + max = subtree; + } + WARN_ON_ONCE(node->augmented != max); } } diff --git a/lib/show_mem.c b/lib/show_mem.c index 5c86ef4c899f..1c26c14ffbb9 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -6,7 +6,6 @@ */ #include -#include #include void show_mem(unsigned int filter, nodemask_t *nodemask) @@ -39,10 +38,6 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_CMA printk("%lu pages cma reserved\n", totalcma_pages); #endif -#ifdef CONFIG_QUICKLIST - printk("%lu pages in pagetable cache\n", - quicklist_total_size()); -#endif #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif diff --git a/lib/string.c b/lib/string.c index 461fb620f85f..cd7a10c19210 100644 --- a/lib/string.c +++ b/lib/string.c @@ -173,8 +173,9 @@ EXPORT_SYMBOL(strlcpy); * doesn't unnecessarily force the tail of the destination buffer to be * zeroed. If zeroing is desired please use strscpy_pad(). * - * Return: The number of characters copied (not including the trailing - * %NUL) or -E2BIG if the destination buffer wasn't big enough. + * Returns: + * * The number of characters copied (not including the trailing %NUL) + * * -E2BIG if count is 0 or @src was truncated. */ ssize_t strscpy(char *dest, const char *src, size_t count) { @@ -182,7 +183,7 @@ ssize_t strscpy(char *dest, const char *src, size_t count) size_t max = count; long res = 0; - if (count == 0) + if (count == 0 || WARN_ON_ONCE(count > INT_MAX)) return -E2BIG; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS @@ -253,8 +254,9 @@ EXPORT_SYMBOL(strscpy); * For full explanation of why you may want to consider using the * 'strscpy' functions please see the function docstring for strscpy(). * - * Return: The number of characters copied (not including the trailing - * %NUL) or -E2BIG if the destination buffer wasn't big enough. + * Returns: + * * The number of characters copied (not including the trailing %NUL) + * * -E2BIG if count is 0 or @src was truncated. */ ssize_t strscpy_pad(char *dest, const char *src, size_t count) { diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 023ba9f3b99f..dccb95af6003 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -108,7 +109,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) return 0; max_addr = user_addr_max(); - src_addr = (unsigned long)src; + src_addr = (unsigned long)untagged_addr(src); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; long retval; diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 7f2db3fe311f..28ff554a1be8 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -2,6 +2,7 @@ #include #include #include +#include #include @@ -109,7 +110,7 @@ long strnlen_user(const char __user *str, long count) return 0; max_addr = user_addr_max(); - src_addr = (unsigned long)str; + src_addr = (unsigned long)untagged_addr(str); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; long retval; diff --git a/lib/test_kasan.c b/lib/test_kasan.c index b63b367a94e8..49cc4d570a40 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -18,6 +18,9 @@ #include #include #include +#include + +#include /* * Note: test functions are marked noinline so that their names appear in @@ -337,6 +340,42 @@ static noinline void __init kmalloc_uaf2(void) kfree(ptr2); } +static noinline void __init kfree_via_page(void) +{ + char *ptr; + size_t size = 8; + struct page *page; + unsigned long offset; + + pr_info("invalid-free false positive (via page)\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + page = virt_to_page(ptr); + offset = offset_in_page(ptr); + kfree(page_address(page) + offset); +} + +static noinline void __init kfree_via_phys(void) +{ + char *ptr; + size_t size = 8; + phys_addr_t phys; + + pr_info("invalid-free false positive (via phys)\n"); + ptr = kmalloc(size, GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + phys = virt_to_phys(ptr); + kfree(phys_to_virt(phys)); +} + static noinline void __init kmem_cache_oob(void) { char *p; @@ -737,6 +776,8 @@ static int __init kmalloc_tests_init(void) kmalloc_uaf(); kmalloc_uaf_memset(); kmalloc_uaf2(); + kfree_via_page(); + kfree_via_phys(); kmem_cache_oob(); memcg_accounted_kmem_cache(); kasan_stack_oob(); diff --git a/mm/Kconfig b/mm/Kconfig index 2fe4902ad755..a5dae9a7eb51 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -273,11 +273,6 @@ config BOUNCE by default when ZONE_DMA or HIGHMEM is selected, but you may say n to override this. -config NR_QUICK - int - depends on QUICKLIST - default "1" - config VIRT_TO_BUS bool help @@ -717,6 +712,17 @@ config GUP_BENCHMARK config GUP_GET_PTE_LOW_HIGH bool +config READ_ONLY_THP_FOR_FS + bool "Read-only THP for filesystems (EXPERIMENTAL)" + depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM + + help + Allow khugepaged to put read-only file-backed pages in THP. + + This is marked experimental because it is a new feature. Write + support of file THPs will be developed in the next few release + cycles. + config ARCH_HAS_PTE_SPECIAL bool diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 82b6a20898bd..327b3ebf23bf 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -21,7 +21,9 @@ config DEBUG_PAGEALLOC Also, the state of page tracking structures is checked more often as pages are being allocated and freed, as unexpected state changes often happen for same reasons as memory corruption (e.g. double free, - use-after-free). + use-after-free). The error reports for these checks can be augmented + with stack traces of last allocation and freeing of the page, when + PAGE_OWNER is also selected and enabled on boot. For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, fill the pages with poison patterns after free_pages() and verify diff --git a/mm/Makefile b/mm/Makefile index d0b295c3b764..d996846697ef 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n +CFLAGS_init-mm.o += $(call cc-disable-warning, override-init) +CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides) + mmu-y := nommu.o mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ @@ -72,7 +75,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o -obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o diff --git a/mm/compaction.c b/mm/compaction.c index 952dc2fb24e5..ce08b39d85d4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -969,7 +969,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * is safe to read and it's 0 for tail pages. */ if (unlikely(PageCompound(page))) { - low_pfn += (1UL << compound_order(page)) - 1; + low_pfn += compound_nr(page) - 1; goto isolate_fail; } } @@ -1737,8 +1737,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) * starting at the block pointed to by the migrate scanner pfn within * compact_control. */ -static isolate_migrate_t isolate_migratepages(struct zone *zone, - struct compact_control *cc) +static isolate_migrate_t isolate_migratepages(struct compact_control *cc) { unsigned long block_start_pfn; unsigned long block_end_pfn; @@ -1756,8 +1755,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, */ low_pfn = fast_find_migrateblock(cc); block_start_pfn = pageblock_start_pfn(low_pfn); - if (block_start_pfn < zone->zone_start_pfn) - block_start_pfn = zone->zone_start_pfn; + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; /* * fast_find_migrateblock marks a pageblock skipped so to avoid @@ -1787,8 +1786,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) cond_resched(); - page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, - zone); + page = pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone); if (!page) continue; @@ -2078,6 +2077,17 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; + /* + * These counters track activities during zone compaction. Initialize + * them before compacting a new zone. + */ + cc->total_migrate_scanned = 0; + cc->total_free_scanned = 0; + cc->nr_migratepages = 0; + cc->nr_freepages = 0; + INIT_LIST_HEAD(&cc->freepages); + INIT_LIST_HEAD(&cc->migratepages); + cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, cc->classzone_idx); @@ -2158,7 +2168,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->rescan = true; } - switch (isolate_migratepages(cc->zone, cc)) { + switch (isolate_migratepages(cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; putback_movable_pages(&cc->migratepages); @@ -2281,10 +2291,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, { enum compact_result ret; struct compact_control cc = { - .nr_freepages = 0, - .nr_migratepages = 0, - .total_migrate_scanned = 0, - .total_free_scanned = 0, .order = order, .search_order = order, .gfp_mask = gfp_mask, @@ -2305,8 +2311,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, if (capture) current->capture_control = &capc; - INIT_LIST_HEAD(&cc.freepages); - INIT_LIST_HEAD(&cc.migratepages); ret = compact_zone(&cc, &capc); @@ -2408,8 +2412,6 @@ static void compact_node(int nid) struct zone *zone; struct compact_control cc = { .order = -1, - .total_migrate_scanned = 0, - .total_free_scanned = 0, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .whole_zone = true, @@ -2423,11 +2425,7 @@ static void compact_node(int nid) if (!populated_zone(zone)) continue; - cc.nr_freepages = 0; - cc.nr_migratepages = 0; cc.zone = zone; - INIT_LIST_HEAD(&cc.freepages); - INIT_LIST_HEAD(&cc.migratepages); compact_zone(&cc, NULL); @@ -2529,8 +2527,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct compact_control cc = { .order = pgdat->kcompactd_max_order, .search_order = pgdat->kcompactd_max_order, - .total_migrate_scanned = 0, - .total_free_scanned = 0, .classzone_idx = pgdat->kcompactd_classzone_idx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = false, @@ -2554,16 +2550,10 @@ static void kcompactd_do_work(pg_data_t *pgdat) COMPACT_CONTINUE) continue; - cc.nr_freepages = 0; - cc.nr_migratepages = 0; - cc.total_migrate_scanned = 0; - cc.total_free_scanned = 0; - cc.zone = zone; - INIT_LIST_HEAD(&cc.freepages); - INIT_LIST_HEAD(&cc.migratepages); - if (kthread_should_stop()) return; + + cc.zone = zone; status = compact_zone(&cc, NULL); if (status == COMPACT_SUCCESS) { diff --git a/mm/filemap.c b/mm/filemap.c index 40667c2f3383..1146fcfa3215 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -126,7 +126,7 @@ static void page_cache_delete(struct address_space *mapping, /* hugetlb pages are represented by a single entry in the xarray */ if (!PageHuge(page)) { xas_set_order(&xas, page->index, compound_order(page)); - nr = 1U << compound_order(page); + nr = compound_nr(page); } VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -203,8 +203,9 @@ static void unaccount_page_cache_page(struct address_space *mapping, __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); - } else { - VM_BUG_ON_PAGE(PageTransHuge(page), page); + } else if (PageTransHuge(page)) { + __dec_node_page_state(page, NR_FILE_THPS); + filemap_nr_thps_dec(mapping); } /* @@ -281,11 +282,11 @@ EXPORT_SYMBOL(delete_from_page_cache); * @pvec: pagevec with pages to delete * * The function walks over mapping->i_pages and removes pages passed in @pvec - * from the mapping. The function expects @pvec to be sorted by page index. + * from the mapping. The function expects @pvec to be sorted by page index + * and is optimised for it to be dense. * It tolerates holes in @pvec (mapping entries at those indices are not * modified). The function expects only THP head pages to be present in the - * @pvec and takes care to delete all corresponding tail pages from the - * mapping as well. + * @pvec. * * The function expects the i_pages lock to be held. */ @@ -294,40 +295,43 @@ static void page_cache_delete_batch(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; - int i = 0, tail_pages = 0; + int i = 0; struct page *page; mapping_set_update(&xas, mapping); xas_for_each(&xas, page, ULONG_MAX) { - if (i >= pagevec_count(pvec) && !tail_pages) + if (i >= pagevec_count(pvec)) break; + + /* A swap/dax/shadow entry got inserted? Skip it. */ if (xa_is_value(page)) continue; - if (!tail_pages) { - /* - * Some page got inserted in our range? Skip it. We - * have our pages locked so they are protected from - * being removed. - */ - if (page != pvec->pages[i]) { - VM_BUG_ON_PAGE(page->index > - pvec->pages[i]->index, page); - continue; - } - WARN_ON_ONCE(!PageLocked(page)); - if (PageTransHuge(page) && !PageHuge(page)) - tail_pages = HPAGE_PMD_NR - 1; - page->mapping = NULL; - /* - * Leave page->index set: truncation lookup relies - * upon it - */ - i++; - } else { - VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages - != pvec->pages[i]->index, page); - tail_pages--; + /* + * A page got inserted in our range? Skip it. We have our + * pages locked so they are protected from being removed. + * If we see a page whose index is higher than ours, it + * means our page has been removed, which shouldn't be + * possible because we're holding the PageLock. + */ + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, + page); + continue; } + + WARN_ON_ONCE(!PageLocked(page)); + + if (page->index == xas.xa_index) + page->mapping = NULL; + /* Leave page->index set: truncation lookup relies on it */ + + /* + * Move to the next page in the vector if this is a regular + * page or the index is of the last sub-page of this compound + * page. + */ + if (page->index + compound_nr(page) - 1 == xas.xa_index) + i++; xas_store(&xas, NULL); total_pages++; } @@ -408,7 +412,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, .range_end = end, }; - if (!mapping_cap_writeback_dirty(mapping)) + if (!mapping_cap_writeback_dirty(mapping) || + !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; wbc_attach_fdatawrite_inode(&wbc, mapping->host); @@ -617,10 +622,13 @@ int filemap_fdatawait_keep_errors(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawait_keep_errors); +/* Returns true if writeback might be needed or already in progress. */ static bool mapping_needs_writeback(struct address_space *mapping) { - return (!dax_mapping(mapping) && mapping->nrpages) || - (dax_mapping(mapping) && mapping->nrexceptional); + if (dax_mapping(mapping)) + return mapping->nrexceptional; + + return mapping->nrpages; } int filemap_write_and_wait(struct address_space *mapping) @@ -1516,7 +1524,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { XA_STATE(xas, &mapping->i_pages, offset); - struct page *head, *page; + struct page *page; rcu_read_lock(); repeat: @@ -1531,25 +1539,19 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) if (!page || xa_is_value(page)) goto out; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } - /* - * Has the page moved? + * Has the page moved or been split? * This is part of the lockless pagecache protocol. See * include/linux/pagemap.h for details. */ if (unlikely(page != xas_reload(&xas))) { - put_page(head); + put_page(page); goto repeat; } + page = find_subpage(page, offset); out: rcu_read_unlock(); @@ -1646,7 +1648,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, } /* Has the page been truncated? */ - if (unlikely(page->mapping != mapping)) { + if (unlikely(compound_head(page)->mapping != mapping)) { unlock_page(page); put_page(page); goto repeat; @@ -1731,7 +1733,6 @@ unsigned find_get_entries(struct address_space *mapping, rcu_read_lock(); xas_for_each(&xas, page, ULONG_MAX) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1742,17 +1743,13 @@ unsigned find_get_entries(struct address_space *mapping, if (xa_is_value(page)) goto export; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; + page = find_subpage(page, xas.xa_index); export: indices[ret] = xas.xa_index; @@ -1761,7 +1758,7 @@ unsigned find_get_entries(struct address_space *mapping, break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1803,33 +1800,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); xas_for_each(&xas, page, end) { - struct page *head; if (xas_retry(&xas, page)) continue; /* Skip over shadow, swap and DAX entries */ if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1874,7 +1865,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1884,24 +1874,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, if (xa_is_value(page)) break; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1937,7 +1922,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, rcu_read_lock(); xas_for_each_marked(&xas, page, end, tag) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1948,26 +1932,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *index = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -2562,12 +2541,12 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) goto out_retry; /* Did it get truncated? */ - if (unlikely(page->mapping != mapping)) { + if (unlikely(compound_head(page)->mapping != mapping)) { unlock_page(page); put_page(page); goto retry_find; } - VM_BUG_ON_PAGE(page->index != offset, page); + VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); /* * We have a locked page in the page cache, now we need to check @@ -2648,7 +2627,7 @@ void filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *head, *page; + struct page *page; rcu_read_lock(); xas_for_each(&xas, page, end_pgoff) { @@ -2657,24 +2636,19 @@ void filemap_map_pages(struct vm_fault *vmf, if (xa_is_value(page)) goto next; - head = compound_head(page); - /* * Check for a locked page first, as a speculative * reference may adversely influence page migration. */ - if (PageLocked(head)) + if (PageLocked(page)) goto next; - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto next; - /* The page was split under us? */ - if (compound_head(page) != head) - goto skip; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto skip; + page = find_subpage(page, xas.xa_index); if (!PageUptodate(page) || PageReadahead(page) || diff --git a/mm/frame_vector.c b/mm/frame_vector.c index c64dca6e27c2..c431ca81dad5 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -46,6 +46,8 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) nr_frames = vec->nr_allocated; + start = untagged_addr(start); + down_read(&mm->mmap_sem); locked = 1; vma = find_vma_intersection(mm, start, start + 1); diff --git a/mm/gup.c b/mm/gup.c index 98f13ab37bac..23a9f9c9d377 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -29,85 +29,70 @@ struct follow_page_context { unsigned int page_mask; }; -typedef int (*set_dirty_func_t)(struct page *page); - -static void __put_user_pages_dirty(struct page **pages, - unsigned long npages, - set_dirty_func_t sdf) -{ - unsigned long index; - - for (index = 0; index < npages; index++) { - struct page *page = compound_head(pages[index]); - - /* - * Checking PageDirty at this point may race with - * clear_page_dirty_for_io(), but that's OK. Two key cases: - * - * 1) This code sees the page as already dirty, so it skips - * the call to sdf(). That could happen because - * clear_page_dirty_for_io() called page_mkclean(), - * followed by set_page_dirty(). However, now the page is - * going to get written back, which meets the original - * intention of setting it dirty, so all is well: - * clear_page_dirty_for_io() goes on to call - * TestClearPageDirty(), and write the page back. - * - * 2) This code sees the page as clean, so it calls sdf(). - * The page stays dirty, despite being written back, so it - * gets written back again in the next writeback cycle. - * This is harmless. - */ - if (!PageDirty(page)) - sdf(page); - - put_user_page(page); - } -} - /** - * put_user_pages_dirty() - release and dirty an array of gup-pinned pages - * @pages: array of pages to be marked dirty and released. + * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages + * @pages: array of pages to be maybe marked dirty, and definitely released. * @npages: number of pages in the @pages array. + * @make_dirty: whether to mark the pages dirty * * "gup-pinned page" refers to a page that has had one of the get_user_pages() * variants called on that page. * * For each page in the @pages array, make that page (or its head page, if a - * compound page) dirty, if it was previously listed as clean. Then, release - * the page using put_user_page(). + * compound page) dirty, if @make_dirty is true, and if the page was previously + * listed as clean. In any case, releases all pages using put_user_page(), + * possibly via put_user_pages(), for the non-dirty case. * * Please see the put_user_page() documentation for details. * - * set_page_dirty(), which does not lock the page, is used here. - * Therefore, it is the caller's responsibility to ensure that this is - * safe. If not, then put_user_pages_dirty_lock() should be called instead. + * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is + * required, then the caller should a) verify that this is really correct, + * because _lock() is usually required, and b) hand code it: + * set_page_dirty_lock(), put_user_page(). * */ -void put_user_pages_dirty(struct page **pages, unsigned long npages) +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, + bool make_dirty) { - __put_user_pages_dirty(pages, npages, set_page_dirty); -} -EXPORT_SYMBOL(put_user_pages_dirty); + unsigned long index; -/** - * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages - * @pages: array of pages to be marked dirty and released. - * @npages: number of pages in the @pages array. - * - * For each page in the @pages array, make that page (or its head page, if a - * compound page) dirty, if it was previously listed as clean. Then, release - * the page using put_user_page(). - * - * Please see the put_user_page() documentation for details. - * - * This is just like put_user_pages_dirty(), except that it invokes - * set_page_dirty_lock(), instead of set_page_dirty(). - * - */ -void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) -{ - __put_user_pages_dirty(pages, npages, set_page_dirty_lock); + /* + * TODO: this can be optimized for huge pages: if a series of pages is + * physically contiguous and part of the same compound page, then a + * single operation to the head page should suffice. + */ + + if (!make_dirty) { + put_user_pages(pages, npages); + return; + } + + for (index = 0; index < npages; index++) { + struct page *page = compound_head(pages[index]); + /* + * Checking PageDirty at this point may race with + * clear_page_dirty_for_io(), but that's OK. Two key + * cases: + * + * 1) This code sees the page as already dirty, so it + * skips the call to set_page_dirty(). That could happen + * because clear_page_dirty_for_io() called + * page_mkclean(), followed by set_page_dirty(). + * However, now the page is going to get written back, + * which meets the original intention of setting it + * dirty, so all is well: clear_page_dirty_for_io() goes + * on to call TestClearPageDirty(), and write the page + * back. + * + * 2) This code sees the page as clean, so it calls + * set_page_dirty(). The page stays dirty, despite being + * written back, so it gets written back again in the + * next writeback cycle. This is harmless. + */ + if (!PageDirty(page)) + set_page_dirty_lock(page); + put_user_page(page); + } } EXPORT_SYMBOL(put_user_pages_dirty_lock); @@ -399,7 +384,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } - if (flags & FOLL_SPLIT) { + if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { int ret; page = pmd_page(*pmd); if (is_huge_zero_page(page)) { @@ -408,7 +393,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, split_huge_pmd(vma, pmd, address); if (pmd_trans_unstable(pmd)) ret = -EBUSY; - } else { + } else if (flags & FOLL_SPLIT) { if (unlikely(!try_get_page(page))) { spin_unlock(ptl); return ERR_PTR(-ENOMEM); @@ -420,6 +405,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, put_page(page); if (pmd_none(*pmd)) return no_page_table(vma, flags); + } else { /* flags & FOLL_SPLIT_PMD */ + spin_unlock(ptl); + split_huge_pmd(vma, pmd, address); + ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; } return ret ? ERR_PTR(ret) : @@ -799,6 +788,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (!nr_pages) return 0; + start = untagged_addr(start); + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); /* @@ -961,6 +952,8 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; vm_fault_t ret, major = 0; + address = untagged_addr(address); + if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY; @@ -1460,7 +1453,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, * gup may start from a tail page. Advance step by the left * part. */ - step = (1 << compound_order(head)) - (pages[i] - head); + step = compound_nr(head) - (pages[i] - head); /* * If we get a page from the CMA zone, since we are going to * be pinning these entries, we might as well move them out diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de1f15969e27..73fc517c08d2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -496,11 +496,25 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } -static inline struct list_head *page_deferred_list(struct page *page) +#ifdef CONFIG_MEMCG +static inline struct deferred_split *get_deferred_split_queue(struct page *page) { - /* ->lru in the tail pages is occupied by compound_head. */ - return &page[2].deferred_list; + struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + + if (memcg) + return &memcg->deferred_split_queue; + else + return &pgdat->deferred_split_queue; } +#else +static inline struct deferred_split *get_deferred_split_queue(struct page *page) +{ + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + + return &pgdat->deferred_split_queue; +} +#endif void prep_transhuge_page(struct page *page) { @@ -2497,6 +2511,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct page *head = compound_head(page); pg_data_t *pgdat = page_pgdat(head); struct lruvec *lruvec; + struct address_space *swap_cache = NULL; + unsigned long offset = 0; int i; lruvec = mem_cgroup_page_lruvec(head, pgdat); @@ -2504,6 +2520,14 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); + if (PageAnon(head) && PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + offset = swp_offset(entry); + swap_cache = swap_address_space(entry); + xa_lock(&swap_cache->i_pages); + } + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ @@ -2513,6 +2537,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) shmem_uncharge(head->mapping->host, 1); put_page(head + i); + } else if (!PageAnon(page)) { + __xa_store(&head->mapping->i_pages, head[i].index, + head + i, 0); + } else if (swap_cache) { + __xa_store(&swap_cache->i_pages, offset + i, + head + i, 0); } } @@ -2523,10 +2553,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { /* Additional pin to swap cache */ - if (PageSwapCache(head)) + if (PageSwapCache(head)) { page_ref_add(head, 2); - else + xa_unlock(&swap_cache->i_pages); + } else { page_ref_inc(head); + } } else { /* Additional pin to page cache */ page_ref_add(head, 2); @@ -2673,6 +2705,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); + struct deferred_split *ds_queue = get_deferred_split_queue(page); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int count, mapcount, extra_pins, ret; @@ -2759,17 +2792,17 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } /* Prevent deferred_split_scan() touching ->_refcount */ - spin_lock(&pgdata->split_queue_lock); + spin_lock(&ds_queue->split_queue_lock); count = page_count(head); mapcount = total_mapcount(head); if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { - pgdata->split_queue_len--; + ds_queue->split_queue_len--; list_del(page_deferred_list(head)); } if (mapping) __dec_node_page_state(page, NR_SHMEM_THPS); - spin_unlock(&pgdata->split_queue_lock); + spin_unlock(&ds_queue->split_queue_lock); __split_huge_page(page, list, end, flags); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2786,7 +2819,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) dump_page(page, "total_mapcount(head) > 0"); BUG(); } - spin_unlock(&pgdata->split_queue_lock); + spin_unlock(&ds_queue->split_queue_lock); fail: if (mapping) xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(&pgdata->lru_lock, flags); @@ -2808,53 +2841,86 @@ fail: if (mapping) void free_transhuge_page(struct page *page) { - struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); + struct deferred_split *ds_queue = get_deferred_split_queue(page); unsigned long flags; - spin_lock_irqsave(&pgdata->split_queue_lock, flags); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(page_deferred_list(page))) { - pgdata->split_queue_len--; + ds_queue->split_queue_len--; list_del(page_deferred_list(page)); } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); free_compound_page(page); } void deferred_split_huge_page(struct page *page) { - struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); + struct deferred_split *ds_queue = get_deferred_split_queue(page); +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; +#endif unsigned long flags; VM_BUG_ON_PAGE(!PageTransHuge(page), page); - spin_lock_irqsave(&pgdata->split_queue_lock, flags); + /* + * The try_to_unmap() in page reclaim path might reach here too, + * this may cause a race condition to corrupt deferred split queue. + * And, if page reclaim is already handling the same page, it is + * unnecessary to handle it again in shrinker. + * + * Check PageSwapCache to determine if the page is being + * handled by page reclaim since THP swap would add the page into + * swap cache before calling try_to_unmap(). + */ + if (PageSwapCache(page)) + return; + + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { count_vm_event(THP_DEFERRED_SPLIT_PAGE); - list_add_tail(page_deferred_list(page), &pgdata->split_queue); - pgdata->split_queue_len++; + list_add_tail(page_deferred_list(page), &ds_queue->split_queue); + ds_queue->split_queue_len++; +#ifdef CONFIG_MEMCG + if (memcg) + memcg_set_shrinker_bit(memcg, page_to_nid(page), + deferred_split_shrinker.id); +#endif } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); - return READ_ONCE(pgdata->split_queue_len); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + +#ifdef CONFIG_MEMCG + if (sc->memcg) + ds_queue = &sc->memcg->deferred_split_queue; +#endif + return READ_ONCE(ds_queue->split_queue_len); } static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; LIST_HEAD(list), *pos, *next; struct page *page; int split = 0; - spin_lock_irqsave(&pgdata->split_queue_lock, flags); +#ifdef CONFIG_MEMCG + if (sc->memcg) + ds_queue = &sc->memcg->deferred_split_queue; +#endif + + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ - list_for_each_safe(pos, next, &pgdata->split_queue) { + list_for_each_safe(pos, next, &ds_queue->split_queue) { page = list_entry((void *)pos, struct page, mapping); page = compound_head(page); if (get_page_unless_zero(page)) { @@ -2862,12 +2928,12 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } else { /* We lost race with put_compound_page() */ list_del_init(page_deferred_list(page)); - pgdata->split_queue_len--; + ds_queue->split_queue_len--; } if (!--sc->nr_to_scan) break; } - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); list_for_each_safe(pos, next, &list) { page = list_entry((void *)pos, struct page, mapping); @@ -2881,15 +2947,15 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, put_page(page); } - spin_lock_irqsave(&pgdata->split_queue_lock, flags); - list_splice_tail(&list, &pgdata->split_queue); - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + list_splice_tail(&list, &ds_queue->split_queue); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); /* * Stop shrinker if we didn't split any page, but the queue is empty. * This can happen if pages were freed under us. */ - if (!split && list_empty(&pgdata->split_queue)) + if (!split && list_empty(&ds_queue->split_queue)) return SHRINK_STOP; return split; } @@ -2898,7 +2964,8 @@ static struct shrinker deferred_split_shrinker = { .count_objects = deferred_split_count, .scan_objects = deferred_split_scan, .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, + .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | + SHRINKER_NONSLAB, }; #ifdef CONFIG_DEBUG_FS diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6d7296dd11b8..ef37c85423a5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1405,12 +1405,25 @@ pgoff_t __basepage_index(struct page *page) } static struct page *alloc_buddy_huge_page(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask) + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { int order = huge_page_order(h); struct page *page; + bool alloc_try_hard = true; - gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + /* + * By default we always try hard to allocate the page with + * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in + * a loop (to adjust global huge page counts) and previous allocation + * failed, do not continue to try hard on the same node. Use the + * node_alloc_noretry bitmap to manage this state information. + */ + if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) + alloc_try_hard = false; + gfp_mask |= __GFP_COMP|__GFP_NOWARN; + if (alloc_try_hard) + gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); @@ -1419,6 +1432,22 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, else __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + /* + * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this + * indicates an overall state change. Clear bit so that we resume + * normal 'try hard' allocations. + */ + if (node_alloc_noretry && page && !alloc_try_hard) + node_clear(nid, *node_alloc_noretry); + + /* + * If we tried hard to get a page but failed, set bit so that + * subsequent attempts will not try as hard until there is an + * overall state change. + */ + if (node_alloc_noretry && !page && alloc_try_hard) + node_set(nid, *node_alloc_noretry); + return page; } @@ -1427,7 +1456,8 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, * should use this function to get new hugetlb pages */ static struct page *alloc_fresh_huge_page(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask) + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { struct page *page; @@ -1435,7 +1465,7 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, page = alloc_gigantic_page(h, gfp_mask, nid, nmask); else page = alloc_buddy_huge_page(h, gfp_mask, - nid, nmask); + nid, nmask, node_alloc_noretry); if (!page) return NULL; @@ -1450,14 +1480,16 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, * Allocates a fresh page to the hugetlb allocator pool in the node interleaved * manner. */ -static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + nodemask_t *node_alloc_noretry) { struct page *page; int nr_nodes, node; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed); + page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, + node_alloc_noretry); if (page) break; } @@ -1601,7 +1633,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock(&hugetlb_lock); - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; @@ -1637,7 +1669,7 @@ struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, if (hstate_is_gigantic(h)) return NULL; - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; @@ -2207,13 +2239,33 @@ static void __init gather_bootmem_prealloc(void) static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long i; + nodemask_t *node_alloc_noretry; + + if (!hstate_is_gigantic(h)) { + /* + * Bit mask controlling how hard we retry per-node allocations. + * Ignore errors as lower level routines can deal with + * node_alloc_noretry == NULL. If this kmalloc fails at boot + * time, we are likely in bigger trouble. + */ + node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), + GFP_KERNEL); + } else { + /* allocations done at boot time */ + node_alloc_noretry = NULL; + } + + /* bit mask controlling how hard we retry per-node allocations */ + if (node_alloc_noretry) + nodes_clear(*node_alloc_noretry); for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h)) break; } else if (!alloc_pool_huge_page(h, - &node_states[N_MEMORY])) + &node_states[N_MEMORY], + node_alloc_noretry)) break; cond_resched(); } @@ -2225,6 +2277,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) h->max_huge_pages, buf, i); h->max_huge_pages = i; } + + kfree(node_alloc_noretry); } static void __init hugetlb_init_hstates(void) @@ -2323,6 +2377,17 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; + NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); + + /* + * Bit mask controlling how hard we retry per-node allocations. + * If we can not allocate the bit mask, do not attempt to allocate + * the requested huge pages. + */ + if (node_alloc_noretry) + nodes_clear(*node_alloc_noretry); + else + return -ENOMEM; spin_lock(&hugetlb_lock); @@ -2356,6 +2421,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { spin_unlock(&hugetlb_lock); + NODEMASK_FREE(node_alloc_noretry); return -EINVAL; } /* Fall through to decrease pool */ @@ -2388,7 +2454,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, /* yield cpu to avoid soft lockup */ cond_resched(); - ret = alloc_pool_huge_page(h, nodes_allowed); + ret = alloc_pool_huge_page(h, nodes_allowed, + node_alloc_noretry); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -2429,6 +2496,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, h->max_huge_pages = persistent_huge_pages(h); spin_unlock(&hugetlb_lock); + NODEMASK_FREE(node_alloc_noretry); + return 0; } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 68c2f2f3c05b..f1930fa0b445 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -139,7 +139,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, if (!page_hcg || page_hcg != h_cg) goto out; - nr_pages = 1 << compound_order(page); + nr_pages = compound_nr(page); if (!parent) { parent = root_h_cgroup; /* root has no limit */ diff --git a/mm/init-mm.c b/mm/init-mm.c index a787a319211e..fb1e15028ef0 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -35,6 +35,6 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, - .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, + .cpu_bitmap = CPU_BITS_NONE, INIT_MM_CONTEXT(init_mm) }; diff --git a/mm/internal.h b/mm/internal.h index e32390802fd3..0d5f720c75ab 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -39,7 +39,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma) +static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); } diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 95d16a42db6b..6814d6d6a023 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -304,7 +304,6 @@ size_t kasan_metadata_size(struct kmem_cache *cache) struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, const void *object) { - BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); return (void *)object + cache->kasan_info.alloc_meta_offset; } @@ -315,14 +314,31 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache, return (void *)object + cache->kasan_info.free_meta_offset; } + +static void kasan_set_free_info(struct kmem_cache *cache, + void *object, u8 tag) +{ + struct kasan_alloc_meta *alloc_meta; + u8 idx = 0; + + alloc_meta = get_alloc_info(cache, object); + +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY + idx = alloc_meta->free_track_idx; + alloc_meta->free_pointer_tag[idx] = tag; + alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; +#endif + + set_track(&alloc_meta->free_track[idx], GFP_NOWAIT); +} + void kasan_poison_slab(struct page *page) { unsigned long i; - for (i = 0; i < (1 << compound_order(page)); i++) + for (i = 0; i < compound_nr(page); i++) page_kasan_tag_reset(page + i); - kasan_poison_shadow(page_address(page), - PAGE_SIZE << compound_order(page), + kasan_poison_shadow(page_address(page), page_size(page), KASAN_KMALLOC_REDZONE); } @@ -452,7 +468,8 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object, unlikely(!(cache->flags & SLAB_KASAN))) return false; - set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT); + kasan_set_free_info(cache, object, tag); + quarantine_put(get_free_info(cache, object), cache); return IS_ENABLED(CONFIG_KASAN_GENERIC); @@ -524,7 +541,7 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, page = virt_to_page(ptr); redzone_start = round_up((unsigned long)(ptr + size), KASAN_SHADOW_SCALE_SIZE); - redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page)); + redzone_end = (unsigned long)ptr + page_size(page); kasan_unpoison_shadow(ptr, size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, @@ -560,8 +577,7 @@ void kasan_poison_kfree(void *ptr, unsigned long ip) kasan_report_invalid_free(ptr, ip); return; } - kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), - KASAN_FREE_PAGE); + kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE); } else { __kasan_slab_free(page->slab_cache, ptr, ip, false); } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 014f19e76247..35cff6bbb716 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -95,9 +95,19 @@ struct kasan_track { depot_stack_handle_t stack; }; +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY +#define KASAN_NR_FREE_STACKS 5 +#else +#define KASAN_NR_FREE_STACKS 1 +#endif + struct kasan_alloc_meta { struct kasan_track alloc_track; - struct kasan_track free_track; + struct kasan_track free_track[KASAN_NR_FREE_STACKS]; +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY + u8 free_pointer_tag[KASAN_NR_FREE_STACKS]; + u8 free_track_idx; +#endif }; struct qlist_node { @@ -146,6 +156,8 @@ void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip); +struct page *kasan_addr_to_page(const void *addr); + #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 0e5f965f1882..621782100eaa 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -111,7 +111,7 @@ static void print_track(struct kasan_track *track, const char *prefix) } } -static struct page *addr_to_page(const void *addr) +struct page *kasan_addr_to_page(const void *addr) { if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) @@ -151,15 +151,38 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, (void *)(object_addr + cache->object_size)); } +static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, + void *object, u8 tag) +{ + struct kasan_alloc_meta *alloc_meta; + int i = 0; + + alloc_meta = get_alloc_info(cache, object); + +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY + for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { + if (alloc_meta->free_pointer_tag[i] == tag) + break; + } + if (i == KASAN_NR_FREE_STACKS) + i = alloc_meta->free_track_idx; +#endif + + return &alloc_meta->free_track[i]; +} + static void describe_object(struct kmem_cache *cache, void *object, - const void *addr) + const void *addr, u8 tag) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); if (cache->flags & SLAB_KASAN) { + struct kasan_track *free_track; + print_track(&alloc_info->alloc_track, "Allocated"); pr_err("\n"); - print_track(&alloc_info->free_track, "Freed"); + free_track = kasan_get_free_track(cache, object, tag); + print_track(free_track, "Freed"); pr_err("\n"); } @@ -344,9 +367,9 @@ static void print_address_stack_frame(const void *addr) print_decoded_frame_descr(frame_descr); } -static void print_address_description(void *addr) +static void print_address_description(void *addr, u8 tag) { - struct page *page = addr_to_page(addr); + struct page *page = kasan_addr_to_page(addr); dump_stack(); pr_err("\n"); @@ -355,7 +378,7 @@ static void print_address_description(void *addr) struct kmem_cache *cache = page->slab_cache; void *object = nearest_obj(cache, page, addr); - describe_object(cache, object, addr); + describe_object(cache, object, addr, tag); } if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { @@ -435,13 +458,14 @@ static bool report_enabled(void) void kasan_report_invalid_free(void *object, unsigned long ip) { unsigned long flags; + u8 tag = get_tag(object); + object = reset_tag(object); start_report(&flags); pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); - print_tags(get_tag(object), reset_tag(object)); - object = reset_tag(object); + print_tags(tag, object); pr_err("\n"); - print_address_description(object); + print_address_description(object, tag); pr_err("\n"); print_shadow_for_address(object); end_report(&flags); @@ -479,7 +503,7 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon pr_err("\n"); if (addr_has_shadow(untagged_addr)) { - print_address_description(untagged_addr); + print_address_description(untagged_addr, get_tag(tagged_addr)); pr_err("\n"); print_shadow_for_address(info.first_bad_addr); } else { diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c index 8eaf5f722271..969ae08f59d7 100644 --- a/mm/kasan/tags_report.c +++ b/mm/kasan/tags_report.c @@ -36,6 +36,30 @@ const char *get_bug_type(struct kasan_access_info *info) { +#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY + struct kasan_alloc_meta *alloc_meta; + struct kmem_cache *cache; + struct page *page; + const void *addr; + void *object; + u8 tag; + int i; + + tag = get_tag(info->access_addr); + addr = reset_tag(info->access_addr); + page = kasan_addr_to_page(addr); + if (page && PageSlab(page)) { + cache = page->slab_cache; + object = nearest_obj(cache, page, (void *)addr); + alloc_meta = get_alloc_info(cache, object); + + for (i = 0; i < KASAN_NR_FREE_STACKS; i++) + if (alloc_meta->free_pointer_tag[i] == tag) + return "use-after-free"; + return "out-of-bounds"; + } + +#endif return "invalid-access"; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ccede2425c3f..0a1b4b484ac5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -48,6 +48,7 @@ enum scan_result { SCAN_CGROUP_CHARGE_FAIL, SCAN_EXCEED_SWAP_PTE, SCAN_TRUNCATED, + SCAN_PAGE_HAS_PRIVATE, }; #define CREATE_TRACE_POINTS @@ -76,6 +77,8 @@ static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct kmem_cache *mm_slot_cache __read_mostly; +#define MAX_PTE_MAPPED_THP 8 + /** * struct mm_slot - hash lookup from mm to mm_slot * @hash: hash collision list @@ -86,6 +89,10 @@ struct mm_slot { struct hlist_node hash; struct list_head mm_node; struct mm_struct *mm; + + /* pte-mapped THP in this mm */ + int nr_pte_mapped_thp; + unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP]; }; /** @@ -404,7 +411,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, (vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; - if (shmem_file(vma->vm_file)) { + + if (shmem_file(vma->vm_file) || + (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + vma->vm_file && + (vm_flags & VM_DENYWRITE))) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) return false; return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, @@ -456,8 +467,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long hstart, hend; /* - * khugepaged does not yet work on non-shmem files or special - * mappings. And file-private shmem THP is not supported. + * khugepaged only supports read-only files for non-shmem files. + * khugepaged does not yet work on special mappings. And + * file-private shmem THP is not supported. */ if (!hugepage_vma_check(vma, vm_flags)) return 0; @@ -1248,6 +1260,159 @@ static void collect_mm_slot(struct mm_slot *mm_slot) } #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +/* + * Notify khugepaged that given addr of the mm is pte-mapped THP. Then + * khugepaged should try to collapse the page table. + */ +static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ + struct mm_slot *mm_slot; + + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) + mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; + spin_unlock(&khugepaged_mm_lock); + return 0; +} + +/** + * Try to collapse a pte-mapped THP for mm at address haddr. + * + * This function checks whether all the PTEs in the PMD are pointing to the + * right THP. If so, retract the page table so the THP can refault in with + * as pmd-mapped. + */ +void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) +{ + unsigned long haddr = addr & HPAGE_PMD_MASK; + struct vm_area_struct *vma = find_vma(mm, haddr); + struct page *hpage = NULL; + pte_t *start_pte, *pte; + pmd_t *pmd, _pmd; + spinlock_t *ptl; + int count = 0; + int i; + + if (!vma || !vma->vm_file || + vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) + return; + + /* + * This vm_flags may not have VM_HUGEPAGE if the page was not + * collapsed by this mm. But we can still collapse if the page is + * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() + * will not fail the vma for missing VM_HUGEPAGE + */ + if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE)) + return; + + pmd = mm_find_pmd(mm, haddr); + if (!pmd) + return; + + start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + + /* step 1: check all mapped PTEs are to the right huge page */ + for (i = 0, addr = haddr, pte = start_pte; + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + struct page *page; + + /* empty pte, skip */ + if (pte_none(*pte)) + continue; + + /* page swapped out, abort */ + if (!pte_present(*pte)) + goto abort; + + page = vm_normal_page(vma, addr, *pte); + + if (!page || !PageCompound(page)) + goto abort; + + if (!hpage) { + hpage = compound_head(page); + /* + * The mapping of the THP should not change. + * + * Note that uprobe, debugger, or MAP_PRIVATE may + * change the page table, but the new page will + * not pass PageCompound() check. + */ + if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping)) + goto abort; + } + + /* + * Confirm the page maps to the correct subpage. + * + * Note that uprobe, debugger, or MAP_PRIVATE may change + * the page table, but the new page will not pass + * PageCompound() check. + */ + if (WARN_ON(hpage + i != page)) + goto abort; + count++; + } + + /* step 2: adjust rmap */ + for (i = 0, addr = haddr, pte = start_pte; + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + struct page *page; + + if (pte_none(*pte)) + continue; + page = vm_normal_page(vma, addr, *pte); + page_remove_rmap(page, false); + } + + pte_unmap_unlock(start_pte, ptl); + + /* step 3: set proper refcount and mm_counters. */ + if (hpage) { + page_ref_sub(hpage, count); + add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); + } + + /* step 4: collapse pmd */ + ptl = pmd_lock(vma->vm_mm, pmd); + _pmd = pmdp_collapse_flush(vma, addr, pmd); + spin_unlock(ptl); + mm_dec_nr_ptes(mm); + pte_free(mm, pmd_pgtable(_pmd)); + return; + +abort: + pte_unmap_unlock(start_pte, ptl); +} + +static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + int i; + + if (likely(mm_slot->nr_pte_mapped_thp == 0)) + return 0; + + if (!down_write_trylock(&mm->mmap_sem)) + return -EBUSY; + + if (unlikely(khugepaged_test_exit(mm))) + goto out; + + for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) + collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]); + +out: + mm_slot->nr_pte_mapped_thp = 0; + up_write(&mm->mmap_sem); + return 0; +} + static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; @@ -1256,7 +1421,22 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - /* probably overkill */ + /* + * Check vma->anon_vma to exclude MAP_PRIVATE mappings that + * got written to. These VMAs are likely not worth investing + * down_write(mmap_sem) as PMD-mapping is likely to be split + * later. + * + * Not that vma->anon_vma check is racy: it can be set up after + * the check but before we took mmap_sem by the fault path. + * But page lock would prevent establishing any new ptes of the + * page, so we are safe. + * + * An alternative would be drop the check, but check that page + * table is clear before calling pmdp_collapse_flush() under + * ptl. It has higher chance to recover THP for the VMA, but + * has higher cost too. + */ if (vma->anon_vma) continue; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); @@ -1269,9 +1449,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) continue; /* * We need exclusive mmap_sem to retract page table. - * If trylock fails we would end up with pte-mapped THP after - * re-fault. Not ideal, but it's more important to not disturb - * the system too much. + * + * We use trylock due to lock inversion: we need to acquire + * mmap_sem while holding page lock. Fault path does it in + * reverse order. Trylock is a way to avoid deadlock. */ if (down_write_trylock(&vma->vm_mm->mmap_sem)) { spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); @@ -1281,18 +1462,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) up_write(&vma->vm_mm->mmap_sem); mm_dec_nr_ptes(vma->vm_mm); pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + } else { + /* Try again later */ + khugepaged_add_pte_mapped_thp(vma->vm_mm, addr); } } i_mmap_unlock_write(mapping); } /** - * collapse_shmem - collapse small tmpfs/shmem pages into huge one. + * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; * - scan page cache replacing old pages with the new one - * + swap in pages if necessary; + * + swap/gup in pages if necessary; * + fill in gaps; * + keep old pages around in case rollback is required; * - if replacing succeeds: @@ -1304,10 +1488,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static void collapse_shmem(struct mm_struct *mm, - struct address_space *mapping, pgoff_t start, +static void collapse_file(struct mm_struct *mm, + struct file *file, pgoff_t start, struct page **hpage, int node) { + struct address_space *mapping = file->f_mapping; gfp_t gfp; struct page *new_page; struct mem_cgroup *memcg; @@ -1315,7 +1500,9 @@ static void collapse_shmem(struct mm_struct *mm, LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; + bool is_shmem = shmem_file(file); + VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); /* Only allocate from the target node */ @@ -1347,7 +1534,8 @@ static void collapse_shmem(struct mm_struct *mm, } while (1); __SetPageLocked(new_page); - __SetPageSwapBacked(new_page); + if (is_shmem) + __SetPageSwapBacked(new_page); new_page->index = start; new_page->mapping = mapping; @@ -1362,41 +1550,75 @@ static void collapse_shmem(struct mm_struct *mm, struct page *page = xas_next(&xas); VM_BUG_ON(index != xas.xa_index); - if (!page) { - /* - * Stop if extent has been truncated or hole-punched, - * and is now completely empty. - */ - if (index == start) { - if (!xas_next_entry(&xas, end - 1)) { - result = SCAN_TRUNCATED; + if (is_shmem) { + if (!page) { + /* + * Stop if extent has been truncated or + * hole-punched, and is now completely + * empty. + */ + if (index == start) { + if (!xas_next_entry(&xas, end - 1)) { + result = SCAN_TRUNCATED; + goto xa_locked; + } + xas_set(&xas, index); + } + if (!shmem_charge(mapping->host, 1)) { + result = SCAN_FAIL; goto xa_locked; } - xas_set(&xas, index); + xas_store(&xas, new_page); + nr_none++; + continue; } - if (!shmem_charge(mapping->host, 1)) { - result = SCAN_FAIL; + + if (xa_is_value(page) || !PageUptodate(page)) { + xas_unlock_irq(&xas); + /* swap in or instantiate fallocated page */ + if (shmem_getpage(mapping->host, index, &page, + SGP_NOHUGE)) { + result = SCAN_FAIL; + goto xa_unlocked; + } + } else if (trylock_page(page)) { + get_page(page); + xas_unlock_irq(&xas); + } else { + result = SCAN_PAGE_LOCK; goto xa_locked; } - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); - nr_none++; - continue; - } - - if (xa_is_value(page) || !PageUptodate(page)) { - xas_unlock_irq(&xas); - /* swap in or instantiate fallocated page */ - if (shmem_getpage(mapping->host, index, &page, - SGP_NOHUGE)) { + } else { /* !is_shmem */ + if (!page || xa_is_value(page)) { + xas_unlock_irq(&xas); + page_cache_sync_readahead(mapping, &file->f_ra, + file, index, + PAGE_SIZE); + /* drain pagevecs to help isolate_lru_page() */ + lru_add_drain(); + page = find_lock_page(mapping, index); + if (unlikely(page == NULL)) { + result = SCAN_FAIL; + goto xa_unlocked; + } + } else if (!PageUptodate(page)) { + xas_unlock_irq(&xas); + wait_on_page_locked(page); + if (!trylock_page(page)) { + result = SCAN_PAGE_LOCK; + goto xa_unlocked; + } + get_page(page); + } else if (PageDirty(page)) { result = SCAN_FAIL; - goto xa_unlocked; + goto xa_locked; + } else if (trylock_page(page)) { + get_page(page); + xas_unlock_irq(&xas); + } else { + result = SCAN_PAGE_LOCK; + goto xa_locked; } - } else if (trylock_page(page)) { - get_page(page); - xas_unlock_irq(&xas); - } else { - result = SCAN_PAGE_LOCK; - goto xa_locked; } /* @@ -1425,6 +1647,12 @@ static void collapse_shmem(struct mm_struct *mm, goto out_unlock; } + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) { + result = SCAN_PAGE_HAS_PRIVATE; + goto out_unlock; + } + if (page_mapped(page)) unmap_mapping_pages(mapping, index, 1, false); @@ -1454,7 +1682,7 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + xas_store(&xas, new_page); continue; out_unlock: unlock_page(page); @@ -1462,12 +1690,20 @@ static void collapse_shmem(struct mm_struct *mm, goto xa_unlocked; } - __inc_node_page_state(new_page, NR_SHMEM_THPS); + if (is_shmem) + __inc_node_page_state(new_page, NR_SHMEM_THPS); + else { + __inc_node_page_state(new_page, NR_FILE_THPS); + filemap_nr_thps_inc(mapping); + } + if (nr_none) { struct zone *zone = page_zone(new_page); __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); - __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); + if (is_shmem) + __mod_node_page_state(zone->zone_pgdat, + NR_SHMEM, nr_none); } xa_locked: @@ -1505,10 +1741,15 @@ static void collapse_shmem(struct mm_struct *mm, SetPageUptodate(new_page); page_ref_add(new_page, HPAGE_PMD_NR - 1); - set_page_dirty(new_page); mem_cgroup_commit_charge(new_page, memcg, false, true); + + if (is_shmem) { + set_page_dirty(new_page); + lru_cache_add_anon(new_page); + } else { + lru_cache_add_file(new_page); + } count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); - lru_cache_add_anon(new_page); /* * Remove pte page tables, so we can re-fault the page as huge. @@ -1523,7 +1764,9 @@ static void collapse_shmem(struct mm_struct *mm, /* Something went wrong: roll back page cache changes */ xas_lock_irq(&xas); mapping->nrpages -= nr_none; - shmem_uncharge(mapping->host, nr_none); + + if (is_shmem) + shmem_uncharge(mapping->host, nr_none); xas_set(&xas, start); xas_for_each(&xas, page, end - 1) { @@ -1563,11 +1806,11 @@ static void collapse_shmem(struct mm_struct *mm, /* TODO: tracepoints */ } -static void khugepaged_scan_shmem(struct mm_struct *mm, - struct address_space *mapping, - pgoff_t start, struct page **hpage) +static void khugepaged_scan_file(struct mm_struct *mm, + struct file *file, pgoff_t start, struct page **hpage) { struct page *page = NULL; + struct address_space *mapping = file->f_mapping; XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; @@ -1606,7 +1849,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, break; } - if (page_count(page) != 1 + page_mapcount(page)) { + if (page_count(page) != + 1 + page_mapcount(page) + page_has_private(page)) { result = SCAN_PAGE_COUNT; break; } @@ -1631,19 +1875,23 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, result = SCAN_EXCEED_NONE_PTE; } else { node = khugepaged_find_target_node(); - collapse_shmem(mm, mapping, start, hpage, node); + collapse_file(mm, file, start, hpage, node); } } /* TODO: tracepoints */ } #else -static void khugepaged_scan_shmem(struct mm_struct *mm, - struct address_space *mapping, - pgoff_t start, struct page **hpage) +static void khugepaged_scan_file(struct mm_struct *mm, + struct file *file, pgoff_t start, struct page **hpage) { BUILD_BUG(); } + +static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +{ + return 0; +} #endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, @@ -1668,6 +1916,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, khugepaged_scan.mm_slot = mm_slot; } spin_unlock(&khugepaged_mm_lock); + khugepaged_collapse_pte_mapped_thps(mm_slot); mm = mm_slot->mm; /* @@ -1713,17 +1962,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (shmem_file(vma->vm_file)) { + if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { struct file *file; pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); - if (!shmem_huge_enabled(vma)) + + if (shmem_file(vma->vm_file) + && !shmem_huge_enabled(vma)) goto skip; file = get_file(vma->vm_file); up_read(&mm->mmap_sem); ret = 1; - khugepaged_scan_shmem(mm, file->f_mapping, - pgoff, hpage); + khugepaged_scan_file(mm, file, pgoff, hpage); fput(file); } else { ret = khugepaged_scan_pmd(mm, vma, diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f6e602918dac..03a8d84badad 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -168,6 +168,8 @@ struct kmemleak_object { #define OBJECT_REPORTED (1 << 1) /* flag set to not scan the object */ #define OBJECT_NO_SCAN (1 << 2) +/* flag set to fully scan the object when scan_area allocation failed */ +#define OBJECT_FULL_SCAN (1 << 3) #define HEX_PREFIX " " /* number of bytes to print per line; must be 16 or 32 */ @@ -183,6 +185,10 @@ struct kmemleak_object { static LIST_HEAD(object_list); /* the list of gray-colored objects (see color_gray comment below) */ static LIST_HEAD(gray_list); +/* memory pool allocation */ +static struct kmemleak_object mem_pool[CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE]; +static int mem_pool_free_count = ARRAY_SIZE(mem_pool); +static LIST_HEAD(mem_pool_free_list); /* search tree for object boundaries */ static struct rb_root object_tree_root = RB_ROOT; /* rw_lock protecting the access to object_list and object_tree_root */ @@ -193,13 +199,11 @@ static struct kmem_cache *object_cache; static struct kmem_cache *scan_area_cache; /* set if tracing memory operations is enabled */ -static int kmemleak_enabled; +static int kmemleak_enabled = 1; /* same as above but only for the kmemleak_free() callback */ -static int kmemleak_free_enabled; +static int kmemleak_free_enabled = 1; /* set in the late_initcall if there were no errors */ static int kmemleak_initialized; -/* enables or disables early logging of the memory operations */ -static int kmemleak_early_log = 1; /* set if a kmemleak warning was issued */ static int kmemleak_warning; /* set if a fatal kmemleak error has occurred */ @@ -227,49 +231,6 @@ static bool kmemleak_found_leaks; static bool kmemleak_verbose; module_param_named(verbose, kmemleak_verbose, bool, 0600); -/* - * Early object allocation/freeing logging. Kmemleak is initialized after the - * kernel allocator. However, both the kernel allocator and kmemleak may - * allocate memory blocks which need to be tracked. Kmemleak defines an - * arbitrary buffer to hold the allocation/freeing information before it is - * fully initialized. - */ - -/* kmemleak operation type for early logging */ -enum { - KMEMLEAK_ALLOC, - KMEMLEAK_ALLOC_PERCPU, - KMEMLEAK_FREE, - KMEMLEAK_FREE_PART, - KMEMLEAK_FREE_PERCPU, - KMEMLEAK_NOT_LEAK, - KMEMLEAK_IGNORE, - KMEMLEAK_SCAN_AREA, - KMEMLEAK_NO_SCAN, - KMEMLEAK_SET_EXCESS_REF -}; - -/* - * Structure holding the information passed to kmemleak callbacks during the - * early logging. - */ -struct early_log { - int op_type; /* kmemleak operation type */ - int min_count; /* minimum reference count */ - const void *ptr; /* allocated/freed memory block */ - union { - size_t size; /* memory block size */ - unsigned long excess_ref; /* surplus reference passing */ - }; - unsigned long trace[MAX_TRACE]; /* stack trace */ - unsigned int trace_len; /* stack trace length */ -}; - -/* early logging buffer and current position */ -static struct early_log - early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata; -static int crt_early_log __initdata; - static void kmemleak_disable(void); /* @@ -449,6 +410,54 @@ static int get_object(struct kmemleak_object *object) return atomic_inc_not_zero(&object->use_count); } +/* + * Memory pool allocation and freeing. kmemleak_lock must not be held. + */ +static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) +{ + unsigned long flags; + struct kmemleak_object *object; + + /* try the slab allocator first */ + if (object_cache) { + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); + if (object) + return object; + } + + /* slab allocation failed, try the memory pool */ + write_lock_irqsave(&kmemleak_lock, flags); + object = list_first_entry_or_null(&mem_pool_free_list, + typeof(*object), object_list); + if (object) + list_del(&object->object_list); + else if (mem_pool_free_count) + object = &mem_pool[--mem_pool_free_count]; + else + pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n"); + write_unlock_irqrestore(&kmemleak_lock, flags); + + return object; +} + +/* + * Return the object to either the slab allocator or the memory pool. + */ +static void mem_pool_free(struct kmemleak_object *object) +{ + unsigned long flags; + + if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) { + kmem_cache_free(object_cache, object); + return; + } + + /* add the object to the memory pool free list */ + write_lock_irqsave(&kmemleak_lock, flags); + list_add(&object->object_list, &mem_pool_free_list); + write_unlock_irqrestore(&kmemleak_lock, flags); +} + /* * RCU callback to free a kmemleak_object. */ @@ -467,7 +476,7 @@ static void free_object_rcu(struct rcu_head *rcu) hlist_del(&area->node); kmem_cache_free(scan_area_cache, area); } - kmem_cache_free(object_cache, object); + mem_pool_free(object); } /* @@ -485,7 +494,15 @@ static void put_object(struct kmemleak_object *object) /* should only get here after delete_object was called */ WARN_ON(object->flags & OBJECT_ALLOCATED); - call_rcu(&object->rcu, free_object_rcu); + /* + * It may be too early for the RCU callbacks, however, there is no + * concurrent object_list traversal when !object_cache and all objects + * came from the memory pool. Free the object directly. + */ + if (object_cache) + call_rcu(&object->rcu, free_object_rcu); + else + free_object_rcu(&object->rcu); } /* @@ -550,7 +567,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct rb_node **link, *rb_parent; unsigned long untagged_ptr; - object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); + object = mem_pool_alloc(gfp); if (!object) { pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); @@ -689,9 +706,7 @@ static void delete_object_part(unsigned long ptr, size_t size) /* * Create one or two objects that may result from the memory block * split. Note that partial freeing is only done by free_bootmem() and - * this happens before kmemleak_init() is called. The path below is - * only executed during early log recording in kmemleak_init(), so - * GFP_KERNEL is enough. + * this happens before kmemleak_init() is called. */ start = object->pointer; end = object->pointer + object->size; @@ -763,7 +778,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) { unsigned long flags; struct kmemleak_object *object; - struct kmemleak_scan_area *area; + struct kmemleak_scan_area *area = NULL; object = find_and_get_object(ptr, 1); if (!object) { @@ -772,13 +787,16 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) return; } - area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); - if (!area) { - pr_warn("Cannot allocate a scan area\n"); - goto out; - } + if (scan_area_cache) + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); spin_lock_irqsave(&object->lock, flags); + if (!area) { + pr_warn_once("Cannot allocate a scan area, scanning the full object\n"); + /* mark the object for full scan to avoid false positives */ + object->flags |= OBJECT_FULL_SCAN; + goto out_unlock; + } if (size == SIZE_MAX) { size = object->pointer + object->size - ptr; } else if (ptr + size > object->pointer + object->size) { @@ -795,7 +813,6 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) hlist_add_head(&area->node, &object->area_list); out_unlock: spin_unlock_irqrestore(&object->lock, flags); -out: put_object(object); } @@ -845,86 +862,6 @@ static void object_no_scan(unsigned long ptr) put_object(object); } -/* - * Log an early kmemleak_* call to the early_log buffer. These calls will be - * processed later once kmemleak is fully initialized. - */ -static void __init log_early(int op_type, const void *ptr, size_t size, - int min_count) -{ - unsigned long flags; - struct early_log *log; - - if (kmemleak_error) { - /* kmemleak stopped recording, just count the requests */ - crt_early_log++; - return; - } - - if (crt_early_log >= ARRAY_SIZE(early_log)) { - crt_early_log++; - kmemleak_disable(); - return; - } - - /* - * There is no need for locking since the kernel is still in UP mode - * at this stage. Disabling the IRQs is enough. - */ - local_irq_save(flags); - log = &early_log[crt_early_log]; - log->op_type = op_type; - log->ptr = ptr; - log->size = size; - log->min_count = min_count; - log->trace_len = __save_stack_trace(log->trace); - crt_early_log++; - local_irq_restore(flags); -} - -/* - * Log an early allocated block and populate the stack trace. - */ -static void early_alloc(struct early_log *log) -{ - struct kmemleak_object *object; - unsigned long flags; - int i; - - if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr)) - return; - - /* - * RCU locking needed to ensure object is not freed via put_object(). - */ - rcu_read_lock(); - object = create_object((unsigned long)log->ptr, log->size, - log->min_count, GFP_ATOMIC); - if (!object) - goto out; - spin_lock_irqsave(&object->lock, flags); - for (i = 0; i < log->trace_len; i++) - object->trace[i] = log->trace[i]; - object->trace_len = log->trace_len; - spin_unlock_irqrestore(&object->lock, flags); -out: - rcu_read_unlock(); -} - -/* - * Log an early allocated block and populate the stack trace. - */ -static void early_alloc_percpu(struct early_log *log) -{ - unsigned int cpu; - const void __percpu *ptr = log->ptr; - - for_each_possible_cpu(cpu) { - log->ptr = per_cpu_ptr(ptr, cpu); - early_alloc(log); - } -} - /** * kmemleak_alloc - register a newly allocated object * @ptr: pointer to beginning of the object @@ -946,8 +883,6 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, if (kmemleak_enabled && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); - else if (kmemleak_early_log) - log_early(KMEMLEAK_ALLOC, ptr, size, min_count); } EXPORT_SYMBOL_GPL(kmemleak_alloc); @@ -975,8 +910,6 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, for_each_possible_cpu(cpu) create_object((unsigned long)per_cpu_ptr(ptr, cpu), size, 0, gfp); - else if (kmemleak_early_log) - log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); @@ -1001,11 +934,6 @@ void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp create_object((unsigned long)area->addr, size, 2, gfp); object_set_excess_ref((unsigned long)area, (unsigned long)area->addr); - } else if (kmemleak_early_log) { - log_early(KMEMLEAK_ALLOC, area->addr, size, 2); - /* reusing early_log.size for storing area->addr */ - log_early(KMEMLEAK_SET_EXCESS_REF, - area, (unsigned long)area->addr, 0); } } EXPORT_SYMBOL_GPL(kmemleak_vmalloc); @@ -1023,8 +951,6 @@ void __ref kmemleak_free(const void *ptr) if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); - else if (kmemleak_early_log) - log_early(KMEMLEAK_FREE, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); @@ -1043,8 +969,6 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_part((unsigned long)ptr, size); - else if (kmemleak_early_log) - log_early(KMEMLEAK_FREE_PART, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_part); @@ -1065,8 +989,6 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) for_each_possible_cpu(cpu) delete_object_full((unsigned long)per_cpu_ptr(ptr, cpu)); - else if (kmemleak_early_log) - log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_percpu); @@ -1117,8 +1039,6 @@ void __ref kmemleak_not_leak(const void *ptr) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); - else if (kmemleak_early_log) - log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_not_leak); @@ -1137,8 +1057,6 @@ void __ref kmemleak_ignore(const void *ptr) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr); - else if (kmemleak_early_log) - log_early(KMEMLEAK_IGNORE, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_ignore); @@ -1159,8 +1077,6 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) add_scan_area((unsigned long)ptr, size, gfp); - else if (kmemleak_early_log) - log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); } EXPORT_SYMBOL(kmemleak_scan_area); @@ -1179,8 +1095,6 @@ void __ref kmemleak_no_scan(const void *ptr) if (kmemleak_enabled && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); - else if (kmemleak_early_log) - log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_no_scan); @@ -1408,7 +1322,8 @@ static void scan_object(struct kmemleak_object *object) if (!(object->flags & OBJECT_ALLOCATED)) /* already freed object */ goto out; - if (hlist_empty(&object->area_list)) { + if (hlist_empty(&object->area_list) || + object->flags & OBJECT_FULL_SCAN) { void *start = (void *)object->pointer; void *end = (void *)(object->pointer + object->size); void *next; @@ -1966,7 +1881,6 @@ static void kmemleak_disable(void) /* stop any memory operation tracing */ kmemleak_enabled = 0; - kmemleak_early_log = 0; /* check whether it is too early for a kernel thread */ if (kmemleak_initialized) @@ -1994,20 +1908,11 @@ static int __init kmemleak_boot_config(char *str) } early_param("kmemleak", kmemleak_boot_config); -static void __init print_log_trace(struct early_log *log) -{ - pr_notice("Early log backtrace:\n"); - stack_trace_print(log->trace, log->trace_len, 2); -} - /* * Kmemleak initialization. */ void __init kmemleak_init(void) { - int i; - unsigned long flags; - #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF if (!kmemleak_skip_disable) { kmemleak_disable(); @@ -2015,28 +1920,15 @@ void __init kmemleak_init(void) } #endif + if (kmemleak_error) + return; + jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); - if (crt_early_log > ARRAY_SIZE(early_log)) - pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", - crt_early_log); - - /* the kernel is still in UP mode, so disabling the IRQs is enough */ - local_irq_save(flags); - kmemleak_early_log = 0; - if (kmemleak_error) { - local_irq_restore(flags); - return; - } else { - kmemleak_enabled = 1; - kmemleak_free_enabled = 1; - } - local_irq_restore(flags); - /* register the data/bss sections */ create_object((unsigned long)_sdata, _edata - _sdata, KMEMLEAK_GREY, GFP_ATOMIC); @@ -2047,57 +1939,6 @@ void __init kmemleak_init(void) create_object((unsigned long)__start_ro_after_init, __end_ro_after_init - __start_ro_after_init, KMEMLEAK_GREY, GFP_ATOMIC); - - /* - * This is the point where tracking allocations is safe. Automatic - * scanning is started during the late initcall. Add the early logged - * callbacks to the kmemleak infrastructure. - */ - for (i = 0; i < crt_early_log; i++) { - struct early_log *log = &early_log[i]; - - switch (log->op_type) { - case KMEMLEAK_ALLOC: - early_alloc(log); - break; - case KMEMLEAK_ALLOC_PERCPU: - early_alloc_percpu(log); - break; - case KMEMLEAK_FREE: - kmemleak_free(log->ptr); - break; - case KMEMLEAK_FREE_PART: - kmemleak_free_part(log->ptr, log->size); - break; - case KMEMLEAK_FREE_PERCPU: - kmemleak_free_percpu(log->ptr); - break; - case KMEMLEAK_NOT_LEAK: - kmemleak_not_leak(log->ptr); - break; - case KMEMLEAK_IGNORE: - kmemleak_ignore(log->ptr); - break; - case KMEMLEAK_SCAN_AREA: - kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); - break; - case KMEMLEAK_NO_SCAN: - kmemleak_no_scan(log->ptr); - break; - case KMEMLEAK_SET_EXCESS_REF: - object_set_excess_ref((unsigned long)log->ptr, - log->excess_ref); - break; - default: - kmemleak_warn("Unknown early log operation: %d\n", - log->op_type); - } - - if (kmemleak_warning) { - print_log_trace(log); - kmemleak_warning = 0; - } - } } /* @@ -2126,7 +1967,8 @@ static int __init kmemleak_late_init(void) mutex_unlock(&scan_mutex); } - pr_info("Kernel memory leak detector initialized\n"); + pr_info("Kernel memory leak detector initialized (mem pool available: %d)\n", + mem_pool_free_count); return 0; } diff --git a/mm/ksm.c b/mm/ksm.c index 3dc4346411e4..dbee2eb4dd05 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1029,24 +1029,6 @@ static u32 calc_checksum(struct page *page) return checksum; } -static int memcmp_pages(struct page *page1, struct page *page2) -{ - char *addr1, *addr2; - int ret; - - addr1 = kmap_atomic(page1); - addr2 = kmap_atomic(page2); - ret = memcmp(addr1, addr2, PAGE_SIZE); - kunmap_atomic(addr2); - kunmap_atomic(addr1); - return ret; -} - -static inline int pages_identical(struct page *page1, struct page *page2) -{ - return !memcmp_pages(page1, page2); -} - static int write_protect_page(struct vm_area_struct *vma, struct page *page, pte_t *orig_pte) { diff --git a/mm/madvise.c b/mm/madvise.c index 569ba11f048f..1cfc273b7e25 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,11 @@ #include "internal.h" +struct madvise_walk_private { + struct mmu_gather *tlb; + bool pageout; +}; + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_sem for writing. Others, which simply traverse vmas, need @@ -42,6 +48,8 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_COLD: + case MADV_PAGEOUT: case MADV_FREE: return 0; default: @@ -107,28 +115,14 @@ static long madvise_behavior(struct vm_area_struct *vma, case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; break; } @@ -154,15 +148,8 @@ static long madvise_behavior(struct vm_area_struct *vma, goto out; } error = __split_vma(mm, vma, start, 1); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; } if (end != vma->vm_end) { @@ -171,15 +158,8 @@ static long madvise_behavior(struct vm_area_struct *vma, goto out; } error = __split_vma(mm, vma, end, 0); - if (error) { - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; - goto out; - } + if (error) + goto out_convert_errno; } success: @@ -187,6 +167,14 @@ static long madvise_behavior(struct vm_area_struct *vma, * vm_flags is protected by the mmap_sem held in write mode. */ vma->vm_flags = new_flags; + +out_convert_errno: + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; out: return error; } @@ -309,6 +297,254 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct madvise_walk_private *private = walk->private; + struct mmu_gather *tlb = private->tlb; + bool pageout = private->pageout; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + pte_t *orig_pte, *pte, ptent; + spinlock_t *ptl; + struct page *page = NULL; + LIST_HEAD(page_list); + + if (fatal_signal_pending(current)) + return -EINTR; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + pmd_t orig_pmd; + unsigned long next = pmd_addr_end(addr, end); + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) + goto huge_unlock; + + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto huge_unlock; + } + + page = pmd_page(orig_pmd); + if (next - addr != HPAGE_PMD_SIZE) { + int err; + + if (page_mapcount(page) != 1) + goto huge_unlock; + + get_page(page); + spin_unlock(ptl); + lock_page(page); + err = split_huge_page(page); + unlock_page(page); + put_page(page); + if (!err) + goto regular_page; + return 0; + } + + if (pmd_young(orig_pmd)) { + pmdp_invalidate(vma, addr, pmd); + orig_pmd = pmd_mkold(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + } + + ClearPageReferenced(page); + test_and_clear_page_young(page); + if (pageout) { + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); + } else + deactivate_page(page); +huge_unlock: + spin_unlock(ptl); + if (pageout) + reclaim_pages(&page_list); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; +regular_page: +#endif + tlb_change_page_size(tlb, PAGE_SIZE); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + for (; addr < end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (pte_none(ptent)) + continue; + + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* + * Creating a THP page is expensive so split it only if we + * are sure it's worth. Split it if we are only owner. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + break; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + break; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + break; + } + unlock_page(page); + put_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (pte_young(ptent)) { + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + + /* + * We are deactivating a page for accelerating reclaiming. + * VM couldn't reclaim the page unless we clear PG_young. + * As a side effect, it makes confuse idle-page tracking + * because they will miss recent referenced history. + */ + ClearPageReferenced(page); + test_and_clear_page_young(page); + if (pageout) { + if (!isolate_lru_page(page)) + list_add(&page->lru, &page_list); + } else + deactivate_page(page); + } + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + if (pageout) + reclaim_pages(&page_list); + cond_resched(); + + return 0; +} + +static const struct mm_walk_ops cold_walk_ops = { + .pmd_entry = madvise_cold_or_pageout_pte_range, +}; + +static void madvise_cold_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct madvise_walk_private walk_private = { + .pageout = false, + .tlb = tlb, + }; + + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); + tlb_end_vma(tlb, vma); +} + +static long madvise_cold(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + madvise_cold_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb, start_addr, end_addr); + + return 0; +} + +static void madvise_pageout_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct madvise_walk_private walk_private = { + .pageout = true, + .tlb = tlb, + }; + + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); + tlb_end_vma(tlb, vma); +} + +static inline bool can_do_pageout(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + /* + * paging out pagecache only for non-anonymous mappings that correspond + * to the files the calling process could (if tried) open for writing; + * otherwise we'd be including shared non-exclusive mappings, which + * opens a side channel. + */ + return inode_owner_or_capable(file_inode(vma->vm_file)) || + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; +} + +static long madvise_pageout(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + if (!can_do_pageout(vma)) + return 0; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb, start_addr, end_addr); + + return 0; +} + static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -513,7 +749,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, int behavior) { *prev = vma; - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) return -EINVAL; if (!userfaultfd_remove(vma, start, end)) { @@ -535,7 +771,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ return -ENOMEM; } - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) return -EINVAL; if (end > vma->vm_end) { /* @@ -689,6 +925,10 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); + case MADV_COLD: + return madvise_cold(vma, prev, start, end); + case MADV_PAGEOUT: + return madvise_pageout(vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: return madvise_dontneed_free(vma, prev, start, end, behavior); @@ -710,6 +950,8 @@ madvise_behavior_valid(int behavior) case MADV_WILLNEED: case MADV_DONTNEED: case MADV_FREE: + case MADV_COLD: + case MADV_PAGEOUT: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: @@ -804,6 +1046,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) size_t len; struct blk_plug plug; + start = untagged_addr(start); + if (!madvise_behavior_valid(behavior)) return error; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3c15bb07cce..c313c49074ca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include "internal.h" #include @@ -317,6 +318,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); struct workqueue_struct *memcg_kmem_cache_wq; +#endif static int memcg_shrinker_map_size; static DEFINE_MUTEX(memcg_shrinker_map_mutex); @@ -440,14 +442,6 @@ void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) } } -#else /* CONFIG_MEMCG_KMEM */ -static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) -{ - return 0; -} -static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } -#endif /* CONFIG_MEMCG_KMEM */ - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest @@ -2270,21 +2264,22 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; + bool flush = false; + rcu_read_lock(); memcg = stock->cached; - if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) - continue; - if (!mem_cgroup_is_descendant(memcg, root_memcg)) { - css_put(&memcg->css); - continue; - } - if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { + if (memcg && stock->nr_pages && + mem_cgroup_is_descendant(memcg, root_memcg)) + flush = true; + rcu_read_unlock(); + + if (flush && + !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) drain_local_stock(&stock->work); else schedule_work_on(cpu, &stock->work); } - css_put(&memcg->css); } put_cpu(); mutex_unlock(&percpu_charge_mutex); @@ -2358,12 +2353,68 @@ static void high_work_func(struct work_struct *work) reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); } +/* + * Clamp the maximum sleep time per allocation batch to 2 seconds. This is + * enough to still cause a significant slowdown in most cases, while still + * allowing diagnostics and tracing to proceed without becoming stuck. + */ +#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) + +/* + * When calculating the delay, we use these either side of the exponentiation to + * maintain precision and scale to a reasonable number of jiffies (see the table + * below. + * + * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the + * overage ratio to a delay. + * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the + * proposed penalty in order to reduce to a reasonable number of jiffies, and + * to produce a reasonable delay curve. + * + * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a + * reasonable delay curve compared to precision-adjusted overage, not + * penalising heavily at first, but still making sure that growth beyond the + * limit penalises misbehaviour cgroups by slowing them down exponentially. For + * example, with a high of 100 megabytes: + * + * +-------+------------------------+ + * | usage | time to allocate in ms | + * +-------+------------------------+ + * | 100M | 0 | + * | 101M | 6 | + * | 102M | 25 | + * | 103M | 57 | + * | 104M | 102 | + * | 105M | 159 | + * | 106M | 230 | + * | 107M | 313 | + * | 108M | 409 | + * | 109M | 518 | + * | 110M | 639 | + * | 111M | 774 | + * | 112M | 921 | + * | 113M | 1081 | + * | 114M | 1254 | + * | 115M | 1439 | + * | 116M | 1638 | + * | 117M | 1849 | + * | 118M | 2000 | + * | 119M | 2000 | + * | 120M | 2000 | + * +-------+------------------------+ + */ + #define MEMCG_DELAY_PRECISION_SHIFT 20 + #define MEMCG_DELAY_SCALING_SHIFT 14 + /* * Scheduled by try_charge() to be executed from the userland return path * and reclaims memory over the high limit. */ void mem_cgroup_handle_over_high(void) { + unsigned long usage, high, clamped_high; + unsigned long pflags; + unsigned long penalty_jiffies, overage; unsigned int nr_pages = current->memcg_nr_pages_over_high; struct mem_cgroup *memcg; @@ -2372,8 +2423,75 @@ void mem_cgroup_handle_over_high(void) memcg = get_mem_cgroup_from_mm(current->mm); reclaim_high(memcg, nr_pages, GFP_KERNEL); - css_put(&memcg->css); current->memcg_nr_pages_over_high = 0; + + /* + * memory.high is breached and reclaim is unable to keep up. Throttle + * allocators proactively to slow down excessive growth. + * + * We use overage compared to memory.high to calculate the number of + * jiffies to sleep (penalty_jiffies). Ideally this value should be + * fairly lenient on small overages, and increasingly harsh when the + * memcg in question makes it clear that it has no intention of stopping + * its crazy behaviour, so we exponentially increase the delay based on + * overage amount. + */ + + usage = page_counter_read(&memcg->memory); + high = READ_ONCE(memcg->high); + + if (usage <= high) + goto out; + + /* + * Prevent division by 0 in overage calculation by acting as if it was a + * threshold of 1 page + */ + clamped_high = max(high, 1UL); + + overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT, + clamped_high); + + penalty_jiffies = ((u64)overage * overage * HZ) + >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); + + /* + * Factor in the task's own contribution to the overage, such that four + * N-sized allocations are throttled approximately the same as one + * 4N-sized allocation. + * + * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or + * larger the current charge patch is than that. + */ + penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; + + /* + * Clamp the max delay per usermode return so as to still keep the + * application moving forwards and also permit diagnostics, albeit + * extremely slowly. + */ + penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); + + /* + * Don't sleep if the amount of jiffies this memcg owes us is so low + * that it's not even worth doing, in an attempt to be nice to those who + * go only a small amount over their memory.high value and maybe haven't + * been aggressively reclaimed enough yet. + */ + if (penalty_jiffies <= HZ / 100) + goto out; + + /* + * If we exit early, we're guaranteed to die (since + * schedule_timeout_killable sets TASK_KILLABLE). This means we don't + * need to account for any ill-begotten jiffies to pay them off later. + */ + psi_memstall_enter(&pflags); + schedule_timeout_killable(penalty_jiffies); + psi_memstall_leave(&pflags); + +out: + css_put(&memcg->css); } static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, @@ -2825,6 +2943,16 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { + + /* + * Enforce __GFP_NOFAIL allocation because callers are not + * prepared to see failures and likely do not have any failure + * handling code. + */ + if (gfp & __GFP_NOFAIL) { + page_counter_charge(&memcg->kmem, nr_pages); + return 0; + } cancel_charge(memcg, nr_pages); return -ENOMEM; } @@ -3512,6 +3640,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: + pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); ret = memcg_update_kmem_max(memcg, nr_pages); break; case _TCP: @@ -4805,11 +4936,6 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) } } -static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) -{ - mem_cgroup_id_get_many(memcg, 1); -} - static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) { mem_cgroup_id_put_many(memcg, 1); @@ -4954,6 +5080,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void) for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) memcg->cgwb_frn[i].done = __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); + INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); + memcg->deferred_split_queue.split_queue_len = 0; #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; @@ -5333,6 +5464,14 @@ static int mem_cgroup_move_account(struct page *page, __mod_memcg_state(to, NR_WRITEBACK, nr_pages); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (compound && !list_empty(page_deferred_list(page))) { + spin_lock(&from->deferred_split_queue.split_queue_lock); + list_del_init(page_deferred_list(page)); + from->deferred_split_queue.split_queue_len--; + spin_unlock(&from->deferred_split_queue.split_queue_lock); + } +#endif /* * It is safe to change page->mem_cgroup here because the page * is referenced, charged, and isolated - we can't race with @@ -5341,6 +5480,17 @@ static int mem_cgroup_move_account(struct page *page, /* caller should have done css_get */ page->mem_cgroup = to; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (compound && list_empty(page_deferred_list(page))) { + spin_lock(&to->deferred_split_queue.split_queue_lock); + list_add_tail(page_deferred_list(page), + &to->deferred_split_queue.split_queue); + to->deferred_split_queue.split_queue_len++; + spin_unlock(&to->deferred_split_queue.split_queue_lock); + } +#endif + spin_unlock_irqrestore(&from->move_lock, flags); ret = 0; @@ -6511,7 +6661,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) unsigned int nr_pages = 1; if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); + nr_pages = compound_nr(page); ug->nr_huge += nr_pages; } if (PageAnon(page)) @@ -6523,7 +6673,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } ug->pgpgout++; } else { - ug->nr_kmem += 1 << compound_order(page); + ug->nr_kmem += compound_nr(page); __ClearPageKmemcg(page); } diff --git a/mm/memfd.c b/mm/memfd.c index 650e65a46b9c..2647c898990c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas) xas_for_each(xas, page, ULONG_MAX) { if (xa_is_value(page)) continue; + page = find_subpage(page, xas->xa_index); if (page_count(page) - page_mapcount(page) > 1) xas_set_mark(xas, MEMFD_TAG_PINNED); @@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) bool clear = true; if (xa_is_value(page)) continue; + page = find_subpage(page, xas.xa_index); if (page_count(page) - page_mapcount(page) != 1) { /* * On the last scan, we clean up all those tags diff --git a/mm/memory.c b/mm/memory.c index b1dff75640b7..b1ca51a079f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -518,7 +518,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); - pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", vma->vm_file, @@ -1026,6 +1026,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_none(ptent)) continue; + if (need_resched()) + break; + if (pte_present(ptent)) { struct page *page; @@ -1093,7 +1096,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (unlikely(details)) continue; - entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) rss[MM_SWAPENTS]--; else if (is_migration_entry(entry)) { @@ -1124,8 +1126,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (force_flush) { force_flush = 0; tlb_flush_mmu(tlb); - if (addr != end) - goto again; + } + + if (addr != end) { + cond_resched(); + goto again; } return addr; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c73f09913165..b1be791f772d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -632,33 +632,30 @@ static void generic_online_page(struct page *page, unsigned int order) #endif } -static int online_pages_blocks(unsigned long start, unsigned long nr_pages) -{ - unsigned long end = start + nr_pages; - int order, onlined_pages = 0; - - while (start < end) { - order = min(MAX_ORDER - 1, - get_order(PFN_PHYS(end) - PFN_PHYS(start))); - (*online_page_callback)(pfn_to_page(start), order); - - onlined_pages += (1UL << order); - start += (1UL << order); - } - return onlined_pages; -} - static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) { - unsigned long onlined_pages = *(unsigned long *)arg; + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; + int order; - if (PageReserved(pfn_to_page(start_pfn))) - onlined_pages += online_pages_blocks(start_pfn, nr_pages); + /* + * Online the pages. The callback might decide to keep some pages + * PG_reserved (to add them to the buddy later), but we still account + * them as being online/belonging to this zone ("present"). + */ + for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) { + order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn))); + /* __free_pages_core() wants pfns to be aligned to the order */ + if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order))) + order = 0; + (*online_page_callback)(pfn_to_page(pfn), order); + } - online_mem_sections(start_pfn, start_pfn + nr_pages); + /* mark all involved sections as online */ + online_mem_sections(start_pfn, end_pfn); - *(unsigned long *)arg = onlined_pages; + *(unsigned long *)arg += nr_pages; return 0; } @@ -714,8 +711,13 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon pgdat->node_start_pfn = start_pfn; pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; -} +} +/* + * Associate the pfn range with the given zone, initializing the memmaps + * and resizing the pgdat/zone data to span the added pages. After this + * call, all affected pages are PG_reserved. + */ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { @@ -804,20 +806,6 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, return default_zone_for_pfn(nid, start_pfn, nr_pages); } -/* - * Associates the given pfn range with the given node and the zone appropriate - * for the given online type. - */ -static struct zone * __meminit move_pfn_range(int online_type, int nid, - unsigned long start_pfn, unsigned long nr_pages) -{ - struct zone *zone; - - zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); - move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL); - return zone; -} - int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; @@ -840,7 +828,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ put_device(&mem->dev); /* associate pfn range with the zone */ - zone = move_pfn_range(online_type, nid, pfn, nr_pages); + zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL); arg.start_pfn = pfn; arg.nr_pages = nr_pages; @@ -864,6 +853,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { + /* not a single memory resource was applicable */ if (need_zonelists_rebuild) zone_pcp_reset(zone); goto failed_addition; @@ -877,27 +867,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ shuffle_zone(zone); - if (onlined_pages) { - node_states_set_node(nid, &arg); - if (need_zonelists_rebuild) - build_all_zonelists(NULL); - else - zone_pcp_update(zone); - } + node_states_set_node(nid, &arg); + if (need_zonelists_rebuild) + build_all_zonelists(NULL); + else + zone_pcp_update(zone); init_per_zone_wmark_min(); - if (onlined_pages) { - kswapd_run(nid); - kcompactd_run(nid); - } + kswapd_run(nid); + kcompactd_run(nid); vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); - if (onlined_pages) - memory_notify(MEM_ONLINE, &arg); + memory_notify(MEM_ONLINE, &arg); mem_hotplug_done(); return 0; @@ -933,8 +918,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) if (!pgdat) return NULL; + pgdat->per_cpu_nodestats = + alloc_percpu(struct per_cpu_nodestat); arch_refresh_nodedata(nid, pgdat); } else { + int cpu; /* * Reset the nr_zones, order and classzone_idx before reuse. * Note that kswapd will init kswapd_classzone_idx properly @@ -943,6 +931,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) pgdat->nr_zones = 0; pgdat->kswapd_order = 0; pgdat->kswapd_classzone_idx = 0; + for_each_online_cpu(cpu) { + struct per_cpu_nodestat *p; + + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); + memset(p, 0, sizeof(*p)); + } } /* we can use NODE_DATA(nid) from here */ @@ -952,7 +946,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_core_hotplug(nid); - pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* * The node we allocated has no zone fallback lists. For avoiding @@ -1309,7 +1302,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) head = compound_head(page); if (page_huge_active(head)) return pfn; - skip = (1 << compound_order(head)) - (page - head); + skip = compound_nr(head) - (page - head); pfn += skip - 1; } return 0; @@ -1347,7 +1340,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { struct page *head = compound_head(page); - pfn = page_to_pfn(head) + (1<start_section_nr)); - endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; + endpa = beginpa + memory_block_size_bytes() - 1; pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); @@ -1800,7 +1793,7 @@ void __remove_memory(int nid, u64 start, u64 size) { /* - * trigger BUG() is some memory is not offlined prior to calling this + * trigger BUG() if some memory is not offlined prior to calling this * function */ if (try_remove_memory(nid, start, size)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 33166964bdac..e8861594efc6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1406,6 +1406,7 @@ static long kernel_mbind(unsigned long start, unsigned long len, int err; unsigned short mode_flags; + start = untagged_addr(start); mode_flags = mode & MPOL_MODE_FLAGS; mode &= ~MPOL_MODE_FLAGS; if (mode >= MPOL_MAX) @@ -1513,10 +1514,6 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, if (nodes_empty(*new)) goto out_put; - nodes_and(*new, *new, node_states[N_MEMORY]); - if (nodes_empty(*new)) - goto out_put; - err = security_task_movememory(task); if (err) goto out_put; @@ -1563,6 +1560,8 @@ static int kernel_get_mempolicy(int __user *policy, int uninitialized_var(pval); nodemask_t nodes; + addr = untagged_addr(addr); + if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; diff --git a/mm/migrate.c b/mm/migrate.c index 9f4ed4e985c1..4fe45d1428c8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -460,7 +460,7 @@ int migrate_page_move_mapping(struct address_space *mapping, for (i = 1; i < HPAGE_PMD_NR; i++) { xas_next(&xas); - xas_store(&xas, newpage + i); + xas_store(&xas, newpage); } } @@ -1612,7 +1612,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, goto out_flush; if (get_user(node, nodes + i)) goto out_flush; - addr = (unsigned long)p; + addr = (unsigned long)untagged_addr(p); err = -ENODEV; if (node < 0 || node >= MAX_NUMNODES) @@ -1892,7 +1892,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); /* Avoid migrating to a node that is nearly full */ - if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) + if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) return 0; if (isolate_lru_page(page)) @@ -2218,17 +2218,15 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, pte_t pte; pte = *ptep; - pfn = pte_pfn(pte); if (pte_none(pte)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; - pfn = 0; goto next; } if (!pte_present(pte)) { - mpfn = pfn = 0; + mpfn = 0; /* * Only care about unaddressable device page special @@ -2245,10 +2243,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, if (is_write_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { + pfn = pte_pfn(pte); if (is_zero_pfn(pfn)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; - pfn = 0; goto next; } page = vm_normal_page(migrate->vma, addr, pte); @@ -2258,10 +2256,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, /* FIXME support THP */ if (!page || !page->mapping || PageTransCompound(page)) { - mpfn = pfn = 0; + mpfn = 0; goto next; } - pfn = page_to_pfn(page); /* * By getting a reference on the page we pin it and that blocks diff --git a/mm/mincore.c b/mm/mincore.c index f9a9dbe8cd33..49b6fa2f6aa1 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -256,6 +256,8 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, unsigned long pages; unsigned char *tmp; + start = untagged_addr(start); + /* Check the start address: needs to be page-aligned.. */ if (start & ~PAGE_MASK) return -EINVAL; diff --git a/mm/mlock.c b/mm/mlock.c index 25d326f1009d..646acba3045b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -674,6 +674,8 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla unsigned long lock_limit; int error = -ENOMEM; + start = untagged_addr(start); + if (!can_do_mlock()) return -EPERM; @@ -735,6 +737,8 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; + start = untagged_addr(start); + len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; diff --git a/mm/mmap.c b/mm/mmap.c index 1ee4d16add7a..fa75b568d481 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -201,6 +201,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) bool downgraded = false; LIST_HEAD(uf); + brk = untagged_addr(brk); + if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -289,9 +291,9 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) return retval; } -static long vma_compute_subtree_gap(struct vm_area_struct *vma) +static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) { - unsigned long max, prev_end, subtree_gap; + unsigned long gap, prev_end; /* * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we @@ -299,14 +301,21 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) * an unmapped area; whereas when expanding we only require one. * That's a little inconsistent, but keeps the code here simpler. */ - max = vm_start_gap(vma); + gap = vm_start_gap(vma); if (vma->vm_prev) { prev_end = vm_end_gap(vma->vm_prev); - if (max > prev_end) - max -= prev_end; + if (gap > prev_end) + gap -= prev_end; else - max = 0; + gap = 0; } + return gap; +} + +#ifdef CONFIG_DEBUG_VM_RB +static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma) +{ + unsigned long max = vma_compute_gap(vma), subtree_gap; if (vma->vm_rb.rb_left) { subtree_gap = rb_entry(vma->vm_rb.rb_left, struct vm_area_struct, vm_rb)->rb_subtree_gap; @@ -322,7 +331,6 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) return max; } -#ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct mm_struct *mm) { struct rb_root *root = &mm->mm_rb; @@ -428,8 +436,9 @@ static void validate_mm(struct mm_struct *mm) #define validate_mm(mm) do { } while (0) #endif -RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, - unsigned long, rb_subtree_gap, vma_compute_subtree_gap) +RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks, + struct vm_area_struct, vm_rb, + unsigned long, rb_subtree_gap, vma_compute_gap) /* * Update augmented rbtree rb_subtree_gap values after vma->vm_start or @@ -439,8 +448,8 @@ RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, static void vma_gap_update(struct vm_area_struct *vma) { /* - * As it turns out, RB_DECLARE_CALLBACKS() already created a callback - * function that does exactly what we want. + * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created + * a callback function that does exactly what we want. */ vma_gap_callbacks_propagate(&vma->vm_rb, NULL); } @@ -1367,6 +1376,9 @@ static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) if (S_ISBLK(inode->i_mode)) return MAX_LFS_FILESIZE; + if (S_ISSOCK(inode->i_mode)) + return MAX_LFS_FILESIZE; + /* Special "we do even unsigned file positions" case */ if (file->f_mode & FMODE_UNSIGNED_OFFSET) return 0; @@ -1586,6 +1598,8 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, struct file *file = NULL; unsigned long retval; + addr = untagged_addr(addr); + if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); @@ -2283,12 +2297,9 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, if (vma) { *pprev = vma->vm_prev; } else { - struct rb_node *rb_node = mm->mm_rb.rb_node; - *pprev = NULL; - while (rb_node) { - *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); - rb_node = rb_node->rb_right; - } + struct rb_node *rb_node = rb_last(&mm->mm_rb); + + *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL; } return vma; } @@ -2887,6 +2898,7 @@ EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { + addr = untagged_addr(addr); profile_munmap(addr); return __vm_munmap(addr, len, true); } diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 8c943a6e1696..7d70e5c78f97 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -271,8 +271,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb, tlb_flush_mmu(tlb); - /* keep the page table cache within bounds */ - check_pgt_cache(); #ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER tlb_batch_list_free(tlb); #endif diff --git a/mm/mprotect.c b/mm/mprotect.c index fbceccd2502f..3cad0a44df91 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -459,6 +459,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); + start = untagged_addr(start); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ return -EINVAL; diff --git a/mm/mremap.c b/mm/mremap.c index fc241d23cd97..1fc8a29fbe3f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -606,6 +606,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); + addr = untagged_addr(addr); + new_addr = untagged_addr(new_addr); + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; diff --git a/mm/msync.c b/mm/msync.c index ef30a429623a..c3bd3e75f687 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -37,6 +37,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) int unmapped_error = 0; int error = -EINVAL; + start = untagged_addr(start); + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; if (offset_in_page(start)) diff --git a/mm/nommu.c b/mm/nommu.c index fed1b6e9c89b..99b7ec318824 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -108,7 +108,7 @@ unsigned int kobjsize(const void *objp) * The ksize() function is only guaranteed to work for pointers * returned by kmalloc(). So handle arbitrary pointers here. */ - return PAGE_SIZE << compound_order(page); + return page_size(page); } /** diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eda2e2a0bdc6..71e3acea7817 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -73,7 +73,7 @@ static inline bool is_memcg_oom(struct oom_control *oc) /** * oom_cpuset_eligible() - check task eligiblity for kill * @start: task struct of which task to consider - * @mask: nodemask passed to page allocator for mempolicy ooms + * @oc: pointer to struct oom_control * * Task eligibility is determined by whether or not a candidate task, @tsk, * shares the same mempolicy nodes as current if it is bound by such a policy @@ -287,7 +287,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { oc->totalpages = total_swap_pages; for_each_node_mask(nid, *oc->nodemask) - oc->totalpages += node_spanned_pages(nid); + oc->totalpages += node_present_pages(nid); return CONSTRAINT_MEMORY_POLICY; } @@ -300,7 +300,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) if (cpuset_limited) { oc->totalpages = total_swap_pages; for_each_node_mask(nid, cpuset_current_mems_allowed) - oc->totalpages += node_spanned_pages(nid); + oc->totalpages += node_present_pages(nid); return CONSTRAINT_CPUSET; } return CONSTRAINT_NONE; @@ -523,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) set_bit(MMF_UNSTABLE, &mm->flags); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (!can_madv_dontneed_vma(vma)) + if (!can_madv_lru_vma(vma)) continue; /* @@ -884,12 +884,13 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) */ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); mark_oom_victim(victim); - pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", - message, task_pid_nr(victim), victim->comm, - K(victim->mm->total_vm), - K(get_mm_counter(victim->mm, MM_ANONPAGES)), - K(get_mm_counter(victim->mm, MM_FILEPAGES)), - K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n", + message, task_pid_nr(victim), victim->comm, K(mm->total_vm), + K(get_mm_counter(mm, MM_ANONPAGES)), + K(get_mm_counter(mm, MM_FILEPAGES)), + K(get_mm_counter(mm, MM_SHMEMPAGES)), + from_kuid(&init_user_ns, task_uid(victim)), + mm_pgtables_bytes(mm), victim->signal->oom_score_adj); task_unlock(victim); /* @@ -1068,9 +1069,10 @@ bool out_of_memory(struct oom_control *oc) * The OOM killer does not compensate for IO-less reclaim. * pagefault_out_of_memory lost its gfp context so we have to * make sure exclude 0 mask - all other users should have at least - * ___GFP_DIRECT_RECLAIM to get here. + * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to + * invoke the OOM killer even if it is a GFP_NOFS allocation. */ - if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) + if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) return true; /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 253f11f3e72c..a5f0994ff099 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -682,6 +682,7 @@ static void bad_page(struct page *page, const char *reason, void free_compound_page(struct page *page) { + mem_cgroup_uncharge(page); __free_pages_ok(page, compound_order(page)); } @@ -3967,16 +3968,24 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, goto check_priority; /* - * make sure the compaction wasn't deferred or didn't bail out early - * due to locks contention before we declare that we should give up. - * But do not retry if the given zonelist is not suitable for - * compaction. + * compaction was skipped because there are not enough order-0 pages + * to work with, so we retry only if it looks like reclaim can help. */ - if (compaction_withdrawn(compact_result)) { + if (compaction_needs_reclaim(compact_result)) { ret = compaction_zonelist_suitable(ac, order, alloc_flags); goto out; } + /* + * make sure the compaction wasn't deferred or didn't bail out early + * due to locks contention before we declare that we should give up. + * But the next retry should use a higher priority if allowed, so + * we don't just keep bailing out endlessly. + */ + if (compaction_withdrawn(compact_result)) { + goto check_priority; + } + /* * !costly requests are much more important than __GFP_RETRY_MAYFAIL * costly ones because they are de facto nofail and invoke OOM @@ -6650,9 +6659,11 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages, #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void pgdat_init_split_queue(struct pglist_data *pgdat) { - spin_lock_init(&pgdat->split_queue_lock); - INIT_LIST_HEAD(&pgdat->split_queue); - pgdat->split_queue_len = 0; + struct deferred_split *ds_queue = &pgdat->deferred_split_queue; + + spin_lock_init(&ds_queue->split_queue_lock); + INIT_LIST_HEAD(&ds_queue->split_queue); + ds_queue->split_queue_len = 0; } #else static void pgdat_init_split_queue(struct pglist_data *pgdat) {} @@ -8213,7 +8224,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (!hugepage_migration_supported(page_hstate(head))) goto unmovable; - skip_pages = (1 << compound_order(head)) - (page - head); + skip_pages = compound_nr(head) - (page - head); iter += skip_pages - 1; continue; } diff --git a/mm/page_owner.c b/mm/page_owner.c index addcbb2ae4e4..dee931184788 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -24,6 +24,9 @@ struct page_owner { short last_migrate_reason; gfp_t gfp_mask; depot_stack_handle_t handle; +#ifdef CONFIG_DEBUG_PAGEALLOC + depot_stack_handle_t free_handle; +#endif }; static bool page_owner_disabled = true; @@ -102,19 +105,6 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext) return (void *)page_ext + page_owner_ops.offset; } -void __reset_page_owner(struct page *page, unsigned int order) -{ - int i; - struct page_ext *page_ext; - - for (i = 0; i < (1 << order); i++) { - page_ext = lookup_page_ext(page + i); - if (unlikely(!page_ext)) - continue; - __clear_bit(PAGE_EXT_OWNER, &page_ext->flags); - } -} - static inline bool check_recursive_alloc(unsigned long *entries, unsigned int nr_entries, unsigned long ip) @@ -154,18 +144,50 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) return handle; } -static inline void __set_page_owner_handle(struct page_ext *page_ext, - depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask) +void __reset_page_owner(struct page *page, unsigned int order) { + int i; + struct page_ext *page_ext; +#ifdef CONFIG_DEBUG_PAGEALLOC + depot_stack_handle_t handle = 0; struct page_owner *page_owner; - page_owner = get_page_owner(page_ext); - page_owner->handle = handle; - page_owner->order = order; - page_owner->gfp_mask = gfp_mask; - page_owner->last_migrate_reason = -1; + if (debug_pagealloc_enabled()) + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); +#endif - __set_bit(PAGE_EXT_OWNER, &page_ext->flags); + for (i = 0; i < (1 << order); i++) { + page_ext = lookup_page_ext(page + i); + if (unlikely(!page_ext)) + continue; + __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); +#ifdef CONFIG_DEBUG_PAGEALLOC + if (debug_pagealloc_enabled()) { + page_owner = get_page_owner(page_ext); + page_owner->free_handle = handle; + } +#endif + } +} + +static inline void __set_page_owner_handle(struct page *page, + struct page_ext *page_ext, depot_stack_handle_t handle, + unsigned int order, gfp_t gfp_mask) +{ + struct page_owner *page_owner; + int i; + + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + page_owner->handle = handle; + page_owner->order = order; + page_owner->gfp_mask = gfp_mask; + page_owner->last_migrate_reason = -1; + __set_bit(PAGE_EXT_OWNER, &page_ext->flags); + __set_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); + + page_ext = lookup_page_ext(page + i); + } } noinline void __set_page_owner(struct page *page, unsigned int order, @@ -178,7 +200,7 @@ noinline void __set_page_owner(struct page *page, unsigned int order, return; handle = save_stack(gfp_mask); - __set_page_owner_handle(page_ext, handle, order, gfp_mask); + __set_page_owner_handle(page, page_ext, handle, order, gfp_mask); } void __set_page_owner_migrate_reason(struct page *page, int reason) @@ -204,8 +226,11 @@ void __split_page_owner(struct page *page, unsigned int order) page_owner = get_page_owner(page_ext); page_owner->order = 0; - for (i = 1; i < (1 << order); i++) - __copy_page_owner(page, page + i); + for (i = 1; i < (1 << order); i++) { + page_ext = lookup_page_ext(page + i); + page_owner = get_page_owner(page_ext); + page_owner->order = 0; + } } void __copy_page_owner(struct page *oldpage, struct page *newpage) @@ -235,6 +260,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) * the new page, which will be freed. */ __set_bit(PAGE_EXT_OWNER, &new_ext->flags); + __set_bit(PAGE_EXT_OWNER_ACTIVE, &new_ext->flags); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -294,7 +320,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (unlikely(!page_ext)) continue; - if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) continue; page_owner = get_page_owner(page_ext); @@ -405,20 +431,36 @@ void __dump_page_owner(struct page *page) mt = gfpflags_to_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { - pr_alert("page_owner info is not active (free page?)\n"); + pr_alert("page_owner info is not present (never set?)\n"); return; } + if (test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) + pr_alert("page_owner tracks the page as allocated\n"); + else + pr_alert("page_owner tracks the page as freed\n"); + + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", + page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); + handle = READ_ONCE(page_owner->handle); if (!handle) { - pr_alert("page_owner info is not active (free page?)\n"); - return; + pr_alert("page_owner allocation stack trace missing\n"); + } else { + nr_entries = stack_depot_fetch(handle, &entries); + stack_trace_print(entries, nr_entries, 0); } - nr_entries = stack_depot_fetch(handle, &entries); - pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", - page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); - stack_trace_print(entries, nr_entries, 0); +#ifdef CONFIG_DEBUG_PAGEALLOC + handle = READ_ONCE(page_owner->free_handle); + if (!handle) { + pr_alert("page_owner free stack trace missing\n"); + } else { + nr_entries = stack_depot_fetch(handle, &entries); + pr_alert("page last free stack trace:\n"); + stack_trace_print(entries, nr_entries, 0); + } +#endif if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", @@ -481,8 +523,22 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; + /* + * Although we do have the info about past allocation of free + * pages, it's not relevant for current memory usage. + */ + if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) + continue; + page_owner = get_page_owner(page_ext); + /* + * Don't print "tail" pages of high-order allocations as that + * would inflate the stats. + */ + if (!IS_ALIGNED(pfn, 1 << page_owner->order)) + continue; + /* * Access to page_ext->handle isn't synchronous so we should * be careful to access it. @@ -562,7 +618,8 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; /* Found early allocated page */ - __set_page_owner_handle(page_ext, early_handle, 0, 0); + __set_page_owner_handle(page, page_ext, early_handle, + 0, 0); count++; } cond_resched(); diff --git a/mm/page_poison.c b/mm/page_poison.c index 21d4f97cb49b..34b9181ee5d1 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -101,7 +101,7 @@ static void unpoison_page(struct page *page) /* * Page poisoning when enabled poisons each and every page * that is freed to buddy. Thus no extra check is done to - * see if a page was posioned. + * see if a page was poisoned. */ check_poison_mem(addr, PAGE_SIZE); kunmap_atomic(addr); diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 11df03e71288..eff4b4520c8d 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -153,8 +153,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (unlikely(PageHuge(pvmw->page))) { /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address, - PAGE_SIZE << compound_order(page)); + pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page)); if (!pvmw->pte) return false; diff --git a/mm/quicklist.c b/mm/quicklist.c deleted file mode 100644 index 5e98ac78e410..000000000000 --- a/mm/quicklist.c +++ /dev/null @@ -1,103 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Quicklist support. - * - * Quicklists are light weight lists of pages that have a defined state - * on alloc and free. Pages must be in the quicklist specific defined state - * (zero by default) when the page is freed. It seems that the initial idea - * for such lists first came from Dave Miller and then various other people - * improved on it. - * - * Copyright (C) 2007 SGI, - * Christoph Lameter - * Generalized, added support for multiple lists and - * constructors / destructors. - */ -#include - -#include -#include -#include -#include - -DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); - -#define FRACTION_OF_NODE_MEM 16 - -static unsigned long max_pages(unsigned long min_pages) -{ - unsigned long node_free_pages, max; - int node = numa_node_id(); - struct zone *zones = NODE_DATA(node)->node_zones; - int num_cpus_on_node; - - node_free_pages = -#ifdef CONFIG_ZONE_DMA - zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) + -#endif -#ifdef CONFIG_ZONE_DMA32 - zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) + -#endif - zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); - - max = node_free_pages / FRACTION_OF_NODE_MEM; - - num_cpus_on_node = cpumask_weight(cpumask_of_node(node)); - max /= num_cpus_on_node; - - return max(max, min_pages); -} - -static long min_pages_to_free(struct quicklist *q, - unsigned long min_pages, long max_free) -{ - long pages_to_free; - - pages_to_free = q->nr_pages - max_pages(min_pages); - - return min(pages_to_free, max_free); -} - -/* - * Trim down the number of pages in the quicklist - */ -void quicklist_trim(int nr, void (*dtor)(void *), - unsigned long min_pages, unsigned long max_free) -{ - long pages_to_free; - struct quicklist *q; - - q = &get_cpu_var(quicklist)[nr]; - if (q->nr_pages > min_pages) { - pages_to_free = min_pages_to_free(q, min_pages, max_free); - - while (pages_to_free > 0) { - /* - * We pass a gfp_t of 0 to quicklist_alloc here - * because we will never call into the page allocator. - */ - void *p = quicklist_alloc(nr, 0, NULL); - - if (dtor) - dtor(p); - free_page((unsigned long)p); - pages_to_free--; - } - } - put_cpu_var(quicklist); -} - -unsigned long quicklist_total_size(void) -{ - unsigned long count = 0; - int cpu; - struct quicklist *ql, *q; - - for_each_online_cpu(cpu) { - ql = per_cpu(quicklist, cpu); - for (q = ql; q < ql + CONFIG_NR_QUICK; q++) - count += q->nr_pages; - } - return count; -} - diff --git a/mm/rmap.c b/mm/rmap.c index 003377e24232..d9a23bb773bf 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -898,15 +898,13 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma, vma->vm_mm, address, - min(vma->vm_end, address + - (PAGE_SIZE << compound_order(page)))); + min(vma->vm_end, address + page_size(page))); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { - unsigned long cstart; int ret = 0; - cstart = address = pvmw.address; + address = pvmw.address; if (pvmw.pte) { pte_t entry; pte_t *pte = pvmw.pte; @@ -933,7 +931,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); - cstart &= PMD_MASK; ret = 1; #else /* unexpected pmd-mapped page? */ @@ -1192,8 +1189,10 @@ void page_add_file_rmap(struct page *page, bool compound) } if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); + if (PageSwapBacked(page)) + __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); + else + __inc_node_page_state(page, NR_FILE_PMDMAPPED); } else { if (PageTransCompound(page) && page_mapping(page)) { VM_WARN_ON_ONCE(!PageLocked(page)); @@ -1232,8 +1231,10 @@ static void page_remove_file_rmap(struct page *page, bool compound) } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) goto out; - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); + if (PageSwapBacked(page)) + __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); + else + __dec_node_page_state(page, NR_FILE_PMDMAPPED); } else { if (!atomic_add_negative(-1, &page->_mapcount)) goto out; @@ -1374,8 +1375,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, - min(vma->vm_end, address + - (PAGE_SIZE << compound_order(page)))); + min(vma->vm_end, address + page_size(page))); if (PageHuge(page)) { /* * If sharing is possible, start and end will be adjusted @@ -1524,8 +1524,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (PageHuge(page)) { - int nr = 1 << compound_order(page); - hugetlb_count_sub(nr, mm); + hugetlb_count_sub(compound_nr(page), mm); set_huge_swap_pte_at(mm, address, pvmw.pte, pteval, vma_mmu_pagesize(vma)); diff --git a/mm/shmem.c b/mm/shmem.c index 0f7fd4a85db6..30ce722c23fa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -609,7 +609,7 @@ static int shmem_add_to_page_cache(struct page *page, { XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); unsigned long i = 0; - unsigned long nr = 1UL << compound_order(page); + unsigned long nr = compound_nr(page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); @@ -631,7 +631,7 @@ static int shmem_add_to_page_cache(struct page *page, if (xas_error(&xas)) goto unlock; next: - xas_store(&xas, page + i); + xas_store(&xas, page); if (++i < nr) { xas_next(&xas); goto next; @@ -1734,7 +1734,7 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * fault_mm and fault_type are only supplied by shmem_fault: + * vmf and fault_type are only supplied by shmem_fault: * otherwise they are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, @@ -1884,7 +1884,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, lru_cache_add_anon(page); spin_lock_irq(&info->lock); - info->alloced += 1 << compound_order(page); + info->alloced += compound_nr(page); inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); @@ -1925,7 +1925,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page *head = compound_head(page); int i; - for (i = 0; i < (1 << compound_order(head)); i++) { + for (i = 0; i < compound_nr(head); i++) { clear_highpage(head + i); flush_dcache_page(head + i); } @@ -1952,7 +1952,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, * Error recovery. */ unacct: - shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); + shmem_inode_unacct_blocks(inode, compound_nr(page)); if (PageTransHuge(page)) { unlock_page(page); diff --git a/mm/slab.h b/mm/slab.h index 9057b8056b07..68e455f2b698 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -30,6 +30,69 @@ struct kmem_cache { struct list_head list; /* List of all slab caches on the system */ }; +#else /* !CONFIG_SLOB */ + +struct memcg_cache_array { + struct rcu_head rcu; + struct kmem_cache *entries[0]; +}; + +/* + * This is the main placeholder for memcg-related information in kmem caches. + * Both the root cache and the child caches will have it. For the root cache, + * this will hold a dynamically allocated array large enough to hold + * information about the currently limited memcgs in the system. To allow the + * array to be accessed without taking any locks, on relocation we free the old + * version only after a grace period. + * + * Root and child caches hold different metadata. + * + * @root_cache: Common to root and child caches. NULL for root, pointer to + * the root cache for children. + * + * The following fields are specific to root caches. + * + * @memcg_caches: kmemcg ID indexed table of child caches. This table is + * used to index child cachces during allocation and cleared + * early during shutdown. + * + * @root_caches_node: List node for slab_root_caches list. + * + * @children: List of all child caches. While the child caches are also + * reachable through @memcg_caches, a child cache remains on + * this list until it is actually destroyed. + * + * The following fields are specific to child caches. + * + * @memcg: Pointer to the memcg this cache belongs to. + * + * @children_node: List node for @root_cache->children list. + * + * @kmem_caches_node: List node for @memcg->kmem_caches list. + */ +struct memcg_cache_params { + struct kmem_cache *root_cache; + union { + struct { + struct memcg_cache_array __rcu *memcg_caches; + struct list_head __root_caches_node; + struct list_head children; + bool dying; + }; + struct { + struct mem_cgroup *memcg; + struct list_head children_node; + struct list_head kmem_caches_node; + struct percpu_ref refcnt; + + void (*work_fn)(struct kmem_cache *); + union { + struct rcu_head rcu_head; + struct work_struct work; + }; + }; + }; +}; #endif /* CONFIG_SLOB */ #ifdef CONFIG_SLAB @@ -174,6 +237,7 @@ int __kmem_cache_shrink(struct kmem_cache *); void __kmemcg_cache_deactivate(struct kmem_cache *s); void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s); void slab_kmem_cache_release(struct kmem_cache *); +void kmem_cache_shrink_all(struct kmem_cache *s); struct seq_file; struct file; diff --git a/mm/slab_common.c b/mm/slab_common.c index 807490fe217a..6491c3a41805 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -981,6 +981,43 @@ int kmem_cache_shrink(struct kmem_cache *cachep) } EXPORT_SYMBOL(kmem_cache_shrink); +/** + * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache + * @s: The cache pointer + */ +void kmem_cache_shrink_all(struct kmem_cache *s) +{ + struct kmem_cache *c; + + if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) { + kmem_cache_shrink(s); + return; + } + + get_online_cpus(); + get_online_mems(); + kasan_cache_shrink(s); + __kmem_cache_shrink(s); + + /* + * We have to take the slab_mutex to protect from the memcg list + * modification. + */ + mutex_lock(&slab_mutex); + for_each_memcg_cache(c, s) { + /* + * Don't need to shrink deactivated memcg caches. + */ + if (s->flags & SLAB_DEACTIVATED) + continue; + kasan_cache_shrink(c); + __kmem_cache_shrink(c); + } + mutex_unlock(&slab_mutex); + put_online_mems(); + put_online_cpus(); +} + bool slab_is_available(void) { return slab_state >= UP; diff --git a/mm/slob.c b/mm/slob.c index 7f421d0ca9ab..cf377beab962 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -539,7 +539,7 @@ size_t __ksize(const void *block) sp = virt_to_page(block); if (unlikely(!PageSlab(sp))) - return PAGE_SIZE << compound_order(sp); + return page_size(sp); align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); m = (unsigned int *)(block - align); diff --git a/mm/slub.c b/mm/slub.c index 8834563cdb4b..42c1b3af3c98 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -829,7 +829,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = PAGE_SIZE << compound_order(page); + length = page_size(page); end = start + length; remainder = length % s->size; if (!remainder) @@ -1074,13 +1074,14 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } -static void setup_page_debug(struct kmem_cache *s, void *addr, int order) +static +void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) { if (!(s->flags & SLAB_POISON)) return; metadata_access_enable(); - memset(addr, POISON_INUSE, PAGE_SIZE << order); + memset(addr, POISON_INUSE, page_size(page)); metadata_access_disable(); } @@ -1340,8 +1341,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} -static inline void setup_page_debug(struct kmem_cache *s, - void *addr, int order) {} +static inline +void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {} static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } @@ -1639,7 +1640,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; void *start, *p, *next; - int idx, order; + int idx; bool shuffle; flags &= gfp_allowed_mask; @@ -1673,7 +1674,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->objects = oo_objects(oo); - order = compound_order(page); page->slab_cache = s; __SetPageSlab(page); if (page_is_pfmemalloc(page)) @@ -1683,7 +1683,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) start = page_address(page); - setup_page_debug(s, start, order); + setup_page_debug(s, page, start); shuffle = shuffle_freelist(s, page); @@ -2004,6 +2004,7 @@ static inline unsigned long next_tid(unsigned long tid) return tid + TID_STEP; } +#ifdef SLUB_DEBUG_CMPXCHG static inline unsigned int tid_to_cpu(unsigned long tid) { return tid % TID_STEP; @@ -2013,6 +2014,7 @@ static inline unsigned long tid_to_event(unsigned long tid) { return tid / TID_STEP; } +#endif static inline unsigned int init_tid(int cpu) { @@ -3930,7 +3932,7 @@ size_t __ksize(const void *object) if (unlikely(!PageSlab(page))) { WARN_ON(!PageCompound(page)); - return PAGE_SIZE << compound_order(page); + return page_size(page); } return slab_ksize(page->slab_cache); @@ -5298,7 +5300,7 @@ static ssize_t shrink_store(struct kmem_cache *s, const char *buf, size_t length) { if (buf[0] == '1') - kmem_cache_shrink(s); + kmem_cache_shrink_all(s); else return -EINVAL; return length; diff --git a/mm/sparse.c b/mm/sparse.c index 72f010d9bff5..bf32de9e666b 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include "internal.h" #include @@ -470,6 +472,12 @@ struct page __init *__populate_section_memmap(unsigned long pfn, static void *sparsemap_buf __meminitdata; static void *sparsemap_buf_end __meminitdata; +static inline void __meminit sparse_buffer_free(unsigned long size) +{ + WARN_ON(!sparsemap_buf || size == 0); + memblock_free_early(__pa(sparsemap_buf), size); +} + static void __init sparse_buffer_init(unsigned long size, int nid) { phys_addr_t addr = __pa(MAX_DMA_ADDRESS); @@ -486,7 +494,7 @@ static void __init sparse_buffer_fini(void) unsigned long size = sparsemap_buf_end - sparsemap_buf; if (sparsemap_buf && size > 0) - memblock_free_early(__pa(sparsemap_buf), size); + sparse_buffer_free(size); sparsemap_buf = NULL; } @@ -495,11 +503,15 @@ void * __meminit sparse_buffer_alloc(unsigned long size) void *ptr = NULL; if (sparsemap_buf) { - ptr = PTR_ALIGN(sparsemap_buf, size); + ptr = (void *) roundup((unsigned long)sparsemap_buf, size); if (ptr + size > sparsemap_buf_end) ptr = NULL; - else + else { + /* Free redundant aligned space */ + if ((unsigned long)(ptr - sparsemap_buf) > 0) + sparse_buffer_free((unsigned long)(ptr - sparsemap_buf)); sparsemap_buf = ptr + size; + } } return ptr; } @@ -867,7 +879,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, */ page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); - ms = __pfn_to_section(start_pfn); + ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); section_mark_present(ms); @@ -884,9 +896,6 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) { int i; - if (!memmap) - return; - /* * A further optimization is to have per section refcounted * num_poisoned_pages. But that would need more space per memmap, so @@ -898,7 +907,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) for (i = 0; i < nr_pages; i++) { if (PageHWPoison(&memmap[i])) { - atomic_long_sub(1, &num_poisoned_pages); + num_poisoned_pages_dec(); ClearPageHWPoison(&memmap[i]); } } diff --git a/mm/swap.c b/mm/swap.c index ae300397dfda..38c3fa4308e2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -47,6 +47,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); @@ -71,12 +72,12 @@ static void __page_cache_release(struct page *page) spin_unlock_irqrestore(&pgdat->lru_lock, flags); } __ClearPageWaiters(page); - mem_cgroup_uncharge(page); } static void __put_single_page(struct page *page) { __page_cache_release(page); + mem_cgroup_uncharge(page); free_unref_page(page); } @@ -515,7 +516,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, del_page_from_lru_list(page, lruvec, lru + active); ClearPageActive(page); ClearPageReferenced(page); - add_page_to_lru_list(page, lruvec, lru); if (PageWriteback(page) || PageDirty(page)) { /* @@ -523,13 +523,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, * It can make readahead confusing. But race window * is _really_ small and it's non-critical problem. */ + add_page_to_lru_list(page, lruvec, lru); SetPageReclaim(page); } else { /* * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - list_move_tail(&page->lru, &lruvec->lists[lru]); + add_page_to_lru_list_tail(page, lruvec, lru); __count_vm_event(PGROTATED); } @@ -538,6 +539,22 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, update_page_reclaim_stat(lruvec, file, 0); } +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, + void *arg) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + + del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(page, lruvec, lru); + + __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page)); + update_page_reclaim_stat(lruvec, file, 0); + } +} static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, void *arg) @@ -590,6 +607,10 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pvec = &per_cpu(lru_lazyfree_pvecs, cpu); if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); @@ -623,6 +644,26 @@ void deactivate_file_page(struct page *page) } } +/* + * deactivate_page - deactivate a page + * @page: page to deactivate + * + * deactivate_page() moves @page to the inactive list if @page was on the active + * list and was not an unevictable page. This is done to accelerate the reclaim + * of @page. + */ +void deactivate_page(struct page *page) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + get_page(page); + if (!pagevec_add(pvec, page) || PageCompound(page)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + put_cpu_var(lru_deactivate_pvecs); + } +} + /** * mark_page_lazyfree - make an anon page lazyfree * @page: page to deactivate @@ -687,6 +728,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); @@ -844,17 +886,15 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, get_page(page_tail); list_add_tail(&page_tail->lru, list); } else { - struct list_head *list_head; /* * Head page has not yet been counted, as an hpage, * so we must account for each subpage individually. * - * Use the standard add function to put page_tail on the list, - * but then correct its position so they all end up in order. + * Put page_tail on the list at the correct position + * so they all end up in order. */ - add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); - list_head = page_tail->lru.prev; - list_move_tail(&page_tail->lru, list_head); + add_page_to_lru_list_tail(page_tail, lruvec, + page_lru(page_tail)); } if (!PageUnevictable(page)) diff --git a/mm/swap_state.c b/mm/swap_state.c index 8368621a0fc7..8e7ce9a9bc5e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -116,7 +116,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); - unsigned long i, nr = 1UL << compound_order(page); + unsigned long i, nr = compound_nr(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); @@ -133,7 +133,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) for (i = 0; i < nr; i++) { VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); set_page_private(page + i, entry.val + i); - xas_store(&xas, page + i); + xas_store(&xas, page); xas_next(&xas); } address_space->nrpages += nr; @@ -168,7 +168,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry) for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, NULL); - VM_BUG_ON_PAGE(entry != page + i, entry); + VM_BUG_ON_PAGE(entry != page, entry); set_page_private(page + i, 0); xas_next(&xas); } diff --git a/mm/util.c b/mm/util.c index e6351a80f248..3ad6db9a722e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -16,6 +16,13 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include #include @@ -293,7 +300,105 @@ int vma_is_stack_for_current(struct vm_area_struct *vma) return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } -#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) +#ifndef STACK_RND_MASK +#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ +#endif + +unsigned long randomize_stack_top(unsigned long stack_top) +{ + unsigned long random_variable = 0; + + if (current->flags & PF_RANDOMIZE) { + random_variable = get_random_long(); + random_variable &= STACK_RND_MASK; + random_variable <<= PAGE_SHIFT; + } +#ifdef CONFIG_STACK_GROWSUP + return PAGE_ALIGN(stack_top) + random_variable; +#else + return PAGE_ALIGN(stack_top) - random_variable; +#endif +} + +#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + /* Is the current task 32bit ? */ + if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) + return randomize_page(mm->brk, SZ_32M); + + return randomize_page(mm->brk, SZ_1G); +} + +unsigned long arch_mmap_rnd(void) +{ + unsigned long rnd; + +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + if (is_compat_task()) + rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); + else +#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); + + return rnd << PAGE_SHIFT; +} + +static int mmap_is_legacy(struct rlimit *rlim_stack) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + if (rlim_stack->rlim_cur == RLIM_INFINITY) + return 1; + + return sysctl_legacy_va_layout; +} + +/* + * Leave enough space between the mmap area and the stack to honour ulimit in + * the face of randomisation. + */ +#define MIN_GAP (SZ_128M) +#define MAX_GAP (STACK_TOP / 6 * 5) + +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) +{ + unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_guard_gap; + + /* Account for stack randomization if necessary */ + if (current->flags & PF_RANDOMIZE) + pad += (STACK_RND_MASK << PAGE_SHIFT); + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + + return PAGE_ALIGN(STACK_TOP - gap - rnd); +} + +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +{ + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = arch_mmap_rnd(); + + if (mmap_is_legacy(rlim_stack)) { + mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; + mm->get_unmapped_area = arch_get_unmapped_area; + } else { + mm->mmap_base = mmap_base(random_factor, rlim_stack); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + } +} +#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; @@ -521,7 +626,7 @@ bool page_mapped(struct page *page) return true; if (PageHuge(page)) return false; - for (i = 0; i < (1 << compound_order(page)); i++) { + for (i = 0; i < compound_nr(page); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; } @@ -783,3 +888,16 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) out: return res; } + +int memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = kmap_atomic(page1); + addr2 = kmap_atomic(page2); + ret = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_atomic(addr2); + kunmap_atomic(addr1); + return ret; +} diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c1246d77cf75..a3c70e275f4e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -329,8 +329,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 -#define VM_LAZY_FREE 0x02 -#define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); /* Export for kexec only */ @@ -398,9 +396,8 @@ compute_subtree_max_size(struct vmap_area *va) get_subtree_max_size(va->rb_node.rb_right)); } -RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb, - struct vmap_area, rb_node, unsigned long, subtree_max_size, - compute_subtree_max_size) +RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, + struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); @@ -1116,7 +1113,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, va->va_start = addr; va->va_end = addr + size; - va->flags = 0; + va->vm = NULL; insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); @@ -1282,7 +1279,14 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) llist_for_each_entry_safe(va, n_va, valist, purge_list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; - __free_vmap_area(va); + /* + * Finally insert or merge lazily-freed area. It is + * detached and there is no need to "unlink" it from + * anything. + */ + merge_or_add_vmap_area(va, + &free_vmap_area_root, &free_vmap_area_list); + atomic_long_sub(nr, &vmap_lazy_nr); if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) @@ -1324,6 +1328,10 @@ static void free_vmap_area_noflush(struct vmap_area *va) { unsigned long nr_lazy; + spin_lock(&vmap_area_lock); + unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); @@ -1918,7 +1926,6 @@ void __init vmalloc_init(void) if (WARN_ON_ONCE(!va)) continue; - va->flags = VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; @@ -2016,7 +2023,6 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; - va->flags |= VM_VM_AREA; spin_unlock(&vmap_area_lock); } @@ -2121,10 +2127,10 @@ struct vm_struct *find_vm_area(const void *addr) struct vmap_area *va; va = find_vmap_area((unsigned long)addr); - if (va && va->flags & VM_VM_AREA) - return va->vm; + if (!va) + return NULL; - return NULL; + return va->vm; } /** @@ -2143,14 +2149,12 @@ struct vm_struct *remove_vm_area(const void *addr) might_sleep(); - va = find_vmap_area((unsigned long)addr); - if (va && va->flags & VM_VM_AREA) { + spin_lock(&vmap_area_lock); + va = __find_vmap_area((unsigned long)addr); + if (va && va->vm) { struct vm_struct *vm = va->vm; - spin_lock(&vmap_area_lock); va->vm = NULL; - va->flags &= ~VM_VM_AREA; - va->flags |= VM_LAZY_FREE; spin_unlock(&vmap_area_lock); kasan_free_shadow(vm); @@ -2158,6 +2162,8 @@ struct vm_struct *remove_vm_area(const void *addr) return vm; } + + spin_unlock(&vmap_area_lock); return NULL; } @@ -2402,7 +2408,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); - area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, @@ -2410,13 +2415,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } else { pages = kmalloc_node(array_size, nested_gfp, node); } - area->pages = pages; - if (!area->pages) { + + if (!pages) { remove_vm_area(area->addr); kfree(area); return NULL; } + area->pages = pages; + area->nr_pages = nr_pages; + for (i = 0; i < area->nr_pages; i++) { struct page *page; @@ -2851,7 +2859,7 @@ long vread(char *buf, char *addr, unsigned long count) if (!count) break; - if (!(va->flags & VM_VM_AREA)) + if (!va->vm) continue; vm = va->vm; @@ -2931,7 +2939,7 @@ long vwrite(char *buf, char *addr, unsigned long count) if (!count) break; - if (!(va->flags & VM_VM_AREA)) + if (!va->vm) continue; vm = va->vm; @@ -3450,6 +3458,22 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) } } +static void show_purge_info(struct seq_file *m) +{ + struct llist_node *head; + struct vmap_area *va; + + head = READ_ONCE(vmap_purge_list.first); + if (head == NULL) + return; + + llist_for_each_entry(va, head, purge_list) { + seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); + } +} + static int s_show(struct seq_file *m, void *p) { struct vmap_area *va; @@ -3458,14 +3482,13 @@ static int s_show(struct seq_file *m, void *p) va = list_entry(p, struct vmap_area, list); /* - * s_show can encounter race with remove_vm_area, !VM_VM_AREA on - * behalf of vmap area is being tear down or vm_map_ram allocation. + * s_show can encounter race with remove_vm_area, !vm on behalf + * of vmap area is being tear down or vm_map_ram allocation. */ - if (!(va->flags & VM_VM_AREA)) { - seq_printf(m, "0x%pK-0x%pK %7ld %s\n", + if (!va->vm) { + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start, - va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram"); + va->va_end - va->va_start); return 0; } @@ -3504,6 +3527,16 @@ static int s_show(struct seq_file *m, void *p) show_numa_info(m, v); seq_putc(m, '\n'); + + /* + * As a final step, dump "unpurged" areas. Note, + * that entire "/proc/vmallocinfo" output will not + * be address sorted, because the purge list is not + * sorted. + */ + if (list_is_last(&va->list, &vmap_area_list)) + show_purge_info(m); + return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index a6c5d0b28321..e5d52d6a24af 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -171,11 +171,22 @@ int vm_swappiness = 60; */ unsigned long vm_total_pages; +static void set_task_reclaim_state(struct task_struct *task, + struct reclaim_state *rs) +{ + /* Check for an overwrite */ + WARN_ON_ONCE(rs && task->reclaim_state); + + /* Check for the nulling of an already-nulled member */ + WARN_ON_ONCE(!rs && !task->reclaim_state); + + task->reclaim_state = rs; +} + static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); -#ifdef CONFIG_MEMCG_KMEM - +#ifdef CONFIG_MEMCG /* * We allow subsystems to populate their shrinker-related * LRU lists before register_shrinker_prepared() is called @@ -227,30 +238,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) idr_remove(&shrinker_idr, id); up_write(&shrinker_rwsem); } -#else /* CONFIG_MEMCG_KMEM */ -static int prealloc_memcg_shrinker(struct shrinker *shrinker) -{ - return 0; -} -static void unregister_memcg_shrinker(struct shrinker *shrinker) -{ -} -#endif /* CONFIG_MEMCG_KMEM */ - -static void set_task_reclaim_state(struct task_struct *task, - struct reclaim_state *rs) -{ - /* Check for an overwrite */ - WARN_ON_ONCE(rs && task->reclaim_state); - - /* Check for the nulling of an already-nulled member */ - WARN_ON_ONCE(!rs && !task->reclaim_state); - - task->reclaim_state = rs; -} - -#ifdef CONFIG_MEMCG static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; @@ -305,6 +293,15 @@ static bool memcg_congested(pg_data_t *pgdat, } #else +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + return 0; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +} + static bool global_reclaim(struct scan_control *sc) { return true; @@ -591,7 +588,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, return freed; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { @@ -599,7 +596,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, unsigned long ret, freed = 0; int i; - if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) + if (!mem_cgroup_online(memcg)) return 0; if (!down_read_trylock(&shrinker_rwsem)) @@ -625,6 +622,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, continue; } + /* Call non-slab shrinkers even though kmem is disabled */ + if (!memcg_kmem_enabled() && + !(shrinker->flags & SHRINKER_NONSLAB)) + continue; + ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { clear_bit(i, map->map); @@ -661,13 +663,13 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, up_read(&shrinker_rwsem); return freed; } -#else /* CONFIG_MEMCG_KMEM */ +#else /* CONFIG_MEMCG */ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { return 0; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ /** * shrink_slab - shrink slab caches @@ -1121,7 +1123,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct scan_control *sc, enum ttu_flags ttu_flags, struct reclaim_stat *stat, - bool force_reclaim) + bool ignore_references) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -1135,7 +1137,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct address_space *mapping; struct page *page; int may_enter_fs; - enum page_references references = PAGEREF_RECLAIM_CLEAN; + enum page_references references = PAGEREF_RECLAIM; bool dirty, writeback; unsigned int nr_pages; @@ -1149,7 +1151,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, VM_BUG_ON_PAGE(PageActive(page), page); - nr_pages = 1 << compound_order(page); + nr_pages = compound_nr(page); /* Account the number of base pages even though THP */ sc->nr_scanned += nr_pages; @@ -1266,7 +1268,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - if (!force_reclaim) + if (!ignore_references) references = page_check_references(page, sc); switch (references) { @@ -1487,10 +1489,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Is there need to periodically free_page_list? It would * appear not as the counts should be low */ - if (unlikely(PageTransHuge(page))) { - mem_cgroup_uncharge(page); + if (unlikely(PageTransHuge(page))) (*get_compound_page_dtor(page))(page); - } else + else list_add(&page->lru, &free_pages); continue; @@ -1705,7 +1706,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON_PAGE(!PageLRU(page), page); - nr_pages = 1 << compound_order(page); + nr_pages = compound_nr(page); total_scan += nr_pages; if (page_zonenum(page) > sc->reclaim_idx) { @@ -1911,7 +1912,6 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, if (unlikely(PageCompound(page))) { spin_unlock_irq(&pgdat->lru_lock); - mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&pgdat->lru_lock); } else @@ -2145,6 +2145,62 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } +unsigned long reclaim_pages(struct list_head *page_list) +{ + int nid = -1; + unsigned long nr_reclaimed = 0; + LIST_HEAD(node_page_list); + struct reclaim_stat dummy_stat; + struct page *page; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + }; + + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + if (nid == -1) { + nid = page_to_nid(page); + INIT_LIST_HEAD(&node_page_list); + } + + if (nid == page_to_nid(page)) { + ClearPageActive(page); + list_move(&page->lru, &node_page_list); + continue; + } + + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + + nid = -1; + } + + if (!list_empty(&node_page_list)) { + nr_reclaimed += shrink_page_list(&node_page_list, + NODE_DATA(nid), + &sc, 0, + &dummy_stat, false); + while (!list_empty(&node_page_list)) { + page = lru_to_page(&node_page_list); + list_del(&page->lru); + putback_lru_page(page); + } + } + + return nr_reclaimed; +} + /* * The inactive anon list should be small enough that the VM never has * to do too much work. @@ -2586,7 +2642,6 @@ static bool in_reclaim_compaction(struct scan_control *sc) */ static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long nr_reclaimed, - unsigned long nr_scanned, struct scan_control *sc) { unsigned long pages_for_compaction; @@ -2597,40 +2652,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, if (!in_reclaim_compaction(sc)) return false; - /* Consider stopping depending on scan and reclaim activity */ - if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) { - /* - * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the - * full LRU list has been scanned and we are still failing - * to reclaim pages. This full LRU scan is potentially - * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed - */ - if (!nr_reclaimed && !nr_scanned) - return false; - } else { - /* - * For non-__GFP_RETRY_MAYFAIL allocations which can presumably - * fail without consequence, stop if we failed to reclaim - * any pages from the last SWAP_CLUSTER_MAX number of - * pages that were scanned. This will return to the - * caller faster at the risk reclaim/compaction and - * the resulting allocation attempt fails - */ - if (!nr_reclaimed) - return false; - } - /* - * If we have not reclaimed enough pages for compaction and the - * inactive lists are large enough, continue reclaiming + * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX + * number of pages that were scanned. This will return to the caller + * with the risk reclaim/compaction and the resulting allocation attempt + * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL + * allocations through requiring that the full LRU list has been scanned + * first, by assuming that zero delta of sc->nr_scanned means full LRU + * scan, but that approximation was wrong, and there were corner cases + * where always a non-zero amount of pages were scanned. */ - pages_for_compaction = compact_gap(sc->order); - inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); - if (get_nr_swap_pages() > 0) - inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); - if (sc->nr_reclaimed < pages_for_compaction && - inactive_lru_pages > pages_for_compaction) - return true; + if (!nr_reclaimed) + return false; /* If compaction would go ahead or the allocation would succeed, stop */ for (z = 0; z <= sc->reclaim_idx; z++) { @@ -2647,7 +2680,17 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, ; } } - return true; + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = compact_gap(sc->order); + inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); + if (get_nr_swap_pages() > 0) + inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); + + return inactive_lru_pages > pages_for_compaction; } static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) @@ -2664,10 +2707,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) do { struct mem_cgroup *root = sc->target_mem_cgroup; - struct mem_cgroup_reclaim_cookie reclaim = { - .pgdat = pgdat, - .priority = sc->priority, - }; unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; @@ -2676,7 +2715,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - memcg = mem_cgroup_iter(root, NULL, &reclaim); + memcg = mem_cgroup_iter(root, NULL, NULL); do { unsigned long lru_pages; unsigned long reclaimed; @@ -2719,21 +2758,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); - /* - * Kswapd have to scan all memory cgroups to fulfill - * the overall scan target for the node. - * - * Limit reclaim, on the other hand, only cares about - * nr_to_reclaim pages to be reclaimed and it will - * retry with decreasing priority if one round over the - * whole hierarchy is not sufficient. - */ - if (!current_is_kswapd() && - sc->nr_reclaimed >= sc->nr_to_reclaim) { - mem_cgroup_iter_break(root, memcg); - break; - } - } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); + } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; @@ -2810,7 +2835,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) wait_iff_congested(BLK_RW_ASYNC, HZ/10); } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, - sc->nr_scanned - nr_scanned, sc)); + sc)); /* * Kswapd gives up on balancing particular nodes after too diff --git a/mm/vmstat.c b/mm/vmstat.c index fd7e16ca6996..6afc892a148a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1158,6 +1158,8 @@ const char * const vmstat_text[] = { "nr_shmem", "nr_shmem_hugepages", "nr_shmem_pmdmapped", + "nr_file_hugepages", + "nr_file_pmdmapped", "nr_anon_transparent_hugepages", "nr_unstable", "nr_vmscan_write", diff --git a/mm/z3fold.c b/mm/z3fold.c index 75b7962439ff..05bdf90646e7 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include @@ -146,8 +145,6 @@ struct z3fold_header { * @release_wq: workqueue for safe page release * @work: work_struct for safe page release * @inode: inode for z3fold pseudo filesystem - * @destroying: bool to stop migration once we start destruction - * @isolated: int to count the number of pages currently in isolation * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. @@ -166,11 +163,8 @@ struct z3fold_pool { const struct zpool_ops *zpool_ops; struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; - struct wait_queue_head isolate_wait; struct work_struct work; struct inode *inode; - bool destroying; - int isolated; }; /* @@ -301,14 +295,11 @@ static void z3fold_unregister_migration(struct z3fold_pool *pool) } /* Initializes the z3fold header of a newly allocated z3fold page */ -static struct z3fold_header *init_z3fold_page(struct page *page, +static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, struct z3fold_pool *pool, gfp_t gfp) { struct z3fold_header *zhdr = page_address(page); - struct z3fold_buddy_slots *slots = alloc_slots(pool, gfp); - - if (!slots) - return NULL; + struct z3fold_buddy_slots *slots; INIT_LIST_HEAD(&page->lru); clear_bit(PAGE_HEADLESS, &page->private); @@ -316,6 +307,12 @@ static struct z3fold_header *init_z3fold_page(struct page *page, clear_bit(NEEDS_COMPACTING, &page->private); clear_bit(PAGE_STALE, &page->private); clear_bit(PAGE_CLAIMED, &page->private); + if (headless) + return zhdr; + + slots = alloc_slots(pool, gfp); + if (!slots) + return NULL; spin_lock_init(&zhdr->page_lock); kref_init(&zhdr->refcount); @@ -372,9 +369,10 @@ static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) * Encodes the handle of a particular buddy within a z3fold page * Pool lock should be held as this function accesses first_num */ -static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) +static unsigned long __encode_handle(struct z3fold_header *zhdr, + struct z3fold_buddy_slots *slots, + enum buddy bud) { - struct z3fold_buddy_slots *slots; unsigned long h = (unsigned long)zhdr; int idx = 0; @@ -391,11 +389,15 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) if (bud == LAST) h |= (zhdr->last_chunks << BUDDY_SHIFT); - slots = zhdr->slots; slots->slot[idx] = h; return (unsigned long)&slots->slot[idx]; } +static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) +{ + return __encode_handle(zhdr, zhdr->slots, bud); +} + /* Returns the z3fold page where a given handle is stored */ static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) { @@ -630,6 +632,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) } if (unlikely(PageIsolated(page) || + test_bit(PAGE_CLAIMED, &page->private) || test_bit(PAGE_STALE, &page->private))) { z3fold_page_unlock(zhdr); return; @@ -775,7 +778,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, goto out_c; spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); - init_waitqueue_head(&pool->isolate_wait); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); if (!pool->unbuddied) goto out_pool; @@ -815,15 +817,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, return NULL; } -static bool pool_isolated_are_drained(struct z3fold_pool *pool) -{ - bool ret; - - spin_lock(&pool->lock); - ret = pool->isolated == 0; - spin_unlock(&pool->lock); - return ret; -} /** * z3fold_destroy_pool() - destroys an existing z3fold pool * @pool: the z3fold pool to be destroyed @@ -833,22 +826,6 @@ static bool pool_isolated_are_drained(struct z3fold_pool *pool) static void z3fold_destroy_pool(struct z3fold_pool *pool) { kmem_cache_destroy(pool->c_handle); - /* - * We set pool-> destroying under lock to ensure that - * z3fold_page_isolate() sees any changes to destroying. This way we - * avoid the need for any memory barriers. - */ - - spin_lock(&pool->lock); - pool->destroying = true; - spin_unlock(&pool->lock); - - /* - * We need to ensure that no pages are being migrated while we destroy - * these workqueues, as migration can queue work on either of the - * workqueues. - */ - wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool)); /* * We need to destroy pool->compact_wq before pool->release_wq, @@ -956,7 +933,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, if (!page) return -ENOMEM; - zhdr = init_z3fold_page(page, pool, gfp); + zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp); if (!zhdr) { __free_page(page); return -ENOMEM; @@ -1132,6 +1109,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) struct z3fold_header *zhdr = NULL; struct page *page = NULL; struct list_head *pos; + struct z3fold_buddy_slots slots; unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; spin_lock(&pool->lock); @@ -1150,16 +1128,22 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) /* this bit could have been set by free, in which case * we pass over to the next page in the pool. */ - if (test_and_set_bit(PAGE_CLAIMED, &page->private)) + if (test_and_set_bit(PAGE_CLAIMED, &page->private)) { + page = NULL; continue; + } - if (unlikely(PageIsolated(page))) + if (unlikely(PageIsolated(page))) { + clear_bit(PAGE_CLAIMED, &page->private); + page = NULL; continue; + } + zhdr = page_address(page); if (test_bit(PAGE_HEADLESS, &page->private)) break; - zhdr = page_address(page); if (!z3fold_page_trylock(zhdr)) { + clear_bit(PAGE_CLAIMED, &page->private); zhdr = NULL; continue; /* can't evict at this point */ } @@ -1177,26 +1161,30 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) if (!test_bit(PAGE_HEADLESS, &page->private)) { /* - * We need encode the handles before unlocking, since - * we can race with free that will set - * (first|last)_chunks to 0 + * We need encode the handles before unlocking, and + * use our local slots structure because z3fold_free + * can zero out zhdr->slots and we can't do much + * about that */ first_handle = 0; last_handle = 0; middle_handle = 0; if (zhdr->first_chunks) - first_handle = encode_handle(zhdr, FIRST); + first_handle = __encode_handle(zhdr, &slots, + FIRST); if (zhdr->middle_chunks) - middle_handle = encode_handle(zhdr, MIDDLE); + middle_handle = __encode_handle(zhdr, &slots, + MIDDLE); if (zhdr->last_chunks) - last_handle = encode_handle(zhdr, LAST); + last_handle = __encode_handle(zhdr, &slots, + LAST); /* * it's safe to unlock here because we hold a * reference to this page */ z3fold_page_unlock(zhdr); } else { - first_handle = encode_handle(zhdr, HEADLESS); + first_handle = __encode_handle(zhdr, &slots, HEADLESS); last_handle = middle_handle = 0; } @@ -1226,9 +1214,9 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); + clear_bit(PAGE_CLAIMED, &page->private); } else { z3fold_page_lock(zhdr); - clear_bit(PAGE_CLAIMED, &page->private); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { atomic64_dec(&pool->pages_nr); @@ -1243,6 +1231,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); + clear_bit(PAGE_CLAIMED, &page->private); } /* We started off locked to we need to lock the pool back */ @@ -1339,28 +1328,6 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool) return atomic64_read(&pool->pages_nr); } -/* - * z3fold_dec_isolated() expects to be called while pool->lock is held. - */ -static void z3fold_dec_isolated(struct z3fold_pool *pool) -{ - assert_spin_locked(&pool->lock); - VM_BUG_ON(pool->isolated <= 0); - pool->isolated--; - - /* - * If we have no more isolated pages, we have to see if - * z3fold_destroy_pool() is waiting for a signal. - */ - if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait)) - wake_up_all(&pool->isolate_wait); -} - -static void z3fold_inc_isolated(struct z3fold_pool *pool) -{ - pool->isolated++; -} - static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) { struct z3fold_header *zhdr; @@ -1369,7 +1336,8 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); - if (test_bit(PAGE_HEADLESS, &page->private)) + if (test_bit(PAGE_HEADLESS, &page->private) || + test_bit(PAGE_CLAIMED, &page->private)) return false; zhdr = page_address(page); @@ -1387,34 +1355,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) spin_lock(&pool->lock); if (!list_empty(&page->lru)) list_del(&page->lru); - /* - * We need to check for destruction while holding pool->lock, as - * otherwise destruction could see 0 isolated pages, and - * proceed. - */ - if (unlikely(pool->destroying)) { - spin_unlock(&pool->lock); - /* - * If this page isn't stale, somebody else holds a - * reference to it. Let't drop our refcount so that they - * can call the release logic. - */ - if (unlikely(kref_put(&zhdr->refcount, - release_z3fold_page_locked))) { - /* - * If we get here we have kref problems, so we - * should freak out. - */ - WARN(1, "Z3fold is experiencing kref problems\n"); - z3fold_page_unlock(zhdr); - return false; - } - z3fold_page_unlock(zhdr); - return false; - } - - - z3fold_inc_isolated(pool); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); return true; @@ -1483,10 +1423,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); - spin_lock(&pool->lock); - z3fold_dec_isolated(pool); - spin_unlock(&pool->lock); - page_mapcount_reset(page); put_page(page); return 0; @@ -1506,14 +1442,10 @@ static void z3fold_page_putback(struct page *page) INIT_LIST_HEAD(&page->lru); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { atomic64_dec(&pool->pages_nr); - spin_lock(&pool->lock); - z3fold_dec_isolated(pool); - spin_unlock(&pool->lock); return; } spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); - z3fold_dec_isolated(pool); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); } diff --git a/mm/zpool.c b/mm/zpool.c index a2dd9107857d..863669212070 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -238,6 +238,22 @@ const char *zpool_get_type(struct zpool *zpool) return zpool->driver->type; } +/** + * zpool_malloc_support_movable() - Check if the zpool support + * allocate movable memory + * @zpool: The zpool to check + * + * This returns if the zpool support allocate movable memory. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: true if if the zpool support allocate movable memory, false if not + */ +bool zpool_malloc_support_movable(struct zpool *zpool) +{ + return zpool->driver->malloc_support_movable; +} + /** * zpool_malloc() - Allocate memory * @zpool: The zpool to allocate from. diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e98bb6ab4f7e..2b2b9aae8a3c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -443,15 +443,16 @@ static u64 zs_zpool_total_size(void *pool) } static struct zpool_driver zs_zpool_driver = { - .type = "zsmalloc", - .owner = THIS_MODULE, - .create = zs_zpool_create, - .destroy = zs_zpool_destroy, - .malloc = zs_zpool_malloc, - .free = zs_zpool_free, - .map = zs_zpool_map, - .unmap = zs_zpool_unmap, - .total_size = zs_zpool_total_size, + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc_support_movable = true, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, }; MODULE_ALIAS("zpool-zsmalloc"); @@ -476,10 +477,6 @@ static inline int get_zspage_inuse(struct zspage *zspage) return zspage->inuse; } -static inline void set_zspage_inuse(struct zspage *zspage, int val) -{ - zspage->inuse = val; -} static inline void mod_zspage_inuse(struct zspage *zspage, int val) { diff --git a/mm/zswap.c b/mm/zswap.c index 0e22744a76cb..46a322316e52 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -856,7 +856,6 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) /* extract swpentry from data */ zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); swpentry = zhdr->swpentry; /* here */ - zpool_unmap_handle(pool, handle); tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); @@ -866,6 +865,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) if (!entry) { /* entry was invalidated */ spin_unlock(&tree->lock); + zpool_unmap_handle(pool, handle); return 0; } spin_unlock(&tree->lock); @@ -886,15 +886,13 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, - ZPOOL_MM_RO) + sizeof(struct zswap_header); + src = (u8 *)zhdr + sizeof(struct zswap_header); dst = kmap_atomic(page); tfm = *get_cpu_ptr(entry->pool->tfm); ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); put_cpu_ptr(entry->pool->tfm); kunmap_atomic(dst); - zpool_unmap_handle(entry->pool->zpool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -940,6 +938,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) spin_unlock(&tree->lock); end: + zpool_unmap_handle(pool, handle); return ret; } @@ -997,6 +996,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, char *buf; u8 *src, *dst; struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; + gfp_t gfp; /* THP isn't supported */ if (PageTransHuge(page)) { @@ -1070,9 +1070,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; - ret = zpool_malloc(entry->pool->zpool, hlen + dlen, - __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, - &handle); + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + if (zpool_malloc_support_movable(entry->pool->zpool)) + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4eeea4d5c3ef..2d568246803f 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -185,18 +186,34 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); +/* + * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are + * compatible with (a superset of) GFP_KERNEL. This is because while the + * actual pages are allocated with the specified flags, the page table pages + * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take + * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc(). + * + * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. + */ void *ceph_kvmalloc(size_t size, gfp_t flags) { - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *ptr = kmalloc(size, flags | __GFP_NOWARN); - if (ptr) - return ptr; + void *p; + + if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) { + p = kvmalloc(size, flags); + } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) { + unsigned int nofs_flag = memalloc_nofs_save(); + p = kvmalloc(size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); + } else { + unsigned int noio_flag = memalloc_noio_save(); + p = kvmalloc(size, GFP_KERNEL); + memalloc_noio_restore(noio_flag); } - return __vmalloc(size, flags, PAGE_KERNEL); + return p; } - static int parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; @@ -694,6 +711,14 @@ void ceph_destroy_client(struct ceph_client *client) } EXPORT_SYMBOL(ceph_destroy_client); +void ceph_reset_client_addr(struct ceph_client *client) +{ + ceph_messenger_reset_nonce(&client->msgr); + ceph_monc_reopen_session(&client->monc); + ceph_osdc_reopen_osds(&client->osdc); +} +EXPORT_SYMBOL(ceph_reset_client_addr); + /* * true if we have the mon map (and have thus joined the cluster) */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 962f521c863e..e4cb3db2ee77 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3031,6 +3031,12 @@ static void con_fault(struct ceph_connection *con) } +void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) +{ + u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; + msgr->inst.addr.nonce = cpu_to_le32(nonce); + encode_my_addr(msgr); +} /* * initialize a new messenger instance diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 0520bf9825aa..7256c402ebaa 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -213,6 +213,13 @@ static void reopen_session(struct ceph_mon_client *monc) __open_session(monc); } +void ceph_monc_reopen_session(struct ceph_mon_client *monc) +{ + mutex_lock(&monc->mutex); + reopen_session(monc); + mutex_unlock(&monc->mutex); +} + static void un_backoff(struct ceph_mon_client *monc) { monc->hunt_mult /= 2; /* reduce by 50% */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 78ae6e8c953d..ba45b074a362 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -841,6 +841,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist; size_t payload_len = 0; size_t size; + int ret; op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); @@ -852,20 +853,27 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, size = strlen(class); BUG_ON(size > (size_t) U8_MAX); op->cls.class_len = size; - ceph_pagelist_append(pagelist, class, size); + ret = ceph_pagelist_append(pagelist, class, size); + if (ret) + goto err_pagelist_free; payload_len += size; op->cls.method_name = method; size = strlen(method); BUG_ON(size > (size_t) U8_MAX); op->cls.method_len = size; - ceph_pagelist_append(pagelist, method, size); + ret = ceph_pagelist_append(pagelist, method, size); + if (ret) + goto err_pagelist_free; payload_len += size; osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); - op->indata_len = payload_len; return 0; + +err_pagelist_free: + ceph_pagelist_release(pagelist); + return ret; } EXPORT_SYMBOL(osd_req_op_cls_init); @@ -877,6 +885,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, opcode, 0); struct ceph_pagelist *pagelist; size_t payload_len; + int ret; BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); @@ -886,10 +895,14 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, payload_len = strlen(name); op->xattr.name_len = payload_len; - ceph_pagelist_append(pagelist, name, payload_len); + ret = ceph_pagelist_append(pagelist, name, payload_len); + if (ret) + goto err_pagelist_free; op->xattr.value_len = size; - ceph_pagelist_append(pagelist, value, size); + ret = ceph_pagelist_append(pagelist, value, size); + if (ret) + goto err_pagelist_free; payload_len += size; op->xattr.cmp_op = cmp_op; @@ -898,6 +911,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); op->indata_len = payload_len; return 0; + +err_pagelist_free: + ceph_pagelist_release(pagelist); + return ret; } EXPORT_SYMBOL(osd_req_op_xattr_init); @@ -1488,7 +1505,6 @@ enum calc_target_result { static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_osd_request_target *t, - struct ceph_connection *con, bool any_change) { struct ceph_pg_pool_info *pi; @@ -2272,7 +2288,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: - ct_res = calc_target(osdc, &req->r_t, NULL, false); + ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) goto promote; @@ -2476,6 +2492,14 @@ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err) } EXPORT_SYMBOL(ceph_osdc_abort_requests); +void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc) +{ + down_write(&osdc->lock); + osdc->abort_err = 0; + up_write(&osdc->lock); +} +EXPORT_SYMBOL(ceph_osdc_clear_abort_err); + static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) { if (likely(eb > osdc->epoch_barrier)) { @@ -3087,7 +3111,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; } - calc_target(osdc, &lreq->t, NULL, false); + calc_target(osdc, &lreq->t, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); link_linger(osd, lreq); @@ -3704,7 +3728,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq) struct ceph_osd_client *osdc = lreq->osdc; enum calc_target_result ct_res; - ct_res = calc_target(osdc, &lreq->t, NULL, true); + ct_res = calc_target(osdc, &lreq->t, true); if (ct_res == CALC_TARGET_NEED_RESEND) { struct ceph_osd *osd; @@ -3776,8 +3800,7 @@ static void scan_requests(struct ceph_osd *osd, n = rb_next(n); /* unlink_request(), check_pool_dne() */ dout("%s req %p tid %llu\n", __func__, req, req->r_tid); - ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, - false); + ct_res = calc_target(osdc, &req->r_t, false); switch (ct_res) { case CALC_TARGET_NO_ACTION: force_resend_writes = cleared_full || @@ -3886,7 +3909,7 @@ static void kick_requests(struct ceph_osd_client *osdc, n = rb_next(n); if (req->r_t.epoch < osdc->osdmap->epoch) { - ct_res = calc_target(osdc, &req->r_t, NULL, false); + ct_res = calc_target(osdc, &req->r_t, false); if (ct_res == CALC_TARGET_POOL_DNE) { erase_request(need_resend, req); check_pool_dne(req); @@ -5086,6 +5109,24 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_call); +/* + * reset all osd connections + */ +void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc) +{ + struct rb_node *n; + + down_write(&osdc->lock); + for (n = rb_first(&osdc->osds); n; ) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + n = rb_next(n); + if (!reopen_osd(osd)) + kick_osd_requests(osd); + } + up_write(&osdc->lock); +} + /* * init, shutdown */ diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 90437906b7bc..4e0de14f80bb 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) struct ceph_pg_pool_info, node); __remove_pg_pool(&map->pg_pools, pi); } - kfree(map->osd_state); - kfree(map->osd_weight); - kfree(map->osd_addr); - kfree(map->osd_primary_affinity); - kfree(map->crush_workspace); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + kvfree(map->osd_primary_affinity); + kvfree(map->crush_workspace); kfree(map); } @@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) * * The new elements are properly initialized. */ -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) +static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) { u32 *state; u32 *weight; struct ceph_entity_addr *addr; + u32 to_copy; int i; - state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); - if (!state) + dout("%s old %u new %u\n", __func__, map->max_osd, max); + if (max == map->max_osd) + return 0; + + state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); + weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); + addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); + if (!state || !weight || !addr) { + kvfree(state); + kvfree(weight); + kvfree(addr); return -ENOMEM; + } + + to_copy = min(map->max_osd, max); + if (map->osd_state) { + memcpy(state, map->osd_state, to_copy * sizeof(*state)); + memcpy(weight, map->osd_weight, to_copy * sizeof(*weight)); + memcpy(addr, map->osd_addr, to_copy * sizeof(*addr)); + kvfree(map->osd_state); + kvfree(map->osd_weight); + kvfree(map->osd_addr); + } + map->osd_state = state; - - weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); - if (!weight) - return -ENOMEM; map->osd_weight = weight; - - addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); - if (!addr) - return -ENOMEM; map->osd_addr = addr; - for (i = map->max_osd; i < max; i++) { map->osd_state[i] = 0; map->osd_weight[i] = CEPH_OSD_OUT; @@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) if (map->osd_primary_affinity) { u32 *affinity; - affinity = krealloc(map->osd_primary_affinity, - max*sizeof(*affinity), GFP_NOFS); + affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), + GFP_NOFS); if (!affinity) return -ENOMEM; - map->osd_primary_affinity = affinity; + memcpy(affinity, map->osd_primary_affinity, + to_copy * sizeof(*affinity)); + kvfree(map->osd_primary_affinity); + + map->osd_primary_affinity = affinity; for (i = map->max_osd; i < max; i++) map->osd_primary_affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; @@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); dout("%s work_size %zu bytes\n", __func__, work_size); - workspace = kmalloc(work_size, GFP_NOIO); + workspace = ceph_kvmalloc(work_size, GFP_NOIO); if (!workspace) { crush_destroy(crush); return -ENOMEM; @@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) if (map->crush) crush_destroy(map->crush); - kfree(map->crush_workspace); + kvfree(map->crush_workspace); map->crush = crush; map->crush_workspace = workspace; return 0; @@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) if (!map->osd_primary_affinity) { int i; - map->osd_primary_affinity = kmalloc_array(map->max_osd, - sizeof(u32), - GFP_NOFS); + map->osd_primary_affinity = ceph_kvmalloc( + array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), + GFP_NOFS); if (!map->osd_primary_affinity) return -ENOMEM; @@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end, ceph_decode_32_safe(p, end, len, e_inval); if (len == 0) { - kfree(map->osd_primary_affinity); + kvfree(map->osd_primary_affinity); map->osd_primary_affinity = NULL; return 0; } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 4ce42c62458e..d75fddca44c9 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1960,7 +1960,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len)) goto unwrap_failed; - if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset)) + if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset)) goto unwrap_failed; maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index a07b516e503a..f7f78566be46 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1837,7 +1837,7 @@ call_allocate(struct rpc_task *task) return; } - rpc_exit(task, -ERESTARTSYS); + rpc_call_rpcerror(task, -ERESTARTSYS); } static int @@ -1862,6 +1862,7 @@ rpc_xdr_encode(struct rpc_task *task) req->rq_rbuffer, req->rq_rcvsize); + req->rq_reply_bytes_recvd = 0; req->rq_snd_buf.head[0].iov_len = 0; xdr_init_encode(&xdr, &req->rq_snd_buf, req->rq_snd_buf.head[0].iov_base, req); @@ -1881,6 +1882,8 @@ call_encode(struct rpc_task *task) if (!rpc_task_need_encode(task)) goto out; dprint_status(task); + /* Dequeue task from the receive queue while we're encoding */ + xprt_request_dequeue_xprt(task); /* Encode here so that rpcsec_gss can use correct sequence number. */ rpc_xdr_encode(task); /* Did the encode result in an error condition? */ @@ -2479,6 +2482,7 @@ call_decode(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct xdr_stream xdr; + int err; dprint_status(task); @@ -2501,6 +2505,15 @@ call_decode(struct rpc_task *task) * before it changed req->rq_reply_bytes_recvd. */ smp_rmb(); + + /* + * Did we ever call xprt_complete_rqst()? If not, we should assume + * the message is incomplete. + */ + err = -EAGAIN; + if (!req->rq_reply_bytes_recvd) + goto out; + req->rq_rcv_buf.len = req->rq_private_buf.len; /* Check that the softirq receive buffer is valid */ @@ -2509,7 +2522,9 @@ call_decode(struct rpc_task *task) xdr_init_decode(&xdr, &req->rq_rcv_buf, req->rq_rcv_buf.head[0].iov_base, req); - switch (rpc_decode_header(task, &xdr)) { + err = rpc_decode_header(task, &xdr); +out: + switch (err) { case 0: task->tk_action = rpc_exit_task; task->tk_status = rpcauth_unwrap_resp(task, &xdr); @@ -2518,9 +2533,6 @@ call_decode(struct rpc_task *task) return; case -EAGAIN: task->tk_status = 0; - xdr_free_bvec(&req->rq_rcv_buf); - req->rq_reply_bytes_recvd = 0; - req->rq_rcv_buf.len = 0; if (task->tk_client->cl_discrtry) xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); @@ -2561,7 +2573,7 @@ rpc_encode_header(struct rpc_task *task, struct xdr_stream *xdr) return 0; out_fail: trace_rpc_bad_callhdr(task); - rpc_exit(task, error); + rpc_call_rpcerror(task, error); return error; } @@ -2628,7 +2640,7 @@ rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr) return -EAGAIN; } out_err: - rpc_exit(task, error); + rpc_call_rpcerror(task, error); return error; out_unparsable: diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 1f275aba786f..360afe153193 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -541,33 +541,14 @@ rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq, return NULL; } -static void -rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, struct rpc_task *task) -{ - rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL); -} - /* * Wake up a queued task while the queue lock is being held */ -static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) +static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, + struct rpc_task *task) { - rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task); -} - -/* - * Wake up a task on a specific queue - */ -void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, - struct rpc_task *task) -{ - if (!RPC_IS_QUEUED(task)) - return; - spin_lock(&queue->lock); - rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); - spin_unlock(&queue->lock); + rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue, + task, NULL, NULL); } /* @@ -930,8 +911,10 @@ static void __rpc_execute(struct rpc_task *task) /* * Signalled tasks should exit rather than sleep. */ - if (RPC_SIGNALLED(task)) + if (RPC_SIGNALLED(task)) { + task->tk_rpc_status = -ERESTARTSYS; rpc_exit(task, -ERESTARTSYS); + } /* * The queue->lock protects against races with @@ -967,6 +950,7 @@ static void __rpc_execute(struct rpc_task *task) */ dprintk("RPC: %5u got signal\n", task->tk_pid); set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); + task->tk_rpc_status = -ERESTARTSYS; rpc_exit(task, -ERESTARTSYS); } dprintk("RPC: %5u sync task resuming\n", task->tk_pid); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 48c93b9e525e..14ba9e72a204 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -560,7 +560,7 @@ EXPORT_SYMBOL_GPL(xdr_init_encode); * required at the end of encoding, or any other time when the xdr_buf * data might be read. */ -void xdr_commit_encode(struct xdr_stream *xdr) +inline void xdr_commit_encode(struct xdr_stream *xdr) { int shift = xdr->scratch.iov_len; void *page; @@ -1236,43 +1236,60 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) } EXPORT_SYMBOL_GPL(xdr_encode_word); -/* If the netobj starting offset bytes from the start of xdr_buf is contained - * entirely in the head or the tail, set object to point to it; otherwise - * try to find space for it at the end of the tail, copy it there, and - * set obj to point to it. */ -int xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, unsigned int offset) +/** + * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf + * @buf: pointer to buffer containing a mic + * @mic: on success, returns the address of the mic + * @offset: the offset in buf where mic may be found + * + * This function may modify the xdr buf if the mic is found to be straddling + * a boundary between head, pages, and tail. On success the mic can be read + * from the address returned. There is no need to free the mic. + * + * Return: Success returns 0, otherwise an integer error. + */ +int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset) { struct xdr_buf subbuf; + unsigned int boundary; - if (xdr_decode_word(buf, offset, &obj->len)) + if (xdr_decode_word(buf, offset, &mic->len)) return -EFAULT; - if (xdr_buf_subsegment(buf, &subbuf, offset + 4, obj->len)) + offset += 4; + + /* Is the mic partially in the head? */ + boundary = buf->head[0].iov_len; + if (offset < boundary && (offset + mic->len) > boundary) + xdr_shift_buf(buf, boundary - offset); + + /* Is the mic partially in the pages? */ + boundary += buf->page_len; + if (offset < boundary && (offset + mic->len) > boundary) + xdr_shrink_pagelen(buf, boundary - offset); + + if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len)) return -EFAULT; - /* Is the obj contained entirely in the head? */ - obj->data = subbuf.head[0].iov_base; - if (subbuf.head[0].iov_len == obj->len) + /* Is the mic contained entirely in the head? */ + mic->data = subbuf.head[0].iov_base; + if (subbuf.head[0].iov_len == mic->len) return 0; - /* ..or is the obj contained entirely in the tail? */ - obj->data = subbuf.tail[0].iov_base; - if (subbuf.tail[0].iov_len == obj->len) + /* ..or is the mic contained entirely in the tail? */ + mic->data = subbuf.tail[0].iov_base; + if (subbuf.tail[0].iov_len == mic->len) return 0; - /* use end of tail as storage for obj: - * (We don't copy to the beginning because then we'd have - * to worry about doing a potentially overlapping copy. - * This assumes the object is at most half the length of the - * tail.) */ - if (obj->len > buf->buflen - buf->len) + /* Find a contiguous area in @buf to hold all of @mic */ + if (mic->len > buf->buflen - buf->len) return -ENOMEM; if (buf->tail[0].iov_len != 0) - obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len; + mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len; else - obj->data = buf->head[0].iov_base + buf->head[0].iov_len; - __read_bytes_from_xdr_buf(&subbuf, obj->data, obj->len); + mic->data = buf->head[0].iov_base + buf->head[0].iov_len; + __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len); return 0; } -EXPORT_SYMBOL_GPL(xdr_buf_read_netobj); +EXPORT_SYMBOL_GPL(xdr_buf_read_mic); /* Returns 0 on success, or else a negative error code. */ static int diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2e71f5455c6c..8a45b3ccc313 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -456,6 +456,12 @@ void xprt_release_rqst_cong(struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); +static void xprt_clear_congestion_window_wait_locked(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) + __xprt_lock_write_next_cong(xprt); +} + /* * Clear the congestion window wait flag and wake up the next * entry on xprt->sending @@ -671,6 +677,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) spin_lock(&xprt->transport_lock); xprt_clear_connected(xprt); xprt_clear_write_space_locked(xprt); + xprt_clear_congestion_window_wait_locked(xprt); xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock(&xprt->transport_lock); } @@ -1323,6 +1330,36 @@ xprt_request_dequeue_transmit(struct rpc_task *task) spin_unlock(&xprt->queue_lock); } +/** + * xprt_request_dequeue_xprt - remove a task from the transmit+receive queue + * @task: pointer to rpc_task + * + * Remove a task from the transmit and receive queues, and ensure that + * it is not pinned by the receive work item. + */ +void +xprt_request_dequeue_xprt(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || + test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || + xprt_is_pinned_rqst(req)) { + spin_lock(&xprt->queue_lock); + xprt_request_dequeue_transmit_locked(task); + xprt_request_dequeue_receive_locked(task); + while (xprt_is_pinned_rqst(req)) { + set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + xprt_wait_on_pinned_rqst(req); + spin_lock(&xprt->queue_lock); + clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + } + spin_unlock(&xprt->queue_lock); + } +} + /** * xprt_request_prepare - prepare an encoded request for transport * @req: pointer to rpc_rqst @@ -1747,28 +1784,6 @@ void xprt_retry_reserve(struct rpc_task *task) xprt_do_reserve(xprt, task); } -static void -xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req) -{ - struct rpc_xprt *xprt = req->rq_xprt; - - if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || - test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || - xprt_is_pinned_rqst(req)) { - spin_lock(&xprt->queue_lock); - xprt_request_dequeue_transmit_locked(task); - xprt_request_dequeue_receive_locked(task); - while (xprt_is_pinned_rqst(req)) { - set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); - spin_unlock(&xprt->queue_lock); - xprt_wait_on_pinned_rqst(req); - spin_lock(&xprt->queue_lock); - clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); - } - spin_unlock(&xprt->queue_lock); - } -} - /** * xprt_release - release an RPC request slot * @task: task which is finished with the slot @@ -1788,7 +1803,7 @@ void xprt_release(struct rpc_task *task) } xprt = req->rq_xprt; - xprt_request_dequeue_all(task, req); + xprt_request_dequeue_xprt(task); spin_lock(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 59e624b1d7a0..50e075fcdd8f 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -54,9 +54,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt) { - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - - return r_xprt->rx_buf.rb_bc_srv_max_requests; + return RPCRDMA_BACKWARD_WRS >> 1; } static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 0b6dad7580a1..30065a28628c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -7,67 +7,37 @@ /* Lightweight memory registration using Fast Registration Work * Requests (FRWR). * - * FRWR features ordered asynchronous registration and deregistration - * of arbitrarily sized memory regions. This is the fastest and safest + * FRWR features ordered asynchronous registration and invalidation + * of arbitrarily-sized memory regions. This is the fastest and safest * but most complex memory registration mode. */ /* Normal operation * - * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG + * A Memory Region is prepared for RDMA Read or Write using a FAST_REG * Work Request (frwr_map). When the RDMA operation is finished, this * Memory Region is invalidated using a LOCAL_INV Work Request - * (frwr_unmap_sync). + * (frwr_unmap_async and frwr_unmap_sync). * - * Typically these Work Requests are not signaled, and neither are RDMA - * SEND Work Requests (with the exception of signaling occasionally to - * prevent provider work queue overflows). This greatly reduces HCA + * Typically FAST_REG Work Requests are not signaled, and neither are + * RDMA Send Work Requests (with the exception of signaling occasionally + * to prevent provider work queue overflows). This greatly reduces HCA * interrupt workload. - * - * As an optimization, frwr_unmap marks MRs INVALID before the - * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on - * rb_mrs immediately so that no work (like managing a linked list - * under a spinlock) is needed in the completion upcall. - * - * But this means that frwr_map() can occasionally encounter an MR - * that is INVALID but the LOCAL_INV WR has not completed. Work Queue - * ordering prevents a subsequent FAST_REG WR from executing against - * that MR while it is still being invalidated. */ /* Transport recovery * - * ->op_map and the transport connect worker cannot run at the same - * time, but ->op_unmap can fire while the transport connect worker - * is running. Thus MR recovery is handled in ->op_map, to guarantee - * that recovered MRs are owned by a sending RPC, and not one where - * ->op_unmap could fire at the same time transport reconnect is - * being done. + * frwr_map and frwr_unmap_* cannot run at the same time the transport + * connect worker is running. The connect worker holds the transport + * send lock, just as ->send_request does. This prevents frwr_map and + * the connect worker from running concurrently. When a connection is + * closed, the Receive completion queue is drained before the allowing + * the connect worker to get control. This prevents frwr_unmap and the + * connect worker from running concurrently. * - * When the underlying transport disconnects, MRs are left in one of - * four states: - * - * INVALID: The MR was not in use before the QP entered ERROR state. - * - * VALID: The MR was registered before the QP entered ERROR state. - * - * FLUSHED_FR: The MR was being registered when the QP entered ERROR - * state, and the pending WR was flushed. - * - * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR - * state, and the pending WR was flushed. - * - * When frwr_map encounters FLUSHED and VALID MRs, they are recovered - * with ib_dereg_mr and then are re-initialized. Because MR recovery - * allocates fresh resources, it is deferred to a workqueue, and the - * recovered MRs are placed back on the rb_mrs list when recovery is - * complete. frwr_map allocates another MR for the current RPC while - * the broken MR is reset. - * - * To ensure that frwr_map doesn't encounter an MR that is marked - * INVALID but that is about to be flushed due to a previous transport - * disconnect, the transport connect worker attempts to drain all - * pending send queue WRs before the transport is reconnected. + * When the underlying transport disconnects, MRs that are in flight + * are flushed and are likely unusable. Thus all flushed MRs are + * destroyed. New MRs are created on demand. */ #include @@ -118,15 +88,8 @@ void frwr_release_mr(struct rpcrdma_mr *mr) kfree(mr); } -/* MRs are dynamically allocated, so simply clean up and release the MR. - * A replacement MR will subsequently be allocated on demand. - */ -static void -frwr_mr_recycle_worker(struct work_struct *work) +static void frwr_mr_recycle(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { - struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - trace_xprtrdma_mr_recycle(mr); if (mr->mr_dir != DMA_NONE) { @@ -136,14 +99,40 @@ frwr_mr_recycle_worker(struct work_struct *work) mr->mr_dir = DMA_NONE; } - spin_lock(&r_xprt->rx_buf.rb_mrlock); + spin_lock(&r_xprt->rx_buf.rb_lock); list_del(&mr->mr_all); r_xprt->rx_stats.mrs_recycled++; - spin_unlock(&r_xprt->rx_buf.rb_mrlock); + spin_unlock(&r_xprt->rx_buf.rb_lock); frwr_release_mr(mr); } +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. + */ +static void +frwr_mr_recycle_worker(struct work_struct *work) +{ + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, + mr_recycle); + + frwr_mr_recycle(mr->mr_xprt, mr); +} + +/* frwr_recycle - Discard MRs + * @req: request to reset + * + * Used after a reconnect. These MRs could be in flight, we can't + * tell. Safe thing to do is release them. + */ +void frwr_recycle(struct rpcrdma_req *req) +{ + struct rpcrdma_mr *mr; + + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) + frwr_mr_recycle(mr->mr_xprt, mr); +} + /* frwr_reset - Place MRs back on the free list * @req: request to reset * @@ -156,12 +145,10 @@ frwr_mr_recycle_worker(struct work_struct *work) */ void frwr_reset(struct rpcrdma_req *req) { - while (!list_empty(&req->rl_registered)) { - struct rpcrdma_mr *mr; + struct rpcrdma_mr *mr; - mr = rpcrdma_mr_pop(&req->rl_registered); - rpcrdma_mr_unmap_and_put(mr); - } + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) + rpcrdma_mr_put(mr); } /** @@ -179,11 +166,14 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) struct ib_mr *frmr; int rc; + /* NB: ib_alloc_mr and device drivers typically allocate + * memory with GFP_KERNEL. + */ frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); if (IS_ERR(frmr)) goto out_mr_err; - sg = kcalloc(depth, sizeof(*sg), GFP_KERNEL); + sg = kcalloc(depth, sizeof(*sg), GFP_NOFS); if (!sg) goto out_list_err; @@ -203,8 +193,6 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) return rc; out_list_err: - dprintk("RPC: %s: sg allocation failure\n", - __func__); ib_dereg_mr(frmr); return -ENOMEM; } @@ -290,8 +278,8 @@ int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep) ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ - ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / - ia->ri_max_frwr_depth); + ia->ri_max_segs = + DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth); /* Reply chunks require segments for head and tail buffers */ ia->ri_max_segs += 2; if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS) @@ -323,31 +311,25 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt) * @nsegs: number of segments remaining * @writing: true when RDMA Write will be used * @xid: XID of RPC using the registered memory - * @out: initialized MR + * @mr: MR to fill in * * Prepare a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. * * Returns the next segment or a negative errno pointer. - * On success, the prepared MR is planted in @out. + * On success, @mr is filled in. */ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr **out) + struct rpcrdma_mr *mr) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; - struct rpcrdma_mr *mr; - struct ib_mr *ibmr; struct ib_reg_wr *reg_wr; + struct ib_mr *ibmr; int i, n; u8 key; - mr = rpcrdma_mr_get(r_xprt); - if (!mr) - goto out_getmr_err; - if (nsegs > ia->ri_max_frwr_depth) nsegs = ia->ri_max_frwr_depth; for (i = 0; i < nsegs;) { @@ -362,7 +344,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, ++seg; ++i; - if (holes_ok) + if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS) continue; if ((i < nsegs && offset_in_page(seg->mr_offset)) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) @@ -397,22 +379,15 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_offset = ibmr->iova; trace_xprtrdma_mr_map(mr); - *out = mr; return seg; -out_getmr_err: - xprt_wait_for_buffer_space(&r_xprt->rx_xprt); - return ERR_PTR(-EAGAIN); - out_dmamap_err: mr->mr_dir = DMA_NONE; trace_xprtrdma_frwr_sgerr(mr, i); - rpcrdma_mr_put(mr); return ERR_PTR(-EIO); out_mapmr_err: trace_xprtrdma_frwr_maperr(mr, n); - rpcrdma_mr_recycle(mr); return ERR_PTR(-EIO); } @@ -485,7 +460,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); trace_xprtrdma_mr_remoteinv(mr); - rpcrdma_mr_unmap_and_put(mr); + rpcrdma_mr_put(mr); break; /* only one invalidated MR per RPC */ } } @@ -495,7 +470,7 @@ static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) if (wc->status != IB_WC_SUCCESS) rpcrdma_mr_recycle(mr); else - rpcrdma_mr_unmap_and_put(mr); + rpcrdma_mr_put(mr); } /** @@ -532,8 +507,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_wake(wc, frwr); - complete(&frwr->fr_linv_done); __frwr_release_mr(wc, mr); + complete(&frwr->fr_linv_done); } /** @@ -562,8 +537,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ frwr = NULL; prev = &first; - while (!list_empty(&req->rl_registered)) { - mr = rpcrdma_mr_pop(&req->rl_registered); + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; @@ -632,11 +606,15 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_rep *rep = mr->mr_req->rl_reply; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_done(wc, frwr); - rpcrdma_complete_rqst(frwr->fr_req->rl_reply); __frwr_release_mr(wc, mr); + + /* Ensure @rep is generated before __frwr_release_mr */ + smp_rmb(); + rpcrdma_complete_rqst(rep); } /** @@ -662,15 +640,13 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) */ frwr = NULL; prev = &first; - while (!list_empty(&req->rl_registered)) { - mr = rpcrdma_mr_pop(&req->rl_registered); + while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; - frwr->fr_req = req; last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 4345e6912392..b86b5fd62d9f 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -342,6 +342,32 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, return 0; } +static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpcrdma_mr_seg *seg, + int nsegs, bool writing, + struct rpcrdma_mr **mr) +{ + *mr = rpcrdma_mr_pop(&req->rl_free_mrs); + if (!*mr) { + *mr = rpcrdma_mr_get(r_xprt); + if (!*mr) + goto out_getmr_err; + trace_xprtrdma_mr_get(req); + (*mr)->mr_req = req; + } + + rpcrdma_mr_push(*mr, &req->rl_registered); + return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); + +out_getmr_err: + trace_xprtrdma_nomrs(req); + xprt_wait_for_buffer_space(&r_xprt->rx_xprt); + if (r_xprt->rx_ep.rep_connected != -ENODEV) + schedule_work(&r_xprt->rx_buf.rb_refresh_worker); + return ERR_PTR(-EAGAIN); +} + /* Register and XDR encode the Read list. Supports encoding a list of read * segments that belong to a single read chunk. * @@ -356,9 +382,10 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, * * Only a single @pos value is currently supported. */ -static noinline int -rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) +static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype rtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; @@ -379,10 +406,9 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return nsegs; do { - seg = frwr_map(r_xprt, seg, nsegs, false, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; @@ -411,9 +437,10 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * * Only a single Write chunk is currently supported. */ -static noinline int -rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) +static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; @@ -440,10 +467,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; @@ -474,9 +500,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * Returns zero on success, or a negative errno if a failure occurred. * @xdr is advanced to the next position in the stream. */ -static noinline int -rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) +static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct rpc_rqst *rqst, + enum rpcrdma_chunktype wtype) { struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; @@ -501,10 +528,9 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); + seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); - rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; @@ -841,12 +867,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) * chunks. Very likely the connection has been replaced, * so these registrations are invalid and unusable. */ - while (unlikely(!list_empty(&req->rl_registered))) { - struct rpcrdma_mr *mr; - - mr = rpcrdma_mr_pop(&req->rl_registered); - rpcrdma_mr_recycle(mr); - } + frwr_recycle(req); /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: @@ -1240,8 +1261,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) struct rpc_rqst *rqst = rep->rr_rqst; int status; - xprt->reestablish_timeout = 0; - switch (rep->rr_proc) { case rdma_msg: status = rpcrdma_decode_msg(r_xprt, rep, rqst); @@ -1300,6 +1319,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) u32 credits; __be32 *p; + /* Any data means we had a useful conversation, so + * then we don't need to delay the next reconnect. + */ + if (xprt->reestablish_timeout) + xprt->reestablish_timeout = 0; + /* Fixed transport header fields */ xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, rep->rr_hdrbuf.head[0].iov_base, NULL); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 2ec349ed4770..160558b4135e 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -423,8 +423,6 @@ void xprt_rdma_close(struct rpc_xprt *xprt) if (ep->rep_connected == -ENODEV) return; - if (ep->rep_connected > 0) - xprt->reestablish_timeout = 0; rpcrdma_ep_disconnect(ep, ia); /* Prepare @xprt for the next connection by reinitializing @@ -434,6 +432,7 @@ void xprt_rdma_close(struct rpc_xprt *xprt) xprt->cwnd = RPC_CWNDSHIFT; out: + xprt->reestablish_timeout = 0; ++xprt->connect_cookie; xprt_disconnect_done(xprt); } @@ -494,9 +493,9 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) * @reconnect_timeout: reconnect timeout after server disconnects * */ -static void xprt_rdma_tcp_set_connect_timeout(struct rpc_xprt *xprt, - unsigned long connect_timeout, - unsigned long reconnect_timeout) +static void xprt_rdma_set_connect_timeout(struct rpc_xprt *xprt, + unsigned long connect_timeout, + unsigned long reconnect_timeout) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); @@ -571,6 +570,7 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) return; out_sleep: + set_bit(XPRT_CONGESTED, &xprt->state); rpc_sleep_on(&xprt->backlog, task, NULL); task->tk_status = -EAGAIN; } @@ -589,7 +589,8 @@ xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst) memset(rqst, 0, sizeof(*rqst)); rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst)); - rpc_wake_up_next(&xprt->backlog); + if (unlikely(!rpc_wake_up_next(&xprt->backlog))) + clear_bit(XPRT_CONGESTED, &xprt->state); } static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt, @@ -803,7 +804,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = { .send_request = xprt_rdma_send_request, .close = xprt_rdma_close, .destroy = xprt_rdma_destroy, - .set_connect_timeout = xprt_rdma_tcp_set_connect_timeout, + .set_connect_timeout = xprt_rdma_set_connect_timeout, .print_stats = xprt_rdma_print_stats, .enable_swap = xprt_rdma_enable_swap, .disable_swap = xprt_rdma_disable_swap, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b10aa16557f0..3a907537e2cf 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -74,8 +75,10 @@ * internal functions */ static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); +static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); +static void rpcrdma_mr_free(struct rpcrdma_mr *mr); static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags); @@ -405,9 +408,8 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_req *req; - struct rpcrdma_rep *rep; - cancel_delayed_work_sync(&buf->rb_refresh_worker); + cancel_work_sync(&buf->rb_refresh_worker); /* This is similar to rpcrdma_ep_destroy, but: * - Don't cancel the connect worker. @@ -429,8 +431,7 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) /* The ULP is responsible for ensuring all DMA * mappings and MRs are gone. */ - list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list) - rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); + rpcrdma_reps_destroy(buf); list_for_each_entry(req, &buf->rb_allreqs, rl_all) { rpcrdma_regbuf_dma_unmap(req->rl_rdmabuf); rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); @@ -604,10 +605,10 @@ void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) * Unlike a normal reconnection, a fresh PD and a new set * of MRs and buffers is needed. */ -static int -rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, + struct ib_qp_init_attr *qp_init_attr) { + struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc, err; trace_xprtrdma_reinsert(r_xprt); @@ -624,7 +625,7 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, } rc = -ENETUNREACH; - err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr); if (err) { pr_err("rpcrdma: rdma_create_qp returned %d\n", err); goto out3; @@ -641,16 +642,16 @@ rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, return rc; } -static int -rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, - struct rpcrdma_ia *ia) +static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, + struct ib_qp_init_attr *qp_init_attr) { + struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rdma_cm_id *id, *old; int err, rc; trace_xprtrdma_reconnect(r_xprt); - rpcrdma_ep_disconnect(ep, ia); + rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia); rc = -EHOSTUNREACH; id = rpcrdma_create_id(r_xprt, ia); @@ -672,7 +673,7 @@ rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, goto out_destroy; } - err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + err = rdma_create_qp(id, ia->ri_pd, qp_init_attr); if (err) goto out_destroy; @@ -697,25 +698,27 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct ib_qp_init_attr qp_init_attr; int rc; retry: + memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr)); switch (ep->rep_connected) { case 0: dprintk("RPC: %s: connecting...\n", __func__); - rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr); if (rc) { rc = -ENETUNREACH; goto out_noupdate; } break; case -ENODEV: - rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia); + rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr); if (rc) goto out_noupdate; break; default: - rc = rpcrdma_ep_reconnect(r_xprt, ep, ia); + rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr); if (rc) goto out; } @@ -729,6 +732,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) if (rc) goto out; + if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); if (ep->rep_connected <= 0) { if (ep->rep_connected == -EAGAIN) @@ -942,14 +947,12 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ia *ia = &r_xprt->rx_ia; unsigned int count; - LIST_HEAD(free); - LIST_HEAD(all); for (count = 0; count < ia->ri_max_segs; count++) { struct rpcrdma_mr *mr; int rc; - mr = kzalloc(sizeof(*mr), GFP_KERNEL); + mr = kzalloc(sizeof(*mr), GFP_NOFS); if (!mr) break; @@ -961,15 +964,13 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) mr->mr_xprt = r_xprt; - list_add(&mr->mr_list, &free); - list_add(&mr->mr_all, &all); + spin_lock(&buf->rb_lock); + list_add(&mr->mr_list, &buf->rb_mrs); + list_add(&mr->mr_all, &buf->rb_all_mrs); + spin_unlock(&buf->rb_lock); } - spin_lock(&buf->rb_mrlock); - list_splice(&free, &buf->rb_mrs); - list_splice(&all, &buf->rb_all); r_xprt->rx_stats.mrs_allocated += count; - spin_unlock(&buf->rb_mrlock); trace_xprtrdma_createmrs(r_xprt, count); } @@ -977,7 +978,7 @@ static void rpcrdma_mr_refresh_worker(struct work_struct *work) { struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, - rb_refresh_worker.work); + rb_refresh_worker); struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); @@ -999,12 +1000,18 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; + size_t maxhdrsize; req = kzalloc(sizeof(*req), flags); if (req == NULL) goto out1; - rb = rpcrdma_regbuf_alloc(RPCRDMA_HDRBUF_SIZE, DMA_TO_DEVICE, flags); + /* Compute maximum header buffer size in bytes */ + maxhdrsize = rpcrdma_fixed_maxsz + 3 + + r_xprt->rx_ia.ri_max_segs * rpcrdma_readchunk_maxsz; + maxhdrsize *= sizeof(__be32); + rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), + DMA_TO_DEVICE, flags); if (!rb) goto out2; req->rl_rdmabuf = rb; @@ -1018,6 +1025,7 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, if (!req->rl_recvbuf) goto out4; + INIT_LIST_HEAD(&req->rl_free_mrs); INIT_LIST_HEAD(&req->rl_registered); spin_lock(&buffer->rb_lock); list_add(&req->rl_all, &buffer->rb_allreqs); @@ -1065,6 +1073,40 @@ static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, return NULL; } +static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) +{ + rpcrdma_regbuf_free(rep->rr_rdmabuf); + kfree(rep); +} + +static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) +{ + struct llist_node *node; + + /* Calls to llist_del_first are required to be serialized */ + node = llist_del_first(&buf->rb_free_reps); + if (!node) + return NULL; + return llist_entry(node, struct rpcrdma_rep, rr_node); +} + +static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, + struct rpcrdma_rep *rep) +{ + if (!rep->rr_temp) + llist_add(&rep->rr_node, &buf->rb_free_reps); + else + rpcrdma_rep_destroy(rep); +} + +static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_rep *rep; + + while ((rep = rpcrdma_rep_get_locked(buf)) != NULL) + rpcrdma_rep_destroy(rep); +} + /** * rpcrdma_buffer_create - Create initial set of req/rep objects * @r_xprt: transport instance to (re)initialize @@ -1078,12 +1120,10 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests; buf->rb_bc_srv_max_requests = 0; - spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); INIT_LIST_HEAD(&buf->rb_mrs); - INIT_LIST_HEAD(&buf->rb_all); - INIT_DELAYED_WORK(&buf->rb_refresh_worker, - rpcrdma_mr_refresh_worker); + INIT_LIST_HEAD(&buf->rb_all_mrs); + INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); rpcrdma_mrs_create(r_xprt); @@ -1102,7 +1142,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } buf->rb_credits = 1; - INIT_LIST_HEAD(&buf->rb_recv_bufs); + init_llist_head(&buf->rb_free_reps); rc = rpcrdma_sendctxs_create(r_xprt); if (rc) @@ -1114,12 +1154,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) return rc; } -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) -{ - rpcrdma_regbuf_free(rep->rr_rdmabuf); - kfree(rep); -} - /** * rpcrdma_req_destroy - Destroy an rpcrdma_req object * @req: unused object to be destroyed @@ -1127,11 +1161,13 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) * This function assumes that the caller prevents concurrent device * unload and transport tear-down. */ -void -rpcrdma_req_destroy(struct rpcrdma_req *req) +void rpcrdma_req_destroy(struct rpcrdma_req *req) { list_del(&req->rl_all); + while (!list_empty(&req->rl_free_mrs)) + rpcrdma_mr_free(rpcrdma_mr_pop(&req->rl_free_mrs)); + rpcrdma_regbuf_free(req->rl_recvbuf); rpcrdma_regbuf_free(req->rl_sendbuf); rpcrdma_regbuf_free(req->rl_rdmabuf); @@ -1147,25 +1183,19 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) unsigned int count; count = 0; - spin_lock(&buf->rb_mrlock); - while (!list_empty(&buf->rb_all)) { - mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all); + spin_lock(&buf->rb_lock); + while ((mr = list_first_entry_or_null(&buf->rb_all_mrs, + struct rpcrdma_mr, + mr_all)) != NULL) { list_del(&mr->mr_all); - - spin_unlock(&buf->rb_mrlock); - - /* Ensure MW is not on any rl_registered list */ - if (!list_empty(&mr->mr_list)) - list_del(&mr->mr_list); + spin_unlock(&buf->rb_lock); frwr_release_mr(mr); count++; - spin_lock(&buf->rb_mrlock); + spin_lock(&buf->rb_lock); } - spin_unlock(&buf->rb_mrlock); + spin_unlock(&buf->rb_lock); r_xprt->rx_stats.mrs_allocated = 0; - - dprintk("RPC: %s: released %u MRs\n", __func__, count); } /** @@ -1179,18 +1209,10 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - cancel_delayed_work_sync(&buf->rb_refresh_worker); + cancel_work_sync(&buf->rb_refresh_worker); rpcrdma_sendctxs_destroy(buf); - - while (!list_empty(&buf->rb_recv_bufs)) { - struct rpcrdma_rep *rep; - - rep = list_first_entry(&buf->rb_recv_bufs, - struct rpcrdma_rep, rr_list); - list_del(&rep->rr_list); - rpcrdma_rep_destroy(rep); - } + rpcrdma_reps_destroy(buf); while (!list_empty(&buf->rb_send_bufs)) { struct rpcrdma_req *req; @@ -1215,54 +1237,20 @@ struct rpcrdma_mr * rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_mr *mr = NULL; + struct rpcrdma_mr *mr; - spin_lock(&buf->rb_mrlock); - if (!list_empty(&buf->rb_mrs)) - mr = rpcrdma_mr_pop(&buf->rb_mrs); - spin_unlock(&buf->rb_mrlock); - - if (!mr) - goto out_nomrs; + spin_lock(&buf->rb_lock); + mr = rpcrdma_mr_pop(&buf->rb_mrs); + spin_unlock(&buf->rb_lock); return mr; - -out_nomrs: - trace_xprtrdma_nomrs(r_xprt); - if (r_xprt->rx_ep.rep_connected != -ENODEV) - schedule_delayed_work(&buf->rb_refresh_worker, 0); - - /* Allow the reply handler and refresh worker to run */ - cond_resched(); - - return NULL; -} - -static void -__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr) -{ - spin_lock(&buf->rb_mrlock); - rpcrdma_mr_push(mr, &buf->rb_mrs); - spin_unlock(&buf->rb_mrlock); } /** - * rpcrdma_mr_put - Release an rpcrdma_mr object - * @mr: object to release + * rpcrdma_mr_put - DMA unmap an MR and release it + * @mr: MR to release * */ -void -rpcrdma_mr_put(struct rpcrdma_mr *mr) -{ - __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr); -} - -/** - * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it - * @mr: object to release - * - */ -void -rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) +void rpcrdma_mr_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; @@ -1272,7 +1260,19 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } - __rpcrdma_mr_put(&r_xprt->rx_buf, mr); + + rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); +} + +static void rpcrdma_mr_free(struct rpcrdma_mr *mr) +{ + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + + mr->mr_req = NULL; + spin_lock(&buf->rb_lock); + rpcrdma_mr_push(mr, &buf->rb_mrs); + spin_unlock(&buf->rb_lock); } /** @@ -1303,39 +1303,24 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) */ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) { - struct rpcrdma_rep *rep = req->rl_reply; - + if (req->rl_reply) + rpcrdma_rep_put(buffers, req->rl_reply); req->rl_reply = NULL; spin_lock(&buffers->rb_lock); list_add(&req->rl_list, &buffers->rb_send_bufs); - if (rep) { - if (!rep->rr_temp) { - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - rep = NULL; - } - } spin_unlock(&buffers->rb_lock); - if (rep) - rpcrdma_rep_destroy(rep); } -/* - * Put reply buffers back into pool when not attached to - * request. This happens in error conditions. +/** + * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list + * @rep: rep to release + * + * Used after error conditions. */ -void -rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) +void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) { - struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; - - if (!rep->rr_temp) { - spin_lock(&buffers->rb_lock); - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - spin_unlock(&buffers->rb_lock); - } else { - rpcrdma_rep_destroy(rep); - } + rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep); } /* Returns a pointer to a rpcrdma_regbuf object, or NULL. @@ -1483,7 +1468,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); - if (ep->rep_receive_count > needed) + if (likely(ep->rep_receive_count > needed)) goto out; needed -= ep->rep_receive_count; if (!temp) @@ -1491,22 +1476,10 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) /* fast path: all needed reps can be found on the free list */ wr = NULL; - spin_lock(&buf->rb_lock); while (needed) { - rep = list_first_entry_or_null(&buf->rb_recv_bufs, - struct rpcrdma_rep, rr_list); + rep = rpcrdma_rep_get_locked(buf); if (!rep) - break; - - list_del(&rep->rr_list); - rep->rr_recv_wr.next = wr; - wr = &rep->rr_recv_wr; - --needed; - } - spin_unlock(&buf->rb_lock); - - while (needed) { - rep = rpcrdma_rep_create(r_xprt, temp); + rep = rpcrdma_rep_create(r_xprt, temp); if (!rep) break; @@ -1523,7 +1496,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) goto release_wrs; - trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe); + trace_xprtrdma_post_recv(rep); ++count; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 92ce09fcea74..65e6b0eb862e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -47,6 +47,7 @@ #include /* atomic_t, etc */ #include /* struct kref */ #include /* struct work_struct */ +#include #include /* RDMA connection api */ #include /* RDMA verbs api */ @@ -117,9 +118,6 @@ struct rpcrdma_ep { #endif /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV - * - * The below structure appears at the front of a large region of kmalloc'd - * memory, which always starts on a good alignment boundary. */ struct rpcrdma_regbuf { @@ -158,25 +156,22 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb) /* To ensure a transport can always make forward progress, * the number of RDMA segments allowed in header chunk lists - * is capped at 8. This prevents less-capable devices and - * memory registrations from overrunning the Send buffer - * while building chunk lists. + * is capped at 16. This prevents less-capable devices from + * overrunning the Send buffer while building chunk lists. * * Elements of the Read list take up more room than the - * Write list or Reply chunk. 8 read segments means the Read - * list (or Write list or Reply chunk) cannot consume more - * than + * Write list or Reply chunk. 16 read segments means the + * chunk lists cannot consume more than * - * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes. + * ((16 + 2) * read segment size) + 1 XDR words, * - * And the fixed part of the header is another 24 bytes. - * - * The smallest inline threshold is 1024 bytes, ensuring that - * at least 750 bytes are available for RPC messages. + * or about 400 bytes. The fixed part of the header is + * another 24 bytes. Thus when the inline threshold is + * 1024 bytes, at least 600 bytes are available for RPC + * message bodies. */ enum { - RPCRDMA_MAX_HDR_SEGS = 8, - RPCRDMA_HDRBUF_SIZE = 256, + RPCRDMA_MAX_HDR_SEGS = 16, }; /* @@ -206,7 +201,7 @@ struct rpcrdma_rep { struct rpc_rqst *rr_rqst; struct xdr_buf rr_hdrbuf; struct xdr_stream rr_stream; - struct list_head rr_list; + struct llist_node rr_node; struct ib_recv_wr rr_recv_wr; }; @@ -240,20 +235,20 @@ struct rpcrdma_sendctx { * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). */ -struct rpcrdma_req; struct rpcrdma_frwr { struct ib_mr *fr_mr; struct ib_cqe fr_cqe; struct completion fr_linv_done; - struct rpcrdma_req *fr_req; union { struct ib_reg_wr fr_regwr; struct ib_send_wr fr_invwr; }; }; +struct rpcrdma_req; struct rpcrdma_mr { struct list_head mr_list; + struct rpcrdma_req *mr_req; struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; @@ -331,7 +326,8 @@ struct rpcrdma_req { struct list_head rl_all; struct kref rl_kref; - struct list_head rl_registered; /* registered segments */ + struct list_head rl_free_mrs; + struct list_head rl_registered; struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; @@ -344,7 +340,7 @@ rpcr_to_rdmar(const struct rpc_rqst *rqst) static inline void rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list) { - list_add_tail(&mr->mr_list, list); + list_add(&mr->mr_list, list); } static inline struct rpcrdma_mr * @@ -352,8 +348,9 @@ rpcrdma_mr_pop(struct list_head *list) { struct rpcrdma_mr *mr; - mr = list_first_entry(list, struct rpcrdma_mr, mr_list); - list_del_init(&mr->mr_list); + mr = list_first_entry_or_null(list, struct rpcrdma_mr, mr_list); + if (mr) + list_del_init(&mr->mr_list); return mr; } @@ -364,19 +361,19 @@ rpcrdma_mr_pop(struct list_head *list) * One of these is associated with a transport instance */ struct rpcrdma_buffer { - spinlock_t rb_mrlock; /* protect rb_mrs list */ + spinlock_t rb_lock; + struct list_head rb_send_bufs; struct list_head rb_mrs; - struct list_head rb_all; unsigned long rb_sc_head; unsigned long rb_sc_tail; unsigned long rb_sc_last; struct rpcrdma_sendctx **rb_sc_ctxs; - spinlock_t rb_lock; /* protect buf lists */ - struct list_head rb_send_bufs; - struct list_head rb_recv_bufs; struct list_head rb_allreqs; + struct list_head rb_all_mrs; + + struct llist_head rb_free_reps; u32 rb_max_requests; u32 rb_credits; /* most recent credit grant */ @@ -384,7 +381,7 @@ struct rpcrdma_buffer { u32 rb_bc_srv_max_requests; u32 rb_bc_max_requests; - struct delayed_work rb_refresh_worker; + struct work_struct rb_refresh_worker; }; /* @@ -490,7 +487,6 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); -void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); static inline void rpcrdma_mr_recycle(struct rpcrdma_mr *mr) @@ -546,6 +542,7 @@ rpcrdma_data_dir(bool writing) /* Memory registration calls xprtrdma/frwr_ops.c */ bool frwr_is_supported(struct ib_device *device); +void frwr_recycle(struct rpcrdma_req *req); void frwr_reset(struct rpcrdma_req *req); int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep); int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); @@ -554,7 +551,7 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, - struct rpcrdma_mr **mr); + struct rpcrdma_mr *mr); int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req); void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e2176c167a57..9ac88722fa83 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -562,10 +562,14 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) printk(KERN_WARNING "Callback slot table overflowed\n"); return -ESHUTDOWN; } + if (transport->recv.copied && !req->rq_private_buf.len) + return -ESHUTDOWN; ret = xs_read_stream_request(transport, msg, flags, req); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) xprt_complete_bc_request(req, transport->recv.copied); + else + req->rq_private_buf.len = transport->recv.copied; return ret; } @@ -587,7 +591,7 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) /* Look up and lock the request corresponding to the given XID */ spin_lock(&xprt->queue_lock); req = xprt_lookup_rqst(xprt, transport->recv.xid); - if (!req) { + if (!req || (transport->recv.copied && !req->rq_private_buf.len)) { msg->msg_flags |= MSG_TRUNC; goto out; } @@ -599,6 +603,8 @@ xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) spin_lock(&xprt->queue_lock); if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) xprt_complete_rqst(req->rq_task, transport->recv.copied); + else + req->rq_private_buf.len = transport->recv.copied; xprt_unpin_rqst(req); out: spin_unlock(&xprt->queue_lock); diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 947b8ff0227e..bba3104f128f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -206,14 +206,7 @@ static int xdp_umem_map_pages(struct xdp_umem *umem) static void xdp_umem_unpin_pages(struct xdp_umem *umem) { - unsigned int i; - - for (i = 0; i < umem->npgs; i++) { - struct page *page = umem->pgs[i]; - - set_page_dirty_lock(page); - put_page(page); - } + put_user_pages_dirty_lock(umem->pgs, umem->npgs, true); kfree(umem->pgs); umem->pgs = NULL; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c2f1af3b6a7c..fa8fbb8fa3c8 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -977,7 +977,7 @@ static int xsk_mmap(struct file *file, struct socket *sock, /* Matches the smp_wmb() in xsk_init_queue */ smp_rmb(); qpg = virt_to_head_page(q->ring); - if (size > (PAGE_SIZE << compound_order(qpg))) + if (size > page_size(qpg)) return -EINVAL; pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 93a7edfe0f05..6fcc66afb088 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -62,6 +62,8 @@ my $conststructsfile = "$D/const_structs.checkpatch"; my $typedefsfile = ""; my $color = "auto"; my $allow_c99_comments = 1; # Can be overridden by --ignore C99_COMMENT_TOLERANCE +# git output parsing needs US English output, so first set backtick child process LANGUAGE +my $git_command ='export LANGUAGE=en_US.UTF-8; git'; sub help { my ($exitcode) = @_; @@ -904,7 +906,7 @@ sub seed_camelcase_includes { $camelcase_seeded = 1; if (-e ".git") { - my $git_last_include_commit = `git log --no-merges --pretty=format:"%h%n" -1 -- include`; + my $git_last_include_commit = `${git_command} log --no-merges --pretty=format:"%h%n" -1 -- include`; chomp $git_last_include_commit; $camelcase_cache = ".checkpatch-camelcase.git.$git_last_include_commit"; } else { @@ -932,7 +934,7 @@ sub seed_camelcase_includes { } if (-e ".git") { - $files = `git ls-files "include/*.h"`; + $files = `${git_command} ls-files "include/*.h"`; @include_files = split('\n', $files); } @@ -956,13 +958,13 @@ sub git_commit_info { return ($id, $desc) if ((which("git") eq "") || !(-e ".git")); - my $output = `git log --no-color --format='%H %s' -1 $commit 2>&1`; + my $output = `${git_command} log --no-color --format='%H %s' -1 $commit 2>&1`; $output =~ s/^\s*//gm; my @lines = split("\n", $output); return ($id, $desc) if ($#lines < 0); - if ($lines[0] =~ /^error: short SHA1 $commit is ambiguous\./) { + if ($lines[0] =~ /^error: short SHA1 $commit is ambiguous/) { # Maybe one day convert this block of bash into something that returns # all matching commit ids, but it's very slow... # @@ -1006,7 +1008,7 @@ if ($git) { } else { $git_range = "-1 $commit_expr"; } - my $lines = `git log --no-color --no-merges --pretty=format:'%H %s' $git_range`; + my $lines = `${git_command} log --no-color --no-merges --pretty=format:'%H %s' $git_range`; foreach my $line (split(/\n/, $lines)) { $line =~ /^([0-9a-fA-F]{40,40}) (.*)$/; next if (!defined($1) || !defined($2)); @@ -2725,8 +2727,10 @@ sub process { ($line =~ /^\s*(?:WARNING:|BUG:)/ || $line =~ /^\s*\[\s*\d+\.\d{6,6}\s*\]/ || # timestamp - $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/)) { - # stack dump address + $line =~ /^\s*\[\<[0-9a-fA-F]{8,}\>\]/) || + $line =~ /^(?:\s+\w+:\s+[0-9a-fA-F]+){3,3}/ || + $line =~ /^\s*\#\d+\s*\[[0-9a-fA-F]+\]\s*\w+ at [0-9a-fA-F]+/) { + # stack dump address styles $commit_log_possible_stack_dump = 1; } @@ -2898,6 +2902,17 @@ sub process { } } +# check for invalid commit id + if ($in_commit_log && $line =~ /(^fixes:|\bcommit)\s+([0-9a-f]{6,40})\b/i) { + my $id; + my $description; + ($id, $description) = git_commit_info($2, undef, undef); + if (!defined($id)) { + WARN("UNKNOWN_COMMIT_ID", + "Unknown commit id '$2', maybe rebased or not pulled?\n" . $herecurr); + } + } + # ignore non-hunk lines and lines being removed next if (!$hunk_line || $line =~ /^-/); @@ -3069,21 +3084,21 @@ sub process { # check SPDX comment style for .[chsS] files if ($realfile =~ /\.[chsS]$/ && $rawline =~ /SPDX-License-Identifier:/ && - $rawline !~ /^\+\s*\Q$comment\E\s*/) { + $rawline !~ m@^\+\s*\Q$comment\E\s*@) { WARN("SPDX_LICENSE_TAG", "Improper SPDX comment style for '$realfile', please use '$comment' instead\n" . $herecurr); } if ($comment !~ /^$/ && - $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) { - WARN("SPDX_LICENSE_TAG", - "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); + $rawline !~ m@^\+\Q$comment\E SPDX-License-Identifier: @) { + WARN("SPDX_LICENSE_TAG", + "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); } elsif ($rawline =~ /(SPDX-License-Identifier: .*)/) { - my $spdx_license = $1; - if (!is_SPDX_License_valid($spdx_license)) { - WARN("SPDX_LICENSE_TAG", - "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr); - } + my $spdx_license = $1; + if (!is_SPDX_License_valid($spdx_license)) { + WARN("SPDX_LICENSE_TAG", + "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr); + } } } } @@ -4660,7 +4675,7 @@ sub process { # closing brace should have a space following it when it has anything # on the line - if ($line =~ /}(?!(?:,|;|\)))\S/) { + if ($line =~ /}(?!(?:,|;|\)|\}))\S/) { if (ERROR("SPACING", "space required after that close brace '}'\n" . $herecurr) && $fix) { @@ -5191,7 +5206,7 @@ sub process { next if ($arg =~ /\.\.\./); next if ($arg =~ /^type$/i); my $tmp_stmt = $define_stmt; - $tmp_stmt =~ s/\b(typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g; + $tmp_stmt =~ s/\b(sizeof|typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g; $tmp_stmt =~ s/\#+\s*$arg\b//g; $tmp_stmt =~ s/\b$arg\s*\#\#//g; my $use_cnt = () = $tmp_stmt =~ /\b$arg\b/g; @@ -5873,6 +5888,18 @@ sub process { "__aligned(size) is preferred over __attribute__((aligned(size)))\n" . $herecurr); } +# Check for __attribute__ section, prefer __section + if ($realfile !~ m@\binclude/uapi/@ && + $line =~ /\b__attribute__\s*\(\s*\(.*_*section_*\s*\(\s*("[^"]*")/) { + my $old = substr($rawline, $-[1], $+[1] - $-[1]); + my $new = substr($old, 1, -1); + if (WARN("PREFER_SECTION", + "__section($new) is preferred over __attribute__((section($old)))\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/\b__attribute__\s*\(\s*\(\s*_*section_*\s*\(\s*\Q$old\E\s*\)\s*\)\s*\)/__section($new)/; + } + } + # Check for __attribute__ format(printf, prefer __printf if ($realfile !~ m@\binclude/uapi/@ && $line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) { @@ -6480,6 +6507,12 @@ sub process { "Using $1 should generally have parentheses around the comparison\n" . $herecurr); } +# nested likely/unlikely calls + if ($line =~ /\b(?:(?:un)?likely)\s*\(\s*!?\s*(IS_ERR(?:_OR_NULL|_VALUE)?|WARN)/) { + WARN("LIKELY_MISUSE", + "nested (un)?likely() calls, $1 already uses unlikely() internally\n" . $herecurr); + } + # whine mightly about in_atomic if ($line =~ /\bin_atomic\s*\(/) { if ($realfile =~ m@^drivers/@) { diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index 2f5b95f09fa0..34e40e96dee2 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -77,12 +77,12 @@ lx-symbols command.""" gdb.write("scanning for modules in {0}\n".format(path)) for root, dirs, files in os.walk(path): for name in files: - if name.endswith(".ko"): + if name.endswith(".ko") or name.endswith(".ko.debug"): self.module_files.append(root + "/" + name) self.module_files_updated = True def _get_module_file(self, module_name): - module_pattern = ".*/{0}\.ko$".format( + module_pattern = ".*/{0}\.ko(?:.debug)?$".format( module_name.replace("_", r"[_\-]")) for name in self.module_files: if re.match(module_pattern, name) and os.path.exists(name): diff --git a/security/keys/trusted.c b/security/keys/trusted.c index ade699131065..1fbd77816610 100644 --- a/security/keys/trusted.c +++ b/security/keys/trusted.c @@ -1228,11 +1228,16 @@ static int __init trusted_shash_alloc(void) static int __init init_digests(void) { + int i; + digests = kcalloc(chip->nr_allocated_banks, sizeof(*digests), GFP_KERNEL); if (!digests) return -ENOMEM; + for (i = 0; i < chip->nr_allocated_banks; i++) + digests[i].alg_id = chip->allocated_banks[i].alg_id; + return 0; } diff --git a/security/safesetid/securityfs.c b/security/safesetid/securityfs.c index d568e17dd773..74a13d432ed8 100644 --- a/security/safesetid/securityfs.c +++ b/security/safesetid/securityfs.c @@ -187,7 +187,8 @@ static ssize_t handle_policy_update(struct file *file, out_free_buf: kfree(buf); out_free_pol: - release_ruleset(pol); + if (pol) + release_ruleset(pol); return err; } diff --git a/security/security.c b/security/security.c index 968e460dd6ef..d6dbc8d5607b 100644 --- a/security/security.c +++ b/security/security.c @@ -870,6 +870,12 @@ int security_move_mount(const struct path *from_path, const struct path *to_path return call_int_hook(move_mount, 0, from_path, to_path); } +int security_path_notify(const struct path *path, u64 mask, + unsigned int obj_type) +{ + return call_int_hook(path_notify, 0, path, mask, obj_type); +} + int security_inode_alloc(struct inode *inode) { int rc = lsm_inode_alloc(inode); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 74dd46de01b6..9625b99e677f 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -89,6 +89,8 @@ #include #include /* for hashlen_string() */ #include +#include +#include #include "avc.h" #include "objsec.h" @@ -3275,6 +3277,50 @@ static int selinux_inode_removexattr(struct dentry *dentry, const char *name) return -EACCES; } +static int selinux_path_notify(const struct path *path, u64 mask, + unsigned int obj_type) +{ + int ret; + u32 perm; + + struct common_audit_data ad; + + ad.type = LSM_AUDIT_DATA_PATH; + ad.u.path = *path; + + /* + * Set permission needed based on the type of mark being set. + * Performs an additional check for sb watches. + */ + switch (obj_type) { + case FSNOTIFY_OBJ_TYPE_VFSMOUNT: + perm = FILE__WATCH_MOUNT; + break; + case FSNOTIFY_OBJ_TYPE_SB: + perm = FILE__WATCH_SB; + ret = superblock_has_perm(current_cred(), path->dentry->d_sb, + FILESYSTEM__WATCH, &ad); + if (ret) + return ret; + break; + case FSNOTIFY_OBJ_TYPE_INODE: + perm = FILE__WATCH; + break; + default: + return -EINVAL; + } + + /* blocking watches require the file:watch_with_perm permission */ + if (mask & (ALL_FSNOTIFY_PERM_EVENTS)) + perm |= FILE__WATCH_WITH_PERM; + + /* watches on read-like events need the file:watch_reads permission */ + if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE)) + perm |= FILE__WATCH_READS; + + return path_has_perm(current_cred(), path, perm); +} + /* * Copy the inode security context value to the user. * @@ -3403,7 +3449,7 @@ static int selinux_inode_copy_up_xattr(const char *name) static int selinux_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { - const struct task_security_struct *tsec = current_security(); + const struct task_security_struct *tsec = selinux_cred(current_cred()); u32 parent_sid, newsid, clen; int rc; char *context; @@ -6818,6 +6864,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(inode_getsecid, selinux_inode_getsecid), LSM_HOOK_INIT(inode_copy_up, selinux_inode_copy_up), LSM_HOOK_INIT(inode_copy_up_xattr, selinux_inode_copy_up_xattr), + LSM_HOOK_INIT(path_notify, selinux_path_notify), LSM_HOOK_INIT(kernfs_init_security, selinux_kernfs_init_security), diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 201f7e588a29..32e9b03be3dd 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -7,7 +7,8 @@ #define COMMON_FILE_PERMS COMMON_FILE_SOCK_PERMS, "unlink", "link", \ "rename", "execute", "quotaon", "mounton", "audit_access", \ - "open", "execmod" + "open", "execmod", "watch", "watch_mount", "watch_sb", \ + "watch_with_perm", "watch_reads" #define COMMON_SOCK_PERMS COMMON_FILE_SOCK_PERMS, "bind", "connect", \ "listen", "accept", "getopt", "setopt", "shutdown", "recvfrom", \ @@ -60,7 +61,7 @@ struct security_class_mapping secclass_map[] = { { "filesystem", { "mount", "remount", "unmount", "getattr", "relabelfrom", "relabelto", "associate", "quotamod", - "quotaget", NULL } }, + "quotaget", "watch", NULL } }, { "file", { COMMON_FILE_PERMS, "execute_no_trans", "entrypoint", NULL } }, diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 91c5395dd20c..586b7abd0aa7 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -37,16 +37,6 @@ struct task_security_struct { u32 sockcreate_sid; /* fscreate SID */ }; -/* - * get the subjective security ID of the current task - */ -static inline u32 current_sid(void) -{ - const struct task_security_struct *tsec = current_security(); - - return tsec->sid; -} - enum label_initialized { LABEL_INVALID, /* invalid or not initialized */ LABEL_INITIALIZED, /* initialized */ @@ -185,4 +175,14 @@ static inline struct ipc_security_struct *selinux_ipc( return ipc->security + selinux_blob_sizes.lbs_ipc; } +/* + * get the subjective security ID of the current task + */ +static inline u32 current_sid(void) +{ + const struct task_security_struct *tsec = selinux_cred(current_cred()); + + return tsec->sid; +} + #endif /* _SELINUX_OBJSEC_H_ */ diff --git a/security/selinux/netif.c b/security/selinux/netif.c index 9cb83eeee1d9..e40fecd73752 100644 --- a/security/selinux/netif.c +++ b/security/selinux/netif.c @@ -132,9 +132,9 @@ static void sel_netif_destroy(struct sel_netif *netif) */ static int sel_netif_sid_slow(struct net *ns, int ifindex, u32 *sid) { - int ret; + int ret = 0; struct sel_netif *netif; - struct sel_netif *new = NULL; + struct sel_netif *new; struct net_device *dev; /* NOTE: we always use init's network namespace since we don't @@ -151,32 +151,27 @@ static int sel_netif_sid_slow(struct net *ns, int ifindex, u32 *sid) netif = sel_netif_find(ns, ifindex); if (netif != NULL) { *sid = netif->nsec.sid; - ret = 0; goto out; } + + ret = security_netif_sid(&selinux_state, dev->name, sid); + if (ret != 0) + goto out; new = kzalloc(sizeof(*new), GFP_ATOMIC); - if (new == NULL) { - ret = -ENOMEM; - goto out; + if (new) { + new->nsec.ns = ns; + new->nsec.ifindex = ifindex; + new->nsec.sid = *sid; + if (sel_netif_insert(new)) + kfree(new); } - ret = security_netif_sid(&selinux_state, dev->name, &new->nsec.sid); - if (ret != 0) - goto out; - new->nsec.ns = ns; - new->nsec.ifindex = ifindex; - ret = sel_netif_insert(new); - if (ret != 0) - goto out; - *sid = new->nsec.sid; out: spin_unlock_bh(&sel_netif_lock); dev_put(dev); - if (unlikely(ret)) { + if (unlikely(ret)) pr_warn("SELinux: failure in %s(), unable to determine network interface label (%d)\n", __func__, ifindex); - kfree(new); - } return ret; } diff --git a/security/selinux/netnode.c b/security/selinux/netnode.c index cae1fcaffd1a..9ab84efa46c7 100644 --- a/security/selinux/netnode.c +++ b/security/selinux/netnode.c @@ -189,9 +189,9 @@ static void sel_netnode_insert(struct sel_netnode *node) */ static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid) { - int ret = -ENOMEM; + int ret; struct sel_netnode *node; - struct sel_netnode *new = NULL; + struct sel_netnode *new; spin_lock_bh(&sel_netnode_lock); node = sel_netnode_find(addr, family); @@ -200,38 +200,36 @@ static int sel_netnode_sid_slow(void *addr, u16 family, u32 *sid) spin_unlock_bh(&sel_netnode_lock); return 0; } + new = kzalloc(sizeof(*new), GFP_ATOMIC); - if (new == NULL) - goto out; switch (family) { case PF_INET: ret = security_node_sid(&selinux_state, PF_INET, addr, sizeof(struct in_addr), sid); - new->nsec.addr.ipv4 = *(__be32 *)addr; + if (new) + new->nsec.addr.ipv4 = *(__be32 *)addr; break; case PF_INET6: ret = security_node_sid(&selinux_state, PF_INET6, addr, sizeof(struct in6_addr), sid); - new->nsec.addr.ipv6 = *(struct in6_addr *)addr; + if (new) + new->nsec.addr.ipv6 = *(struct in6_addr *)addr; break; default: BUG(); ret = -EINVAL; } - if (ret != 0) - goto out; + if (ret == 0 && new) { + new->nsec.family = family; + new->nsec.sid = *sid; + sel_netnode_insert(new); + } else + kfree(new); - new->nsec.family = family; - new->nsec.sid = *sid; - sel_netnode_insert(new); - -out: spin_unlock_bh(&sel_netnode_lock); - if (unlikely(ret)) { + if (unlikely(ret)) pr_warn("SELinux: failure in %s(), unable to determine network node label\n", __func__); - kfree(new); - } return ret; } diff --git a/security/selinux/netport.c b/security/selinux/netport.c index 364b6d5b8968..3f8b2c0458c8 100644 --- a/security/selinux/netport.c +++ b/security/selinux/netport.c @@ -137,9 +137,9 @@ static void sel_netport_insert(struct sel_netport *port) */ static int sel_netport_sid_slow(u8 protocol, u16 pnum, u32 *sid) { - int ret = -ENOMEM; + int ret; struct sel_netport *port; - struct sel_netport *new = NULL; + struct sel_netport *new; spin_lock_bh(&sel_netport_lock); port = sel_netport_find(protocol, pnum); @@ -148,25 +148,23 @@ static int sel_netport_sid_slow(u8 protocol, u16 pnum, u32 *sid) spin_unlock_bh(&sel_netport_lock); return 0; } - new = kzalloc(sizeof(*new), GFP_ATOMIC); - if (new == NULL) - goto out; + ret = security_port_sid(&selinux_state, protocol, pnum, sid); if (ret != 0) goto out; - - new->psec.port = pnum; - new->psec.protocol = protocol; - new->psec.sid = *sid; - sel_netport_insert(new); + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (new) { + new->psec.port = pnum; + new->psec.protocol = protocol; + new->psec.sid = *sid; + sel_netport_insert(new); + } out: spin_unlock_bh(&sel_netport_lock); - if (unlikely(ret)) { + if (unlikely(ret)) pr_warn("SELinux: failure in %s(), unable to determine network port label\n", __func__); - kfree(new); - } return ret; } diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index f8efaa9f647c..1260f5fb766e 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -176,397 +176,6 @@ static struct policydb_compat_info *policydb_lookup_compat(int version) return info; } -/* - * Initialize the role table. - */ -static int roles_init(struct policydb *p) -{ - char *key = NULL; - int rc; - struct role_datum *role; - - role = kzalloc(sizeof(*role), GFP_KERNEL); - if (!role) - return -ENOMEM; - - rc = -EINVAL; - role->value = ++p->p_roles.nprim; - if (role->value != OBJECT_R_VAL) - goto out; - - rc = -ENOMEM; - key = kstrdup(OBJECT_R, GFP_KERNEL); - if (!key) - goto out; - - rc = hashtab_insert(p->p_roles.table, key, role); - if (rc) - goto out; - - return 0; -out: - kfree(key); - kfree(role); - return rc; -} - -static u32 filenametr_hash(struct hashtab *h, const void *k) -{ - const struct filename_trans *ft = k; - unsigned long hash; - unsigned int byte_num; - unsigned char focus; - - hash = ft->stype ^ ft->ttype ^ ft->tclass; - - byte_num = 0; - while ((focus = ft->name[byte_num++])) - hash = partial_name_hash(focus, hash); - return hash & (h->size - 1); -} - -static int filenametr_cmp(struct hashtab *h, const void *k1, const void *k2) -{ - const struct filename_trans *ft1 = k1; - const struct filename_trans *ft2 = k2; - int v; - - v = ft1->stype - ft2->stype; - if (v) - return v; - - v = ft1->ttype - ft2->ttype; - if (v) - return v; - - v = ft1->tclass - ft2->tclass; - if (v) - return v; - - return strcmp(ft1->name, ft2->name); - -} - -static u32 rangetr_hash(struct hashtab *h, const void *k) -{ - const struct range_trans *key = k; - return (key->source_type + (key->target_type << 3) + - (key->target_class << 5)) & (h->size - 1); -} - -static int rangetr_cmp(struct hashtab *h, const void *k1, const void *k2) -{ - const struct range_trans *key1 = k1, *key2 = k2; - int v; - - v = key1->source_type - key2->source_type; - if (v) - return v; - - v = key1->target_type - key2->target_type; - if (v) - return v; - - v = key1->target_class - key2->target_class; - - return v; -} - -static int (*destroy_f[SYM_NUM]) (void *key, void *datum, void *datap); - -/* - * Initialize a policy database structure. - */ -static int policydb_init(struct policydb *p) -{ - int i, rc; - - memset(p, 0, sizeof(*p)); - - for (i = 0; i < SYM_NUM; i++) { - rc = symtab_init(&p->symtab[i], symtab_sizes[i]); - if (rc) - goto out; - } - - rc = avtab_init(&p->te_avtab); - if (rc) - goto out; - - rc = roles_init(p); - if (rc) - goto out; - - rc = cond_policydb_init(p); - if (rc) - goto out; - - p->filename_trans = hashtab_create(filenametr_hash, filenametr_cmp, (1 << 10)); - if (!p->filename_trans) { - rc = -ENOMEM; - goto out; - } - - p->range_tr = hashtab_create(rangetr_hash, rangetr_cmp, 256); - if (!p->range_tr) { - rc = -ENOMEM; - goto out; - } - - ebitmap_init(&p->filename_trans_ttypes); - ebitmap_init(&p->policycaps); - ebitmap_init(&p->permissive_map); - - return 0; -out: - hashtab_destroy(p->filename_trans); - hashtab_destroy(p->range_tr); - for (i = 0; i < SYM_NUM; i++) { - hashtab_map(p->symtab[i].table, destroy_f[i], NULL); - hashtab_destroy(p->symtab[i].table); - } - return rc; -} - -/* - * The following *_index functions are used to - * define the val_to_name and val_to_struct arrays - * in a policy database structure. The val_to_name - * arrays are used when converting security context - * structures into string representations. The - * val_to_struct arrays are used when the attributes - * of a class, role, or user are needed. - */ - -static int common_index(void *key, void *datum, void *datap) -{ - struct policydb *p; - struct common_datum *comdatum; - - comdatum = datum; - p = datap; - if (!comdatum->value || comdatum->value > p->p_commons.nprim) - return -EINVAL; - - p->sym_val_to_name[SYM_COMMONS][comdatum->value - 1] = key; - - return 0; -} - -static int class_index(void *key, void *datum, void *datap) -{ - struct policydb *p; - struct class_datum *cladatum; - - cladatum = datum; - p = datap; - if (!cladatum->value || cladatum->value > p->p_classes.nprim) - return -EINVAL; - - p->sym_val_to_name[SYM_CLASSES][cladatum->value - 1] = key; - p->class_val_to_struct[cladatum->value - 1] = cladatum; - return 0; -} - -static int role_index(void *key, void *datum, void *datap) -{ - struct policydb *p; - struct role_datum *role; - - role = datum; - p = datap; - if (!role->value - || role->value > p->p_roles.nprim - || role->bounds > p->p_roles.nprim) - return -EINVAL; - - p->sym_val_to_name[SYM_ROLES][role->value - 1] = key; - p->role_val_to_struct[role->value - 1] = role; - return 0; -} - -static int type_index(void *key, void *datum, void *datap) -{ - struct policydb *p; - struct type_datum *typdatum; - - typdatum = datum; - p = datap; - - if (typdatum->primary) { - if (!typdatum->value - || typdatum->value > p->p_types.nprim - || typdatum->bounds > p->p_types.nprim) - return -EINVAL; - p->sym_val_to_name[SYM_TYPES][typdatum->value - 1] = key; - p->type_val_to_struct_array[typdatum->value - 1] = typdatum; - } - - return 0; -} - -static int user_index(void *key, void *datum, void *datap) -{ - struct policydb *p; - struct user_datum *usrdatum; - - usrdatum = datum; - p = datap; - if (!usrdatum->value - || usrdatum->value > p->p_users.nprim - || usrdatum->bounds > p->p_users.nprim) - return -EINVAL; - - p->sym_val_to_name[SYM_USERS][usrdatum->value - 1] = key; - p->user_val_to_struct[usrdatum->value - 1] = usrdatum; - return 0; -} - -static int sens_index(void *key, void *datum, void *datap) -{ - struct policydb *p; - struct level_datum *levdatum; - - levdatum = datum; - p = datap; - - if (!levdatum->isalias) { - if (!levdatum->level->sens || - levdatum->level->sens > p->p_levels.nprim) - return -EINVAL; - - p->sym_val_to_name[SYM_LEVELS][levdatum->level->sens - 1] = key; - } - - return 0; -} - -static int cat_index(void *key, void *datum, void *datap) -{ - struct policydb *p; - struct cat_datum *catdatum; - - catdatum = datum; - p = datap; - - if (!catdatum->isalias) { - if (!catdatum->value || catdatum->value > p->p_cats.nprim) - return -EINVAL; - - p->sym_val_to_name[SYM_CATS][catdatum->value - 1] = key; - } - - return 0; -} - -static int (*index_f[SYM_NUM]) (void *key, void *datum, void *datap) = -{ - common_index, - class_index, - role_index, - type_index, - user_index, - cond_index_bool, - sens_index, - cat_index, -}; - -#ifdef DEBUG_HASHES -static void hash_eval(struct hashtab *h, const char *hash_name) -{ - struct hashtab_info info; - - hashtab_stat(h, &info); - pr_debug("SELinux: %s: %d entries and %d/%d buckets used, " - "longest chain length %d\n", hash_name, h->nel, - info.slots_used, h->size, info.max_chain_len); -} - -static void symtab_hash_eval(struct symtab *s) -{ - int i; - - for (i = 0; i < SYM_NUM; i++) - hash_eval(s[i].table, symtab_name[i]); -} - -#else -static inline void hash_eval(struct hashtab *h, char *hash_name) -{ -} -#endif - -/* - * Define the other val_to_name and val_to_struct arrays - * in a policy database structure. - * - * Caller must clean up on failure. - */ -static int policydb_index(struct policydb *p) -{ - int i, rc; - - if (p->mls_enabled) - pr_debug("SELinux: %d users, %d roles, %d types, %d bools, %d sens, %d cats\n", - p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim, - p->p_bools.nprim, p->p_levels.nprim, p->p_cats.nprim); - else - pr_debug("SELinux: %d users, %d roles, %d types, %d bools\n", - p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim, - p->p_bools.nprim); - - pr_debug("SELinux: %d classes, %d rules\n", - p->p_classes.nprim, p->te_avtab.nel); - -#ifdef DEBUG_HASHES - avtab_hash_eval(&p->te_avtab, "rules"); - symtab_hash_eval(p->symtab); -#endif - - p->class_val_to_struct = kcalloc(p->p_classes.nprim, - sizeof(*p->class_val_to_struct), - GFP_KERNEL); - if (!p->class_val_to_struct) - return -ENOMEM; - - p->role_val_to_struct = kcalloc(p->p_roles.nprim, - sizeof(*p->role_val_to_struct), - GFP_KERNEL); - if (!p->role_val_to_struct) - return -ENOMEM; - - p->user_val_to_struct = kcalloc(p->p_users.nprim, - sizeof(*p->user_val_to_struct), - GFP_KERNEL); - if (!p->user_val_to_struct) - return -ENOMEM; - - p->type_val_to_struct_array = kvcalloc(p->p_types.nprim, - sizeof(*p->type_val_to_struct_array), - GFP_KERNEL); - if (!p->type_val_to_struct_array) - return -ENOMEM; - - rc = cond_init_bool_indexes(p); - if (rc) - goto out; - - for (i = 0; i < SYM_NUM; i++) { - p->sym_val_to_name[i] = kvcalloc(p->symtab[i].nprim, - sizeof(char *), - GFP_KERNEL); - if (!p->sym_val_to_name[i]) - return -ENOMEM; - - rc = hashtab_map(p->symtab[i].table, index_f[i], p); - if (rc) - goto out; - } - rc = 0; -out: - return rc; -} - /* * The following *_destroy functions are used to * free any memory allocated for each kind of @@ -723,6 +332,7 @@ static int (*destroy_f[SYM_NUM]) (void *key, void *datum, void *datap) = static int filenametr_destroy(void *key, void *datum, void *p) { struct filename_trans *ft = key; + kfree(ft->name); kfree(key); kfree(datum); @@ -733,6 +343,7 @@ static int filenametr_destroy(void *key, void *datum, void *p) static int range_tr_destroy(void *key, void *datum, void *p) { struct mls_range *rt = datum; + kfree(key); ebitmap_destroy(&rt->level[0].cat); ebitmap_destroy(&rt->level[1].cat); @@ -754,6 +365,397 @@ static void ocontext_destroy(struct ocontext *c, int i) kfree(c); } +/* + * Initialize the role table. + */ +static int roles_init(struct policydb *p) +{ + char *key = NULL; + int rc; + struct role_datum *role; + + role = kzalloc(sizeof(*role), GFP_KERNEL); + if (!role) + return -ENOMEM; + + rc = -EINVAL; + role->value = ++p->p_roles.nprim; + if (role->value != OBJECT_R_VAL) + goto out; + + rc = -ENOMEM; + key = kstrdup(OBJECT_R, GFP_KERNEL); + if (!key) + goto out; + + rc = hashtab_insert(p->p_roles.table, key, role); + if (rc) + goto out; + + return 0; +out: + kfree(key); + kfree(role); + return rc; +} + +static u32 filenametr_hash(struct hashtab *h, const void *k) +{ + const struct filename_trans *ft = k; + unsigned long hash; + unsigned int byte_num; + unsigned char focus; + + hash = ft->stype ^ ft->ttype ^ ft->tclass; + + byte_num = 0; + while ((focus = ft->name[byte_num++])) + hash = partial_name_hash(focus, hash); + return hash & (h->size - 1); +} + +static int filenametr_cmp(struct hashtab *h, const void *k1, const void *k2) +{ + const struct filename_trans *ft1 = k1; + const struct filename_trans *ft2 = k2; + int v; + + v = ft1->stype - ft2->stype; + if (v) + return v; + + v = ft1->ttype - ft2->ttype; + if (v) + return v; + + v = ft1->tclass - ft2->tclass; + if (v) + return v; + + return strcmp(ft1->name, ft2->name); + +} + +static u32 rangetr_hash(struct hashtab *h, const void *k) +{ + const struct range_trans *key = k; + + return (key->source_type + (key->target_type << 3) + + (key->target_class << 5)) & (h->size - 1); +} + +static int rangetr_cmp(struct hashtab *h, const void *k1, const void *k2) +{ + const struct range_trans *key1 = k1, *key2 = k2; + int v; + + v = key1->source_type - key2->source_type; + if (v) + return v; + + v = key1->target_type - key2->target_type; + if (v) + return v; + + v = key1->target_class - key2->target_class; + + return v; +} + +/* + * Initialize a policy database structure. + */ +static int policydb_init(struct policydb *p) +{ + int i, rc; + + memset(p, 0, sizeof(*p)); + + for (i = 0; i < SYM_NUM; i++) { + rc = symtab_init(&p->symtab[i], symtab_sizes[i]); + if (rc) + goto out; + } + + rc = avtab_init(&p->te_avtab); + if (rc) + goto out; + + rc = roles_init(p); + if (rc) + goto out; + + rc = cond_policydb_init(p); + if (rc) + goto out; + + p->filename_trans = hashtab_create(filenametr_hash, filenametr_cmp, + (1 << 10)); + if (!p->filename_trans) { + rc = -ENOMEM; + goto out; + } + + p->range_tr = hashtab_create(rangetr_hash, rangetr_cmp, 256); + if (!p->range_tr) { + rc = -ENOMEM; + goto out; + } + + ebitmap_init(&p->filename_trans_ttypes); + ebitmap_init(&p->policycaps); + ebitmap_init(&p->permissive_map); + + return 0; +out: + hashtab_destroy(p->filename_trans); + hashtab_destroy(p->range_tr); + for (i = 0; i < SYM_NUM; i++) { + hashtab_map(p->symtab[i].table, destroy_f[i], NULL); + hashtab_destroy(p->symtab[i].table); + } + return rc; +} + +/* + * The following *_index functions are used to + * define the val_to_name and val_to_struct arrays + * in a policy database structure. The val_to_name + * arrays are used when converting security context + * structures into string representations. The + * val_to_struct arrays are used when the attributes + * of a class, role, or user are needed. + */ + +static int common_index(void *key, void *datum, void *datap) +{ + struct policydb *p; + struct common_datum *comdatum; + + comdatum = datum; + p = datap; + if (!comdatum->value || comdatum->value > p->p_commons.nprim) + return -EINVAL; + + p->sym_val_to_name[SYM_COMMONS][comdatum->value - 1] = key; + + return 0; +} + +static int class_index(void *key, void *datum, void *datap) +{ + struct policydb *p; + struct class_datum *cladatum; + + cladatum = datum; + p = datap; + if (!cladatum->value || cladatum->value > p->p_classes.nprim) + return -EINVAL; + + p->sym_val_to_name[SYM_CLASSES][cladatum->value - 1] = key; + p->class_val_to_struct[cladatum->value - 1] = cladatum; + return 0; +} + +static int role_index(void *key, void *datum, void *datap) +{ + struct policydb *p; + struct role_datum *role; + + role = datum; + p = datap; + if (!role->value + || role->value > p->p_roles.nprim + || role->bounds > p->p_roles.nprim) + return -EINVAL; + + p->sym_val_to_name[SYM_ROLES][role->value - 1] = key; + p->role_val_to_struct[role->value - 1] = role; + return 0; +} + +static int type_index(void *key, void *datum, void *datap) +{ + struct policydb *p; + struct type_datum *typdatum; + + typdatum = datum; + p = datap; + + if (typdatum->primary) { + if (!typdatum->value + || typdatum->value > p->p_types.nprim + || typdatum->bounds > p->p_types.nprim) + return -EINVAL; + p->sym_val_to_name[SYM_TYPES][typdatum->value - 1] = key; + p->type_val_to_struct[typdatum->value - 1] = typdatum; + } + + return 0; +} + +static int user_index(void *key, void *datum, void *datap) +{ + struct policydb *p; + struct user_datum *usrdatum; + + usrdatum = datum; + p = datap; + if (!usrdatum->value + || usrdatum->value > p->p_users.nprim + || usrdatum->bounds > p->p_users.nprim) + return -EINVAL; + + p->sym_val_to_name[SYM_USERS][usrdatum->value - 1] = key; + p->user_val_to_struct[usrdatum->value - 1] = usrdatum; + return 0; +} + +static int sens_index(void *key, void *datum, void *datap) +{ + struct policydb *p; + struct level_datum *levdatum; + + levdatum = datum; + p = datap; + + if (!levdatum->isalias) { + if (!levdatum->level->sens || + levdatum->level->sens > p->p_levels.nprim) + return -EINVAL; + + p->sym_val_to_name[SYM_LEVELS][levdatum->level->sens - 1] = key; + } + + return 0; +} + +static int cat_index(void *key, void *datum, void *datap) +{ + struct policydb *p; + struct cat_datum *catdatum; + + catdatum = datum; + p = datap; + + if (!catdatum->isalias) { + if (!catdatum->value || catdatum->value > p->p_cats.nprim) + return -EINVAL; + + p->sym_val_to_name[SYM_CATS][catdatum->value - 1] = key; + } + + return 0; +} + +static int (*index_f[SYM_NUM]) (void *key, void *datum, void *datap) = +{ + common_index, + class_index, + role_index, + type_index, + user_index, + cond_index_bool, + sens_index, + cat_index, +}; + +#ifdef DEBUG_HASHES +static void hash_eval(struct hashtab *h, const char *hash_name) +{ + struct hashtab_info info; + + hashtab_stat(h, &info); + pr_debug("SELinux: %s: %d entries and %d/%d buckets used, longest chain length %d\n", + hash_name, h->nel, info.slots_used, h->size, + info.max_chain_len); +} + +static void symtab_hash_eval(struct symtab *s) +{ + int i; + + for (i = 0; i < SYM_NUM; i++) + hash_eval(s[i].table, symtab_name[i]); +} + +#else +static inline void hash_eval(struct hashtab *h, char *hash_name) +{ +} +#endif + +/* + * Define the other val_to_name and val_to_struct arrays + * in a policy database structure. + * + * Caller must clean up on failure. + */ +static int policydb_index(struct policydb *p) +{ + int i, rc; + + if (p->mls_enabled) + pr_debug("SELinux: %d users, %d roles, %d types, %d bools, %d sens, %d cats\n", + p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim, + p->p_bools.nprim, p->p_levels.nprim, p->p_cats.nprim); + else + pr_debug("SELinux: %d users, %d roles, %d types, %d bools\n", + p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim, + p->p_bools.nprim); + + pr_debug("SELinux: %d classes, %d rules\n", + p->p_classes.nprim, p->te_avtab.nel); + +#ifdef DEBUG_HASHES + avtab_hash_eval(&p->te_avtab, "rules"); + symtab_hash_eval(p->symtab); +#endif + + p->class_val_to_struct = kcalloc(p->p_classes.nprim, + sizeof(*p->class_val_to_struct), + GFP_KERNEL); + if (!p->class_val_to_struct) + return -ENOMEM; + + p->role_val_to_struct = kcalloc(p->p_roles.nprim, + sizeof(*p->role_val_to_struct), + GFP_KERNEL); + if (!p->role_val_to_struct) + return -ENOMEM; + + p->user_val_to_struct = kcalloc(p->p_users.nprim, + sizeof(*p->user_val_to_struct), + GFP_KERNEL); + if (!p->user_val_to_struct) + return -ENOMEM; + + p->type_val_to_struct = kvcalloc(p->p_types.nprim, + sizeof(*p->type_val_to_struct), + GFP_KERNEL); + if (!p->type_val_to_struct) + return -ENOMEM; + + rc = cond_init_bool_indexes(p); + if (rc) + goto out; + + for (i = 0; i < SYM_NUM; i++) { + p->sym_val_to_name[i] = kvcalloc(p->symtab[i].nprim, + sizeof(char *), + GFP_KERNEL); + if (!p->sym_val_to_name[i]) + return -ENOMEM; + + rc = hashtab_map(p->symtab[i].table, index_f[i], p); + if (rc) + goto out; + } + rc = 0; +out: + return rc; +} + /* * Free any memory allocated by a policy database structure. */ @@ -777,7 +779,7 @@ void policydb_destroy(struct policydb *p) kfree(p->class_val_to_struct); kfree(p->role_val_to_struct); kfree(p->user_val_to_struct); - kvfree(p->type_val_to_struct_array); + kvfree(p->type_val_to_struct); avtab_destroy(&p->te_avtab); @@ -1722,7 +1724,7 @@ static int type_bounds_sanity_check(void *key, void *datum, void *datap) return -EINVAL; } - upper = p->type_val_to_struct_array[upper->bounds - 1]; + upper = p->type_val_to_struct[upper->bounds - 1]; BUG_ON(!upper); if (upper->attribute) { diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h index fcc6366b447f..162d0e79b85b 100644 --- a/security/selinux/ss/policydb.h +++ b/security/selinux/ss/policydb.h @@ -253,7 +253,7 @@ struct policydb { struct class_datum **class_val_to_struct; struct role_datum **role_val_to_struct; struct user_datum **user_val_to_struct; - struct type_datum **type_val_to_struct_array; + struct type_datum **type_val_to_struct; /* type enforcement access vectors and transitions */ struct avtab te_avtab; diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index d61563a3695e..3a29e7c24ba9 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -542,13 +542,13 @@ static void type_attribute_bounds_av(struct policydb *policydb, struct type_datum *target; u32 masked = 0; - source = policydb->type_val_to_struct_array[scontext->type - 1]; + source = policydb->type_val_to_struct[scontext->type - 1]; BUG_ON(!source); if (!source->bounds) return; - target = policydb->type_val_to_struct_array[tcontext->type - 1]; + target = policydb->type_val_to_struct[tcontext->type - 1]; BUG_ON(!target); memset(&lo_avd, 0, sizeof(lo_avd)); @@ -891,7 +891,7 @@ int security_bounded_transition(struct selinux_state *state, index = new_context->type; while (true) { - type = policydb->type_val_to_struct_array[index - 1]; + type = policydb->type_val_to_struct[index - 1]; BUG_ON(!type); /* not bounded anymore */ diff --git a/security/selinux/ss/sidtab.c b/security/selinux/ss/sidtab.c index 1f0a6eaa2d6a..7d49994e8d5f 100644 --- a/security/selinux/ss/sidtab.c +++ b/security/selinux/ss/sidtab.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include "flask.h" #include "security.h" #include "sidtab.h" @@ -23,14 +23,14 @@ int sidtab_init(struct sidtab *s) memset(s->roots, 0, sizeof(s->roots)); + /* max count is SIDTAB_MAX so valid index is always < SIDTAB_MAX */ for (i = 0; i < SIDTAB_RCACHE_SIZE; i++) - atomic_set(&s->rcache[i], -1); + s->rcache[i] = SIDTAB_MAX; for (i = 0; i < SECINITSID_NUM; i++) s->isids[i].set = 0; - atomic_set(&s->count, 0); - + s->count = 0; s->convert = NULL; spin_lock_init(&s->lock); @@ -130,14 +130,12 @@ static struct context *sidtab_do_lookup(struct sidtab *s, u32 index, int alloc) static struct context *sidtab_lookup(struct sidtab *s, u32 index) { - u32 count = (u32)atomic_read(&s->count); + /* read entries only after reading count */ + u32 count = smp_load_acquire(&s->count); if (index >= count) return NULL; - /* read entries after reading count */ - smp_rmb(); - return sidtab_do_lookup(s, index, 0); } @@ -210,10 +208,10 @@ static int sidtab_find_context(union sidtab_entry_inner entry, static void sidtab_rcache_update(struct sidtab *s, u32 index, u32 pos) { while (pos > 0) { - atomic_set(&s->rcache[pos], atomic_read(&s->rcache[pos - 1])); + WRITE_ONCE(s->rcache[pos], READ_ONCE(s->rcache[pos - 1])); --pos; } - atomic_set(&s->rcache[0], (int)index); + WRITE_ONCE(s->rcache[0], index); } static void sidtab_rcache_push(struct sidtab *s, u32 index) @@ -227,14 +225,14 @@ static int sidtab_rcache_search(struct sidtab *s, struct context *context, u32 i; for (i = 0; i < SIDTAB_RCACHE_SIZE; i++) { - int v = atomic_read(&s->rcache[i]); + u32 v = READ_ONCE(s->rcache[i]); - if (v < 0) + if (v >= SIDTAB_MAX) continue; - if (context_cmp(sidtab_do_lookup(s, (u32)v, 0), context)) { - sidtab_rcache_update(s, (u32)v, i); - *index = (u32)v; + if (context_cmp(sidtab_do_lookup(s, v, 0), context)) { + sidtab_rcache_update(s, v, i); + *index = v; return 0; } } @@ -245,8 +243,7 @@ static int sidtab_reverse_lookup(struct sidtab *s, struct context *context, u32 *index) { unsigned long flags; - u32 count = (u32)atomic_read(&s->count); - u32 count_locked, level, pos; + u32 count, count_locked, level, pos; struct sidtab_convert_params *convert; struct context *dst, *dst_convert; int rc; @@ -255,11 +252,10 @@ static int sidtab_reverse_lookup(struct sidtab *s, struct context *context, if (rc == 0) return 0; + /* read entries only after reading count */ + count = smp_load_acquire(&s->count); level = sidtab_level_from_count(count); - /* read entries after reading count */ - smp_rmb(); - pos = 0; rc = sidtab_find_context(s->roots[level], &pos, count, level, context, index); @@ -272,7 +268,7 @@ static int sidtab_reverse_lookup(struct sidtab *s, struct context *context, spin_lock_irqsave(&s->lock, flags); convert = s->convert; - count_locked = (u32)atomic_read(&s->count); + count_locked = s->count; level = sidtab_level_from_count(count_locked); /* if count has changed before we acquired the lock, then catch up */ @@ -320,7 +316,7 @@ static int sidtab_reverse_lookup(struct sidtab *s, struct context *context, } /* at this point we know the insert won't fail */ - atomic_set(&convert->target->count, count + 1); + convert->target->count = count + 1; } if (context->len) @@ -331,9 +327,7 @@ static int sidtab_reverse_lookup(struct sidtab *s, struct context *context, *index = count; /* write entries before writing new count */ - smp_wmb(); - - atomic_set(&s->count, count + 1); + smp_store_release(&s->count, count + 1); rc = 0; out_unlock: @@ -423,7 +417,7 @@ int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params) return -EBUSY; } - count = (u32)atomic_read(&s->count); + count = s->count; level = sidtab_level_from_count(count); /* allocate last leaf in the new sidtab (to avoid race with @@ -436,7 +430,7 @@ int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params) } /* set count in case no new entries are added during conversion */ - atomic_set(¶ms->target->count, count); + params->target->count = count; /* enable live convert of new entries */ s->convert = params; diff --git a/security/selinux/ss/sidtab.h b/security/selinux/ss/sidtab.h index bbd5c0d1f3bd..1f4763141aa1 100644 --- a/security/selinux/ss/sidtab.h +++ b/security/selinux/ss/sidtab.h @@ -40,8 +40,8 @@ union sidtab_entry_inner { #define SIDTAB_LEAF_ENTRIES \ (SIDTAB_NODE_ALLOC_SIZE / sizeof(struct sidtab_entry_leaf)) -#define SIDTAB_MAX_BITS 31 /* limited to INT_MAX due to atomic_t range */ -#define SIDTAB_MAX (((u32)1 << SIDTAB_MAX_BITS) - 1) +#define SIDTAB_MAX_BITS 32 +#define SIDTAB_MAX U32_MAX /* ensure enough tree levels for SIDTAB_MAX entries */ #define SIDTAB_MAX_LEVEL \ DIV_ROUND_UP(SIDTAB_MAX_BITS - size_to_shift(SIDTAB_LEAF_ENTRIES), \ @@ -69,13 +69,22 @@ struct sidtab_convert_params { #define SIDTAB_RCACHE_SIZE 3 struct sidtab { + /* + * lock-free read access only for as many items as a prior read of + * 'count' + */ union sidtab_entry_inner roots[SIDTAB_MAX_LEVEL + 1]; - atomic_t count; + /* + * access atomically via {READ|WRITE}_ONCE(); only increment under + * spinlock + */ + u32 count; + /* access only under spinlock */ struct sidtab_convert_params *convert; spinlock_t lock; - /* reverse lookup cache */ - atomic_t rcache[SIDTAB_RCACHE_SIZE]; + /* reverse lookup cache - access atomically via {READ|WRITE}_ONCE() */ + u32 rcache[SIDTAB_RCACHE_SIZE]; /* index == SID - 1 (no entry for SECSID_NULL) */ struct sidtab_isid_entry isids[SECINITSID_NUM]; diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index f1c93a7be9ec..38ac3da4e791 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -465,7 +465,7 @@ char *smk_parse_smack(const char *string, int len) if (i == 0 || i >= SMK_LONGLABEL) return ERR_PTR(-EINVAL); - smack = kzalloc(i + 1, GFP_KERNEL); + smack = kzalloc(i + 1, GFP_NOFS); if (smack == NULL) return ERR_PTR(-ENOMEM); @@ -500,7 +500,7 @@ int smk_netlbl_mls(int level, char *catset, struct netlbl_lsm_secattr *sap, if ((m & *cp) == 0) continue; rc = netlbl_catmap_setbit(&sap->attr.mls.cat, - cat, GFP_KERNEL); + cat, GFP_NOFS); if (rc < 0) { netlbl_catmap_free(sap->attr.mls.cat); return rc; @@ -536,7 +536,7 @@ struct smack_known *smk_import_entry(const char *string, int len) if (skp != NULL) goto freeout; - skp = kzalloc(sizeof(*skp), GFP_KERNEL); + skp = kzalloc(sizeof(*skp), GFP_NOFS); if (skp == NULL) { skp = ERR_PTR(-ENOMEM); goto freeout; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 4c5e5a438f8b..abeb09c30633 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -288,7 +288,7 @@ static struct smack_known *smk_fetch(const char *name, struct inode *ip, if (!(ip->i_opflags & IOP_XATTR)) return ERR_PTR(-EOPNOTSUPP); - buffer = kzalloc(SMK_LONGLABEL, GFP_KERNEL); + buffer = kzalloc(SMK_LONGLABEL, GFP_NOFS); if (buffer == NULL) return ERR_PTR(-ENOMEM); @@ -307,7 +307,7 @@ static struct smack_known *smk_fetch(const char *name, struct inode *ip, /** * init_inode_smack - initialize an inode security blob - * @isp: the blob to initialize + * @inode: inode to extract the info from * @skp: a pointer to the Smack label entry to use in the blob * */ @@ -509,7 +509,7 @@ static int smack_ptrace_traceme(struct task_struct *ptp) /** * smack_syslog - Smack approval on syslog - * @type: message type + * @typefrom_file: unused * * Returns 0 on success, error code otherwise. */ @@ -765,7 +765,7 @@ static int smack_sb_eat_lsm_opts(char *options, void **mnt_opts) /** * smack_set_mnt_opts - set Smack specific mount options * @sb: the file system superblock - * @opts: Smack mount options + * @mnt_opts: Smack mount options * @kern_flags: mount option from kernel space or user space * @set_kern_flags: where to store converted mount opts * @@ -937,7 +937,8 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm) if (rc != 0) return rc; - } else if (bprm->unsafe) + } + if (bprm->unsafe & ~LSM_UNSAFE_PTRACE) return -EPERM; bsp->smk_task = isp->smk_task; @@ -958,7 +959,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm) * smack_inode_alloc_security - allocate an inode blob * @inode: the inode in need of a blob * - * Returns 0 if it gets a blob, -ENOMEM otherwise + * Returns 0 */ static int smack_inode_alloc_security(struct inode *inode) { @@ -1164,7 +1165,7 @@ static int smack_inode_rename(struct inode *old_inode, * * This is the important Smack hook. * - * Returns 0 if access is permitted, -EACCES otherwise + * Returns 0 if access is permitted, an error code otherwise */ static int smack_inode_permission(struct inode *inode, int mask) { @@ -1222,8 +1223,7 @@ static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr) /** * smack_inode_getattr - Smack check for getting attributes - * @mnt: vfsmount of the object - * @dentry: the object + * @path: path to extract the info from * * Returns 0 if access is permitted, an error code otherwise */ @@ -1870,14 +1870,13 @@ static int smack_file_receive(struct file *file) /** * smack_file_open - Smack dentry open processing * @file: the object - * @cred: task credential * * Set the security blob in the file structure. * Allow the open only if the task has read access. There are * many read operations (e.g. fstat) that you can do with an * fd even if you have the file open write-only. * - * Returns 0 + * Returns 0 if current has access, error code otherwise */ static int smack_file_open(struct file *file) { @@ -1900,7 +1899,7 @@ static int smack_file_open(struct file *file) /** * smack_cred_alloc_blank - "allocate" blank task-level security credentials - * @new: the new credentials + * @cred: the new credentials * @gfp: the atomicity of any memory allocations * * Prepare a blank set of credentials for modification. This must allocate all @@ -1983,7 +1982,7 @@ static void smack_cred_transfer(struct cred *new, const struct cred *old) /** * smack_cred_getsecid - get the secid corresponding to a creds structure - * @c: the object creds + * @cred: the object creds * @secid: where to put the result * * Sets the secid to contain a u32 version of the smack label. @@ -2140,8 +2139,6 @@ static int smack_task_getioprio(struct task_struct *p) /** * smack_task_setscheduler - Smack check on setting scheduler * @p: the task object - * @policy: unused - * @lp: unused * * Return 0 if read access is permitted */ @@ -2611,8 +2608,9 @@ static void smk_ipv6_port_label(struct socket *sock, struct sockaddr *address) /** * smk_ipv6_port_check - check Smack port access - * @sock: socket + * @sk: socket * @address: address + * @act: the action being taken * * Create or update the port list entry */ @@ -2782,7 +2780,7 @@ static int smack_socket_post_create(struct socket *sock, int family, * * Cross reference the peer labels for SO_PEERSEC * - * Returns 0 on success, and error code otherwise + * Returns 0 */ static int smack_socket_socketpair(struct socket *socka, struct socket *sockb) @@ -3014,13 +3012,13 @@ static int smack_shm_shmctl(struct kern_ipc_perm *isp, int cmd) * * Returns 0 if current has the requested access, error code otherwise */ -static int smack_shm_shmat(struct kern_ipc_perm *ipc, char __user *shmaddr, +static int smack_shm_shmat(struct kern_ipc_perm *isp, char __user *shmaddr, int shmflg) { int may; may = smack_flags_to_may(shmflg); - return smk_curacc_shm(ipc, may); + return smk_curacc_shm(isp, may); } /** @@ -3925,6 +3923,8 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) skp = smack_ipv6host_label(&sadd); if (skp == NULL) skp = smack_net_ambient; + if (skb == NULL) + break; #ifdef CONFIG_AUDIT smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net); ad.a.u.net->family = family; @@ -4762,7 +4762,7 @@ static __init void init_smack_known_list(void) /** * smack_init - initialize the smack system * - * Returns 0 + * Returns 0 on success, -ENOMEM is there's no memory */ static __init int smack_init(void) { diff --git a/sound/firewire/dice/dice-alesis.c b/sound/firewire/dice/dice-alesis.c index 218292bdace6..f5b325263b67 100644 --- a/sound/firewire/dice/dice-alesis.c +++ b/sound/firewire/dice/dice-alesis.c @@ -15,7 +15,7 @@ alesis_io14_tx_pcm_chs[MAX_STREAMS][SND_DICE_RATE_MODE_COUNT] = { static const unsigned int alesis_io26_tx_pcm_chs[MAX_STREAMS][SND_DICE_RATE_MODE_COUNT] = { - {10, 10, 8}, /* Tx0 = Analog + S/PDIF. */ + {10, 10, 4}, /* Tx0 = Analog + S/PDIF. */ {16, 8, 0}, /* Tx1 = ADAT1 + ADAT2. */ }; diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 91e71be42fa4..240f4ca76391 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2485,8 +2485,7 @@ static const struct pci_device_id azx_ids[] = { AZX_DCAPS_PM_RUNTIME }, /* AMD Raven */ { PCI_DEVICE(0x1022, 0x15e3), - .driver_data = AZX_DRIVER_GENERIC | AZX_DCAPS_PRESET_ATI_SB | - AZX_DCAPS_PM_RUNTIME }, + .driver_data = AZX_DRIVER_GENERIC | AZX_DCAPS_PRESET_AMD_SB }, /* ATI HDMI */ { PCI_DEVICE(0x1002, 0x0002), .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS }, diff --git a/sound/pci/hda/patch_analog.c b/sound/pci/hda/patch_analog.c index e283966bdbb1..bc9dd8e6fd86 100644 --- a/sound/pci/hda/patch_analog.c +++ b/sound/pci/hda/patch_analog.c @@ -357,6 +357,7 @@ static const struct hda_fixup ad1986a_fixups[] = { static const struct snd_pci_quirk ad1986a_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x30af, "HP B2800", AD1986A_FIXUP_LAPTOP_IMIC), + SND_PCI_QUIRK(0x1043, 0x1153, "ASUS M9V", AD1986A_FIXUP_LAPTOP_IMIC), SND_PCI_QUIRK(0x1043, 0x1443, "ASUS Z99He", AD1986A_FIXUP_EAPD), SND_PCI_QUIRK(0x1043, 0x1447, "ASUS A8JN", AD1986A_FIXUP_EAPD), SND_PCI_QUIRK_MASK(0x1043, 0xff00, 0x8100, "ASUS P5", AD1986A_FIXUP_3STACK), diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index da1695418731..b000b36ac3c6 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5817,6 +5817,7 @@ enum { ALC292_FIXUP_DELL_E7X, ALC292_FIXUP_DISABLE_AAMIX, ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK, + ALC298_FIXUP_ALIENWARE_MIC_NO_PRESENCE, ALC298_FIXUP_DELL1_MIC_NO_PRESENCE, ALC298_FIXUP_DELL_AIO_MIC_NO_PRESENCE, ALC275_FIXUP_DELL_XPS, @@ -5871,6 +5872,7 @@ enum { ALC256_FIXUP_ASUS_MIC_NO_PRESENCE, ALC299_FIXUP_PREDATOR_SPK, ALC294_FIXUP_ASUS_INTSPK_HEADSET_MIC, + ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE, }; static const struct hda_fixup alc269_fixups[] = { @@ -6506,6 +6508,15 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC292_FIXUP_DISABLE_AAMIX }, + [ALC298_FIXUP_ALIENWARE_MIC_NO_PRESENCE] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x18, 0x01a1913c }, /* headset mic w/o jack detect */ + { } + }, + .chained_before = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE, + }, [ALC298_FIXUP_DELL1_MIC_NO_PRESENCE] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -6927,6 +6938,16 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC }, + [ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x04a11040 }, + { 0x21, 0x04211020 }, + { } + }, + .chained = true, + .chain_id = ALC256_FIXUP_ASUS_HEADSET_MODE + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -7190,6 +7211,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x9e54, "LENOVO NB", ALC269_FIXUP_LENOVO_EAPD), SND_PCI_QUIRK(0x19e5, 0x3204, "Huawei MACH-WX9", ALC256_FIXUP_HUAWEI_MACH_WX9_PINS), SND_PCI_QUIRK(0x1b7d, 0xa831, "Ordissimo EVE2 ", ALC269VB_FIXUP_ORDISSIMO_EVE2), /* Also known as Malata PC-B1303 */ + SND_PCI_QUIRK(0x10ec, 0x118c, "Medion EE4254 MD62100", ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE), #if 0 /* Below is a quirk table taken from the old code. @@ -7358,6 +7380,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC295_FIXUP_CHROME_BOOK, .name = "alc-chrome-book"}, {.id = ALC299_FIXUP_PREDATOR_SPK, .name = "predator-spk"}, {.id = ALC298_FIXUP_HUAWEI_MBX_STEREO, .name = "huawei-mbx-stereo"}, + {.id = ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE, .name = "alc256-medion-headset"}, {} }; #define ALC225_STANDARD_PINS \ @@ -7770,6 +7793,11 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x17, 0x90170110}, {0x1a, 0x03011020}, {0x21, 0x03211030}), + SND_HDA_PIN_QUIRK(0x10ec0298, 0x1028, "Dell", ALC298_FIXUP_ALIENWARE_MIC_NO_PRESENCE, + {0x12, 0xb7a60140}, + {0x17, 0x90170110}, + {0x1a, 0x03a11030}, + {0x21, 0x03211020}), SND_HDA_PIN_QUIRK(0x10ec0299, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, ALC225_STANDARD_PINS, {0x12, 0xb7a60130}, diff --git a/sound/soc/atmel/atmel_ssc_dai.c b/sound/soc/atmel/atmel_ssc_dai.c index 48e9eef34c0f..ca603397651c 100644 --- a/sound/soc/atmel/atmel_ssc_dai.c +++ b/sound/soc/atmel/atmel_ssc_dai.c @@ -116,19 +116,16 @@ static struct atmel_pcm_dma_params ssc_dma_params[NUM_SSC_DEVICES][2] = { static struct atmel_ssc_info ssc_info[NUM_SSC_DEVICES] = { { .name = "ssc0", - .lock = __SPIN_LOCK_UNLOCKED(ssc_info[0].lock), .dir_mask = SSC_DIR_MASK_UNUSED, .initialized = 0, }, { .name = "ssc1", - .lock = __SPIN_LOCK_UNLOCKED(ssc_info[1].lock), .dir_mask = SSC_DIR_MASK_UNUSED, .initialized = 0, }, { .name = "ssc2", - .lock = __SPIN_LOCK_UNLOCKED(ssc_info[2].lock), .dir_mask = SSC_DIR_MASK_UNUSED, .initialized = 0, }, @@ -317,13 +314,10 @@ static int atmel_ssc_startup(struct snd_pcm_substream *substream, snd_soc_dai_set_dma_data(dai, substream, dma_params); - spin_lock_irq(&ssc_p->lock); - if (ssc_p->dir_mask & dir_mask) { - spin_unlock_irq(&ssc_p->lock); + if (ssc_p->dir_mask & dir_mask) return -EBUSY; - } + ssc_p->dir_mask |= dir_mask; - spin_unlock_irq(&ssc_p->lock); return 0; } @@ -355,7 +349,6 @@ static void atmel_ssc_shutdown(struct snd_pcm_substream *substream, dir_mask = 1 << dir; - spin_lock_irq(&ssc_p->lock); ssc_p->dir_mask &= ~dir_mask; if (!ssc_p->dir_mask) { if (ssc_p->initialized) { @@ -369,7 +362,6 @@ static void atmel_ssc_shutdown(struct snd_pcm_substream *substream, ssc_p->cmr_div = ssc_p->tcmr_period = ssc_p->rcmr_period = 0; ssc_p->forced_divider = 0; } - spin_unlock_irq(&ssc_p->lock); /* Shutdown the SSC clock. */ pr_debug("atmel_ssc_dai: Stopping clock\n"); diff --git a/sound/soc/atmel/atmel_ssc_dai.h b/sound/soc/atmel/atmel_ssc_dai.h index ae764cb541c7..3470b966e449 100644 --- a/sound/soc/atmel/atmel_ssc_dai.h +++ b/sound/soc/atmel/atmel_ssc_dai.h @@ -93,7 +93,6 @@ struct atmel_ssc_state { struct atmel_ssc_info { char *name; struct ssc_device *ssc; - spinlock_t lock; /* lock for dir_mask */ unsigned short dir_mask; /* 0=unused, 1=playback, 2=capture */ unsigned short initialized; /* true if SSC has been initialized */ unsigned short daifmt; diff --git a/sound/soc/codecs/pcm3168a.c b/sound/soc/codecs/pcm3168a.c index 50ed86d45c26..88b75695fbf7 100644 --- a/sound/soc/codecs/pcm3168a.c +++ b/sound/soc/codecs/pcm3168a.c @@ -21,8 +21,7 @@ #define PCM3168A_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | \ SNDRV_PCM_FMTBIT_S24_3LE | \ - SNDRV_PCM_FMTBIT_S24_LE | \ - SNDRV_PCM_FMTBIT_S32_LE) + SNDRV_PCM_FMTBIT_S24_LE) #define PCM3168A_FMT_I2S 0x0 #define PCM3168A_FMT_LEFT_J 0x1 diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c index ef0b74693093..b517e4bc1b87 100644 --- a/sound/soc/fsl/fsl_sai.c +++ b/sound/soc/fsl/fsl_sai.c @@ -628,6 +628,16 @@ static int fsl_sai_startup(struct snd_pcm_substream *substream, FSL_SAI_CR3_TRCE_MASK, FSL_SAI_CR3_TRCE); + /* + * EDMA controller needs period size to be a multiple of + * tx/rx maxburst + */ + if (sai->soc_data->use_edma) + snd_pcm_hw_constraint_step(substream->runtime, 0, + SNDRV_PCM_HW_PARAM_PERIOD_SIZE, + tx ? sai->dma_params_tx.maxburst : + sai->dma_params_rx.maxburst); + ret = snd_pcm_hw_constraint_list(substream->runtime, 0, SNDRV_PCM_HW_PARAM_RATE, &fsl_sai_rate_constraints); @@ -1026,30 +1036,35 @@ static int fsl_sai_remove(struct platform_device *pdev) static const struct fsl_sai_soc_data fsl_sai_vf610_data = { .use_imx_pcm = false, + .use_edma = false, .fifo_depth = 32, .reg_offset = 0, }; static const struct fsl_sai_soc_data fsl_sai_imx6sx_data = { .use_imx_pcm = true, + .use_edma = false, .fifo_depth = 32, .reg_offset = 0, }; static const struct fsl_sai_soc_data fsl_sai_imx7ulp_data = { .use_imx_pcm = true, + .use_edma = false, .fifo_depth = 16, .reg_offset = 8, }; static const struct fsl_sai_soc_data fsl_sai_imx8mq_data = { .use_imx_pcm = true, + .use_edma = false, .fifo_depth = 128, .reg_offset = 8, }; static const struct fsl_sai_soc_data fsl_sai_imx8qm_data = { .use_imx_pcm = true, + .use_edma = true, .fifo_depth = 64, .reg_offset = 0, }; diff --git a/sound/soc/fsl/fsl_sai.h b/sound/soc/fsl/fsl_sai.h index b12cb578f6d0..76b15deea80c 100644 --- a/sound/soc/fsl/fsl_sai.h +++ b/sound/soc/fsl/fsl_sai.h @@ -157,6 +157,7 @@ struct fsl_sai_soc_data { bool use_imx_pcm; + bool use_edma; unsigned int fifo_depth; unsigned int reg_offset; }; diff --git a/sound/soc/sh/rcar/ssi.c b/sound/soc/sh/rcar/ssi.c index f6a7466622ea..fc5d089868df 100644 --- a/sound/soc/sh/rcar/ssi.c +++ b/sound/soc/sh/rcar/ssi.c @@ -286,6 +286,11 @@ static int rsnd_ssi_master_clk_start(struct rsnd_mod *mod, if (rsnd_ssi_is_multi_slave(mod, io)) return 0; + if (rsnd_runtime_is_tdm_split(io)) + chan = rsnd_io_converted_chan(io); + + chan = rsnd_channel_normalization(chan); + if (ssi->usrcnt > 0) { if (ssi->rate != rate) { dev_err(dev, "SSI parent/child should use same rate\n"); @@ -300,11 +305,6 @@ static int rsnd_ssi_master_clk_start(struct rsnd_mod *mod, return 0; } - if (rsnd_runtime_is_tdm_split(io)) - chan = rsnd_io_converted_chan(io); - - chan = rsnd_channel_normalization(chan); - main_rate = rsnd_ssi_clk_query(rdai, rate, chan, &idx); if (!main_rate) { dev_err(dev, "unsupported clock rate\n"); diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 35f48e9c5ead..88978a3036c4 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -978,7 +978,7 @@ static void soc_cleanup_component(struct snd_soc_component *component) /* For framework level robustness */ snd_soc_component_set_jack(component, NULL, NULL); - list_del(&component->card_list); + list_del_init(&component->card_list); snd_soc_dapm_free(snd_soc_component_get_dapm(component)); soc_cleanup_component_debugfs(component); component->card = NULL; diff --git a/sound/soc/ti/Kconfig b/sound/soc/ti/Kconfig index 87a9b9dd4e98..29f61053ab62 100644 --- a/sound/soc/ti/Kconfig +++ b/sound/soc/ti/Kconfig @@ -200,11 +200,18 @@ config SND_SOC_DM365_AIC3X_CODEC config SND_SOC_DM365_VOICE_CODEC bool "Voice Codec - CQ93VC" - select MFD_DAVINCI_VOICECODEC - select SND_SOC_CQ0093VC help Say Y if you want to add support for SoC On-chip voice codec endchoice +config SND_SOC_DM365_VOICE_CODEC_MODULE + def_tristate y + depends on SND_SOC_DM365_VOICE_CODEC && SND_SOC + select MFD_DAVINCI_VOICECODEC + select SND_SOC_CQ0093VC + help + The is an internal symbol needed to ensure that the codec + and MFD driver can be built as loadable modules if necessary. + endmenu diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 25faf2d3c639..fbfde996fee7 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1658,6 +1658,8 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, case 0x25ce: /* Mytek devices */ case 0x278b: /* Rotel? */ case 0x2ab6: /* T+A devices */ + case 0x3842: /* EVGA */ + case 0xc502: /* HiBy devices */ if (fp->dsd_raw) return SNDRV_PCM_FMTBIT_DSD_U32_BE; break; diff --git a/tools/hv/Build b/tools/hv/Build new file mode 100644 index 000000000000..6cf51fa4b306 --- /dev/null +++ b/tools/hv/Build @@ -0,0 +1,3 @@ +hv_kvp_daemon-y += hv_kvp_daemon.o +hv_vss_daemon-y += hv_vss_daemon.o +hv_fcopy_daemon-y += hv_fcopy_daemon.o diff --git a/tools/hv/Makefile b/tools/hv/Makefile index 5db5e62cebda..b57143d9459c 100644 --- a/tools/hv/Makefile +++ b/tools/hv/Makefile @@ -1,28 +1,55 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for Hyper-V tools - -WARNINGS = -Wall -Wextra -CFLAGS = $(WARNINGS) -g $(shell getconf LFS_CFLAGS) - -CFLAGS += -D__EXPORTED_HEADERS__ -I../../include/uapi -I../../include +include ../scripts/Makefile.include sbindir ?= /usr/sbin libexecdir ?= /usr/libexec sharedstatedir ?= /var/lib -ALL_PROGRAMS := hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon +ifeq ($(srctree),) +srctree := $(patsubst %/,%,$(dir $(CURDIR))) +srctree := $(patsubst %/,%,$(dir $(srctree))) +endif + +# Do not use make's built-in rules +# (this improves performance and avoids hard-to-debug behaviour); +MAKEFLAGS += -r + +override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include + +ALL_TARGETS := hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon +ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) ALL_SCRIPTS := hv_get_dhcp_info.sh hv_get_dns_info.sh hv_set_ifconfig.sh all: $(ALL_PROGRAMS) -%: %.c - $(CC) $(CFLAGS) -o $@ $^ +export srctree OUTPUT CC LD CFLAGS +include $(srctree)/tools/build/Makefile.include + +HV_KVP_DAEMON_IN := $(OUTPUT)hv_kvp_daemon-in.o +$(HV_KVP_DAEMON_IN): FORCE + $(Q)$(MAKE) $(build)=hv_kvp_daemon +$(OUTPUT)hv_kvp_daemon: $(HV_KVP_DAEMON_IN) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ + +HV_VSS_DAEMON_IN := $(OUTPUT)hv_vss_daemon-in.o +$(HV_VSS_DAEMON_IN): FORCE + $(Q)$(MAKE) $(build)=hv_vss_daemon +$(OUTPUT)hv_vss_daemon: $(HV_VSS_DAEMON_IN) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ + +HV_FCOPY_DAEMON_IN := $(OUTPUT)hv_fcopy_daemon-in.o +$(HV_FCOPY_DAEMON_IN): FORCE + $(Q)$(MAKE) $(build)=hv_fcopy_daemon +$(OUTPUT)hv_fcopy_daemon: $(HV_FCOPY_DAEMON_IN) + $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ clean: - $(RM) hv_kvp_daemon hv_vss_daemon hv_fcopy_daemon + rm -f $(ALL_PROGRAMS) + find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.d' -delete -install: all +install: $(ALL_PROGRAMS) install -d -m 755 $(DESTDIR)$(sbindir); \ install -d -m 755 $(DESTDIR)$(libexecdir)/hypervkvpd; \ install -d -m 755 $(DESTDIR)$(sharedstatedir); \ @@ -33,3 +60,7 @@ install: all for script in $(ALL_SCRIPTS); do \ install $$script -m 755 $(DESTDIR)$(libexecdir)/hypervkvpd/$${script%.sh}; \ done + +FORCE: + +.PHONY: all install clean FORCE prepare diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h index d83763a5327c..e03b1ea23e0e 100644 --- a/tools/include/linux/rbtree.h +++ b/tools/include/linux/rbtree.h @@ -31,25 +31,9 @@ struct rb_root { struct rb_node *rb_node; }; -/* - * Leftmost-cached rbtrees. - * - * We do not cache the rightmost node based on footprint - * size vs number of potential users that could benefit - * from O(1) rb_last(). Just not worth it, users that want - * this feature can always implement the logic explicitly. - * Furthermore, users that want to cache both pointers may - * find it a bit asymmetric, but that's ok. - */ -struct rb_root_cached { - struct rb_root rb_root; - struct rb_node *rb_leftmost; -}; - #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define RB_ROOT (struct rb_root) { NULL, } -#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) @@ -71,12 +55,6 @@ extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); -extern void rb_insert_color_cached(struct rb_node *, - struct rb_root_cached *, bool); -extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); -/* Same as rb_first(), but O(1) */ -#define rb_first_cached(root) (root)->rb_leftmost - /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); @@ -84,8 +62,6 @@ extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); -extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, - struct rb_root_cached *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) @@ -129,4 +105,51 @@ static inline void rb_erase_init(struct rb_node *n, struct rb_root *root) rb_erase(n, root); RB_CLEAR_NODE(n); } + +/* + * Leftmost-cached rbtrees. + * + * We do not cache the rightmost node based on footprint + * size vs number of potential users that could benefit + * from O(1) rb_last(). Just not worth it, users that want + * this feature can always implement the logic explicitly. + * Furthermore, users that want to cache both pointers may + * find it a bit asymmetric, but that's ok. + */ +struct rb_root_cached { + struct rb_root rb_root; + struct rb_node *rb_leftmost; +}; + +#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } + +/* Same as rb_first(), but O(1) */ +#define rb_first_cached(root) (root)->rb_leftmost + +static inline void rb_insert_color_cached(struct rb_node *node, + struct rb_root_cached *root, + bool leftmost) +{ + if (leftmost) + root->rb_leftmost = node; + rb_insert_color(node, &root->rb_root); +} + +static inline void rb_erase_cached(struct rb_node *node, + struct rb_root_cached *root) +{ + if (root->rb_leftmost == node) + root->rb_leftmost = rb_next(node); + rb_erase(node, &root->rb_root); +} + +static inline void rb_replace_node_cached(struct rb_node *victim, + struct rb_node *new, + struct rb_root_cached *root) +{ + if (root->rb_leftmost == victim) + root->rb_leftmost = new; + rb_replace_node(victim, new, &root->rb_root); +} + #endif /* __TOOLS_LINUX_PERF_RBTREE_H */ diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index ddd01006ece5..381aa948610d 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -32,17 +32,16 @@ struct rb_augment_callbacks { void (*rotate)(struct rb_node *old, struct rb_node *new); }; -extern void __rb_insert_augmented(struct rb_node *node, - struct rb_root *root, - bool newleft, struct rb_node **leftmost, +extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); + /* * Fixup the rbtree and update the augmented information when rebalancing. * * On insertion, the user must update the augmented information on the path * leading to the inserted node, then call rb_link_node() as usual and - * rb_augment_inserted() instead of the usual rb_insert_color() call. - * If rb_augment_inserted() rebalances the rbtree, it will callback into + * rb_insert_augmented() instead of the usual rb_insert_color() call. + * If rb_insert_augmented() rebalances the rbtree, it will callback into * a user provided function to update the augmented information on the * affected subtrees. */ @@ -50,7 +49,7 @@ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - __rb_insert_augmented(node, root, false, NULL, augment->rotate); + __rb_insert_augmented(node, root, augment->rotate); } static inline void @@ -58,45 +57,92 @@ rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *root, bool newleft, const struct rb_augment_callbacks *augment) { - __rb_insert_augmented(node, &root->rb_root, - newleft, &root->rb_leftmost, augment->rotate); + if (newleft) + root->rb_leftmost = node; + rb_insert_augmented(node, &root->rb_root, augment); } -#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ - rbtype, rbaugmented, rbcompute) \ +/* + * Template for declaring augmented rbtree callbacks (generic case) + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data + */ + +#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ -rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ - rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ - rbtype augmented = rbcompute(node); \ - if (node->rbaugmented == augmented) \ + RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ + if (RBCOMPUTE(node, true)) \ break; \ - node->rbaugmented = augmented; \ - rb = rb_parent(&node->rbfield); \ + rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ -rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ -rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ - rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ - rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ - new->rbaugmented = old->rbaugmented; \ - old->rbaugmented = rbcompute(old); \ + RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ + RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ + new->RBAUGMENTED = old->RBAUGMENTED; \ + RBCOMPUTE(old, false); \ } \ -rbstatic const struct rb_augment_callbacks rbname = { \ - .propagate = rbname ## _propagate, \ - .copy = rbname ## _copy, \ - .rotate = rbname ## _rotate \ +RBSTATIC const struct rb_augment_callbacks RBNAME = { \ + .propagate = RBNAME ## _propagate, \ + .copy = RBNAME ## _copy, \ + .rotate = RBNAME ## _rotate \ }; +/* + * Template for declaring augmented rbtree callbacks, + * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. + * + * RBSTATIC: 'static' or empty + * RBNAME: name of the rb_augment_callbacks structure + * RBSTRUCT: struct type of the tree nodes + * RBFIELD: name of struct rb_node field within RBSTRUCT + * RBTYPE: type of the RBAUGMENTED field + * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree + * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar + */ + +#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ + RBTYPE, RBAUGMENTED, RBCOMPUTE) \ +static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ +{ \ + RBSTRUCT *child; \ + RBTYPE max = RBCOMPUTE(node); \ + if (node->RBFIELD.rb_left) { \ + child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + if (node->RBFIELD.rb_right) { \ + child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ + if (child->RBAUGMENTED > max) \ + max = child->RBAUGMENTED; \ + } \ + if (exit && node->RBAUGMENTED == max) \ + return true; \ + node->RBAUGMENTED = max; \ + return false; \ +} \ +RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ + RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) + #define RB_RED 0 #define RB_BLACK 1 @@ -139,7 +185,6 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, - struct rb_node **leftmost, const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right; @@ -147,9 +192,6 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, struct rb_node *parent, *rebalance; unsigned long pc; - if (leftmost && node == *leftmost) - *leftmost = rb_next(node); - if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) @@ -249,8 +291,7 @@ static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - struct rb_node *rebalance = __rb_erase_augmented(node, root, - NULL, augment); + struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } @@ -259,11 +300,9 @@ static __always_inline void rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, const struct rb_augment_callbacks *augment) { - struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root, - &root->rb_leftmost, - augment); - if (rebalance) - __rb_erase_color(rebalance, &root->rb_root, augment->rotate); + if (root->rb_leftmost == node) + root->rb_leftmost = rb_next(node); + rb_erase_augmented(node, &root->rb_root, augment); } #endif /* _TOOLS_LINUX_RBTREE_AUGMENTED_H */ diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c index 804f145e3113..2548ff8c4d9c 100644 --- a/tools/lib/rbtree.c +++ b/tools/lib/rbtree.c @@ -83,14 +83,10 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, static __always_inline void __rb_insert(struct rb_node *node, struct rb_root *root, - bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; - if (newleft) - *leftmost = node; - while (true) { /* * Loop invariant: node is red. @@ -436,34 +432,17 @@ static const struct rb_augment_callbacks dummy_callbacks = { void rb_insert_color(struct rb_node *node, struct rb_root *root) { - __rb_insert(node, root, false, NULL, dummy_rotate); + __rb_insert(node, root, dummy_rotate); } void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *rebalance; - rebalance = __rb_erase_augmented(node, root, - NULL, &dummy_callbacks); + rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); if (rebalance) ____rb_erase_color(rebalance, root, dummy_rotate); } -void rb_insert_color_cached(struct rb_node *node, - struct rb_root_cached *root, bool leftmost) -{ - __rb_insert(node, &root->rb_root, leftmost, - &root->rb_leftmost, dummy_rotate); -} - -void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) -{ - struct rb_node *rebalance; - rebalance = __rb_erase_augmented(node, &root->rb_root, - &root->rb_leftmost, &dummy_callbacks); - if (rebalance) - ____rb_erase_color(rebalance, &root->rb_root, dummy_rotate); -} - /* * Augmented rbtree manipulation functions. * @@ -472,10 +451,9 @@ void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) */ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, - bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - __rb_insert(node, root, newleft, leftmost, augment_rotate); + __rb_insert(node, root, augment_rotate); } /* @@ -580,15 +558,6 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, __rb_change_child(victim, new, parent, root); } -void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, - struct rb_root_cached *root) -{ - rb_replace_node(victim, new, &root->rb_root); - - if (root->rb_leftmost == victim) - root->rb_leftmost = new; -} - static struct rb_node *rb_left_deepest_node(const struct rb_node *node) { for (;;) { diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index 59753b3917bb..2a9890c8395a 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -38,6 +38,7 @@ static int fact_avx = 0xFF; static unsigned long long fact_trl; static int out_format_json; static int cmd_help; +static int force_online_offline; /* clos related */ static int current_clos = -1; @@ -138,14 +139,14 @@ int out_format_is_json(void) int get_physical_package_id(int cpu) { return parse_int_file( - 1, "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", + 0, "/sys/devices/system/cpu/cpu%d/topology/physical_package_id", cpu); } int get_physical_core_id(int cpu) { return parse_int_file( - 1, "/sys/devices/system/cpu/cpu%d/topology/core_id", cpu); + 0, "/sys/devices/system/cpu/cpu%d/topology/core_id", cpu); } int get_physical_die_id(int cpu) @@ -165,6 +166,26 @@ int get_topo_max_cpus(void) return topo_max_cpus; } +static void set_cpu_online_offline(int cpu, int state) +{ + char buffer[128]; + int fd; + + snprintf(buffer, sizeof(buffer), + "/sys/devices/system/cpu/cpu%d/online", cpu); + + fd = open(buffer, O_WRONLY); + if (fd < 0) + err(-1, "%s open failed", buffer); + + if (state) + write(fd, "1\n", 2); + else + write(fd, "0\n", 2); + + close(fd); +} + #define MAX_PACKAGE_COUNT 8 #define MAX_DIE_PER_PACKAGE 2 static void for_each_online_package_in_set(void (*callback)(int, void *, void *, @@ -402,6 +423,9 @@ void set_cpu_mask_from_punit_coremask(int cpu, unsigned long long core_mask, int j; for (j = 0; j < topo_max_cpus; ++j) { + if (!CPU_ISSET_S(j, present_cpumask_size, present_cpumask)) + continue; + if (cpu_map[j].pkg_id == pkg_id && cpu_map[j].die_id == die_id && cpu_map[j].punit_cpu_core == i) { @@ -484,7 +508,7 @@ int isst_send_mbox_command(unsigned int cpu, unsigned char command, int write = 0; int clos_id, core_id, ret = 0; - debug_printf("CLOS %d\n", cpu); + debug_printf("CPU %d\n", cpu); if (parameter & BIT(MBOX_CMD_WRITE_BIT)) { value = req_data; @@ -649,8 +673,8 @@ static void exec_on_get_ctdp_cpu(int cpu, void *arg1, void *arg2, void *arg3, if (ret) perror("get_tdp_*"); else - isst_display_result(cpu, outf, "perf-profile", (char *)arg3, - *(unsigned int *)arg4); + isst_ctdp_display_core_info(cpu, outf, arg3, + *(unsigned int *)arg4); } #define _get_tdp_level(desc, suffix, object, help) \ @@ -733,9 +757,34 @@ static void set_tdp_level_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, ret = isst_set_tdp_level(cpu, tdp_level); if (ret) perror("set_tdp_level_for_cpu"); - else + else { isst_display_result(cpu, outf, "perf-profile", "set_tdp_level", ret); + if (force_online_offline) { + struct isst_pkg_ctdp_level_info ctdp_level; + int pkg_id = get_physical_package_id(cpu); + int die_id = get_physical_die_id(cpu); + + fprintf(stderr, "Option is set to online/offline\n"); + ctdp_level.core_cpumask_size = + alloc_cpu_set(&ctdp_level.core_cpumask); + isst_get_coremask_info(cpu, tdp_level, &ctdp_level); + if (ctdp_level.cpu_count) { + int i, max_cpus = get_topo_max_cpus(); + for (i = 0; i < max_cpus; ++i) { + if (pkg_id != get_physical_package_id(i) || die_id != get_physical_die_id(i)) + continue; + if (CPU_ISSET_S(i, ctdp_level.core_cpumask_size, ctdp_level.core_cpumask)) { + fprintf(stderr, "online cpu %d\n", i); + set_cpu_online_offline(i, 1); + } else { + fprintf(stderr, "offline cpu %d\n", i); + set_cpu_online_offline(i, 0); + } + } + } + } + } } static void set_tdp_level(void) @@ -744,6 +793,8 @@ static void set_tdp_level(void) fprintf(stderr, "Set Config TDP level\n"); fprintf(stderr, "\t Arguments: -l|--level : Specify tdp level\n"); + fprintf(stderr, + "\t Optional Arguments: -o | online : online/offline for the tdp level\n"); exit(0); } @@ -1082,6 +1133,40 @@ static void dump_clos_config(void) isst_ctdp_display_information_end(outf); } +static void get_clos_info_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, + void *arg4) +{ + int enable, ret, prio_type; + + ret = isst_clos_get_clos_information(cpu, &enable, &prio_type); + if (ret) + perror("isst_clos_get_info"); + else + isst_clos_display_clos_information(cpu, outf, enable, prio_type); +} + +static void dump_clos_info(void) +{ + if (cmd_help) { + fprintf(stderr, + "Print Intel Speed Select Technology core power information\n"); + fprintf(stderr, "\tSpecify targeted cpu id with [--cpu|-c]\n"); + exit(0); + } + + if (!max_target_cpus) { + fprintf(stderr, + "Invalid target cpu. Specify with [-c|--cpu]\n"); + exit(0); + } + + isst_ctdp_display_information_start(outf); + for_each_online_target_cpu_in_set(get_clos_info_for_cpu, NULL, + NULL, NULL, NULL); + isst_ctdp_display_information_end(outf); + +} + static void set_clos_config_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, void *arg4) { @@ -1198,7 +1283,7 @@ static void get_clos_assoc_for_cpu(int cpu, void *arg1, void *arg2, void *arg3, if (ret) perror("isst_clos_get_assoc_status"); else - isst_display_result(cpu, outf, "core-power", "get-assoc", clos); + isst_clos_display_assoc_information(cpu, outf, clos); } static void get_clos_assoc(void) @@ -1208,13 +1293,17 @@ static void get_clos_assoc(void) fprintf(stderr, "\tSpecify targeted cpu id with [--cpu|-c]\n"); exit(0); } - if (max_target_cpus) - for_each_online_target_cpu_in_set(get_clos_assoc_for_cpu, NULL, - NULL, NULL, NULL); - else { + + if (!max_target_cpus) { fprintf(stderr, "Invalid target cpu. Specify with [-c|--cpu]\n"); + exit(0); } + + isst_ctdp_display_information_start(outf); + for_each_online_target_cpu_in_set(get_clos_assoc_for_cpu, NULL, + NULL, NULL, NULL); + isst_ctdp_display_information_end(outf); } static struct process_cmd_struct isst_cmds[] = { @@ -1231,10 +1320,11 @@ static struct process_cmd_struct isst_cmds[] = { { "turbo-freq", "info", dump_fact_config }, { "turbo-freq", "enable", set_fact_enable }, { "turbo-freq", "disable", set_fact_disable }, - { "core-power", "info", dump_clos_config }, + { "core-power", "info", dump_clos_info }, { "core-power", "enable", set_clos_enable }, { "core-power", "disable", set_clos_disable }, { "core-power", "config", set_clos_config }, + { "core-power", "get-config", dump_clos_config }, { "core-power", "assoc", set_clos_assoc }, { "core-power", "get-assoc", get_clos_assoc }, { NULL, NULL, NULL } @@ -1316,6 +1406,7 @@ static void parse_cmd_args(int argc, int start, char **argv) static struct option long_options[] = { { "bucket", required_argument, 0, 'b' }, { "level", required_argument, 0, 'l' }, + { "online", required_argument, 0, 'o' }, { "trl-type", required_argument, 0, 'r' }, { "trl", required_argument, 0, 't' }, { "help", no_argument, 0, 'h' }, @@ -1332,7 +1423,7 @@ static void parse_cmd_args(int argc, int start, char **argv) option_index = start; optind = start + 1; - while ((opt = getopt_long(argc, argv, "b:l:t:c:d:e:n:m:p:w:h", + while ((opt = getopt_long(argc, argv, "b:l:t:c:d:e:n:m:p:w:ho", long_options, &option_index)) != -1) { switch (opt) { case 'b': @@ -1344,6 +1435,9 @@ static void parse_cmd_args(int argc, int start, char **argv) case 'l': tdp_level = atoi(optarg); break; + case 'o': + force_online_offline = 1; + break; case 't': sscanf(optarg, "0x%llx", &fact_trl); break; @@ -1362,7 +1456,6 @@ static void parse_cmd_args(int argc, int start, char **argv) /* CLOS related */ case 'c': current_clos = atoi(optarg); - printf("clos %d\n", current_clos); break; case 'd': clos_desired = atoi(optarg); @@ -1433,6 +1526,7 @@ static void core_power_help(void) printf("\tenable\n"); printf("\tdisable\n"); printf("\tconfig\n"); + printf("\tget-config\n"); printf("\tassoc\n"); printf("\tget-assoc\n"); } diff --git a/tools/power/x86/intel-speed-select/isst-core.c b/tools/power/x86/intel-speed-select/isst-core.c index 0bf341ad9697..6dee5332c9d3 100644 --- a/tools/power/x86/intel-speed-select/isst-core.c +++ b/tools/power/x86/intel-speed-select/isst-core.c @@ -619,6 +619,31 @@ int isst_get_process_ctdp(int cpu, int tdp_level, struct isst_pkg_ctdp *pkg_dev) return 0; } +int isst_clos_get_clos_information(int cpu, int *enable, int *type) +{ + unsigned int resp; + int ret; + + ret = isst_send_mbox_command(cpu, CONFIG_CLOS, CLOS_PM_QOS_CONFIG, 0, 0, + &resp); + if (ret) + return ret; + + debug_printf("cpu:%d CLOS_PM_QOS_CONFIG resp:%x\n", cpu, resp); + + if (resp & BIT(1)) + *enable = 1; + else + *enable = 0; + + if (resp & BIT(2)) + *type = 1; + else + *type = 0; + + return 0; +} + int isst_pm_qos_config(int cpu, int enable_clos, int priority_type) { unsigned int req, resp; diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index df4aa99c4e92..40346d534f78 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -287,6 +287,26 @@ static void _isst_fact_display_information(int cpu, FILE *outf, int level, format_and_print(outf, base_level + 2, header, value); } +void isst_ctdp_display_core_info(int cpu, FILE *outf, char *prefix, + unsigned int val) +{ + char header[256]; + char value[256]; + + snprintf(header, sizeof(header), "package-%d", + get_physical_package_id(cpu)); + format_and_print(outf, 1, header, NULL); + snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); + format_and_print(outf, 2, header, NULL); + snprintf(header, sizeof(header), "cpu-%d", cpu); + format_and_print(outf, 3, header, NULL); + + snprintf(value, sizeof(value), "%u", val); + format_and_print(outf, 4, prefix, value); + + format_and_print(outf, 1, NULL, NULL); +} + void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, struct isst_pkg_ctdp *pkg_dev) { @@ -503,6 +523,57 @@ void isst_clos_display_information(int cpu, FILE *outf, int clos, format_and_print(outf, 1, NULL, NULL); } +void isst_clos_display_clos_information(int cpu, FILE *outf, + int clos_enable, int type) +{ + char header[256]; + char value[256]; + + snprintf(header, sizeof(header), "package-%d", + get_physical_package_id(cpu)); + format_and_print(outf, 1, header, NULL); + snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); + format_and_print(outf, 2, header, NULL); + snprintf(header, sizeof(header), "cpu-%d", cpu); + format_and_print(outf, 3, header, NULL); + + snprintf(header, sizeof(header), "core-power"); + format_and_print(outf, 4, header, NULL); + + snprintf(header, sizeof(header), "enable-status"); + snprintf(value, sizeof(value), "%d", clos_enable); + format_and_print(outf, 5, header, value); + + snprintf(header, sizeof(header), "priority-type"); + snprintf(value, sizeof(value), "%d", type); + format_and_print(outf, 5, header, value); + + format_and_print(outf, 1, NULL, NULL); +} + +void isst_clos_display_assoc_information(int cpu, FILE *outf, int clos) +{ + char header[256]; + char value[256]; + + snprintf(header, sizeof(header), "package-%d", + get_physical_package_id(cpu)); + format_and_print(outf, 1, header, NULL); + snprintf(header, sizeof(header), "die-%d", get_physical_die_id(cpu)); + format_and_print(outf, 2, header, NULL); + snprintf(header, sizeof(header), "cpu-%d", cpu); + format_and_print(outf, 3, header, NULL); + + snprintf(header, sizeof(header), "get-assoc"); + format_and_print(outf, 4, header, NULL); + + snprintf(header, sizeof(header), "clos"); + snprintf(value, sizeof(value), "%d", clos); + format_and_print(outf, 5, header, value); + + format_and_print(outf, 1, NULL, NULL); +} + void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd, int result) { diff --git a/tools/power/x86/intel-speed-select/isst.h b/tools/power/x86/intel-speed-select/isst.h index 2f7f62765eb6..d280b27d600d 100644 --- a/tools/power/x86/intel-speed-select/isst.h +++ b/tools/power/x86/intel-speed-select/isst.h @@ -187,12 +187,16 @@ extern int isst_send_msr_command(unsigned int cpu, unsigned int command, int write, unsigned long long *req_resp); extern int isst_get_ctdp_levels(int cpu, struct isst_pkg_ctdp *pkg_dev); +extern int isst_get_coremask_info(int cpu, int config_index, + struct isst_pkg_ctdp_level_info *ctdp_level); extern int isst_get_process_ctdp(int cpu, int tdp_level, struct isst_pkg_ctdp *pkg_dev); extern void isst_get_process_ctdp_complete(int cpu, struct isst_pkg_ctdp *pkg_dev); extern void isst_ctdp_display_information(int cpu, FILE *outf, int tdp_level, struct isst_pkg_ctdp *pkg_dev); +extern void isst_ctdp_display_core_info(int cpu, FILE *outf, char *prefix, + unsigned int val); extern void isst_ctdp_display_information_start(FILE *outf); extern void isst_ctdp_display_information_end(FILE *outf); extern void isst_pbf_display_information(int cpu, FILE *outf, int level, @@ -223,10 +227,14 @@ extern int isst_clos_associate(int cpu, int clos); extern int isst_clos_get_assoc_status(int cpu, int *clos_id); extern void isst_clos_display_information(int cpu, FILE *outf, int clos, struct isst_clos_config *clos_config); - +extern void isst_clos_display_assoc_information(int cpu, FILE *outf, int clos); extern int isst_read_reg(unsigned short reg, unsigned int *val); extern int isst_write_reg(int reg, unsigned int val); extern void isst_display_result(int cpu, FILE *outf, char *feature, char *cmd, int result); + +extern int isst_clos_get_clos_information(int cpu, int *enable, int *type); +extern void isst_clos_display_clos_information(int cpu, FILE *outf, + int clos_enable, int type); #endif diff --git a/tools/testing/selftests/.gitignore b/tools/testing/selftests/.gitignore index 8059ce834247..61df01cdf0b2 100644 --- a/tools/testing/selftests/.gitignore +++ b/tools/testing/selftests/.gitignore @@ -2,3 +2,5 @@ gpiogpio-event-mon gpiogpio-hammer gpioinclude/ gpiolsgpio +tpm2/SpaceTest.log +tpm2/*.pyc diff --git a/tools/testing/selftests/tpm2/Makefile b/tools/testing/selftests/tpm2/Makefile index 9dd848427a7b..bf401f725eef 100644 --- a/tools/testing/selftests/tpm2/Makefile +++ b/tools/testing/selftests/tpm2/Makefile @@ -2,3 +2,4 @@ include ../lib.mk TEST_PROGS := test_smoke.sh test_space.sh +TEST_FILES := tpm2.py tpm2_tests.py diff --git a/usr/Makefile b/usr/Makefile index 6a89eb019275..e6f7cb2f81db 100644 --- a/usr/Makefile +++ b/usr/Makefile @@ -11,6 +11,9 @@ datafile_y = initramfs_data.cpio$(suffix_y) datafile_d_y = .$(datafile_y).d AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/$(datafile_y)" +# clean rules do not have CONFIG_INITRAMFS_COMPRESSION. So clean up after all +# possible compression formats. +clean-files += initramfs_data.cpio* # Generate builtin.o based on initramfs_data.o obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o